mirror of
https://github.com/recp/cglm.git
synced 2025-10-04 09:08:53 +00:00
arm: optimize vec4 div with NEON
This commit is contained in:
4
CREDITS
4
CREDITS
@@ -79,3 +79,7 @@ https://stackoverflow.com/a/57793352/2676533
|
||||
|
||||
https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic
|
||||
http://github.com/microsoft/DirectXMath
|
||||
|
||||
16. ARM NEON Div
|
||||
|
||||
http://github.com/microsoft/DirectXMath
|
||||
|
@@ -131,6 +131,23 @@ glmm_norm_inf(float32x4_t a) {
|
||||
return glmm_hmax(glmm_abs(a));
|
||||
}
|
||||
|
||||
static inline
|
||||
float32x4_t
|
||||
glmm_div(float32x4_t a, float32x4_t b) {
|
||||
#if CGLM_ARM641
|
||||
return vdivq_f32(a, b);
|
||||
#else
|
||||
/* 2 iterations of Newton-Raphson refinement of reciprocal */
|
||||
float32x4_t r0, r1;
|
||||
r0 = vrecpeq_f32(b);
|
||||
r1 = vrecpsq_f32(r0, b);
|
||||
r0 = vmulq_f32(r1, r0);
|
||||
r1 = vrecpsq_f32(r0, b);
|
||||
r0 = vmulq_f32(r1, r0);
|
||||
return vmulq_f32(a, r0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
float32x4_t
|
||||
glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
||||
|
@@ -209,6 +209,12 @@ glmm_store3(float v[3], __m128 vx) {
|
||||
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_div(__m128 a, __m128 b) {
|
||||
return _mm_div_ps(a, b);
|
||||
}
|
||||
|
||||
/* enable FMA macro for MSVC? */
|
||||
#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
|
||||
# define __FMA__ 1
|
||||
|
@@ -473,8 +473,8 @@ glm_vec4_scale_as(vec4 v, float s, vec4 dest) {
|
||||
CGLM_INLINE
|
||||
void
|
||||
glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
|
||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||
glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b)));
|
||||
#if defined(CGLM_SIMD)
|
||||
glmm_store(dest, glmm_div(glmm_load(a), glmm_load(b)));
|
||||
#else
|
||||
dest[0] = a[0] / b[0];
|
||||
dest[1] = a[1] / b[1];
|
||||
|
Reference in New Issue
Block a user