mirror of
https://github.com/recp/cglm.git
synced 2025-10-04 17:09:40 +00:00
arm: optimize vec4 div with NEON
This commit is contained in:
4
CREDITS
4
CREDITS
@@ -79,3 +79,7 @@ https://stackoverflow.com/a/57793352/2676533
|
|||||||
|
|
||||||
https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic
|
https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic
|
||||||
http://github.com/microsoft/DirectXMath
|
http://github.com/microsoft/DirectXMath
|
||||||
|
|
||||||
|
16. ARM NEON Div
|
||||||
|
|
||||||
|
http://github.com/microsoft/DirectXMath
|
||||||
|
@@ -131,6 +131,23 @@ glmm_norm_inf(float32x4_t a) {
|
|||||||
return glmm_hmax(glmm_abs(a));
|
return glmm_hmax(glmm_abs(a));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
float32x4_t
|
||||||
|
glmm_div(float32x4_t a, float32x4_t b) {
|
||||||
|
#if CGLM_ARM641
|
||||||
|
return vdivq_f32(a, b);
|
||||||
|
#else
|
||||||
|
/* 2 iterations of Newton-Raphson refinement of reciprocal */
|
||||||
|
float32x4_t r0, r1;
|
||||||
|
r0 = vrecpeq_f32(b);
|
||||||
|
r1 = vrecpsq_f32(r0, b);
|
||||||
|
r0 = vmulq_f32(r1, r0);
|
||||||
|
r1 = vrecpsq_f32(r0, b);
|
||||||
|
r0 = vmulq_f32(r1, r0);
|
||||||
|
return vmulq_f32(a, r0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
float32x4_t
|
float32x4_t
|
||||||
glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
||||||
|
@@ -209,6 +209,12 @@ glmm_store3(float v[3], __m128 vx) {
|
|||||||
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
__m128
|
||||||
|
glmm_div(__m128 a, __m128 b) {
|
||||||
|
return _mm_div_ps(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
/* enable FMA macro for MSVC? */
|
/* enable FMA macro for MSVC? */
|
||||||
#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
|
#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
|
||||||
# define __FMA__ 1
|
# define __FMA__ 1
|
||||||
|
@@ -473,8 +473,8 @@ glm_vec4_scale_as(vec4 v, float s, vec4 dest) {
|
|||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
|
glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
|
||||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
#if defined(CGLM_SIMD)
|
||||||
glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b)));
|
glmm_store(dest, glmm_div(glmm_load(a), glmm_load(b)));
|
||||||
#else
|
#else
|
||||||
dest[0] = a[0] / b[0];
|
dest[0] = a[0] / b[0];
|
||||||
dest[1] = a[1] / b[1];
|
dest[1] = a[1] / b[1];
|
||||||
|
Reference in New Issue
Block a user