From 13ed79a61a5d79831babeecf2570a43f407bb027 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 12 Mar 2023 16:43:47 +0300 Subject: [PATCH] arm: fix checking arm64 --- include/cglm/affine-mat.h | 8 ++--- include/cglm/mat2.h | 6 ++-- include/cglm/mat4.h | 20 ++++++------- include/cglm/quat.h | 4 +-- include/cglm/simd/arm.h | 6 ++++ include/cglm/simd/intrin.h | 52 +++++++++++++++++++++------------ include/cglm/simd/neon/affine.h | 2 +- include/cglm/simd/neon/mat2.h | 2 +- include/cglm/simd/neon/mat4.h | 2 +- include/cglm/simd/neon/quat.h | 2 +- include/cglm/vec4-ext.h | 2 +- include/cglm/vec4.h | 38 ++++++++++++------------ 12 files changed, 82 insertions(+), 62 deletions(-) diff --git a/include/cglm/affine-mat.h b/include/cglm/affine-mat.h index 51b5742..75607e7 100644 --- a/include/cglm/affine-mat.h +++ b/include/cglm/affine-mat.h @@ -26,7 +26,7 @@ # include "simd/avx/affine.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/affine.h" #endif @@ -53,7 +53,7 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -109,7 +109,7 @@ void glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_rot_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mul_rot_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -158,7 +158,7 @@ void glm_inv_tr(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_inv_tr_sse2(mat); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_inv_tr_neon(mat); #else CGLM_ALIGN_MAT mat3 r; diff --git a/include/cglm/mat2.h b/include/cglm/mat2.h index f76382b..871d6bd 100644 --- a/include/cglm/mat2.h +++ b/include/cglm/mat2.h @@ -40,7 +40,7 @@ # include "simd/sse2/mat2.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/mat2.h" #endif @@ -134,7 +134,7 @@ void glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat2_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat2_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], @@ -224,7 +224,7 @@ void glm_mat2_scale(mat2 m, float s) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s))); #else m[0][0] = m[0][0] * s; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index b73c888..c7c8abd 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -60,7 +60,7 @@ # include "simd/avx/mat4.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/mat4.h" #endif @@ -129,7 +129,7 @@ glm_mat4_copy(mat4 mat, mat4 dest) { glmm_store(dest[1], glmm_load(mat[1])); glmm_store(dest[2], glmm_load(mat[2])); glmm_store(dest[3], glmm_load(mat[3])); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest[0], vld1q_f32(mat[0])); vst1q_f32(dest[1], vld1q_f32(mat[1])); vst1q_f32(dest[2], vld1q_f32(mat[2])); @@ -199,7 +199,7 @@ glm_mat4_zero(mat4 mat) { glmm_store(mat[1], x0); glmm_store(mat[2], x0); glmm_store(mat[3], x0); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glmm_128 x0; x0 = vdupq_n_f32(0.0f); vst1q_f32(mat[0], x0); @@ -301,7 +301,7 @@ glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -379,7 +379,7 @@ void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mulv_sse2(m, v, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_mulv_neon(m, v, dest); #else vec4 res; @@ -499,7 +499,7 @@ void glm_mat4_transpose_to(mat4 m, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, dest); #else dest[0][0] = m[0][0]; dest[1][0] = m[0][1]; @@ -523,7 +523,7 @@ void glm_mat4_transpose(mat4 m) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, m); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, m); #else mat4 d; @@ -564,7 +564,7 @@ glm_mat4_scale(mat4 m, float s) { glm_mat4_scale_avx(m, s); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_scale_sse2(m, s); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_scale_neon(m, s); #else glm_mat4_scale_p(m, s); @@ -583,7 +583,7 @@ float glm_mat4_det(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glm_mat4_det_sse2(mat); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glm_mat4_det_neon(mat); #else /* [square] det(A) = det(At) */ @@ -618,7 +618,7 @@ void glm_mat4_inv(mat4 mat, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_inv_sse2(mat, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_inv_neon(mat, dest); #else float t[6]; diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 9488e23..c76fa03 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -66,7 +66,7 @@ # include "simd/sse2/quat.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/quat.h" #endif @@ -440,7 +440,7 @@ glm_quat_mul(versor p, versor q, versor dest) { */ #if defined( __SSE__ ) || defined( __SSE2__ ) glm_quat_mul_sse2(p, q, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_quat_mul_neon(p, q, dest); #else dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1]; diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 01525d3..8ba5494 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -10,6 +10,12 @@ #include "intrin.h" #ifdef CGLM_SIMD_ARM +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) +# define CGLM_ARM64 1 +#else +# define CGLM_ARM64 0 +#endif + #define glmm_load(p) vld1q_f32(p) #define glmm_store(p, a) vst1q_f32(p, a) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 73fb675..80ef95e 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -63,29 +63,43 @@ #endif /* ARM Neon */ -/* TODO: check _M_ARM and compiling should work if there is no ARM64 and NEON */ -#if defined(__ARM_NEON) || defined(__ARM_NEON__) \ - || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) -# include -# ifndef __ARM_NEON -# define __ARM_NEON +#if defined(_WIN32) +/* TODO: non-ARM stuff already inported, will this be better option */ +/* # include */ + +# if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) +# include +# include +# ifndef CGLM_NEON_FP +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif +# elif defined(_M_ARM) +# include +# include +# if defined(CGLM_NEON_FP) && (defined(__ARM_NEON_FP) || defined(vaddq_f32)) /* vaddq_f32 is defined as macro, we pick it */ +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif # endif -# ifndef __ARM_NEON_FP -# define __ARM_NEON_FP 1 -# define CGLM_NEON_FP 1 -# endif -# ifndef CGLM_ARM64 -# define CGLM_ARM64 1 -# endif -# ifndef CGLM_SIMD_ARM -# define CGLM_SIMD_ARM -# endif -# ifndef CGLM_SIMD_NEON -# define CGLM_SIMD_NEON 1 + +#else /* non-windows */ +# if defined(__ARM_NEON) || defined(__ARM_NEON__) +# include +# if defined(__ARM_NEON_FP) +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif # endif #endif -#if defined(CGLM_SIMD_x86) || defined(CGLM_SIMD_NEON) +#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP) # ifndef CGLM_SIMD # define CGLM_SIMD # endif diff --git a/include/cglm/simd/neon/affine.h b/include/cglm/simd/neon/affine.h index e55ea6f..b0a65a6 100644 --- a/include/cglm/simd/neon/affine.h +++ b/include/cglm/simd/neon/affine.h @@ -7,7 +7,7 @@ #ifndef cglm_affine_neon_h #define cglm_affine_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat2.h b/include/cglm/simd/neon/mat2.h index d73e411..7d0d9eb 100644 --- a/include/cglm/simd/neon/mat2.h +++ b/include/cglm/simd/neon/mat2.h @@ -7,7 +7,7 @@ #ifndef cglm_mat2_neon_h #define cglm_mat2_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index e9f3f8a..2d1184e 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -7,7 +7,7 @@ #ifndef cglm_mat4_neon_h #define cglm_mat4_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h index f73988d..fbaf390 100644 --- a/include/cglm/simd/neon/quat.h +++ b/include/cglm/simd/neon/quat.h @@ -7,7 +7,7 @@ #ifndef cglm_quat_neon_h #define cglm_quat_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 43d7214..e4e20cb 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -249,7 +249,7 @@ void glm_vec4_abs(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_abs(glmm_load(v))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vabsq_f32(vld1q_f32(v))); #else dest[0] = fabsf(v[0]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 2272556..8e95ec5 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -139,7 +139,7 @@ void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_load(v)); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vld1q_f32(v)); #else dest[0] = v[0]; @@ -176,7 +176,7 @@ void glm_vec4_zero(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_setzero_ps()); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(0.0f)); #else v[0] = 0.0f; @@ -196,7 +196,7 @@ void glm_vec4_one(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_set1_ps(1.0f)); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else v[0] = 1.0f; @@ -322,7 +322,7 @@ void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] + b[0]; @@ -344,7 +344,7 @@ void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] + s; @@ -366,7 +366,7 @@ void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] - b[0]; @@ -388,7 +388,7 @@ void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] - s; @@ -410,7 +410,7 @@ void glm_vec4_mul(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] * b[0]; @@ -432,7 +432,7 @@ void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] * s; @@ -516,7 +516,7 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_add_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vaddq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -544,7 +544,7 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_sub_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vsubq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -616,7 +616,7 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_max_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vmaxq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -644,7 +644,7 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_min_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vminq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -667,7 +667,7 @@ void glm_vec4_negate_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else dest[0] = -v[0]; @@ -748,7 +748,7 @@ float glm_vec4_distance(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b))); #else return sqrtf(glm_pow2(a[0] - b[0]) @@ -770,7 +770,7 @@ float glm_vec4_distance2(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b))); #else return glm_pow2(a[0] - b[0]) @@ -792,7 +792,7 @@ void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_max(a[0], b[0]); @@ -814,7 +814,7 @@ void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_min(a[0], b[0]); @@ -837,7 +837,7 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal))); #else