From 8a117017ea8094ba84188e321245ba295ecac559 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 7 Mar 2023 13:11:08 +0300 Subject: [PATCH 1/8] fix building ARM NEON on windows & msvc --- .vscode/settings.json | 3 +- include/cglm/simd/arm.h | 6 + include/cglm/simd/intrin.h | 8 +- include/cglm/simd/neon/mat4.h | 4 +- include/cglm/simd/neon/quat.h | 2 +- win/cglm-test.vcxproj | 224 +++++++++++++++++++++++++++++++- win/cglm.sln | 34 ++++- win/cglm.vcxproj | 234 +++++++++++++++++++++++++++++++++- 8 files changed, 501 insertions(+), 14 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 5b62438..c6af4c0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,5 @@ { "C_Cpp.default.configurationProvider": "vector-of-bool.cmake-tools", - "restructuredtext.confPath": "${workspaceFolder}/docs/source" + "restructuredtext.confPath": "${workspaceFolder}/docs/source", + "workbench.colorTheme": "Visual Studio Dark - C++" } \ No newline at end of file diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 50cec46..fdb13f0 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -38,6 +38,12 @@ #define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x), vget_high_f32(y)) #define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y)) +#if defined(_WIN32) && defined(_MSC_VER) +# define glmm_float32x4_init(x, y, z, w) { .n128_f32 = { x, y, z, w } } +#else +# define glmm_float32x4_init(x, y, z, w) { x, y, z, w } +#endif + static inline float32x4_t glmm_abs(float32x4_t v) { diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index a6ca5b0..8f2fd55 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -17,7 +17,13 @@ # ifndef __SSE__ # define __SSE__ # endif +# elif defined(_M_ARM64) +# ifndef __ARM_NEON +/* TODO: is this valid */ +# define __ARM_NEON +# define __ARM_NEON_FP # endif +#endif /* do not use alignment for older visual studio versions */ # if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ # define CGLM_ALL_UNALIGNED @@ -63,7 +69,7 @@ #endif /* ARM Neon */ -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__ARM_NEON__) # include # if defined(__ARM_NEON_FP) # define CGLM_NEON_FP 1 diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 5b9f014..6691c4f 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -108,7 +108,7 @@ glm_mat4_det_neon(mat4 mat) { float32x4_t r0, r1, r2, r3, x0, x1, x2; float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34; float32x4x2_t a1; - float32x4_t x3 = { 0.f, -0.f, 0.f, -0.f }; + float32x4_t x3 = glmm_float32x4_init(0.f, -0.f, 0.f, -0.f); /* 127 <- 0, [square] det(A) = det(At) */ r0 = glmm_load(mat[0]); /* d c b a */ @@ -181,7 +181,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7, x8; float32x4x2_t a1; float32x2_t lp, ko, hg, jn, im, fe, ae, bf, cg, dh; - float32x4_t x9 = { -0.f, 0.f, -0.f, 0.f }; + float32x4_t x9 = glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f); x8 = vrev64q_f32(x9); diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h index f6b9e99..e5adf61 100644 --- a/include/cglm/simd/neon/quat.h +++ b/include/cglm/simd/neon/quat.h @@ -23,7 +23,7 @@ glm_quat_mul_neon(versor p, versor q, versor dest) { */ glmm_128 xp, xq, xqr, r, x, y, z, s2, s3; - glmm_128 s1 = {-0.f, 0.f, 0.f, -0.f}; + glmm_128 s1 = glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f); float32x2_t qh, ql; xp = glmm_load(p); /* 3 2 1 0 */ diff --git a/win/cglm-test.vcxproj b/win/cglm-test.vcxproj index e38d77d..bdcdf8a 100644 --- a/win/cglm-test.vcxproj +++ b/win/cglm-test.vcxproj @@ -1,10 +1,34 @@ + + Debug + ARM + + + Debug + ARM64 + + + Debug + ARM64EC + Debug Win32 + + Release + ARM + + + Release + ARM64 + + + Release + ARM64EC + Release Win32 @@ -70,26 +94,65 @@ Application true - v142 + v143 Unicode Application false - v142 + v143 true Unicode Application true - v142 + v143 + Unicode + + + Application + true + v143 + Unicode + + + Application + true + v143 + Unicode + + + Application + true + v143 Unicode Application false - v142 + v143 + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + v143 + true + Unicode + + + Application + false + v143 true Unicode @@ -107,19 +170,55 @@ + + + + + + + + + + + + + + + + + + false + + false + + + false + + + false + true true + + true + + + true + + + true + false @@ -144,6 +243,69 @@ %(AdditionalDependencies) + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + %(AdditionalDependencies) + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + %(AdditionalDependencies) + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + %(AdditionalDependencies) + + @@ -180,6 +342,60 @@ %(AdditionalDependencies) + + + + + Level3 + Disabled + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + Default + + + Console + true + %(AdditionalDependencies) + + + + + + + Level3 + Disabled + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + Default + + + Console + true + %(AdditionalDependencies) + + + + + + + Level3 + Disabled + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + ../include;%(AdditionalIncludeDirectories) + Default + + + Console + true + %(AdditionalDependencies) + + diff --git a/win/cglm.sln b/win/cglm.sln index 04f08b4..b34e272 100644 --- a/win/cglm.sln +++ b/win/cglm.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.29123.88 +# Visual Studio Version 17 +VisualStudioVersion = 17.6.33417.168 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cglm", "cglm.vcxproj", "{CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}" EndProject @@ -9,24 +9,54 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cglm-test", "cglm-test.vcxp EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|ARM = Debug|ARM + Debug|ARM64 = Debug|ARM64 + Debug|ARM64EC = Debug|ARM64EC Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 + Release|ARM = Release|ARM + Release|ARM64 = Release|ARM64 + Release|ARM64EC = Release|ARM64EC Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM.ActiveCfg = Debug|ARM + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM.Build.0 = Debug|ARM + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM64.Build.0 = Debug|ARM64 + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|x64.ActiveCfg = Debug|x64 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|x64.Build.0 = Debug|x64 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|x86.ActiveCfg = Debug|Win32 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Debug|x86.Build.0 = Debug|Win32 + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM.ActiveCfg = Release|ARM + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM.Build.0 = Release|ARM + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM64.ActiveCfg = Release|ARM64 + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM64.Build.0 = Release|ARM64 + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM64EC.ActiveCfg = Release|ARM64EC + {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|ARM64EC.Build.0 = Release|ARM64EC {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|x64.ActiveCfg = Release|x64 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|x64.Build.0 = Release|x64 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|x86.ActiveCfg = Release|Win32 {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}.Release|x86.Build.0 = Release|Win32 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM.ActiveCfg = Debug|ARM + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM.Build.0 = Debug|ARM + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM64.Build.0 = Debug|ARM64 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|x64.ActiveCfg = Debug|x64 {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|x64.Build.0 = Debug|x64 {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|x86.ActiveCfg = Debug|Win32 {200E0DF1-7532-44E6-8273-84FB92C5557E}.Debug|x86.Build.0 = Debug|Win32 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM.ActiveCfg = Release|ARM + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM.Build.0 = Release|ARM + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM64.ActiveCfg = Release|ARM64 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM64.Build.0 = Release|ARM64 + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM64EC.ActiveCfg = Release|ARM64EC + {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|ARM64EC.Build.0 = Release|ARM64EC {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|x64.ActiveCfg = Release|x64 {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|x64.Build.0 = Release|x64 {200E0DF1-7532-44E6-8273-84FB92C5557E}.Release|x86.ActiveCfg = Release|Win32 diff --git a/win/cglm.vcxproj b/win/cglm.vcxproj index 60c4af0..fea6218 100644 --- a/win/cglm.vcxproj +++ b/win/cglm.vcxproj @@ -1,10 +1,34 @@ - + + + Debug + ARM + + + Debug + ARM64 + + + Debug + ARM64EC + Debug Win32 + + Release + ARM + + + Release + ARM64 + + + Release + ARM64EC + Release Win32 @@ -213,7 +237,7 @@ {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC} Win32Proj cglm - 10.0.17763.0 + 10.0 @@ -235,6 +259,24 @@ v141 Unicode + + DynamicLibrary + true + v141 + Unicode + + + DynamicLibrary + true + v141 + Unicode + + + DynamicLibrary + true + v141 + Unicode + DynamicLibrary false @@ -242,6 +284,27 @@ true Unicode + + DynamicLibrary + false + v141 + true + Unicode + + + DynamicLibrary + false + v141 + true + Unicode + + + DynamicLibrary + false + v141 + true + Unicode + @@ -256,9 +319,27 @@ + + + + + + + + + + + + + + + + + + true @@ -269,6 +350,18 @@ NativeRecommendedRules.ruleset true + + NativeRecommendedRules.ruleset + true + + + NativeRecommendedRules.ruleset + true + + + NativeRecommendedRules.ruleset + true + false NativeRecommendedRules.ruleset @@ -279,6 +372,21 @@ NativeRecommendedRules.ruleset true + + false + NativeRecommendedRules.ruleset + true + + + false + NativeRecommendedRules.ruleset + true + + + false + NativeRecommendedRules.ruleset + true + NotUsing @@ -314,6 +422,63 @@ Windows + + + Level3 + MaxSpeed + _DEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + AnySuitable + true + true + + + CompileAsC + None + Default + true + + + Windows + + + + + Level3 + MaxSpeed + _DEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + AnySuitable + true + true + + + CompileAsC + None + Default + true + + + Windows + + + + + Level3 + MaxSpeed + _DEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + AnySuitable + true + true + + + CompileAsC + None + Default + true + + + Windows + + Level3 @@ -354,7 +519,70 @@ true + + + Level3 + NotUsing + MaxSpeed + true + true + NDEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + None + + + CompileAsC + true + Default + + + Windows + true + true + + + + + Level3 + NotUsing + MaxSpeed + true + true + NDEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + None + + + CompileAsC + true + Default + + + Windows + true + true + + + + + Level3 + NotUsing + MaxSpeed + true + true + NDEBUG;_WINDOWS;_USRDLL;CGLM_EXPORTS;%(PreprocessorDefinitions) + None + + + CompileAsC + true + Default + + + Windows + true + true + + - \ No newline at end of file + From a30baffafa0b5f0536098bedbe3896f434706512 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 9 Mar 2023 21:56:25 +0300 Subject: [PATCH 2/8] arm: update ARM/NEON macros --- include/cglm/simd/arm.h | 4 ---- include/cglm/simd/intrin.h | 20 ++++++++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index fdb13f0..676270c 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -10,10 +10,6 @@ #include "intrin.h" #ifdef CGLM_SIMD_ARM -#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) -# define CGLM_ARM64 1 -#endif - #define glmm_load(p) vld1q_f32(p) #define glmm_store(p, a) vst1q_f32(p, a) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 8f2fd55..1d955d0 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -69,13 +69,21 @@ #endif /* ARM Neon */ -#if defined(__ARM_NEON) || defined(__ARM_NEON__) +/* TODO: check _M_ARM and compiling should work if there is no ARM64 and NEON */ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) \ + || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) # include -# if defined(__ARM_NEON_FP) -# define CGLM_NEON_FP 1 -# ifndef CGLM_SIMD_ARM -# define CGLM_SIMD_ARM -# endif +# ifndef __ARM_NEON +# define __ARM_NEON +# endif +# ifndef __ARM_NEON_FP +# define __ARM_NEON_FP 1 +# endif +# ifndef CGLM_ARM64 +# define CGLM_ARM64 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM # endif #endif From 9ed325018d51775b337caaf91c29fd74106e0a28 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 9 Mar 2023 22:06:20 +0300 Subject: [PATCH 3/8] Update intrin.h --- include/cglm/simd/intrin.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 1d955d0..f4c0fe9 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -17,12 +17,6 @@ # ifndef __SSE__ # define __SSE__ # endif -# elif defined(_M_ARM64) -# ifndef __ARM_NEON -/* TODO: is this valid */ -# define __ARM_NEON -# define __ARM_NEON_FP -# endif #endif /* do not use alignment for older visual studio versions */ # if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ From 4a6fc485fd458e7f43d8f4868767501ccf42fb5c Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 9 Mar 2023 22:16:39 +0300 Subject: [PATCH 4/8] use CGLM_SIMD_NEON instead of CGLM_NEON_FP --- include/cglm/affine-mat.h | 8 +++---- include/cglm/mat2.h | 6 +++--- include/cglm/mat4.h | 20 ++++++++--------- include/cglm/quat.h | 4 ++-- include/cglm/simd/intrin.h | 6 +++++- include/cglm/simd/neon/affine.h | 2 +- include/cglm/simd/neon/mat2.h | 2 +- include/cglm/simd/neon/mat4.h | 2 +- include/cglm/simd/neon/quat.h | 2 +- include/cglm/vec4-ext.h | 2 +- include/cglm/vec4.h | 38 ++++++++++++++++----------------- 11 files changed, 48 insertions(+), 44 deletions(-) diff --git a/include/cglm/affine-mat.h b/include/cglm/affine-mat.h index 75607e7..51b5742 100644 --- a/include/cglm/affine-mat.h +++ b/include/cglm/affine-mat.h @@ -26,7 +26,7 @@ # include "simd/avx/affine.h" #endif -#ifdef CGLM_NEON_FP +#ifdef CGLM_SIMD_NEON # include "simd/neon/affine.h" #endif @@ -53,7 +53,7 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_sse2(m1, m2, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -109,7 +109,7 @@ void glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_rot_sse2(m1, m2, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mul_rot_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -158,7 +158,7 @@ void glm_inv_tr(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_inv_tr_sse2(mat); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_inv_tr_neon(mat); #else CGLM_ALIGN_MAT mat3 r; diff --git a/include/cglm/mat2.h b/include/cglm/mat2.h index 871d6bd..f76382b 100644 --- a/include/cglm/mat2.h +++ b/include/cglm/mat2.h @@ -40,7 +40,7 @@ # include "simd/sse2/mat2.h" #endif -#ifdef CGLM_NEON_FP +#ifdef CGLM_SIMD_NEON # include "simd/neon/mat2.h" #endif @@ -134,7 +134,7 @@ void glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat2_mul_sse2(m1, m2, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat2_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], @@ -224,7 +224,7 @@ void glm_mat2_scale(mat2 m, float s) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s))); #else m[0][0] = m[0][0] * s; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index c7c8abd..b73c888 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -60,7 +60,7 @@ # include "simd/avx/mat4.h" #endif -#ifdef CGLM_NEON_FP +#ifdef CGLM_SIMD_NEON # include "simd/neon/mat4.h" #endif @@ -129,7 +129,7 @@ glm_mat4_copy(mat4 mat, mat4 dest) { glmm_store(dest[1], glmm_load(mat[1])); glmm_store(dest[2], glmm_load(mat[2])); glmm_store(dest[3], glmm_load(mat[3])); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest[0], vld1q_f32(mat[0])); vst1q_f32(dest[1], vld1q_f32(mat[1])); vst1q_f32(dest[2], vld1q_f32(mat[2])); @@ -199,7 +199,7 @@ glm_mat4_zero(mat4 mat) { glmm_store(mat[1], x0); glmm_store(mat[2], x0); glmm_store(mat[3], x0); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glmm_128 x0; x0 = vdupq_n_f32(0.0f); vst1q_f32(mat[0], x0); @@ -301,7 +301,7 @@ glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mul_sse2(m1, m2, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -379,7 +379,7 @@ void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mulv_sse2(m, v, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_mulv_neon(m, v, dest); #else vec4 res; @@ -499,7 +499,7 @@ void glm_mat4_transpose_to(mat4 m, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_transp_neon(m, dest); #else dest[0][0] = m[0][0]; dest[1][0] = m[0][1]; @@ -523,7 +523,7 @@ void glm_mat4_transpose(mat4 m) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, m); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_transp_neon(m, m); #else mat4 d; @@ -564,7 +564,7 @@ glm_mat4_scale(mat4 m, float s) { glm_mat4_scale_avx(m, s); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_scale_sse2(m, s); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_scale_neon(m, s); #else glm_mat4_scale_p(m, s); @@ -583,7 +583,7 @@ float glm_mat4_det(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glm_mat4_det_sse2(mat); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) return glm_mat4_det_neon(mat); #else /* [square] det(A) = det(At) */ @@ -618,7 +618,7 @@ void glm_mat4_inv(mat4 mat, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_inv_sse2(mat, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_mat4_inv_neon(mat, dest); #else float t[6]; diff --git a/include/cglm/quat.h b/include/cglm/quat.h index c76fa03..9488e23 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -66,7 +66,7 @@ # include "simd/sse2/quat.h" #endif -#ifdef CGLM_NEON_FP +#ifdef CGLM_SIMD_NEON # include "simd/neon/quat.h" #endif @@ -440,7 +440,7 @@ glm_quat_mul(versor p, versor q, versor dest) { */ #if defined( __SSE__ ) || defined( __SSE2__ ) glm_quat_mul_sse2(p, q, dest); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) glm_quat_mul_neon(p, q, dest); #else dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1]; diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index f4c0fe9..73fb675 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -72,6 +72,7 @@ # endif # ifndef __ARM_NEON_FP # define __ARM_NEON_FP 1 +# define CGLM_NEON_FP 1 # endif # ifndef CGLM_ARM64 # define CGLM_ARM64 1 @@ -79,9 +80,12 @@ # ifndef CGLM_SIMD_ARM # define CGLM_SIMD_ARM # endif +# ifndef CGLM_SIMD_NEON +# define CGLM_SIMD_NEON 1 +# endif #endif -#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP) +#if defined(CGLM_SIMD_x86) || defined(CGLM_SIMD_NEON) # ifndef CGLM_SIMD # define CGLM_SIMD # endif diff --git a/include/cglm/simd/neon/affine.h b/include/cglm/simd/neon/affine.h index da0a350..e55ea6f 100644 --- a/include/cglm/simd/neon/affine.h +++ b/include/cglm/simd/neon/affine.h @@ -7,7 +7,7 @@ #ifndef cglm_affine_neon_h #define cglm_affine_neon_h -#if defined(__ARM_NEON_FP) +#if defined(CGLM_SIMD_NEON) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat2.h b/include/cglm/simd/neon/mat2.h index 471ebea..d73e411 100644 --- a/include/cglm/simd/neon/mat2.h +++ b/include/cglm/simd/neon/mat2.h @@ -7,7 +7,7 @@ #ifndef cglm_mat2_neon_h #define cglm_mat2_neon_h -#if defined(__ARM_NEON_FP) +#if defined(CGLM_SIMD_NEON) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 6691c4f..e9f3f8a 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -7,7 +7,7 @@ #ifndef cglm_mat4_neon_h #define cglm_mat4_neon_h -#if defined(__ARM_NEON_FP) +#if defined(CGLM_SIMD_NEON) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h index e5adf61..f73988d 100644 --- a/include/cglm/simd/neon/quat.h +++ b/include/cglm/simd/neon/quat.h @@ -7,7 +7,7 @@ #ifndef cglm_quat_neon_h #define cglm_quat_neon_h -#if defined(__ARM_NEON_FP) +#if defined(CGLM_SIMD_NEON) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index e4e20cb..43d7214 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -249,7 +249,7 @@ void glm_vec4_abs(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_abs(glmm_load(v))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vabsq_f32(vld1q_f32(v))); #else dest[0] = fabsf(v[0]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 8e95ec5..2272556 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -139,7 +139,7 @@ void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_load(v)); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vld1q_f32(v)); #else dest[0] = v[0]; @@ -176,7 +176,7 @@ void glm_vec4_zero(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_setzero_ps()); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(v, vdupq_n_f32(0.0f)); #else v[0] = 0.0f; @@ -196,7 +196,7 @@ void glm_vec4_one(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_set1_ps(1.0f)); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(v, vdupq_n_f32(1.0f)); #else v[0] = 1.0f; @@ -322,7 +322,7 @@ void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] + b[0]; @@ -344,7 +344,7 @@ void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] + s; @@ -366,7 +366,7 @@ void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] - b[0]; @@ -388,7 +388,7 @@ void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] - s; @@ -410,7 +410,7 @@ void glm_vec4_mul(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] * b[0]; @@ -432,7 +432,7 @@ void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] * s; @@ -516,7 +516,7 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_add_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vaddq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -544,7 +544,7 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_sub_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vsubq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -616,7 +616,7 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_max_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vmaxq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -644,7 +644,7 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_min_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vminq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -667,7 +667,7 @@ void glm_vec4_negate_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else dest[0] = -v[0]; @@ -748,7 +748,7 @@ float glm_vec4_distance(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b))); #else return sqrtf(glm_pow2(a[0] - b[0]) @@ -770,7 +770,7 @@ float glm_vec4_distance2(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b))); #else return glm_pow2(a[0] - b[0]) @@ -792,7 +792,7 @@ void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_max(a[0], b[0]); @@ -814,7 +814,7 @@ void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_min(a[0], b[0]); @@ -837,7 +837,7 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); -#elif defined(CGLM_NEON_FP) +#elif defined(CGLM_SIMD_NEON) vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal))); #else From ba993b3ea9937c4d5e6cefd193f29e7c2e85ba56 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 11 Mar 2023 14:15:30 +0300 Subject: [PATCH 5/8] arm: use intrin to set/init vec4 as @gottfriedleibniz suggests --- include/cglm/simd/arm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 676270c..01525d3 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -35,7 +35,13 @@ #define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y)) #if defined(_WIN32) && defined(_MSC_VER) -# define glmm_float32x4_init(x, y, z, w) { .n128_f32 = { x, y, z, w } } +/* # define glmm_float32x4_init(x, y, z, w) { .n128_f32 = { x, y, z, w } } */ +CGLM_INLINE +float32x4_t +glmm_float32x4_init(float x, float y, float z, float w) { + CGLM_ALIGN(16) float v[4] = {x, y, z, w}; + return vld1q_f32(v); +} #else # define glmm_float32x4_init(x, y, z, w) { x, y, z, w } #endif From 13ed79a61a5d79831babeecf2570a43f407bb027 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 12 Mar 2023 16:43:47 +0300 Subject: [PATCH 6/8] arm: fix checking arm64 --- include/cglm/affine-mat.h | 8 ++--- include/cglm/mat2.h | 6 ++-- include/cglm/mat4.h | 20 ++++++------- include/cglm/quat.h | 4 +-- include/cglm/simd/arm.h | 6 ++++ include/cglm/simd/intrin.h | 52 +++++++++++++++++++++------------ include/cglm/simd/neon/affine.h | 2 +- include/cglm/simd/neon/mat2.h | 2 +- include/cglm/simd/neon/mat4.h | 2 +- include/cglm/simd/neon/quat.h | 2 +- include/cglm/vec4-ext.h | 2 +- include/cglm/vec4.h | 38 ++++++++++++------------ 12 files changed, 82 insertions(+), 62 deletions(-) diff --git a/include/cglm/affine-mat.h b/include/cglm/affine-mat.h index 51b5742..75607e7 100644 --- a/include/cglm/affine-mat.h +++ b/include/cglm/affine-mat.h @@ -26,7 +26,7 @@ # include "simd/avx/affine.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/affine.h" #endif @@ -53,7 +53,7 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -109,7 +109,7 @@ void glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_rot_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mul_rot_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -158,7 +158,7 @@ void glm_inv_tr(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_inv_tr_sse2(mat); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_inv_tr_neon(mat); #else CGLM_ALIGN_MAT mat3 r; diff --git a/include/cglm/mat2.h b/include/cglm/mat2.h index f76382b..871d6bd 100644 --- a/include/cglm/mat2.h +++ b/include/cglm/mat2.h @@ -40,7 +40,7 @@ # include "simd/sse2/mat2.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/mat2.h" #endif @@ -134,7 +134,7 @@ void glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat2_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat2_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], @@ -224,7 +224,7 @@ void glm_mat2_scale(mat2 m, float s) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s))); #else m[0][0] = m[0][0] * s; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index b73c888..c7c8abd 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -60,7 +60,7 @@ # include "simd/avx/mat4.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/mat4.h" #endif @@ -129,7 +129,7 @@ glm_mat4_copy(mat4 mat, mat4 dest) { glmm_store(dest[1], glmm_load(mat[1])); glmm_store(dest[2], glmm_load(mat[2])); glmm_store(dest[3], glmm_load(mat[3])); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest[0], vld1q_f32(mat[0])); vst1q_f32(dest[1], vld1q_f32(mat[1])); vst1q_f32(dest[2], vld1q_f32(mat[2])); @@ -199,7 +199,7 @@ glm_mat4_zero(mat4 mat) { glmm_store(mat[1], x0); glmm_store(mat[2], x0); glmm_store(mat[3], x0); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glmm_128 x0; x0 = vdupq_n_f32(0.0f); vst1q_f32(mat[0], x0); @@ -301,7 +301,7 @@ glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mul_sse2(m1, m2, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_mul_neon(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], @@ -379,7 +379,7 @@ void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mulv_sse2(m, v, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_mulv_neon(m, v, dest); #else vec4 res; @@ -499,7 +499,7 @@ void glm_mat4_transpose_to(mat4 m, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, dest); #else dest[0][0] = m[0][0]; dest[1][0] = m[0][1]; @@ -523,7 +523,7 @@ void glm_mat4_transpose(mat4 m) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, m); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, m); #else mat4 d; @@ -564,7 +564,7 @@ glm_mat4_scale(mat4 m, float s) { glm_mat4_scale_avx(m, s); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_scale_sse2(m, s); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_scale_neon(m, s); #else glm_mat4_scale_p(m, s); @@ -583,7 +583,7 @@ float glm_mat4_det(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glm_mat4_det_sse2(mat); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glm_mat4_det_neon(mat); #else /* [square] det(A) = det(At) */ @@ -618,7 +618,7 @@ void glm_mat4_inv(mat4 mat, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_inv_sse2(mat, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_mat4_inv_neon(mat, dest); #else float t[6]; diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 9488e23..c76fa03 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -66,7 +66,7 @@ # include "simd/sse2/quat.h" #endif -#ifdef CGLM_SIMD_NEON +#ifdef CGLM_NEON_FP # include "simd/neon/quat.h" #endif @@ -440,7 +440,7 @@ glm_quat_mul(versor p, versor q, versor dest) { */ #if defined( __SSE__ ) || defined( __SSE2__ ) glm_quat_mul_sse2(p, q, dest); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) glm_quat_mul_neon(p, q, dest); #else dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1]; diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 01525d3..8ba5494 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -10,6 +10,12 @@ #include "intrin.h" #ifdef CGLM_SIMD_ARM +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) +# define CGLM_ARM64 1 +#else +# define CGLM_ARM64 0 +#endif + #define glmm_load(p) vld1q_f32(p) #define glmm_store(p, a) vst1q_f32(p, a) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 73fb675..80ef95e 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -63,29 +63,43 @@ #endif /* ARM Neon */ -/* TODO: check _M_ARM and compiling should work if there is no ARM64 and NEON */ -#if defined(__ARM_NEON) || defined(__ARM_NEON__) \ - || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) -# include -# ifndef __ARM_NEON -# define __ARM_NEON +#if defined(_WIN32) +/* TODO: non-ARM stuff already inported, will this be better option */ +/* # include */ + +# if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) +# include +# include +# ifndef CGLM_NEON_FP +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif +# elif defined(_M_ARM) +# include +# include +# if defined(CGLM_NEON_FP) && (defined(__ARM_NEON_FP) || defined(vaddq_f32)) /* vaddq_f32 is defined as macro, we pick it */ +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif # endif -# ifndef __ARM_NEON_FP -# define __ARM_NEON_FP 1 -# define CGLM_NEON_FP 1 -# endif -# ifndef CGLM_ARM64 -# define CGLM_ARM64 1 -# endif -# ifndef CGLM_SIMD_ARM -# define CGLM_SIMD_ARM -# endif -# ifndef CGLM_SIMD_NEON -# define CGLM_SIMD_NEON 1 + +#else /* non-windows */ +# if defined(__ARM_NEON) || defined(__ARM_NEON__) +# include +# if defined(__ARM_NEON_FP) +# define CGLM_NEON_FP 1 +# endif +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif # endif #endif -#if defined(CGLM_SIMD_x86) || defined(CGLM_SIMD_NEON) +#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP) # ifndef CGLM_SIMD # define CGLM_SIMD # endif diff --git a/include/cglm/simd/neon/affine.h b/include/cglm/simd/neon/affine.h index e55ea6f..b0a65a6 100644 --- a/include/cglm/simd/neon/affine.h +++ b/include/cglm/simd/neon/affine.h @@ -7,7 +7,7 @@ #ifndef cglm_affine_neon_h #define cglm_affine_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat2.h b/include/cglm/simd/neon/mat2.h index d73e411..7d0d9eb 100644 --- a/include/cglm/simd/neon/mat2.h +++ b/include/cglm/simd/neon/mat2.h @@ -7,7 +7,7 @@ #ifndef cglm_mat2_neon_h #define cglm_mat2_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index e9f3f8a..2d1184e 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -7,7 +7,7 @@ #ifndef cglm_mat4_neon_h #define cglm_mat4_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h index f73988d..fbaf390 100644 --- a/include/cglm/simd/neon/quat.h +++ b/include/cglm/simd/neon/quat.h @@ -7,7 +7,7 @@ #ifndef cglm_quat_neon_h #define cglm_quat_neon_h -#if defined(CGLM_SIMD_NEON) +#if defined(CGLM_NEON_FP) #include "../../common.h" #include "../intrin.h" diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 43d7214..e4e20cb 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -249,7 +249,7 @@ void glm_vec4_abs(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_abs(glmm_load(v))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vabsq_f32(vld1q_f32(v))); #else dest[0] = fabsf(v[0]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 2272556..8e95ec5 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -139,7 +139,7 @@ void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_load(v)); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vld1q_f32(v)); #else dest[0] = v[0]; @@ -176,7 +176,7 @@ void glm_vec4_zero(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_setzero_ps()); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(0.0f)); #else v[0] = 0.0f; @@ -196,7 +196,7 @@ void glm_vec4_one(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_set1_ps(1.0f)); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else v[0] = 1.0f; @@ -322,7 +322,7 @@ void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] + b[0]; @@ -344,7 +344,7 @@ void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] + s; @@ -366,7 +366,7 @@ void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] - b[0]; @@ -388,7 +388,7 @@ void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] - s; @@ -410,7 +410,7 @@ void glm_vec4_mul(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = a[0] * b[0]; @@ -432,7 +432,7 @@ void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else dest[0] = v[0] * s; @@ -516,7 +516,7 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_add_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vaddq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -544,7 +544,7 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_sub_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vsubq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -616,7 +616,7 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_max_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vmaxq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -644,7 +644,7 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_min_ps(glmm_load(a), glmm_load(b)))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vminq_f32(vld1q_f32(a), vld1q_f32(b)))); @@ -667,7 +667,7 @@ void glm_vec4_negate_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else dest[0] = -v[0]; @@ -748,7 +748,7 @@ float glm_vec4_distance(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b))); #else return sqrtf(glm_pow2(a[0] - b[0]) @@ -770,7 +770,7 @@ float glm_vec4_distance2(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b))); #else return glm_pow2(a[0] - b[0]) @@ -792,7 +792,7 @@ void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_max(a[0], b[0]); @@ -814,7 +814,7 @@ void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b))); #else dest[0] = glm_min(a[0], b[0]); @@ -837,7 +837,7 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); -#elif defined(CGLM_SIMD_NEON) +#elif defined(CGLM_NEON_FP) vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal))); #else From e276b5b4059266fea9911a4694c64eee8fc31515 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 14 Mar 2023 09:54:32 +0300 Subject: [PATCH 7/8] Update intrin.h --- include/cglm/simd/intrin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 80ef95e..bfdc94e 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -79,7 +79,7 @@ # elif defined(_M_ARM) # include # include -# if defined(CGLM_NEON_FP) && (defined(__ARM_NEON_FP) || defined(vaddq_f32)) /* vaddq_f32 is defined as macro, we pick it */ +# ifndef CGLM_NEON_FP # define CGLM_NEON_FP 1 # endif # ifndef CGLM_SIMD_ARM @@ -99,7 +99,7 @@ # endif #endif -#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP) +#if defined(CGLM_SIMD_x86) || defined(CGLM_SIMD_ARM) # ifndef CGLM_SIMD # define CGLM_SIMD # endif From 77b4c5cffbe6db7c52ec35b46d726af72d65ff88 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 16 Mar 2023 13:16:24 +0300 Subject: [PATCH 8/8] reset visual studio tool verisons --- win/cglm-test.vcxproj | 20 ++++++++++---------- win/cglm.sln | 4 ++-- win/cglm.vcxproj | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/win/cglm-test.vcxproj b/win/cglm-test.vcxproj index bdcdf8a..97a3eda 100644 --- a/win/cglm-test.vcxproj +++ b/win/cglm-test.vcxproj @@ -94,65 +94,65 @@ Application true - v143 + v142 Unicode Application false - v143 + v142 true Unicode Application true - v143 + v142 Unicode Application true - v143 + v142 Unicode Application true - v143 + v142 Unicode Application true - v143 + v142 Unicode Application false - v143 + v142 true Unicode Application false - v143 + v142 true Unicode Application false - v143 + v142 true Unicode Application false - v143 + v142 true Unicode diff --git a/win/cglm.sln b/win/cglm.sln index b34e272..22f929b 100644 --- a/win/cglm.sln +++ b/win/cglm.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.6.33417.168 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29123.88 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cglm", "cglm.vcxproj", "{CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC}" EndProject diff --git a/win/cglm.vcxproj b/win/cglm.vcxproj index fea6218..a0a9282 100644 --- a/win/cglm.vcxproj +++ b/win/cglm.vcxproj @@ -237,7 +237,7 @@ {CA8BCAF9-CD25-4133-8F62-3D1449B5D2FC} Win32Proj cglm - 10.0 + 10.0.17763.0