From 3a9e4df3932c26013bbda901db10f2d203712b28 Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sun, 2 Apr 2023 12:39:20 +0800 Subject: [PATCH] simd128: handle both sse2 and simd128 enabled by Emscripten https://github.com/recp/cglm/pull/286#issuecomment-1492985403 --- include/cglm/affine-mat.h | 18 ++-- include/cglm/mat2.h | 19 +++-- include/cglm/mat3.h | 6 +- include/cglm/mat4.h | 68 +++++++-------- include/cglm/quat.h | 36 ++++---- include/cglm/simd/intrin.h | 2 +- include/cglm/vec4-ext.h | 26 +++--- include/cglm/vec4.h | 166 ++++++++++++++++++------------------- 8 files changed, 171 insertions(+), 170 deletions(-) diff --git a/include/cglm/affine-mat.h b/include/cglm/affine-mat.h index 5107efc..1cd4973 100644 --- a/include/cglm/affine-mat.h +++ b/include/cglm/affine-mat.h @@ -53,12 +53,12 @@ CGLM_INLINE void glm_mul(mat4 m1, mat4 m2, mat4 dest) { -#ifdef __AVX__ +#if defined(__wasm__) && defined(__wasm_simd128__) + glm_mul_wasm(m1, m2, dest); +#elif defined(__AVX__) glm_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mul_sse2(m1, m2, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) - glm_mul_wasm(m1, m2, dest); #elif defined(CGLM_NEON_FP) glm_mul_neon(m1, m2, dest); #else @@ -113,10 +113,10 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) { CGLM_INLINE void glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mul_rot_sse2(m1, m2, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mul_rot_wasm(m1, m2, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mul_rot_sse2(m1, m2, dest); #elif defined(CGLM_NEON_FP) glm_mul_rot_neon(m1, m2, dest); #else @@ -164,10 +164,10 @@ glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { CGLM_INLINE void glm_inv_tr(mat4 mat) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_inv_tr_sse2(mat); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_inv_tr_wasm(mat); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_inv_tr_sse2(mat); #elif defined(CGLM_NEON_FP) glm_inv_tr_neon(mat); #else diff --git a/include/cglm/mat2.h b/include/cglm/mat2.h index dc71185..918e965 100644 --- a/include/cglm/mat2.h +++ b/include/cglm/mat2.h @@ -136,10 +136,10 @@ glm_mat2_zero(mat2 mat) { CGLM_INLINE void glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat2_mul_sse2(m1, m2, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat2_mul_wasm(m1, m2, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat2_mul_sse2(m1, m2, dest); #elif defined(CGLM_NEON_FP) glm_mat2_mul_neon(m1, m2, dest); #else @@ -166,10 +166,10 @@ glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { CGLM_INLINE void glm_mat2_transpose_to(mat2 m, mat2 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat2_transp_sse2(m, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat2_transp_wasm(m, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat2_transp_sse2(m, dest); #else dest[0][0] = m[0][0]; dest[0][1] = m[1][0]; @@ -230,10 +230,11 @@ glm_mat2_trace(mat2 m) { CGLM_INLINE void glm_mat2_scale(mat2 m, float s) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(__wasm__) && defined(__wasm_simd128__) + glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]), + wasm_f32x4_splat(s))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s))); -#elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]), wasm_f32x4_splat(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s))); #else diff --git a/include/cglm/mat3.h b/include/cglm/mat3.h index cbfab30..69f0454 100644 --- a/include/cglm/mat3.h +++ b/include/cglm/mat3.h @@ -152,10 +152,10 @@ glm_mat3_zero(mat3 mat) { CGLM_INLINE void glm_mat3_mul(mat3 m1, mat3 m2, mat3 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat3_mul_sse2(m1, m2, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat3_mul_wasm(m1, m2, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat3_mul_sse2(m1, m2, dest); #else float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 5eb2879..c75dfa5 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -125,15 +125,15 @@ glm_mat4_ucopy(mat4 mat, mat4 dest) { CGLM_INLINE void glm_mat4_copy(mat4 mat, mat4 dest) { -#ifdef __AVX__ - glmm_store256(dest[0], glmm_load256(mat[0])); - glmm_store256(dest[2], glmm_load256(mat[2])); -#elif defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest[0], glmm_load(mat[0])); glmm_store(dest[1], glmm_load(mat[1])); glmm_store(dest[2], glmm_load(mat[2])); glmm_store(dest[3], glmm_load(mat[3])); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#elif defined(__AVX__) + glmm_store256(dest[0], glmm_load256(mat[0])); + glmm_store256(dest[2], glmm_load256(mat[2])); +#elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest[0], glmm_load(mat[0])); glmm_store(dest[1], glmm_load(mat[1])); glmm_store(dest[2], glmm_load(mat[2])); @@ -196,7 +196,14 @@ glm_mat4_identity_array(mat4 * __restrict mat, size_t count) { CGLM_INLINE void glm_mat4_zero(mat4 mat) { -#ifdef __AVX__ +#if defined(__wasm__) && defined(__wasm_simd128__) + glmm_128 x0; + x0 = wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); + glmm_store(mat[0], x0); + glmm_store(mat[1], x0); + glmm_store(mat[2], x0); + glmm_store(mat[3], x0); +#elif defined(__AVX__) __m256 y0; y0 = _mm256_setzero_ps(); glmm_store256(mat[0], y0); @@ -208,13 +215,6 @@ glm_mat4_zero(mat4 mat) { glmm_store(mat[1], x0); glmm_store(mat[2], x0); glmm_store(mat[3], x0); -#elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_128 x0; - x0 = wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); - glmm_store(mat[0], x0); - glmm_store(mat[1], x0); - glmm_store(mat[2], x0); - glmm_store(mat[3], x0); #elif defined(CGLM_NEON_FP) glmm_128 x0; x0 = vdupq_n_f32(0.0f); @@ -313,12 +313,12 @@ glm_mat4_ins3(mat3 mat, mat4 dest) { CGLM_INLINE void glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { -#ifdef __AVX__ +#if defined(__wasm__) && defined(__wasm_simd128__) + glm_mat4_mul_wasm(m1, m2, dest); +#elif defined(__AVX__) glm_mat4_mul_avx(m1, m2, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mul_sse2(m1, m2, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) - glm_mat4_mul_wasm(m1, m2, dest); #elif defined(CGLM_NEON_FP) glm_mat4_mul_neon(m1, m2, dest); #else @@ -395,10 +395,10 @@ glm_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest) { CGLM_INLINE void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat4_mulv_sse2(m, v, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_mulv_wasm(m, v, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_mulv_sse2(m, v, dest); #elif defined(CGLM_NEON_FP) glm_mat4_mulv_neon(m, v, dest); #else @@ -517,10 +517,10 @@ glm_mat4_mulv3(mat4 m, vec3 v, float last, vec3 dest) { CGLM_INLINE void glm_mat4_transpose_to(mat4 m, mat4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat4_transp_sse2(m, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_transp_wasm(m, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_transp_sse2(m, dest); #elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, dest); #else @@ -543,10 +543,10 @@ glm_mat4_transpose_to(mat4 m, mat4 dest) { CGLM_INLINE void glm_mat4_transpose(mat4 m) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat4_transp_sse2(m, m); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_transp_wasm(m, m); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_transp_sse2(m, m); #elif defined(CGLM_NEON_FP) glm_mat4_transp_neon(m, m); #else @@ -584,12 +584,12 @@ glm_mat4_scale_p(mat4 m, float s) { CGLM_INLINE void glm_mat4_scale(mat4 m, float s) { -#ifdef __AVX__ +#if defined(__wasm__) && defined(__wasm_simd128__) + glm_mat4_scale_wasm(m, s); +#elif defined(__AVX__) glm_mat4_scale_avx(m, s); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_scale_sse2(m, s); -#elif defined(__wasm__) && defined(__wasm_simd128__) - glm_mat4_scale_wasm(m, s); #elif defined(CGLM_NEON_FP) glm_mat4_scale_neon(m, s); #else @@ -607,10 +607,10 @@ glm_mat4_scale(mat4 m, float s) { CGLM_INLINE float glm_mat4_det(mat4 mat) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - return glm_mat4_det_sse2(mat); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) return glm_mat4_det_wasm(mat); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + return glm_mat4_det_sse2(mat); #elif defined(CGLM_NEON_FP) return glm_mat4_det_neon(mat); #else @@ -707,10 +707,10 @@ glm_mat4_inv(mat4 mat, mat4 dest) { CGLM_INLINE void glm_mat4_inv_fast(mat4 mat, mat4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_mat4_inv_fast_sse2(mat, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_inv_fast_wasm(mat, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_inv_fast_sse2(mat, dest); #else glm_mat4_inv(mat, dest); #endif diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 5ce27ff..bc52a38 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -242,21 +242,7 @@ glm_quat_norm(versor q) { CGLM_INLINE void glm_quat_normalize_to(versor q, versor dest) { -#if defined( __SSE2__ ) || defined( __SSE2__ ) - __m128 xdot, x0; - float dot; - - x0 = glmm_load(q); - xdot = glmm_vdot(x0, x0); - dot = _mm_cvtss_f32(xdot); - - if (dot <= 0.0f) { - glm_quat_identity(dest); - return; - } - - glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_128 xdot, x0; float dot; @@ -271,6 +257,20 @@ glm_quat_normalize_to(versor q, versor dest) { } glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot))); +#elif defined( __SSE2__ ) || defined( __SSE2__ ) + __m128 xdot, x0; + float dot; + + x0 = glmm_load(q); + xdot = glmm_vdot(x0, x0); + dot = _mm_cvtss_f32(xdot); + + if (dot <= 0.0f) { + glm_quat_identity(dest); + return; + } + + glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); #else float dot; @@ -457,10 +457,10 @@ glm_quat_mul(versor p, versor q, versor dest) { + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k a1 a2 − b1 b2 − c1 c2 − d1 d2 */ -#if defined( __SSE__ ) || defined( __SSE2__ ) - glm_quat_mul_sse2(p, q, dest); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glm_quat_mul_wasm(p, q, dest); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glm_quat_mul_sse2(p, q, dest); #elif defined(CGLM_NEON_FP) glm_quat_mul_neon(p, q, dest); #else diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 17998f5..bf8d119 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -113,7 +113,7 @@ # endif #endif -#if defined(CGLM_SIMD_x86) +#if defined(CGLM_SIMD_x86) && !defined(CGLM_SIMD_WASM) # include "x86.h" #endif diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 06b6210..cccbb72 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -45,10 +45,10 @@ CGLM_INLINE void glm_vec4_broadcast(float val, vec4 d) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(d, _mm_set1_ps(val)); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(d, wasm_f32x4_splat(val)); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(d, _mm_set1_ps(val)); #else d[0] = d[1] = d[2] = d[3] = val; #endif @@ -63,10 +63,10 @@ glm_vec4_broadcast(float val, vec4 d) { CGLM_INLINE void glm_vec4_fill(vec4 v, float val) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_set1_ps(val)); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_splat(val)); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(v, _mm_set1_ps(val)); #else v[0] = v[1] = v[2] = v[3] = val; #endif @@ -251,9 +251,9 @@ glm_vec4_sign(vec4 v, vec4 dest) { CGLM_INLINE void glm_vec4_abs(vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, glmm_abs(glmm_load(v))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_abs(glmm_load(v))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vabsq_f32(vld1q_f32(v))); @@ -290,9 +290,9 @@ glm_vec4_fract(vec4 v, vec4 dest) { CGLM_INLINE float glm_vec4_hadd(vec4 v) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(__wasm__) && defined(__wasm_simd128__) return glmm_hadd(glmm_load(v)); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#elif defined( __SSE__ ) || defined( __SSE2__ ) return glmm_hadd(glmm_load(v)); #else return v[0] + v[1] + v[2] + v[3]; @@ -308,10 +308,10 @@ glm_vec4_hadd(vec4 v) { CGLM_INLINE void glm_vec4_sqrt(vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_sqrt_ps(glmm_load(v))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_sqrt(glmm_load(v))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_sqrt_ps(glmm_load(v))); #else dest[0] = sqrtf(v[0]); dest[1] = sqrtf(v[1]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index d6ee080..1d18625 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -137,9 +137,9 @@ glm_vec4_copy3(vec4 a, vec3 dest) { CGLM_INLINE void glm_vec4_copy(vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, glmm_load(v)); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, glmm_load(v)); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vld1q_f32(v)); @@ -181,10 +181,10 @@ glm_vec4_ucopy(vec4 v, vec4 dest) { CGLM_INLINE void glm_vec4_zero(vec4 v) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_setzero_ps()); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_const_splat(0.f)); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(v, _mm_setzero_ps()); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(0.0f)); #else @@ -203,10 +203,10 @@ glm_vec4_zero(vec4 v) { CGLM_INLINE void glm_vec4_one(vec4 v) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_set1_ps(1.0f)); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_const_splat(1.0f)); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(v, _mm_set1_ps(1.0f)); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else @@ -331,10 +331,10 @@ glm_vec4_norm_inf(vec4 v) { CGLM_INLINE void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b))); #else @@ -355,10 +355,10 @@ glm_vec4_add(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_adds(vec4 v, float s, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -379,10 +379,10 @@ glm_vec4_adds(vec4 v, float s, vec4 dest) { CGLM_INLINE void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_sub(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b))); #else @@ -403,10 +403,10 @@ glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_subs(vec4 v, float s, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -427,10 +427,10 @@ glm_vec4_subs(vec4 v, float s, vec4 dest) { CGLM_INLINE void glm_vec4_mul(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_mul(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b))); #else @@ -451,10 +451,10 @@ glm_vec4_mul(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_scale(vec4 v, float s, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -516,10 +516,10 @@ glm_vec4_div(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_divs(vec4 v, float s, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_div(glmm_load(v), wasm_f32x4_splat(s))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); #else glm_vec4_scale(v, 1.0f / s, dest); #endif @@ -537,14 +537,14 @@ glm_vec4_divs(vec4 v, float s, vec4 dest) { CGLM_INLINE void glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_add_ps(glmm_load(a), - glmm_load(b)))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), wasm_f32x4_add(glmm_load(a), glmm_load(b)))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_add_ps(glmm_load(a), + glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vaddq_f32(vld1q_f32(a), @@ -569,14 +569,14 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_sub_ps(glmm_load(a), - glmm_load(b)))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), wasm_f32x4_sub(glmm_load(a), glmm_load(b)))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_sub_ps(glmm_load(a), + glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vsubq_f32(vld1q_f32(a), @@ -645,14 +645,14 @@ glm_vec4_muladds(vec4 a, float s, vec4 dest) { CGLM_INLINE void glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_max_ps(glmm_load(a), - glmm_load(b)))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), wasm_f32x4_max(glmm_load(a), glmm_load(b)))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_max_ps(glmm_load(a), + glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vmaxq_f32(vld1q_f32(a), @@ -677,14 +677,14 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_min_ps(glmm_load(a), - glmm_load(b)))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), wasm_f32x4_min(glmm_load(a), glmm_load(b)))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_min_ps(glmm_load(a), + glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vminq_f32(vld1q_f32(a), @@ -706,11 +706,11 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_negate_to(vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_v128_xor(glmm_load(v), wasm_f32x4_const_splat(-0.0f))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else @@ -741,21 +741,7 @@ glm_vec4_negate(vec4 v) { CGLM_INLINE void glm_vec4_normalize_to(vec4 v, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - __m128 xdot, x0; - float dot; - - x0 = glmm_load(v); - xdot = glmm_vdot(x0, x0); - dot = _mm_cvtss_f32(xdot); - - if (dot == 0.0f) { - glmm_store(dest, _mm_setzero_ps()); - return; - } - - glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_128 xdot, x0; float dot; @@ -770,6 +756,20 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) { } glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + __m128 xdot, x0; + float dot; + + x0 = glmm_load(v); + xdot = glmm_vdot(x0, x0); + dot = _mm_cvtss_f32(xdot); + + if (dot == 0.0f) { + glmm_store(dest, _mm_setzero_ps()); + return; + } + + glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); #else float norm; @@ -805,10 +805,10 @@ glm_vec4_normalize(vec4 v) { CGLM_INLINE float glm_vec4_distance(vec4 a, vec4 b) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) return glmm_norm(wasm_f32x4_sub(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b))); #else @@ -829,10 +829,10 @@ glm_vec4_distance(vec4 a, vec4 b) { CGLM_INLINE float glm_vec4_distance2(vec4 a, vec4 b) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) return glmm_norm2(wasm_f32x4_sub(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b))); #else @@ -853,10 +853,10 @@ glm_vec4_distance2(vec4 a, vec4 b) { CGLM_INLINE void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_max(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b))); #else @@ -877,10 +877,10 @@ glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_min(glmm_load(a), glmm_load(b))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b))); #else @@ -901,13 +901,13 @@ glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), - _mm_set1_ps(maxVal))); -#elif defined(__wasm__) && defined(__wasm_simd128__) +#if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_min( wasm_f32x4_max(glmm_load(v), wasm_f32x4_splat(minVal)), wasm_f32x4_splat(maxVal))); +#elif defined( __SSE__ ) || defined( __SSE2__ ) + glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), + _mm_set1_ps(maxVal))); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal)));