simd128: handle both sse2 and simd128 enabled by Emscripten

https://github.com/recp/cglm/pull/286#issuecomment-1492985403
This commit is contained in:
myfreeer
2023-04-02 12:39:20 +08:00
parent 7ca9a64ecf
commit 3a9e4df393
8 changed files with 171 additions and 170 deletions

View File

@@ -53,12 +53,12 @@
CGLM_INLINE
void
glm_mul(mat4 m1, mat4 m2, mat4 dest) {
#ifdef __AVX__
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mul_wasm(m1, m2, dest);
#elif defined(__AVX__)
glm_mul_avx(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mul_sse2(m1, m2, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
glm_mul_wasm(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mul_neon(m1, m2, dest);
#else
@@ -113,10 +113,10 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) {
CGLM_INLINE
void
glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mul_rot_sse2(m1, m2, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mul_rot_wasm(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mul_rot_sse2(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mul_rot_neon(m1, m2, dest);
#else
@@ -164,10 +164,10 @@ glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) {
CGLM_INLINE
void
glm_inv_tr(mat4 mat) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_inv_tr_sse2(mat);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_inv_tr_wasm(mat);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_inv_tr_sse2(mat);
#elif defined(CGLM_NEON_FP)
glm_inv_tr_neon(mat);
#else

View File

@@ -136,10 +136,10 @@ glm_mat2_zero(mat2 mat) {
CGLM_INLINE
void
glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat2_mul_sse2(m1, m2, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat2_mul_wasm(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat2_mul_sse2(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mat2_mul_neon(m1, m2, dest);
#else
@@ -166,10 +166,10 @@ glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) {
CGLM_INLINE
void
glm_mat2_transpose_to(mat2 m, mat2 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat2_transp_sse2(m, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat2_transp_wasm(m, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat2_transp_sse2(m, dest);
#else
dest[0][0] = m[0][0];
dest[0][1] = m[1][0];
@@ -230,10 +230,11 @@ glm_mat2_trace(mat2 m) {
CGLM_INLINE
void
glm_mat2_scale(mat2 m, float s) {
#if defined( __SSE__ ) || defined( __SSE2__ )
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]), wasm_f32x4_splat(s)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
#else

View File

@@ -152,10 +152,10 @@ glm_mat3_zero(mat3 mat) {
CGLM_INLINE
void
glm_mat3_mul(mat3 m1, mat3 m2, mat3 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat3_mul_sse2(m1, m2, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat3_mul_wasm(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat3_mul_sse2(m1, m2, dest);
#else
float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2],
a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2],

View File

@@ -125,15 +125,15 @@ glm_mat4_ucopy(mat4 mat, mat4 dest) {
CGLM_INLINE
void
glm_mat4_copy(mat4 mat, mat4 dest) {
#ifdef __AVX__
glmm_store256(dest[0], glmm_load256(mat[0]));
glmm_store256(dest[2], glmm_load256(mat[2]));
#elif defined( __SSE__ ) || defined( __SSE2__ )
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest[0], glmm_load(mat[0]));
glmm_store(dest[1], glmm_load(mat[1]));
glmm_store(dest[2], glmm_load(mat[2]));
glmm_store(dest[3], glmm_load(mat[3]));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#elif defined(__AVX__)
glmm_store256(dest[0], glmm_load256(mat[0]));
glmm_store256(dest[2], glmm_load256(mat[2]));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest[0], glmm_load(mat[0]));
glmm_store(dest[1], glmm_load(mat[1]));
glmm_store(dest[2], glmm_load(mat[2]));
@@ -196,7 +196,14 @@ glm_mat4_identity_array(mat4 * __restrict mat, size_t count) {
CGLM_INLINE
void
glm_mat4_zero(mat4 mat) {
#ifdef __AVX__
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_128 x0;
x0 = wasm_f32x4_const(0.f, 0.f, 0.f, 0.f);
glmm_store(mat[0], x0);
glmm_store(mat[1], x0);
glmm_store(mat[2], x0);
glmm_store(mat[3], x0);
#elif defined(__AVX__)
__m256 y0;
y0 = _mm256_setzero_ps();
glmm_store256(mat[0], y0);
@@ -208,13 +215,6 @@ glm_mat4_zero(mat4 mat) {
glmm_store(mat[1], x0);
glmm_store(mat[2], x0);
glmm_store(mat[3], x0);
#elif defined(__wasm__) && defined(__wasm_simd128__)
glmm_128 x0;
x0 = wasm_f32x4_const(0.f, 0.f, 0.f, 0.f);
glmm_store(mat[0], x0);
glmm_store(mat[1], x0);
glmm_store(mat[2], x0);
glmm_store(mat[3], x0);
#elif defined(CGLM_NEON_FP)
glmm_128 x0;
x0 = vdupq_n_f32(0.0f);
@@ -313,12 +313,12 @@ glm_mat4_ins3(mat3 mat, mat4 dest) {
CGLM_INLINE
void
glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) {
#ifdef __AVX__
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_mul_wasm(m1, m2, dest);
#elif defined(__AVX__)
glm_mat4_mul_avx(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_mul_sse2(m1, m2, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_mul_wasm(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mat4_mul_neon(m1, m2, dest);
#else
@@ -395,10 +395,10 @@ glm_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest) {
CGLM_INLINE
void
glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_mulv_sse2(m, v, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_mulv_wasm(m, v, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_mulv_sse2(m, v, dest);
#elif defined(CGLM_NEON_FP)
glm_mat4_mulv_neon(m, v, dest);
#else
@@ -517,10 +517,10 @@ glm_mat4_mulv3(mat4 m, vec3 v, float last, vec3 dest) {
CGLM_INLINE
void
glm_mat4_transpose_to(mat4 m, mat4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_transp_wasm(m, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, dest);
#elif defined(CGLM_NEON_FP)
glm_mat4_transp_neon(m, dest);
#else
@@ -543,10 +543,10 @@ glm_mat4_transpose_to(mat4 m, mat4 dest) {
CGLM_INLINE
void
glm_mat4_transpose(mat4 m) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, m);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_transp_wasm(m, m);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, m);
#elif defined(CGLM_NEON_FP)
glm_mat4_transp_neon(m, m);
#else
@@ -584,12 +584,12 @@ glm_mat4_scale_p(mat4 m, float s) {
CGLM_INLINE
void
glm_mat4_scale(mat4 m, float s) {
#ifdef __AVX__
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_scale_wasm(m, s);
#elif defined(__AVX__)
glm_mat4_scale_avx(m, s);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_scale_sse2(m, s);
#elif defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_scale_wasm(m, s);
#elif defined(CGLM_NEON_FP)
glm_mat4_scale_neon(m, s);
#else
@@ -607,10 +607,10 @@ glm_mat4_scale(mat4 m, float s) {
CGLM_INLINE
float
glm_mat4_det(mat4 mat) {
#if defined( __SSE__ ) || defined( __SSE2__ )
return glm_mat4_det_sse2(mat);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
return glm_mat4_det_wasm(mat);
#elif defined( __SSE__ ) || defined( __SSE2__ )
return glm_mat4_det_sse2(mat);
#elif defined(CGLM_NEON_FP)
return glm_mat4_det_neon(mat);
#else
@@ -707,10 +707,10 @@ glm_mat4_inv(mat4 mat, mat4 dest) {
CGLM_INLINE
void
glm_mat4_inv_fast(mat4 mat, mat4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_inv_fast_sse2(mat, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_inv_fast_wasm(mat, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_inv_fast_sse2(mat, dest);
#else
glm_mat4_inv(mat, dest);
#endif

View File

@@ -242,21 +242,7 @@ glm_quat_norm(versor q) {
CGLM_INLINE
void
glm_quat_normalize_to(versor q, versor dest) {
#if defined( __SSE2__ ) || defined( __SSE2__ )
__m128 xdot, x0;
float dot;
x0 = glmm_load(q);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot <= 0.0f) {
glm_quat_identity(dest);
return;
}
glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_128 xdot, x0;
float dot;
@@ -271,6 +257,20 @@ glm_quat_normalize_to(versor q, versor dest) {
}
glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot)));
#elif defined( __SSE2__ ) || defined( __SSE2__ )
__m128 xdot, x0;
float dot;
x0 = glmm_load(q);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot <= 0.0f) {
glm_quat_identity(dest);
return;
}
glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#else
float dot;
@@ -457,10 +457,10 @@ glm_quat_mul(versor p, versor q, versor dest) {
+ (a1 d2 + b1 c2 c1 b2 + d1 a2)k
a1 a2 b1 b2 c1 c2 d1 d2
*/
#if defined( __SSE__ ) || defined( __SSE2__ )
glm_quat_mul_sse2(p, q, dest);
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_quat_mul_wasm(p, q, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_quat_mul_sse2(p, q, dest);
#elif defined(CGLM_NEON_FP)
glm_quat_mul_neon(p, q, dest);
#else

View File

@@ -113,7 +113,7 @@
# endif
#endif
#if defined(CGLM_SIMD_x86)
#if defined(CGLM_SIMD_x86) && !defined(CGLM_SIMD_WASM)
# include "x86.h"
#endif

View File

@@ -45,10 +45,10 @@
CGLM_INLINE
void
glm_vec4_broadcast(float val, vec4 d) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(d, _mm_set1_ps(val));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(d, wasm_f32x4_splat(val));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(d, _mm_set1_ps(val));
#else
d[0] = d[1] = d[2] = d[3] = val;
#endif
@@ -63,10 +63,10 @@ glm_vec4_broadcast(float val, vec4 d) {
CGLM_INLINE
void
glm_vec4_fill(vec4 v, float val) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_set1_ps(val));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(v, wasm_f32x4_splat(val));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_set1_ps(val));
#else
v[0] = v[1] = v[2] = v[3] = val;
#endif
@@ -251,9 +251,9 @@ glm_vec4_sign(vec4 v, vec4 dest) {
CGLM_INLINE
void
glm_vec4_abs(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, glmm_abs(glmm_load(v)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, glmm_abs(glmm_load(v)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vabsq_f32(vld1q_f32(v)));
@@ -290,9 +290,9 @@ glm_vec4_fract(vec4 v, vec4 dest) {
CGLM_INLINE
float
glm_vec4_hadd(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ )
#if defined(__wasm__) && defined(__wasm_simd128__)
return glmm_hadd(glmm_load(v));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#elif defined( __SSE__ ) || defined( __SSE2__ )
return glmm_hadd(glmm_load(v));
#else
return v[0] + v[1] + v[2] + v[3];
@@ -308,10 +308,10 @@ glm_vec4_hadd(vec4 v) {
CGLM_INLINE
void
glm_vec4_sqrt(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sqrt_ps(glmm_load(v)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_sqrt(glmm_load(v)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sqrt_ps(glmm_load(v)));
#else
dest[0] = sqrtf(v[0]);
dest[1] = sqrtf(v[1]);

View File

@@ -137,9 +137,9 @@ glm_vec4_copy3(vec4 a, vec3 dest) {
CGLM_INLINE
void
glm_vec4_copy(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, glmm_load(v));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, glmm_load(v));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vld1q_f32(v));
@@ -181,10 +181,10 @@ glm_vec4_ucopy(vec4 v, vec4 dest) {
CGLM_INLINE
void
glm_vec4_zero(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_setzero_ps());
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(v, wasm_f32x4_const_splat(0.f));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_setzero_ps());
#elif defined(CGLM_NEON_FP)
vst1q_f32(v, vdupq_n_f32(0.0f));
#else
@@ -203,10 +203,10 @@ glm_vec4_zero(vec4 v) {
CGLM_INLINE
void
glm_vec4_one(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_set1_ps(1.0f));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(v, wasm_f32x4_const_splat(1.0f));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_set1_ps(1.0f));
#elif defined(CGLM_NEON_FP)
vst1q_f32(v, vdupq_n_f32(1.0f));
#else
@@ -331,10 +331,10 @@ glm_vec4_norm_inf(vec4 v) {
CGLM_INLINE
void
glm_vec4_add(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(a), vld1q_f32(b)));
#else
@@ -355,10 +355,10 @@ glm_vec4_add(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_adds(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s)));
#else
@@ -379,10 +379,10 @@ glm_vec4_adds(vec4 v, float s, vec4 dest) {
CGLM_INLINE
void
glm_vec4_sub(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vsubq_f32(vld1q_f32(a), vld1q_f32(b)));
#else
@@ -403,10 +403,10 @@ glm_vec4_sub(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_subs(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s)));
#else
@@ -427,10 +427,10 @@ glm_vec4_subs(vec4 v, float s, vec4 dest) {
CGLM_INLINE
void
glm_vec4_mul(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_mul(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_mul_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vmulq_f32(vld1q_f32(a), vld1q_f32(b)));
#else
@@ -451,10 +451,10 @@ glm_vec4_mul(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_scale(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s)));
#else
@@ -516,10 +516,10 @@ glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_divs(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_div(glmm_load(v), wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s)));
#else
glm_vec4_scale(v, 1.0f / s, dest);
#endif
@@ -537,14 +537,14 @@ glm_vec4_divs(vec4 v, float s, vec4 dest) {
CGLM_INLINE
void
glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_add_ps(glmm_load(a),
glmm_load(b))));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(
glmm_load(dest),
wasm_f32x4_add(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_add_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vaddq_f32(vld1q_f32(a),
@@ -569,14 +569,14 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_sub_ps(glmm_load(a),
glmm_load(b))));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(
glmm_load(dest),
wasm_f32x4_sub(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_sub_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vsubq_f32(vld1q_f32(a),
@@ -645,14 +645,14 @@ glm_vec4_muladds(vec4 a, float s, vec4 dest) {
CGLM_INLINE
void
glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_max_ps(glmm_load(a),
glmm_load(b))));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(
glmm_load(dest),
wasm_f32x4_max(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_max_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vmaxq_f32(vld1q_f32(a),
@@ -677,14 +677,14 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_min_ps(glmm_load(a),
glmm_load(b))));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add(
glmm_load(dest),
wasm_f32x4_min(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_min_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vminq_f32(vld1q_f32(a),
@@ -706,11 +706,11 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_negate_to(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_v128_xor(glmm_load(v),
wasm_f32x4_const_splat(-0.0f)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vnegq_f32(vld1q_f32(v)));
#else
@@ -741,21 +741,7 @@ glm_vec4_negate(vec4 v) {
CGLM_INLINE
void
glm_vec4_normalize_to(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
__m128 xdot, x0;
float dot;
x0 = glmm_load(v);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot == 0.0f) {
glmm_store(dest, _mm_setzero_ps());
return;
}
glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_128 xdot, x0;
float dot;
@@ -770,6 +756,20 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
}
glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
__m128 xdot, x0;
float dot;
x0 = glmm_load(v);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot == 0.0f) {
glmm_store(dest, _mm_setzero_ps());
return;
}
glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#else
float norm;
@@ -805,10 +805,10 @@ glm_vec4_normalize(vec4 v) {
CGLM_INLINE
float
glm_vec4_distance(vec4 a, vec4 b) {
#if defined( __SSE__ ) || defined( __SSE2__ )
return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
return glmm_norm(wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
return glmm_norm(_mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
return glmm_norm(vsubq_f32(glmm_load(a), glmm_load(b)));
#else
@@ -829,10 +829,10 @@ glm_vec4_distance(vec4 a, vec4 b) {
CGLM_INLINE
float
glm_vec4_distance2(vec4 a, vec4 b) {
#if defined( __SSE__ ) || defined( __SSE2__ )
return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
return glmm_norm2(wasm_f32x4_sub(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
return glmm_norm2(_mm_sub_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
return glmm_norm2(vsubq_f32(glmm_load(a), glmm_load(b)));
#else
@@ -853,10 +853,10 @@ glm_vec4_distance2(vec4 a, vec4 b) {
CGLM_INLINE
void
glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_max(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b)));
#else
@@ -877,10 +877,10 @@ glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_minv(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_min(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b)));
#else
@@ -901,13 +901,13 @@ glm_vec4_minv(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE
void
glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
#if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)),
_mm_set1_ps(maxVal)));
#elif defined(__wasm__) && defined(__wasm_simd128__)
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(v, wasm_f32x4_min(
wasm_f32x4_max(glmm_load(v), wasm_f32x4_splat(minVal)),
wasm_f32x4_splat(maxVal)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)),
_mm_set1_ps(maxVal)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)),
vdupq_n_f32(maxVal)));