From 252bf925fc4ecd9cd939992f3651df15fc84e0c3 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:25:23 +0300 Subject: [PATCH] simd, sse2: make alignment optional for load operations --- include/cglm/affine.h | 34 ++++++++--------- include/cglm/mat4.h | 8 ++-- include/cglm/quat.h | 2 +- include/cglm/simd/intrin.h | 12 +++++- include/cglm/simd/sse2/affine.h | 38 +++++++++---------- include/cglm/simd/sse2/mat4.h | 66 ++++++++++++++++----------------- include/cglm/simd/sse2/quat.h | 4 +- include/cglm/vec4-ext.h | 6 +-- include/cglm/vec4.h | 58 ++++++++++++++--------------- 9 files changed, 118 insertions(+), 110 deletions(-) diff --git a/include/cglm/affine.h b/include/cglm/affine.h index 8124930..b200f30 100644 --- a/include/cglm/affine.h +++ b/include/cglm/affine.h @@ -59,18 +59,18 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(dest[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[0]), + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]), _mm_set1_ps(v[0])), - _mm_mul_ps(_mm_load_ps(t[1]), + _mm_mul_ps(glmm_load(t[1]), _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(_mm_load_ps(t[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(t[2]), _mm_set1_ps(v[2])), - _mm_load_ps(t[3])))) + glmm_load(t[3])))) ; - _mm_store_ps(dest[0], _mm_load_ps(m[0])); - _mm_store_ps(dest[1], _mm_load_ps(m[1])); - _mm_store_ps(dest[2], _mm_load_ps(m[2])); + _mm_store_ps(dest[0], glmm_load(m[0])); + _mm_store_ps(dest[1], glmm_load(m[1])); + _mm_store_ps(dest[2], glmm_load(m[2])); #else vec4 v1, v2, v3; @@ -98,13 +98,13 @@ void glm_translate(mat4 m, vec3 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_set1_ps(v[0])), - _mm_mul_ps(_mm_load_ps(m[1]), + _mm_mul_ps(glmm_load(m[1]), _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_set1_ps(v[2])), - _mm_load_ps(m[3])))) + glmm_load(m[3])))) ; #else vec4 v1, v2, v3; @@ -130,9 +130,9 @@ void glm_translate_x(mat4 m, float x) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_set1_ps(x)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; @@ -152,9 +152,9 @@ void glm_translate_y(mat4 m, float y) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[1]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[1]), _mm_set1_ps(y)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; @@ -174,9 +174,9 @@ void glm_translate_z(mat4 m, float z) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_set1_ps(z)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index d1a72cb..8cafb7c 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -113,10 +113,10 @@ glm_mat4_copy(mat4 mat, mat4 dest) { _mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); _mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); #elif defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest[0], _mm_load_ps(mat[0])); - _mm_store_ps(dest[1], _mm_load_ps(mat[1])); - _mm_store_ps(dest[2], _mm_load_ps(mat[2])); - _mm_store_ps(dest[3], _mm_load_ps(mat[3])); + _mm_store_ps(dest[0], glmm_load(mat[0])); + _mm_store_ps(dest[1], glmm_load(mat[1])); + _mm_store_ps(dest[2], glmm_load(mat[2])); + _mm_store_ps(dest[3], glmm_load(mat[3])); #else glm_mat4_ucopy(mat, dest); #endif diff --git a/include/cglm/quat.h b/include/cglm/quat.h index f6dc86e..6bff527 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -198,7 +198,7 @@ glm_quat_normalize_to(versor q, versor dest) { __m128 xdot, x0; float dot; - x0 = _mm_load_ps(q); + x0 = glmm_load(q); xdot = glm_simd_dot(x0, x0); dot = _mm_cvtss_f32(xdot); diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index cf6753f..bf1db60 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -35,7 +35,7 @@ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ z1, y1, x1, w1) -CGLM_INLINE +static inline __m128 glm_simd_dot(__m128 a, __m128 b) { __m128 x0; @@ -44,7 +44,7 @@ glm_simd_dot(__m128 a, __m128 b) { return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)); } -CGLM_INLINE +static inline __m128 glm_simd_norm(__m128 a) { return _mm_sqrt_ps(glm_simd_dot(a, a)); @@ -69,6 +69,14 @@ glm_simd_store_v3(__m128 vx, vec3 v) { _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2)); } +#ifdef CGLM_ALL_UNALIGNED +#define glmm_load(p) _mm_loadu_ps(p) +#define glmm_store(p, a) _mm_storeu_ps(p, a) +#else +#define glmm_load(p) _mm_load_ps(p) +#define glmm_store(p, a) _mm_store_ps(p, a) +#endif + #endif /* x86, x64 */ diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h index b746d0f..df8c166 100644 --- a/include/cglm/simd/sse2/affine.h +++ b/include/cglm/simd/sse2/affine.h @@ -18,30 +18,30 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[3]); + r = glmm_load(m2[3]); _mm_store_ps(dest[3], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -55,24 +55,24 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -86,10 +86,10 @@ void glm_inv_tr_sse2(mat4 mat) { __m128 r0, r1, r2, r3, x0, x1; - r0 = _mm_load_ps(mat[0]); - r1 = _mm_load_ps(mat[1]); - r2 = _mm_load_ps(mat[2]); - r3 = _mm_load_ps(mat[3]); + r0 = glmm_load(mat[0]); + r1 = glmm_load(mat[1]); + r2 = glmm_load(mat[2]); + r3 = glmm_load(mat[3]); x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); _MM_TRANSPOSE4_PS(r0, r1, r2, x1); diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 77874a8..404b496 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -20,10 +20,10 @@ glm_mat4_scale_sse2(mat4 m, float s){ __m128 x0; x0 = _mm_set1_ps(s); - _mm_store_ps(m[0], _mm_mul_ps(_mm_load_ps(m[0]), x0)); - _mm_store_ps(m[1], _mm_mul_ps(_mm_load_ps(m[1]), x0)); - _mm_store_ps(m[2], _mm_mul_ps(_mm_load_ps(m[2]), x0)); - _mm_store_ps(m[3], _mm_mul_ps(_mm_load_ps(m[3]), x0)); + _mm_store_ps(m[0], _mm_mul_ps(glmm_load(m[0]), x0)); + _mm_store_ps(m[1], _mm_mul_ps(glmm_load(m[1]), x0)); + _mm_store_ps(m[2], _mm_mul_ps(glmm_load(m[2]), x0)); + _mm_store_ps(m[3], _mm_mul_ps(glmm_load(m[3]), x0)); } CGLM_INLINE @@ -31,10 +31,10 @@ void glm_mat4_transp_sse2(mat4 m, mat4 dest){ __m128 r0, r1, r2, r3; - r0 = _mm_load_ps(m[0]); - r1 = _mm_load_ps(m[1]); - r2 = _mm_load_ps(m[2]); - r3 = _mm_load_ps(m[3]); + r0 = glmm_load(m[0]); + r1 = glmm_load(m[1]); + r2 = glmm_load(m[2]); + r3 = glmm_load(m[3]); _MM_TRANSPOSE4_PS(r0, r1, r2, r3); @@ -51,31 +51,31 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[3]); + r = glmm_load(m2[3]); _mm_store_ps(dest[3], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -88,15 +88,15 @@ void glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { __m128 x0, x1, x2; - x0 = _mm_load_ps(v); - x1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + x0 = glmm_load(v); + x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_shuffle1_ps1(x0, 0)), - _mm_mul_ps(_mm_load_ps(m[1]), + _mm_mul_ps(glmm_load(m[1]), _mm_shuffle1_ps1(x0, 1))); - x2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_shuffle1_ps1(x0, 2)), - _mm_mul_ps(_mm_load_ps(m[3]), + _mm_mul_ps(glmm_load(m[3]), _mm_shuffle1_ps1(x0, 3))); _mm_store_ps(dest, _mm_add_ps(x1, x2)); @@ -108,10 +108,10 @@ glm_mat4_det_sse2(mat4 mat) { __m128 r0, r1, r2, r3, x0, x1, x2; /* 127 <- 0, [square] det(A) = det(At) */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ /* t[1] = j * p - n * l; @@ -166,10 +166,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7; /* 127 <- 0 */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ @@ -290,10 +290,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7; /* 127 <- 0 */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index 5dbf759..4970eff 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -24,8 +24,8 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { __m128 xp, xq, x0, r; - xp = _mm_load_ps(p); /* 3 2 1 0 */ - xq = _mm_load_ps(q); + xp = glmm_load(p); /* 3 2 1 0 */ + xq = glmm_load(q); r = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq); diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 7a6cb3d..1055ebe 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -42,7 +42,7 @@ CGLM_INLINE void glm_vec4_mulv(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -223,7 +223,7 @@ glm_vec4_sign(vec4 v, vec4 dest) { #if defined( __SSE2__ ) || defined( __SSE2__ ) __m128 x0, x1, x2, x3, x4; - x0 = _mm_load_ps(v); + x0 = glmm_load(v); x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f); x2 = _mm_shuffle1_ps1(x1, 2); @@ -249,7 +249,7 @@ CGLM_INLINE void glm_vec4_sqrt(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sqrt_ps(_mm_load_ps(v))); + _mm_store_ps(dest, _mm_sqrt_ps(glmm_load(v))); #else dest[0] = sqrtf(v[0]); dest[1] = sqrtf(v[1]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 2e9ca45..b2a9b97 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -111,7 +111,7 @@ CGLM_INLINE void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_load_ps(v)); + _mm_store_ps(dest, glmm_load(v)); #else dest[0] = v[0]; dest[1] = v[1]; @@ -169,7 +169,7 @@ float glm_vec4_dot(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)); + x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); #else @@ -193,7 +193,7 @@ float glm_vec4_norm2(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_load_ps(v); + x0 = glmm_load(v); x0 = _mm_mul_ps(x0, x0); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); @@ -214,7 +214,7 @@ float glm_vec4_norm(vec4 vec) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_load_ps(vec); + x0 = glmm_load(vec); return _mm_cvtss_f32(_mm_sqrt_ss(glm_simd_dot(x0, x0))); #else return sqrtf(glm_vec4_norm2(vec)); @@ -232,7 +232,7 @@ CGLM_INLINE void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] + b[0]; dest[1] = a[1] + b[1]; @@ -252,7 +252,7 @@ CGLM_INLINE void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] + s; dest[1] = v[1] + s; @@ -272,7 +272,7 @@ CGLM_INLINE void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] - b[0]; dest[1] = a[1] - b[1]; @@ -292,7 +292,7 @@ CGLM_INLINE void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] - s; dest[1] = v[1] - s; @@ -312,7 +312,7 @@ CGLM_INLINE void glm_vec4_mul(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -332,7 +332,7 @@ CGLM_INLINE void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_mul_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] * s; dest[1] = v[1] * s; @@ -373,7 +373,7 @@ CGLM_INLINE void glm_vec4_div(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] / b[0]; dest[1] = a[1] / b[1]; @@ -393,7 +393,7 @@ CGLM_INLINE void glm_vec4_divs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); #else glm_vec4_scale(v, 1.0f / s, dest); #endif @@ -413,9 +413,9 @@ CGLM_INLINE void glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_add_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_add_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] + b[0]; dest[1] += a[1] + b[1]; @@ -437,9 +437,9 @@ CGLM_INLINE void glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_sub_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_sub_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] - b[0]; dest[1] += a[1] - b[1]; @@ -461,9 +461,9 @@ CGLM_INLINE void glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_mul_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] * b[0]; dest[1] += a[1] * b[1]; @@ -485,8 +485,8 @@ CGLM_INLINE void glm_vec4_muladds(vec4 a, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_mul_ps(_mm_load_ps(a), + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), _mm_set1_ps(s)))); #else dest[0] += a[0] * s; @@ -505,7 +505,7 @@ CGLM_INLINE void glm_vec4_flipsign(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_xor_ps(_mm_load_ps(v), _mm_set1_ps(-0.0f))); + _mm_store_ps(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else v[0] = -v[0]; v[1] = -v[1]; @@ -524,7 +524,7 @@ CGLM_INLINE void glm_vec4_flipsign_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_xor_ps(_mm_load_ps(v), + _mm_store_ps(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else dest[0] = -v[0]; @@ -571,7 +571,7 @@ glm_vec4_normalize_to(vec4 vec, vec4 dest) { __m128 xdot, x0; float dot; - x0 = _mm_load_ps(vec); + x0 = glmm_load(vec); xdot = glm_simd_dot(x0, x0); dot = _mm_cvtss_f32(xdot); @@ -633,7 +633,7 @@ CGLM_INLINE void glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_max_ps(_mm_load_ps(v1), _mm_load_ps(v2))); + _mm_store_ps(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_max(v1[0], v2[0]); dest[1] = glm_max(v1[1], v2[1]); @@ -653,7 +653,7 @@ CGLM_INLINE void glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_min_ps(_mm_load_ps(v1), _mm_load_ps(v2))); + _mm_store_ps(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_min(v1[0], v2[0]); dest[1] = glm_min(v1[1], v2[1]); @@ -673,7 +673,7 @@ CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_min_ps(_mm_max_ps(_mm_load_ps(v), _mm_set1_ps(minVal)), + _mm_store_ps(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); #else v[0] = glm_clamp(v[0], minVal, maxVal);