From 56f0bb0928e127ed993ec6e46aef9ad7d8cfd819 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:35:17 +0300 Subject: [PATCH] simd, avx: make alignment optional for load/store operations --- include/cglm/mat4.h | 4 ++-- include/cglm/simd/avx/affine.h | 26 +++++++++++++------------- include/cglm/simd/avx/mat4.h | 28 ++++++++++++++-------------- include/cglm/simd/intrin.h | 17 +++++++++++++---- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 88563cb..f0b6736 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -110,8 +110,8 @@ CGLM_INLINE void glm_mat4_copy(mat4 mat, mat4 dest) { #ifdef __AVX__ - _mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); - _mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); + glmm_store256(dest[0], glmm_load256(mat[0])); + glmm_store256(dest[2], glmm_load256(mat[2])); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest[0], glmm_load(mat[0])); glmm_store(dest[1], glmm_load(mat[1])); diff --git a/include/cglm/simd/avx/affine.h b/include/cglm/simd/avx/affine.h index 1b0dcea..5c7f71c 100644 --- a/include/cglm/simd/avx/affine.h +++ b/include/cglm/simd/avx/affine.h @@ -21,11 +21,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ - y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ + y0 = glmm_load256(m2[0]); /* h g f e d c b a */ + y1 = glmm_load256(m2[2]); /* p o n m l k j i */ - y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ - y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ + y2 = glmm_load256(m1[0]); /* h g f e d c b a */ + y3 = glmm_load256(m1[2]); /* p o n m l k j i */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000000); /* l k j i l k j i */ @@ -37,10 +37,10 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - _mm256_store_ps(dest[0], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y4, y8)), - _mm256_mul_ps(y5, y7))); + glmm_store256(dest[0], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y4, y8)), + _mm256_mul_ps(y5, y7))); /* n n n n i i i i */ @@ -52,11 +52,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[2], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[2], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); } #endif diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index e2ef9da..b5859a7 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -21,11 +21,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ - y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ + y0 = glmm_load256(m2[0]); /* h g f e d c b a */ + y1 = glmm_load256(m2[2]); /* p o n m l k j i */ - y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ - y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ + y2 = glmm_load256(m1[0]); /* h g f e d c b a */ + y3 = glmm_load256(m1[2]); /* p o n m l k j i */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000011); /* l k j i p o n m */ @@ -39,11 +39,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[0], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[0], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); /* n n n n i i i i */ /* p p p p k k k k */ @@ -54,11 +54,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[2], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[2], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); } #endif diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index bf1db60..8fd1526 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -70,11 +70,11 @@ glm_simd_store_v3(__m128 vx, vec3 v) { } #ifdef CGLM_ALL_UNALIGNED -#define glmm_load(p) _mm_loadu_ps(p) -#define glmm_store(p, a) _mm_storeu_ps(p, a) +# define glmm_load(p) _mm_loadu_ps(p) +# define glmm_store(p, a) _mm_storeu_ps(p, a) #else -#define glmm_load(p) _mm_load_ps(p) -#define glmm_store(p, a) _mm_store_ps(p, a) +# define glmm_load(p) _mm_load_ps(p) +# define glmm_store(p, a) _mm_store_ps(p, a) #endif #endif @@ -86,6 +86,15 @@ glm_simd_store_v3(__m128 vx, vec3 v) { #ifdef __AVX__ # define CGLM_AVX_FP 1 + +#ifdef CGLM_ALL_UNALIGNED +# define glmm_load256(p) _mm256_loadu_ps(p) +# define glmm_store256(p, a) _mm256_storeu_ps(p, a) +#else +# define glmm_load256(p) _mm256_load_ps(p) +# define glmm_store256(p, a) _mm256_store_ps(p, a) +#endif + #endif /* ARM Neon */