From abfa355b8402d688ff80d682084ad16847d87352 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 30 Oct 2018 09:27:55 +0300 Subject: [PATCH] avx: optimize (re-use) mat4_mul registers --- include/cglm/simd/avx/mat4.h | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index 944769b..9a37e05 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -19,26 +19,32 @@ void glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ - __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; + __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; + __m256i yi0, yi1, yi2, yi3; - y0 = glmm_load256(m2[0]); /* h g f e d c b a */ - y1 = glmm_load256(m2[2]); /* p o n m l k j i */ + y0 = glmm_load256(m2[0]); /* h g f e d c b a */ + y1 = glmm_load256(m2[2]); /* p o n m l k j i */ - y2 = glmm_load256(m1[0]); /* h g f e d c b a */ - y3 = glmm_load256(m1[2]); /* p o n m l k j i */ + y2 = glmm_load256(m1[0]); /* h g f e d c b a */ + y3 = glmm_load256(m1[2]); /* p o n m l k j i */ /* 0x03: 0b00000011 */ - y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */ - y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */ + y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */ + y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */ + + yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0); + yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2); + yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1); + yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3); /* f f f f a a a a */ /* h h h h c c c c */ /* e e e e b b b b */ /* g g g g d d d d */ - y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); + y6 = _mm256_permutevar_ps(y0, yi0); + y7 = _mm256_permutevar_ps(y0, yi1); + y8 = _mm256_permutevar_ps(y0, yi2); + y9 = _mm256_permutevar_ps(y0, yi3); glmm_store256(dest[0], _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), @@ -50,10 +56,10 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { /* p p p p k k k k */ /* m m m m j j j j */ /* o o o o l l l l */ - y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); + y6 = _mm256_permutevar_ps(y1, yi0); + y7 = _mm256_permutevar_ps(y1, yi1); + y8 = _mm256_permutevar_ps(y1, yi2); + y9 = _mm256_permutevar_ps(y1, yi3); glmm_store256(dest[2], _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),