diff --git a/include/cglm-mat-simd-avx.h b/include/cglm-mat-simd-avx.h new file mode 100644 index 0000000..3085149 --- /dev/null +++ b/include/cglm-mat-simd-avx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_mat_simd_avx_h +#define cglm_mat_simd_avx_h + +#ifdef __AVX__ +#include "cglm-intrin.h" +#include + +CGLM_INLINE +void +glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { + /* D = R * L (Column-Major) */ + + __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; + + y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ + y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ + + y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ + y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ + + y4 = _mm256_permute2f128_ps(y0, y0, 0b00000011); /* d c b a h g f e */ + y5 = _mm256_permute2f128_ps(y3, y1, 0b00000011); /* l k j i p o n m */ + + /* f f f f a a a a */ + /* h h h h c c c c */ + /* e e e e b b b b */ + /* g g g g d d d d */ + y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); + y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); + y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); + + _mm256_store_ps(dest[0], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); + + /* n n n n i i i i */ + /* p p p p k k k k */ + /* m m m m j j j j */ + /* o o o o l l l l */ + y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); + y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); + y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); + + _mm256_store_ps(dest[2], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); +} + +#endif +#endif /* cglm_mat_simd_avx_h */ diff --git a/include/cglm-mat.h b/include/cglm-mat.h index d830746..1848d6a 100644 --- a/include/cglm-mat.h +++ b/include/cglm-mat.h @@ -10,6 +10,7 @@ #include "cglm.h" #include "cglm-mat-simd.h" +#include "cglm-mat-simd-avx.h" #include #define GLM_MAT4_IDENTITY_INIT {1.0f, 0.0f, 0.0f, 0.0f, \ @@ -22,7 +23,9 @@ CGLM_INLINE void glm_mat4_mul(mat4 l, mat4 r, mat4 d) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#ifdef __AVX__ + glm_mat4_mul_avx(l, r, d); +#elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mul_sse2(l, r, d); #else d[0][0] = l[0][0] * r[0][0] + l[1][0] * r[0][1] +