diff --git a/include/cglm-mat3-simd-sse2.h b/include/cglm-mat3-simd-sse2.h index 4c6e7f4..aed2d71 100644 --- a/include/cglm-mat3-simd-sse2.h +++ b/include/cglm-mat3-simd-sse2.h @@ -11,6 +11,66 @@ #include "cglm-intrin.h" +CGLM_INLINE +void +glm_mat3_transp_to_sse2(mat3 m, mat3 dest){ + __m128 x0, x1, x2, x3, x4; + /* + a b c d a d g b + e f g h -> e h c f + j j + */ + + /* d c b a */ + /* h g f e */ + x0 = _mm_loadu_ps(&m[0][0]); + x1 = _mm_loadu_ps(&m[1][1]); + + /* g g b b */ + /* a d g b */ + x2 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 1, 1)); + x3 = _mm_shuffle_ps(x0, x2, _MM_SHUFFLE(0, 2, 3, 0)); + + /* c c f f */ + /* e h c f */ + x2 = _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(2, 2, 1, 1)); + x4 = _mm_shuffle_ps(x1, x2, _MM_SHUFFLE(0, 2, 3, 0)); + + _mm_storeu_ps(&dest[0][0], x3); + _mm_storeu_ps(&dest[1][1], x4); + + dest[2][2] = m[2][2]; +} + +CGLM_INLINE +void +glm_mat3_transp_sse2(mat3 m){ + __m128 x0, x1, x2, x3, x4; + /* + a b c d a d g b + e f g h -> e h c f + j j + */ + + /* d c b a */ + /* h g f e */ + x0 = _mm_loadu_ps(&m[0][0]); + x1 = _mm_loadu_ps(&m[1][1]); + + /* g g b b */ + /* a d g b */ + x2 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 1, 1)); + x3 = _mm_shuffle_ps(x0, x2, _MM_SHUFFLE(0, 2, 3, 0)); + + /* c c f f */ + /* e h c f */ + x2 = _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(2, 2, 1, 1)); + x4 = _mm_shuffle_ps(x1, x2, _MM_SHUFFLE(0, 2, 3, 0)); + + _mm_storeu_ps(&m[0][0], x3); + _mm_storeu_ps(&m[1][1], x4); +} + CGLM_INLINE void glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) { diff --git a/include/cglm-mat3.h b/include/cglm-mat3.h index f17f31a..6016d8a 100644 --- a/include/cglm-mat3.h +++ b/include/cglm-mat3.h @@ -76,6 +76,63 @@ glm_mat3_mul(mat3 m1, mat3 m2, mat3 dest) { #endif } +/*! + * @brief transpose mat3 and store in dest + * + * source matrix will not be transposed unless dest is m + * + * @param m[in] matrix + * @param dest[out] result + */ +CGLM_INLINE +void +glm_mat3_transpose_to(mat3 m, mat3 dest) { +#if defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat3_transp_to_sse2(m, dest); +#else + dest[0][0] = m[0][0]; + dest[0][1] = m[1][0]; + dest[0][2] = m[2][0]; + + dest[1][0] = m[0][1]; + dest[1][1] = m[1][1]; + dest[1][2] = m[2][1]; + + dest[2][0] = m[0][2]; + dest[2][1] = m[1][2]; + dest[2][2] = m[2][2]; +#endif +} + +/*! + * @brief tranpose mat3 and store result in same matrix + * + * @param[in, out] m source and dest + */ +CGLM_INLINE +void +glm_mat3_transpose(mat3 m) { +#if defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat3_transp_sse2(m); +#else + mat3 tmp; + + tmp[0][1] = m[1][0]; + tmp[0][2] = m[2][0]; + tmp[1][0] = m[0][1]; + tmp[1][2] = m[2][1]; + tmp[2][0] = m[0][2]; + tmp[2][1] = m[1][2]; + + m[0][1] = tmp[0][1]; + m[0][2] = tmp[0][2]; + m[1][0] = tmp[1][0]; + m[1][2] = tmp[1][2]; + m[2][0] = tmp[2][0]; + m[2][1] = tmp[2][1]; +#endif +} + CGLM_INLINE void glm_mat3_print(mat3 matrix,