From 599524dacf973a96a0e7d6ae7159ad10aa8457a4 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 10 May 2018 16:27:40 +0300 Subject: [PATCH] docs: add new option to docs --- docs/source/opt.rst | 8 +++++++- include/cglm/simd/intrin.h | 2 +- include/cglm/simd/sse2/mat4.h | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/source/opt.rst b/docs/source/opt.rst index c614e42..4acb972 100644 --- a/docs/source/opt.rst +++ b/docs/source/opt.rst @@ -5,7 +5,7 @@ Options A few options are provided via macros. -Alignment +Alignment Option ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ As default, cglm requires types to be aligned. Alignment requirements: @@ -34,3 +34,9 @@ have to compile cglm with **CGLM_ALL_UNALIGNED** macro. ALWAYS USE SAME CONFIGURATION / OPTION for **cglm** if you have multiple projects. For instance if you set CGLM_ALL_UNALIGNED in a project then set it in other projects too + +SSE and SSE2 Shuffle Option +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**_mm_shuffle_ps** generates **shufps** instruction even if registers are same. +You can force it to generate **pshufd** instruction by defining +**CGLM_USE_INT_DOMAIN** macro. As default it is not defined. diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 2f79b01..f4854bd 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -32,7 +32,7 @@ #ifdef CGLM_USE_INT_DOMAIN # define glmm_shuff1(xmm, z, y, x, w) \ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ - _MM_SHUFFLE(z, y, x, w))) + _MM_SHUFFLE(z, y, x, w))) #else # define glmm_shuff1(xmm, z, y, x, w) \ _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 20afeae..c2acb9f 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -168,8 +168,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ - x1 = glmm_shuff1(x0, 1, 3, 3, 3); /* l p p p */ - x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ + x1 = glmm_shuff1(x0, 1, 3, 3, 3); /* l p p p */ + x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */ x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */ @@ -180,7 +180,7 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { t0 = _mm_sub_ps(_mm_mul_ps(x3, x1), _mm_mul_ps(x2, x0)); x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */ - x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ + x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */ /* t1[1] = j * p - n * l;