From 8a6fe6948a676be43056cc880bf5fe9295aa15e3 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 20 Sep 2016 03:13:48 +0300 Subject: [PATCH] improve mat4 mul --- include/cglm-intrin.h | 15 ++------- include/cglm-mat-simd.h | 59 +++++++++++++++++++++----------- include/cglm-mat.h | 75 +++++++++++++++++++---------------------- 3 files changed, 77 insertions(+), 72 deletions(-) diff --git a/include/cglm-intrin.h b/include/cglm-intrin.h index a41a518..fccc35c 100644 --- a/include/cglm-intrin.h +++ b/include/cglm-intrin.h @@ -12,23 +12,14 @@ #include /* float */ -#define _mm_madd_ps(L, R0, R1) \ - _mm_add_ps(_mm_mul_ps(_mm_set1_ps(*(L)), R0), \ - _mm_mul_ps(_mm_set1_ps(*(L + 1)), R1)) - -#define _mm_msub_ps(M00, M01, M10, M11) \ - _mm_sub_ps(_mm_mul_ps(M00, M01), \ - _mm_mul_ps(M10, M11)) - #define _mm_shuffle1_ps(a, z, y, x, w) \ _mm_shuffle_ps(a, a, _MM_SHUFFLE(z, y, x, w)) +#define _mm_shuffle1_ps1(a, x) \ + _mm_shuffle_ps(a, a, _MM_SHUFFLE(x, x, x, x)) + #define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ z1, y1, x1, w1); -#define _mm_madd4_ps(L, R0, R1, R2, R3) \ - _mm_add_ps(_mm_madd_ps(L, R0, R1), \ - _mm_madd_ps(L + 2, R2, R3)) - #endif /* cglm_intrin_h */ diff --git a/include/cglm-mat-simd.h b/include/cglm-mat-simd.h index 0877661..6876ee7 100644 --- a/include/cglm-mat-simd.h +++ b/include/cglm-mat-simd.h @@ -10,24 +10,6 @@ #include "cglm-intrin.h" -#define CGLM_MAT_MUL_SSE_4x4f(L, R, D) \ - do { \ - __m128 r0; \ - __m128 r1; \ - __m128 r2; \ - __m128 r3; \ - \ - r0 = _mm_load_ps(R); \ - r1 = _mm_load_ps(R + 4); \ - r2 = _mm_load_ps(R + 8); \ - r3 = _mm_load_ps(R + 12); \ - \ - _mm_store_ps(D, _mm_madd4_ps(L, r0, r1, r2, r3)); \ - _mm_store_ps(D + 4, _mm_madd4_ps(L + 4, r0, r1, r2, r3)); \ - _mm_store_ps(D + 8, _mm_madd4_ps(L + 8, r0, r1, r2, r3)); \ - _mm_store_ps(D + 12, _mm_madd4_ps(L + 12, r0, r1, r2, r3)); \ - } while (0) - #define CGLM_MAT_TRANSP_SSE_4x4f(M, D) \ do { \ __m128 r0; \ @@ -58,12 +40,51 @@ _mm_store_ps(M[3], _mm_mul_ps(_mm_load_ps(M[3]), xmm0)); \ } while (0) +CGLM_INLINE +void +glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { + /* D = R * L (Column-Major) */ + + __m128 l0, l1, l2, l3, r; + + l0 = _mm_load_ps(m1[0]); + l1 = _mm_load_ps(m1[1]); + l2 = _mm_load_ps(m1[2]); + l3 = _mm_load_ps(m1[3]); + + r = _mm_load_ps(m2[0]); + _mm_store_ps(dest[0], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + r = _mm_load_ps(m2[1]); + _mm_store_ps(dest[1], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + r = _mm_load_ps(m2[2]); + _mm_store_ps(dest[2], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + + r = _mm_load_ps(m2[3]); + _mm_store_ps(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); +} + CGLM_INLINE float glm_mat4_det_sse2(mat4 mat) { __m128 v0, dt, t0, t1, t2, t3, t4, r0, r1, r2, r3; - + r0 = _mm_load_ps(mat[0]); r1 = _mm_load_ps(mat[1]); r2 = _mm_load_ps(mat[2]); diff --git a/include/cglm-mat.h b/include/cglm-mat.h index 25f3cc0..fcec7c0 100644 --- a/include/cglm-mat.h +++ b/include/cglm-mat.h @@ -19,52 +19,45 @@ #define GLM_MAT4_IDENTITY (mat4)GLM_MAT4_IDENTITY_INIT -#define glm_mat4_mul_impl(l, r, d) \ - do { \ - d[0] = l[0] * r[0] + l[1] * r[4] + l[2] * r[8] + l[3] * r[12]; \ - d[1] = l[0] * r[1] + l[1] * r[5] + l[2] * r[9] + l[3] * r[13]; \ - d[2] = l[0] * r[2] + l[1] * r[6] + l[2] * r[10] + l[3] * r[14]; \ - d[3] = l[0] * r[3] + l[1] * r[7] + l[2] * r[11] + l[3] * r[15]; \ - d[4] = l[4] * r[0] + l[5] * r[4] + l[6] * r[8] + l[7] * r[12]; \ - d[5] = l[4] * r[1] + l[5] * r[5] + l[6] * r[9] + l[7] * r[13]; \ - d[6] = l[4] * r[2] + l[5] * r[6] + l[6] * r[10] + l[7] * r[14]; \ - d[7] = l[4] * r[3] + l[5] * r[7] + l[6] * r[11] + l[7] * r[15]; \ - d[8] = l[8] * r[0] + l[9] * r[4] + l[10] * r[8] + l[11] * r[12]; \ - d[9] = l[8] * r[1] + l[9] * r[5] + l[10] * r[9] + l[11] * r[13]; \ - d[10] = l[8] * r[2] + l[9] * r[6] + l[10] * r[10] + l[11] * r[14]; \ - d[11] = l[8] * r[3] + l[9] * r[7] + l[10] * r[11] + l[11] * r[15]; \ - d[12] = l[12] * r[0] + l[13] * r[4] + l[14] * r[8] + l[15] * r[12]; \ - d[13] = l[12] * r[1] + l[13] * r[5] + l[14] * r[9] + l[15] * r[13]; \ - d[14] = l[12] * r[2] + l[13] * r[6] + l[14] * r[10] + l[15] * r[14]; \ - d[15] = l[12] * r[3] + l[13] * r[7] + l[14] * r[11] + l[15] * r[15]; \ - } while (0) - CGLM_INLINE void -glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { - float * __restrict d; - float * __restrict l; - - d = (float *)dest; - l = (float *)m1; - - if (m1 != m2) { - float * __restrict r; - - r = (float *)m2; - +glm_mat4_mul(mat4 l, mat4 r, mat4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - CGLM_MAT_MUL_SSE_4x4f(l, r, d); + glm_mat4_mul_sse2(l, r, d); #else - glm_mat4_mul_impl(l, r, d); + d[0][0] = l[0][0] * r[0][0] + l[1][0] * r[0][1] + + l[2][0] * r[0][2] + l[3][0] * r[0][3]; + d[1][0] = l[0][0] * r[1][0] + l[1][0] * r[1][1] + + l[2][0] * r[1][2] + l[3][0] * r[1][3]; + d[2][0] = l[0][0] * r[2][0] + l[1][0] * r[2][1] + + l[2][0] * r[2][2] + l[3][0] * r[2][3]; + d[3][0] = l[0][0] * r[3][0] + l[1][0] * r[3][1] + + l[2][0] * r[3][2] + l[3][0] * r[3][3]; + d[0][1] = l[0][1] * r[0][0] + l[1][1] * r[0][1] + + l[2][1] * r[0][2] + l[3][1] * r[0][3]; + d[1][1] = l[0][1] * r[1][0] + l[1][1] * r[1][1] + + l[2][1] * r[1][2] + l[3][1] * r[1][3]; + d[2][1] = l[0][1] * r[2][0] + l[1][1] * r[2][1] + + l[2][1] * r[2][2] + l[3][1] * r[2][3]; + d[3][1] = l[0][1] * r[3][0] + l[1][1] * r[3][1] + + l[2][1] * r[3][2] + l[3][1] * r[3][3]; + d[0][2] = l[0][2] * r[0][0] + l[1][2] * r[0][1] + + l[2][2] * r[0][2] + l[3][2] * r[0][3]; + d[1][2] = l[0][2] * r[1][0] + l[1][2] * r[1][1] + + l[2][2] * r[1][2] + l[3][2] * r[1][3]; + d[2][2] = l[0][2] * r[2][0] + l[1][2] * r[2][1] + + l[2][2] * r[2][2] + l[3][2] * r[2][3]; + d[3][2] = l[0][2] * r[3][0] + l[1][2] * r[3][1] + + l[2][2] * r[3][2] + l[3][2] * r[3][3]; + d[0][3] = l[0][3] * r[0][0] + l[1][3] * r[0][1] + + l[2][3] * r[0][2] + l[3][3] * r[0][3]; + d[1][3] = l[0][3] * r[1][0] + l[1][3] * r[1][1] + + l[2][3] * r[1][2] + l[3][3] * r[1][3]; + d[2][3] = l[0][3] * r[2][0] + l[1][3] * r[2][1] + + l[2][3] * r[2][2] + l[3][3] * r[2][3]; + d[3][3] = l[0][3] * r[3][0] + l[1][3] * r[3][1] + + l[2][3] * r[3][2] + l[3][3] * r[3][3]; #endif - } else { -#if defined( __SSE__ ) || defined( __SSE2__ ) - CGLM_MAT_MUL_SSE_4x4f(l, l, d); -#else - glm_mat4_mul_impl(l, l, d); -#endif - } } CGLM_INLINE