improve mat4 mul

This commit is contained in:
Recep Aslantas
2016-09-20 03:13:48 +03:00
parent b42a599310
commit 8a6fe6948a
3 changed files with 77 additions and 72 deletions

View File

@@ -12,23 +12,14 @@
#include <emmintrin.h> #include <emmintrin.h>
/* float */ /* float */
#define _mm_madd_ps(L, R0, R1) \
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(*(L)), R0), \
_mm_mul_ps(_mm_set1_ps(*(L + 1)), R1))
#define _mm_msub_ps(M00, M01, M10, M11) \
_mm_sub_ps(_mm_mul_ps(M00, M01), \
_mm_mul_ps(M10, M11))
#define _mm_shuffle1_ps(a, z, y, x, w) \ #define _mm_shuffle1_ps(a, z, y, x, w) \
_mm_shuffle_ps(a, a, _MM_SHUFFLE(z, y, x, w)) _mm_shuffle_ps(a, a, _MM_SHUFFLE(z, y, x, w))
#define _mm_shuffle1_ps1(a, x) \
_mm_shuffle_ps(a, a, _MM_SHUFFLE(x, x, x, x))
#define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ #define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
_mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1); z1, y1, x1, w1);
#define _mm_madd4_ps(L, R0, R1, R2, R3) \
_mm_add_ps(_mm_madd_ps(L, R0, R1), \
_mm_madd_ps(L + 2, R2, R3))
#endif /* cglm_intrin_h */ #endif /* cglm_intrin_h */

View File

@@ -10,24 +10,6 @@
#include "cglm-intrin.h" #include "cglm-intrin.h"
#define CGLM_MAT_MUL_SSE_4x4f(L, R, D) \
do { \
__m128 r0; \
__m128 r1; \
__m128 r2; \
__m128 r3; \
\
r0 = _mm_load_ps(R); \
r1 = _mm_load_ps(R + 4); \
r2 = _mm_load_ps(R + 8); \
r3 = _mm_load_ps(R + 12); \
\
_mm_store_ps(D, _mm_madd4_ps(L, r0, r1, r2, r3)); \
_mm_store_ps(D + 4, _mm_madd4_ps(L + 4, r0, r1, r2, r3)); \
_mm_store_ps(D + 8, _mm_madd4_ps(L + 8, r0, r1, r2, r3)); \
_mm_store_ps(D + 12, _mm_madd4_ps(L + 12, r0, r1, r2, r3)); \
} while (0)
#define CGLM_MAT_TRANSP_SSE_4x4f(M, D) \ #define CGLM_MAT_TRANSP_SSE_4x4f(M, D) \
do { \ do { \
__m128 r0; \ __m128 r0; \
@@ -58,12 +40,51 @@
_mm_store_ps(M[3], _mm_mul_ps(_mm_load_ps(M[3]), xmm0)); \ _mm_store_ps(M[3], _mm_mul_ps(_mm_load_ps(M[3]), xmm0)); \
} while (0) } while (0)
CGLM_INLINE
void
glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r;
l0 = _mm_load_ps(m1[0]);
l1 = _mm_load_ps(m1[1]);
l2 = _mm_load_ps(m1[2]);
l3 = _mm_load_ps(m1[3]);
r = _mm_load_ps(m2[0]);
_mm_store_ps(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[1]);
_mm_store_ps(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[2]);
_mm_store_ps(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[3]);
_mm_store_ps(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
}
CGLM_INLINE CGLM_INLINE
float float
glm_mat4_det_sse2(mat4 mat) { glm_mat4_det_sse2(mat4 mat) {
__m128 v0, dt, t0, t1, t2, t3, t4, r0, r1, r2, r3; __m128 v0, dt, t0, t1, t2, t3, t4, r0, r1, r2, r3;
r0 = _mm_load_ps(mat[0]); r0 = _mm_load_ps(mat[0]);
r1 = _mm_load_ps(mat[1]); r1 = _mm_load_ps(mat[1]);
r2 = _mm_load_ps(mat[2]); r2 = _mm_load_ps(mat[2]);

View File

@@ -19,52 +19,45 @@
#define GLM_MAT4_IDENTITY (mat4)GLM_MAT4_IDENTITY_INIT #define GLM_MAT4_IDENTITY (mat4)GLM_MAT4_IDENTITY_INIT
#define glm_mat4_mul_impl(l, r, d) \
do { \
d[0] = l[0] * r[0] + l[1] * r[4] + l[2] * r[8] + l[3] * r[12]; \
d[1] = l[0] * r[1] + l[1] * r[5] + l[2] * r[9] + l[3] * r[13]; \
d[2] = l[0] * r[2] + l[1] * r[6] + l[2] * r[10] + l[3] * r[14]; \
d[3] = l[0] * r[3] + l[1] * r[7] + l[2] * r[11] + l[3] * r[15]; \
d[4] = l[4] * r[0] + l[5] * r[4] + l[6] * r[8] + l[7] * r[12]; \
d[5] = l[4] * r[1] + l[5] * r[5] + l[6] * r[9] + l[7] * r[13]; \
d[6] = l[4] * r[2] + l[5] * r[6] + l[6] * r[10] + l[7] * r[14]; \
d[7] = l[4] * r[3] + l[5] * r[7] + l[6] * r[11] + l[7] * r[15]; \
d[8] = l[8] * r[0] + l[9] * r[4] + l[10] * r[8] + l[11] * r[12]; \
d[9] = l[8] * r[1] + l[9] * r[5] + l[10] * r[9] + l[11] * r[13]; \
d[10] = l[8] * r[2] + l[9] * r[6] + l[10] * r[10] + l[11] * r[14]; \
d[11] = l[8] * r[3] + l[9] * r[7] + l[10] * r[11] + l[11] * r[15]; \
d[12] = l[12] * r[0] + l[13] * r[4] + l[14] * r[8] + l[15] * r[12]; \
d[13] = l[12] * r[1] + l[13] * r[5] + l[14] * r[9] + l[15] * r[13]; \
d[14] = l[12] * r[2] + l[13] * r[6] + l[14] * r[10] + l[15] * r[14]; \
d[15] = l[12] * r[3] + l[13] * r[7] + l[14] * r[11] + l[15] * r[15]; \
} while (0)
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul(mat4 l, mat4 r, mat4 d) {
float * __restrict d;
float * __restrict l;
d = (float *)dest;
l = (float *)m1;
if (m1 != m2) {
float * __restrict r;
r = (float *)m2;
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
CGLM_MAT_MUL_SSE_4x4f(l, r, d); glm_mat4_mul_sse2(l, r, d);
#else #else
glm_mat4_mul_impl(l, r, d); d[0][0] = l[0][0] * r[0][0] + l[1][0] * r[0][1] +
l[2][0] * r[0][2] + l[3][0] * r[0][3];
d[1][0] = l[0][0] * r[1][0] + l[1][0] * r[1][1] +
l[2][0] * r[1][2] + l[3][0] * r[1][3];
d[2][0] = l[0][0] * r[2][0] + l[1][0] * r[2][1] +
l[2][0] * r[2][2] + l[3][0] * r[2][3];
d[3][0] = l[0][0] * r[3][0] + l[1][0] * r[3][1] +
l[2][0] * r[3][2] + l[3][0] * r[3][3];
d[0][1] = l[0][1] * r[0][0] + l[1][1] * r[0][1] +
l[2][1] * r[0][2] + l[3][1] * r[0][3];
d[1][1] = l[0][1] * r[1][0] + l[1][1] * r[1][1] +
l[2][1] * r[1][2] + l[3][1] * r[1][3];
d[2][1] = l[0][1] * r[2][0] + l[1][1] * r[2][1] +
l[2][1] * r[2][2] + l[3][1] * r[2][3];
d[3][1] = l[0][1] * r[3][0] + l[1][1] * r[3][1] +
l[2][1] * r[3][2] + l[3][1] * r[3][3];
d[0][2] = l[0][2] * r[0][0] + l[1][2] * r[0][1] +
l[2][2] * r[0][2] + l[3][2] * r[0][3];
d[1][2] = l[0][2] * r[1][0] + l[1][2] * r[1][1] +
l[2][2] * r[1][2] + l[3][2] * r[1][3];
d[2][2] = l[0][2] * r[2][0] + l[1][2] * r[2][1] +
l[2][2] * r[2][2] + l[3][2] * r[2][3];
d[3][2] = l[0][2] * r[3][0] + l[1][2] * r[3][1] +
l[2][2] * r[3][2] + l[3][2] * r[3][3];
d[0][3] = l[0][3] * r[0][0] + l[1][3] * r[0][1] +
l[2][3] * r[0][2] + l[3][3] * r[0][3];
d[1][3] = l[0][3] * r[1][0] + l[1][3] * r[1][1] +
l[2][3] * r[1][2] + l[3][3] * r[1][3];
d[2][3] = l[0][3] * r[2][0] + l[1][3] * r[2][1] +
l[2][3] * r[2][2] + l[3][3] * r[2][3];
d[3][3] = l[0][3] * r[3][0] + l[1][3] * r[3][1] +
l[2][3] * r[3][2] + l[3][3] * r[3][3];
#endif #endif
} else {
#if defined( __SSE__ ) || defined( __SSE2__ )
CGLM_MAT_MUL_SSE_4x4f(l, l, d);
#else
glm_mat4_mul_impl(l, l, d);
#endif
}
} }
CGLM_INLINE CGLM_INLINE