mirror of
https://github.com/recp/cglm.git
synced 2025-10-04 09:08:53 +00:00
improve mat4 mul
This commit is contained in:
@@ -12,23 +12,14 @@
|
|||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
/* float */
|
/* float */
|
||||||
#define _mm_madd_ps(L, R0, R1) \
|
|
||||||
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(*(L)), R0), \
|
|
||||||
_mm_mul_ps(_mm_set1_ps(*(L + 1)), R1))
|
|
||||||
|
|
||||||
#define _mm_msub_ps(M00, M01, M10, M11) \
|
|
||||||
_mm_sub_ps(_mm_mul_ps(M00, M01), \
|
|
||||||
_mm_mul_ps(M10, M11))
|
|
||||||
|
|
||||||
#define _mm_shuffle1_ps(a, z, y, x, w) \
|
#define _mm_shuffle1_ps(a, z, y, x, w) \
|
||||||
_mm_shuffle_ps(a, a, _MM_SHUFFLE(z, y, x, w))
|
_mm_shuffle_ps(a, a, _MM_SHUFFLE(z, y, x, w))
|
||||||
|
|
||||||
|
#define _mm_shuffle1_ps1(a, x) \
|
||||||
|
_mm_shuffle_ps(a, a, _MM_SHUFFLE(x, x, x, x))
|
||||||
|
|
||||||
#define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
#define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
||||||
_mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
_mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
||||||
z1, y1, x1, w1);
|
z1, y1, x1, w1);
|
||||||
|
|
||||||
#define _mm_madd4_ps(L, R0, R1, R2, R3) \
|
|
||||||
_mm_add_ps(_mm_madd_ps(L, R0, R1), \
|
|
||||||
_mm_madd_ps(L + 2, R2, R3))
|
|
||||||
|
|
||||||
#endif /* cglm_intrin_h */
|
#endif /* cglm_intrin_h */
|
||||||
|
@@ -10,24 +10,6 @@
|
|||||||
|
|
||||||
#include "cglm-intrin.h"
|
#include "cglm-intrin.h"
|
||||||
|
|
||||||
#define CGLM_MAT_MUL_SSE_4x4f(L, R, D) \
|
|
||||||
do { \
|
|
||||||
__m128 r0; \
|
|
||||||
__m128 r1; \
|
|
||||||
__m128 r2; \
|
|
||||||
__m128 r3; \
|
|
||||||
\
|
|
||||||
r0 = _mm_load_ps(R); \
|
|
||||||
r1 = _mm_load_ps(R + 4); \
|
|
||||||
r2 = _mm_load_ps(R + 8); \
|
|
||||||
r3 = _mm_load_ps(R + 12); \
|
|
||||||
\
|
|
||||||
_mm_store_ps(D, _mm_madd4_ps(L, r0, r1, r2, r3)); \
|
|
||||||
_mm_store_ps(D + 4, _mm_madd4_ps(L + 4, r0, r1, r2, r3)); \
|
|
||||||
_mm_store_ps(D + 8, _mm_madd4_ps(L + 8, r0, r1, r2, r3)); \
|
|
||||||
_mm_store_ps(D + 12, _mm_madd4_ps(L + 12, r0, r1, r2, r3)); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define CGLM_MAT_TRANSP_SSE_4x4f(M, D) \
|
#define CGLM_MAT_TRANSP_SSE_4x4f(M, D) \
|
||||||
do { \
|
do { \
|
||||||
__m128 r0; \
|
__m128 r0; \
|
||||||
@@ -58,6 +40,45 @@
|
|||||||
_mm_store_ps(M[3], _mm_mul_ps(_mm_load_ps(M[3]), xmm0)); \
|
_mm_store_ps(M[3], _mm_mul_ps(_mm_load_ps(M[3]), xmm0)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
CGLM_INLINE
|
||||||
|
void
|
||||||
|
glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
|
||||||
|
/* D = R * L (Column-Major) */
|
||||||
|
|
||||||
|
__m128 l0, l1, l2, l3, r;
|
||||||
|
|
||||||
|
l0 = _mm_load_ps(m1[0]);
|
||||||
|
l1 = _mm_load_ps(m1[1]);
|
||||||
|
l2 = _mm_load_ps(m1[2]);
|
||||||
|
l3 = _mm_load_ps(m1[3]);
|
||||||
|
|
||||||
|
r = _mm_load_ps(m2[0]);
|
||||||
|
_mm_store_ps(dest[0],
|
||||||
|
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
|
||||||
|
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
|
||||||
|
r = _mm_load_ps(m2[1]);
|
||||||
|
_mm_store_ps(dest[1],
|
||||||
|
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
|
||||||
|
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
|
||||||
|
r = _mm_load_ps(m2[2]);
|
||||||
|
_mm_store_ps(dest[2],
|
||||||
|
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
|
||||||
|
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
|
||||||
|
|
||||||
|
r = _mm_load_ps(m2[3]);
|
||||||
|
_mm_store_ps(dest[3],
|
||||||
|
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
|
||||||
|
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
|
||||||
|
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
|
||||||
|
}
|
||||||
|
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
float
|
float
|
||||||
glm_mat4_det_sse2(mat4 mat) {
|
glm_mat4_det_sse2(mat4 mat) {
|
||||||
|
@@ -19,52 +19,45 @@
|
|||||||
|
|
||||||
#define GLM_MAT4_IDENTITY (mat4)GLM_MAT4_IDENTITY_INIT
|
#define GLM_MAT4_IDENTITY (mat4)GLM_MAT4_IDENTITY_INIT
|
||||||
|
|
||||||
#define glm_mat4_mul_impl(l, r, d) \
|
|
||||||
do { \
|
|
||||||
d[0] = l[0] * r[0] + l[1] * r[4] + l[2] * r[8] + l[3] * r[12]; \
|
|
||||||
d[1] = l[0] * r[1] + l[1] * r[5] + l[2] * r[9] + l[3] * r[13]; \
|
|
||||||
d[2] = l[0] * r[2] + l[1] * r[6] + l[2] * r[10] + l[3] * r[14]; \
|
|
||||||
d[3] = l[0] * r[3] + l[1] * r[7] + l[2] * r[11] + l[3] * r[15]; \
|
|
||||||
d[4] = l[4] * r[0] + l[5] * r[4] + l[6] * r[8] + l[7] * r[12]; \
|
|
||||||
d[5] = l[4] * r[1] + l[5] * r[5] + l[6] * r[9] + l[7] * r[13]; \
|
|
||||||
d[6] = l[4] * r[2] + l[5] * r[6] + l[6] * r[10] + l[7] * r[14]; \
|
|
||||||
d[7] = l[4] * r[3] + l[5] * r[7] + l[6] * r[11] + l[7] * r[15]; \
|
|
||||||
d[8] = l[8] * r[0] + l[9] * r[4] + l[10] * r[8] + l[11] * r[12]; \
|
|
||||||
d[9] = l[8] * r[1] + l[9] * r[5] + l[10] * r[9] + l[11] * r[13]; \
|
|
||||||
d[10] = l[8] * r[2] + l[9] * r[6] + l[10] * r[10] + l[11] * r[14]; \
|
|
||||||
d[11] = l[8] * r[3] + l[9] * r[7] + l[10] * r[11] + l[11] * r[15]; \
|
|
||||||
d[12] = l[12] * r[0] + l[13] * r[4] + l[14] * r[8] + l[15] * r[12]; \
|
|
||||||
d[13] = l[12] * r[1] + l[13] * r[5] + l[14] * r[9] + l[15] * r[13]; \
|
|
||||||
d[14] = l[12] * r[2] + l[13] * r[6] + l[14] * r[10] + l[15] * r[14]; \
|
|
||||||
d[15] = l[12] * r[3] + l[13] * r[7] + l[14] * r[11] + l[15] * r[15]; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) {
|
glm_mat4_mul(mat4 l, mat4 r, mat4 d) {
|
||||||
float * __restrict d;
|
|
||||||
float * __restrict l;
|
|
||||||
|
|
||||||
d = (float *)dest;
|
|
||||||
l = (float *)m1;
|
|
||||||
|
|
||||||
if (m1 != m2) {
|
|
||||||
float * __restrict r;
|
|
||||||
|
|
||||||
r = (float *)m2;
|
|
||||||
|
|
||||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
CGLM_MAT_MUL_SSE_4x4f(l, r, d);
|
glm_mat4_mul_sse2(l, r, d);
|
||||||
#else
|
#else
|
||||||
glm_mat4_mul_impl(l, r, d);
|
d[0][0] = l[0][0] * r[0][0] + l[1][0] * r[0][1] +
|
||||||
|
l[2][0] * r[0][2] + l[3][0] * r[0][3];
|
||||||
|
d[1][0] = l[0][0] * r[1][0] + l[1][0] * r[1][1] +
|
||||||
|
l[2][0] * r[1][2] + l[3][0] * r[1][3];
|
||||||
|
d[2][0] = l[0][0] * r[2][0] + l[1][0] * r[2][1] +
|
||||||
|
l[2][0] * r[2][2] + l[3][0] * r[2][3];
|
||||||
|
d[3][0] = l[0][0] * r[3][0] + l[1][0] * r[3][1] +
|
||||||
|
l[2][0] * r[3][2] + l[3][0] * r[3][3];
|
||||||
|
d[0][1] = l[0][1] * r[0][0] + l[1][1] * r[0][1] +
|
||||||
|
l[2][1] * r[0][2] + l[3][1] * r[0][3];
|
||||||
|
d[1][1] = l[0][1] * r[1][0] + l[1][1] * r[1][1] +
|
||||||
|
l[2][1] * r[1][2] + l[3][1] * r[1][3];
|
||||||
|
d[2][1] = l[0][1] * r[2][0] + l[1][1] * r[2][1] +
|
||||||
|
l[2][1] * r[2][2] + l[3][1] * r[2][3];
|
||||||
|
d[3][1] = l[0][1] * r[3][0] + l[1][1] * r[3][1] +
|
||||||
|
l[2][1] * r[3][2] + l[3][1] * r[3][3];
|
||||||
|
d[0][2] = l[0][2] * r[0][0] + l[1][2] * r[0][1] +
|
||||||
|
l[2][2] * r[0][2] + l[3][2] * r[0][3];
|
||||||
|
d[1][2] = l[0][2] * r[1][0] + l[1][2] * r[1][1] +
|
||||||
|
l[2][2] * r[1][2] + l[3][2] * r[1][3];
|
||||||
|
d[2][2] = l[0][2] * r[2][0] + l[1][2] * r[2][1] +
|
||||||
|
l[2][2] * r[2][2] + l[3][2] * r[2][3];
|
||||||
|
d[3][2] = l[0][2] * r[3][0] + l[1][2] * r[3][1] +
|
||||||
|
l[2][2] * r[3][2] + l[3][2] * r[3][3];
|
||||||
|
d[0][3] = l[0][3] * r[0][0] + l[1][3] * r[0][1] +
|
||||||
|
l[2][3] * r[0][2] + l[3][3] * r[0][3];
|
||||||
|
d[1][3] = l[0][3] * r[1][0] + l[1][3] * r[1][1] +
|
||||||
|
l[2][3] * r[1][2] + l[3][3] * r[1][3];
|
||||||
|
d[2][3] = l[0][3] * r[2][0] + l[1][3] * r[2][1] +
|
||||||
|
l[2][3] * r[2][2] + l[3][3] * r[2][3];
|
||||||
|
d[3][3] = l[0][3] * r[3][0] + l[1][3] * r[3][1] +
|
||||||
|
l[2][3] * r[3][2] + l[3][3] * r[3][3];
|
||||||
#endif
|
#endif
|
||||||
} else {
|
|
||||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
|
||||||
CGLM_MAT_MUL_SSE_4x4f(l, l, d);
|
|
||||||
#else
|
|
||||||
glm_mat4_mul_impl(l, l, d);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
|
Reference in New Issue
Block a user