mirror of
https://github.com/recp/cglm.git
synced 2025-10-03 16:51:35 +00:00
Merge pull request #413 from recp/optimize-inv
WIP: More Optimizations and SIMD fixes for MSVC & ARM
This commit is contained in:
@@ -76,7 +76,7 @@ SSE and SSE2 Shuffle Option
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
**_mm_shuffle_ps** generates **shufps** instruction even if registers are same.
|
**_mm_shuffle_ps** generates **shufps** instruction even if registers are same.
|
||||||
You can force it to generate **pshufd** instruction by defining
|
You can force it to generate **pshufd** instruction by defining
|
||||||
**CGLM_USE_INT_DOMAIN** macro. As default it is not defined.
|
**CGLM_NO_INT_DOMAIN** macro. As default it is not defined.
|
||||||
|
|
||||||
SSE3 and SSE4 Dot Product Options
|
SSE3 and SSE4 Dot Product Options
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@@ -235,7 +235,7 @@ glm_mat2_scale(mat2 m, float s) {
|
|||||||
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
|
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
|
||||||
wasm_f32x4_splat(s)));
|
wasm_f32x4_splat(s)));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s)));
|
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s)));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
|
vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
|
||||||
#else
|
#else
|
||||||
|
@@ -334,7 +334,7 @@ glm_mat3_det(mat3 mat) {
|
|||||||
d = mat[1][0], e = mat[1][1], f = mat[1][2],
|
d = mat[1][0], e = mat[1][1], f = mat[1][2],
|
||||||
g = mat[2][0], h = mat[2][1], i = mat[2][2];
|
g = mat[2][0], h = mat[2][1], i = mat[2][2];
|
||||||
|
|
||||||
return a * (e * i - h * f) - d * (b * i - c * h) + g * (b * f - c * e);
|
return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@@ -346,24 +346,22 @@ glm_mat3_det(mat3 mat) {
|
|||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_mat3_inv(mat3 mat, mat3 dest) {
|
glm_mat3_inv(mat3 mat, mat3 dest) {
|
||||||
float det;
|
|
||||||
float a = mat[0][0], b = mat[0][1], c = mat[0][2],
|
float a = mat[0][0], b = mat[0][1], c = mat[0][2],
|
||||||
d = mat[1][0], e = mat[1][1], f = mat[1][2],
|
d = mat[1][0], e = mat[1][1], f = mat[1][2],
|
||||||
g = mat[2][0], h = mat[2][1], i = mat[2][2];
|
g = mat[2][0], h = mat[2][1], i = mat[2][2],
|
||||||
|
|
||||||
dest[0][0] = e * i - f * h;
|
c1 = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e,
|
||||||
dest[0][1] = -(b * i - h * c);
|
idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt;
|
||||||
dest[0][2] = b * f - e * c;
|
|
||||||
dest[1][0] = -(d * i - g * f);
|
|
||||||
dest[1][1] = a * i - c * g;
|
|
||||||
dest[1][2] = -(a * f - d * c);
|
|
||||||
dest[2][0] = d * h - g * e;
|
|
||||||
dest[2][1] = -(a * h - g * b);
|
|
||||||
dest[2][2] = a * e - b * d;
|
|
||||||
|
|
||||||
det = 1.0f / (a * dest[0][0] + b * dest[1][0] + c * dest[2][0]);
|
dest[0][0] = idt * c1;
|
||||||
|
dest[0][1] = ndt * (b * i - h * c);
|
||||||
glm_mat3_scale(dest, det);
|
dest[0][2] = idt * (b * f - e * c);
|
||||||
|
dest[1][0] = ndt * c2;
|
||||||
|
dest[1][1] = idt * (a * i - g * c);
|
||||||
|
dest[1][2] = ndt * (a * f - d * c);
|
||||||
|
dest[2][0] = idt * c3;
|
||||||
|
dest[2][1] = ndt * (a * h - g * b);
|
||||||
|
dest[2][2] = idt * (a * e - d * b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@@ -520,6 +520,8 @@ void
|
|||||||
glm_mat4_transpose_to(mat4 m, mat4 dest) {
|
glm_mat4_transpose_to(mat4 m, mat4 dest) {
|
||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glm_mat4_transp_wasm(m, dest);
|
glm_mat4_transp_wasm(m, dest);
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
glm_mat4_transp_avx(m, dest);
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glm_mat4_transp_sse2(m, dest);
|
glm_mat4_transp_sse2(m, dest);
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
@@ -546,6 +548,8 @@ void
|
|||||||
glm_mat4_transpose(mat4 m) {
|
glm_mat4_transpose(mat4 m) {
|
||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glm_mat4_transp_wasm(m, m);
|
glm_mat4_transp_wasm(m, m);
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
glm_mat4_transp_avx(m, m);
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glm_mat4_transp_sse2(m, m);
|
glm_mat4_transp_sse2(m, m);
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
@@ -652,46 +656,37 @@ glm_mat4_inv(mat4 mat, mat4 dest) {
|
|||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
glm_mat4_inv_neon(mat, dest);
|
glm_mat4_inv_neon(mat, dest);
|
||||||
#else
|
#else
|
||||||
float t[6];
|
|
||||||
float det;
|
|
||||||
float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
|
float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
|
||||||
e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
|
e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
|
||||||
i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
|
i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
|
||||||
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3];
|
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3],
|
||||||
|
|
||||||
t[0] = k * p - o * l; t[1] = j * p - n * l; t[2] = j * o - n * k;
|
c1 = k * p - l * o, c2 = c * h - d * g, c3 = i * p - l * m,
|
||||||
t[3] = i * p - m * l; t[4] = i * o - m * k; t[5] = i * n - m * j;
|
c4 = a * h - d * e, c5 = j * p - l * n, c6 = b * h - d * f,
|
||||||
|
c7 = i * n - j * m, c8 = a * f - b * e, c9 = j * o - k * n,
|
||||||
|
c10 = b * g - c * f, c11 = i * o - k * m, c12 = a * g - c * e,
|
||||||
|
|
||||||
dest[0][0] = f * t[0] - g * t[1] + h * t[2];
|
idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;
|
||||||
dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]);
|
|
||||||
dest[2][0] = e * t[1] - f * t[3] + h * t[5];
|
|
||||||
dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]);
|
|
||||||
|
|
||||||
dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]);
|
dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
|
||||||
dest[1][1] = a * t[0] - c * t[3] + d * t[4];
|
dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt;
|
||||||
dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]);
|
dest[0][2] = (n * c2 - o * c6 + p * c10) * idt;
|
||||||
dest[3][1] = a * t[2] - b * t[4] + c * t[5];
|
dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt;
|
||||||
|
|
||||||
t[0] = g * p - o * h; t[1] = f * p - n * h; t[2] = f * o - n * g;
|
dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt;
|
||||||
t[3] = e * p - m * h; t[4] = e * o - m * g; t[5] = e * n - m * f;
|
dest[1][1] = (a * c1 - c * c3 + d * c11) * idt;
|
||||||
|
dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt;
|
||||||
|
dest[1][3] = (i * c2 - k * c4 + l * c12) * idt;
|
||||||
|
|
||||||
dest[0][2] = b * t[0] - c * t[1] + d * t[2];
|
dest[2][0] = (e * c5 - f * c3 + h * c7) * idt;
|
||||||
dest[1][2] =-(a * t[0] - c * t[3] + d * t[4]);
|
dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt;
|
||||||
dest[2][2] = a * t[1] - b * t[3] + d * t[5];
|
dest[2][2] = (m * c6 - n * c4 + p * c8) * idt;
|
||||||
dest[3][2] =-(a * t[2] - b * t[4] + c * t[5]);
|
dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt;
|
||||||
|
|
||||||
t[0] = g * l - k * h; t[1] = f * l - j * h; t[2] = f * k - j * g;
|
dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt;
|
||||||
t[3] = e * l - i * h; t[4] = e * k - i * g; t[5] = e * j - i * f;
|
dest[3][1] = (a * c9 - b * c11 + c * c7) * idt;
|
||||||
|
dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt;
|
||||||
dest[0][3] =-(b * t[0] - c * t[1] + d * t[2]);
|
dest[3][3] = (i * c10 - j * c12 + k * c8) * idt;
|
||||||
dest[1][3] = a * t[0] - c * t[3] + d * t[4];
|
|
||||||
dest[2][3] =-(a * t[1] - b * t[3] + d * t[5]);
|
|
||||||
dest[3][3] = a * t[2] - b * t[4] + c * t[5];
|
|
||||||
|
|
||||||
det = 1.0f / (a * dest[0][0] + b * dest[1][0]
|
|
||||||
+ c * dest[2][0] + d * dest[3][0]);
|
|
||||||
|
|
||||||
glm_mat4_scale_p(dest, det);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -63,8 +63,21 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_
|
|||||||
static inline
|
static inline
|
||||||
float32x4_t
|
float32x4_t
|
||||||
glmm_vhadd(float32x4_t v) {
|
glmm_vhadd(float32x4_t v) {
|
||||||
|
#if CGLM_ARM64
|
||||||
|
float32x4_t p;
|
||||||
|
p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
|
||||||
|
return vpaddq_f32(p, p); /* [t, t, t, t] */;
|
||||||
|
#else
|
||||||
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
|
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
|
||||||
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
|
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
|
||||||
|
#endif
|
||||||
|
/* TODO: measure speed of this compare to above */
|
||||||
|
/* return vdupq_n_f32(vaddvq_f32(v)); */
|
||||||
|
|
||||||
|
/*
|
||||||
|
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
|
||||||
|
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
|
||||||
|
*/
|
||||||
/*
|
/*
|
||||||
this seems slower:
|
this seems slower:
|
||||||
v = vaddq_f32(v, vrev64q_f32(v));
|
v = vaddq_f32(v, vrev64q_f32(v));
|
||||||
@@ -108,6 +121,12 @@ glmm_dot(float32x4_t a, float32x4_t b) {
|
|||||||
return glmm_hadd(vmulq_f32(a, b));
|
return glmm_hadd(vmulq_f32(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
float32x4_t
|
||||||
|
glmm_vdot(float32x4_t a, float32x4_t b) {
|
||||||
|
return glmm_vhadd(vmulq_f32(a, b));
|
||||||
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
float
|
float
|
||||||
glmm_norm(float32x4_t a) {
|
glmm_norm(float32x4_t a) {
|
||||||
|
@@ -12,16 +12,46 @@
|
|||||||
#include "../../common.h"
|
#include "../../common.h"
|
||||||
#include "../intrin.h"
|
#include "../intrin.h"
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_mat4_scale_avx(mat4 m, float s) {
|
glm_mat4_scale_avx(mat4 m, float s) {
|
||||||
__m256 y0;
|
__m256 y0, y1, y2, y3, y4;
|
||||||
y0 = _mm256_set1_ps(s);
|
|
||||||
|
y0 = glmm_load256(m[0]); /* h g f e d c b a */
|
||||||
|
y1 = glmm_load256(m[2]); /* p o n m l k j i */
|
||||||
|
|
||||||
|
y2 = _mm256_broadcast_ss(&s);
|
||||||
|
|
||||||
|
y3 = _mm256_mul_ps(y0, y2);
|
||||||
|
y4 = _mm256_mul_ps(y1, y2);
|
||||||
|
|
||||||
|
glmm_store256(m[0], y3);
|
||||||
|
glmm_store256(m[2], y4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO: this must be tested and compared to SSE version, may be slower!!! */
|
||||||
|
CGLM_INLINE
|
||||||
|
void
|
||||||
|
glm_mat4_transp_avx(mat4 m, mat4 dest) {
|
||||||
|
__m256 y0, y1, y2, y3;
|
||||||
|
|
||||||
|
y0 = glmm_load256(m[0]); /* h g f e d c b a */
|
||||||
|
y1 = glmm_load256(m[2]); /* p o n m l k j i */
|
||||||
|
|
||||||
|
y2 = _mm256_unpacklo_ps(y0, y1); /* n f m e j b i a */
|
||||||
|
y3 = _mm256_unpackhi_ps(y0, y1); /* p h o g l d k c */
|
||||||
|
|
||||||
glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
|
y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */
|
||||||
glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
|
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */
|
||||||
|
|
||||||
|
y2 = _mm256_unpacklo_ps(y0, y1); /* o k g c m i e a */
|
||||||
|
y3 = _mm256_unpackhi_ps(y0, y1); /* p l h d n j f b */
|
||||||
|
|
||||||
|
y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */
|
||||||
|
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */
|
||||||
|
|
||||||
|
glmm_store256(dest[0], y0);
|
||||||
|
glmm_store256(dest[2], y1);
|
||||||
}
|
}
|
||||||
|
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
@@ -29,7 +59,8 @@ void
|
|||||||
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
|
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
|
||||||
/* D = R * L (Column-Major) */
|
/* D = R * L (Column-Major) */
|
||||||
|
|
||||||
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
|
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13;
|
||||||
|
__m256i yi0, yi1, yi2, yi3;
|
||||||
|
|
||||||
y0 = glmm_load256(m2[0]); /* h g f e d c b a */
|
y0 = glmm_load256(m2[0]); /* h g f e d c b a */
|
||||||
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
|
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
|
||||||
@@ -41,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
|
|||||||
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
|
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
|
||||||
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
|
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
|
||||||
|
|
||||||
|
yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
|
||||||
|
yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2);
|
||||||
|
yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1);
|
||||||
|
yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3);
|
||||||
|
|
||||||
/* f f f f a a a a */
|
/* f f f f a a a a */
|
||||||
/* h h h h c c c c */
|
/* h h h h c c c c */
|
||||||
/* e e e e b b b b */
|
/* e e e e b b b b */
|
||||||
/* g g g g d d d d */
|
/* g g g g d d d d */
|
||||||
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
y6 = _mm256_permutevar_ps(y0, yi0);
|
||||||
y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
y7 = _mm256_permutevar_ps(y0, yi1);
|
||||||
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
y8 = _mm256_permutevar_ps(y0, yi2);
|
||||||
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
y9 = _mm256_permutevar_ps(y0, yi3);
|
||||||
|
|
||||||
glmm_store256(dest[0],
|
|
||||||
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
|
||||||
_mm256_mul_ps(y3, y7)),
|
|
||||||
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
|
||||||
_mm256_mul_ps(y5, y9))));
|
|
||||||
|
|
||||||
/* n n n n i i i i */
|
/* n n n n i i i i */
|
||||||
/* p p p p k k k k */
|
/* p p p p k k k k */
|
||||||
/* m m m m j j j j */
|
/* m m m m j j j j */
|
||||||
/* o o o o l l l l */
|
/* o o o o l l l l */
|
||||||
y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
|
y10 = _mm256_permutevar_ps(y1, yi0);
|
||||||
y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
|
y11 = _mm256_permutevar_ps(y1, yi1);
|
||||||
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
|
y12 = _mm256_permutevar_ps(y1, yi2);
|
||||||
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
|
y13 = _mm256_permutevar_ps(y1, yi3);
|
||||||
|
|
||||||
glmm_store256(dest[2],
|
y0 = _mm256_mul_ps(y2, y6);
|
||||||
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
|
y1 = _mm256_mul_ps(y2, y10);
|
||||||
_mm256_mul_ps(y3, y7)),
|
|
||||||
_mm256_add_ps(_mm256_mul_ps(y4, y8),
|
y0 = glmm256_fmadd(y3, y7, y0);
|
||||||
_mm256_mul_ps(y5, y9))));
|
y1 = glmm256_fmadd(y3, y11, y1);
|
||||||
|
|
||||||
|
y0 = glmm256_fmadd(y4, y8, y0);
|
||||||
|
y1 = glmm256_fmadd(y4, y12, y1);
|
||||||
|
|
||||||
|
y0 = glmm256_fmadd(y5, y9, y0);
|
||||||
|
y1 = glmm256_fmadd(y5, y13, y1);
|
||||||
|
|
||||||
|
glmm_store256(dest[0], y0);
|
||||||
|
glmm_store256(dest[2], y1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
#ifndef cglm_intrin_h
|
#ifndef cglm_intrin_h
|
||||||
#define cglm_intrin_h
|
#define cglm_intrin_h
|
||||||
|
|
||||||
#if defined( _MSC_VER )
|
#if defined(_MSC_VER) && !defined(_M_ARM64EC)
|
||||||
# if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
|
# if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
|
||||||
# ifndef __SSE__
|
# ifndef __SSE__
|
||||||
# define __SSE__
|
# define __SSE__
|
||||||
@@ -20,13 +20,37 @@
|
|||||||
# ifndef __SSE__
|
# ifndef __SSE__
|
||||||
# define __SSE__
|
# define __SSE__
|
||||||
# endif
|
# endif
|
||||||
#endif
|
# endif
|
||||||
/* do not use alignment for older visual studio versions */
|
/* do not use alignment for older visual studio versions */
|
||||||
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
|
/* also ARM32 also causes similar error, disable it for now on ARM32 too */
|
||||||
|
# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */
|
||||||
# define CGLM_ALL_UNALIGNED
|
# define CGLM_ALL_UNALIGNED
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX__
|
||||||
|
# include <immintrin.h>
|
||||||
|
# define CGLM_AVX_FP 1
|
||||||
|
# ifndef __SSE2__
|
||||||
|
# define __SSE2__
|
||||||
|
# endif
|
||||||
|
# ifndef __SSE3__
|
||||||
|
# define __SSE3__
|
||||||
|
# endif
|
||||||
|
# ifndef __SSE4__
|
||||||
|
# define __SSE4__
|
||||||
|
# endif
|
||||||
|
# ifndef __SSE4_1__
|
||||||
|
# define __SSE4_1__
|
||||||
|
# endif
|
||||||
|
# ifndef __SSE4_2__
|
||||||
|
# define __SSE4_2__
|
||||||
|
# endif
|
||||||
|
# ifndef CGLM_SIMD_x86
|
||||||
|
# define CGLM_SIMD_x86
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SSE__)
|
#if defined(__SSE__)
|
||||||
# include <xmmintrin.h>
|
# include <xmmintrin.h>
|
||||||
# define CGLM_SSE_FP 1
|
# define CGLM_SSE_FP 1
|
||||||
@@ -64,14 +88,6 @@
|
|||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX__
|
|
||||||
# include <immintrin.h>
|
|
||||||
# define CGLM_AVX_FP 1
|
|
||||||
# ifndef CGLM_SIMD_x86
|
|
||||||
# define CGLM_SIMD_x86
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* ARM Neon */
|
/* ARM Neon */
|
||||||
#if defined(_WIN32) && defined(_MSC_VER)
|
#if defined(_WIN32) && defined(_MSC_VER)
|
||||||
/* TODO: non-ARM stuff already inported, will this be better option */
|
/* TODO: non-ARM stuff already inported, will this be better option */
|
||||||
@@ -100,7 +116,7 @@
|
|||||||
#else /* non-windows */
|
#else /* non-windows */
|
||||||
# if defined(__ARM_NEON) || defined(__ARM_NEON__)
|
# if defined(__ARM_NEON) || defined(__ARM_NEON__)
|
||||||
# include <arm_neon.h>
|
# include <arm_neon.h>
|
||||||
# if defined(__ARM_NEON_FP)
|
# if defined(__ARM_NEON_FP) || defined(__ARM_FP)
|
||||||
# define CGLM_NEON_FP 1
|
# define CGLM_NEON_FP 1
|
||||||
# endif
|
# endif
|
||||||
# ifndef CGLM_SIMD_ARM
|
# ifndef CGLM_SIMD_ARM
|
||||||
|
@@ -172,6 +172,8 @@ glm_mat4_det_neon(mat4 mat) {
|
|||||||
return glmm_hadd(vmulq_f32(x2, r0));
|
return glmm_hadd(vmulq_f32(x2, r0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* old one */
|
||||||
|
#if 0
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
||||||
@@ -297,7 +299,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
|||||||
vget_low_f32(vzipq_f32(v2, v3).val[0]));
|
vget_low_f32(vzipq_f32(v2, v3).val[0]));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
|
x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
|
||||||
|
|
||||||
glmm_store(dest[0], vmulq_f32(v0, x0));
|
glmm_store(dest[0], vmulq_f32(v0, x0));
|
||||||
glmm_store(dest[1], vmulq_f32(v1, x0));
|
glmm_store(dest[1], vmulq_f32(v1, x0));
|
||||||
@@ -312,6 +314,155 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
|||||||
glmm_store(dest[2], glmm_div(v2, x0));
|
glmm_store(dest[2], glmm_div(v2, x0));
|
||||||
glmm_store(dest[3], glmm_div(v3, x0));
|
glmm_store(dest[3], glmm_div(v3, x0));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CGLM_INLINE
|
||||||
|
void
|
||||||
|
glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
||||||
|
float32x4_t r0, r1, r2, r3,
|
||||||
|
v0, v1, v2, v3, v4, v5,
|
||||||
|
t0, t1, t2;
|
||||||
|
float32x4x2_t a0, a1, a2, a3, a4;
|
||||||
|
float32x4_t s1 = glmm_float32x4_SIGNMASK_PNPN, s2;
|
||||||
|
|
||||||
|
#if !CGLM_ARM64
|
||||||
|
float32x2_t l0, l1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
s2 = vrev64q_f32(s1);
|
||||||
|
|
||||||
|
/* 127 <- 0 */
|
||||||
|
r0 = glmm_load(mat[0]); /* d c b a */
|
||||||
|
r1 = glmm_load(mat[1]); /* h g f e */
|
||||||
|
r2 = glmm_load(mat[2]); /* l k j i */
|
||||||
|
r3 = glmm_load(mat[3]); /* p o n m */
|
||||||
|
|
||||||
|
a1 = vzipq_f32(r0, r2); /* l d k c, j b i a */
|
||||||
|
a2 = vzipq_f32(r1, r3); /* p h o g, n f m e */
|
||||||
|
a3 = vzipq_f32(a2.val[0], a1.val[0]); /* j n b f, i m a e */
|
||||||
|
a4 = vzipq_f32(a2.val[1], a1.val[1]); /* l p d h, k o c g */
|
||||||
|
|
||||||
|
v0 = vextq_f32(a1.val[0], a1.val[1], 2); /* k c j b */
|
||||||
|
v1 = vextq_f32(a2.val[0], a2.val[1], 2); /* o g n f */
|
||||||
|
v2 = vextq_f32(a1.val[1], a2.val[0], 2); /* m e l d */
|
||||||
|
v3 = vextq_f32(a2.val[1], a1.val[0], 2); /* i a p h */
|
||||||
|
v4 = vextq_f32(v1, v2, 2); /* l d o g */
|
||||||
|
v5 = vextq_f32(v0, v3, 2); /* p h k c */
|
||||||
|
|
||||||
|
/* c2 = c * h - g * d c12 = a * g - c * e c8 = a * f - b * e
|
||||||
|
c1 = k * p - o * l c11 = i * o - k * m c7 = i * n - j * m
|
||||||
|
c4 = h * a - d * e c6 = b * h - d * f c10 = b * g - c * f
|
||||||
|
c3 = p * i - l * m c5 = j * p - l * n c9 = j * o - k * n */
|
||||||
|
t0 = vmulq_f32(v5, v3);
|
||||||
|
t1 = vmulq_f32(a1.val[0], a2.val[1]);
|
||||||
|
t2 = vmulq_f32(a1.val[0], v1);
|
||||||
|
|
||||||
|
t0 = glmm_fnmadd(v4, v2, t0);
|
||||||
|
t1 = glmm_fnmadd(a1.val[1], a2.val[0], t1);
|
||||||
|
t2 = glmm_fnmadd(v0, a2.val[0], t2);
|
||||||
|
|
||||||
|
t0 = vrev64q_f32(t0);
|
||||||
|
t1 = vrev64q_f32(t1);
|
||||||
|
t2 = vrev64q_f32(t2);
|
||||||
|
|
||||||
|
/* det */
|
||||||
|
v0 = vrev64q_f32(t2);
|
||||||
|
v1 = vextq_f32(t1, t1, 2);
|
||||||
|
v0 = vmulq_f32(t0, v0);
|
||||||
|
v1 = vrev64q_f32(v1);
|
||||||
|
v1 = vmulq_f32(v1, t1);
|
||||||
|
|
||||||
|
/* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
|
||||||
|
#if CGLM_ARM64
|
||||||
|
v0 = vpaddq_f32(v0, v0);
|
||||||
|
v0 = vpaddq_f32(v0, v0);
|
||||||
|
#else
|
||||||
|
l0 = vget_low_f32(v0);
|
||||||
|
l1 = vget_high_f32(v0);
|
||||||
|
|
||||||
|
l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */
|
||||||
|
l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */
|
||||||
|
l0 = vadd_f32(l0, l1); /* [sum, sum] */
|
||||||
|
|
||||||
|
v0 = vcombine_f32(l0, l0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* c5 * c12 + c6 * c11 */
|
||||||
|
#if CGLM_ARM64
|
||||||
|
v1 = vpaddq_f32(v1, v1);
|
||||||
|
#else
|
||||||
|
l0 = vget_low_f32(v1);
|
||||||
|
l1 = vget_high_f32(v1);
|
||||||
|
|
||||||
|
l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */
|
||||||
|
l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */
|
||||||
|
|
||||||
|
v1 = vcombine_f32(l0, l1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
v0 = vsubq_f32(v0, v1); /* det */
|
||||||
|
|
||||||
|
/* inv div */
|
||||||
|
v1 = vdupq_n_f32(1.0f);
|
||||||
|
v0 = glmm_div(v1, v0); /* inv div */
|
||||||
|
|
||||||
|
/* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
|
||||||
|
t0 = vmulq_f32(t0, v0);
|
||||||
|
t1 = vmulq_f32(t1, v0);
|
||||||
|
t2 = vmulq_f32(t2, v0);
|
||||||
|
|
||||||
|
a0 = vzipq_f32(t0, t0); /* c4 c4 c3 c3, c2 c2 c1 c1 */
|
||||||
|
a1 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c12 c12 c11 c11 */
|
||||||
|
a2 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c8 c8 c7 c7 */
|
||||||
|
|
||||||
|
/* result */
|
||||||
|
|
||||||
|
/* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
|
||||||
|
dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt;
|
||||||
|
dest[0][2] = (n * c2 - o * c6 + p * c10) * idt;
|
||||||
|
dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt;
|
||||||
|
|
||||||
|
dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt;
|
||||||
|
dest[1][1] = (a * c1 - c * c3 + d * c11) * idt;
|
||||||
|
dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt;
|
||||||
|
dest[1][3] = (i * c2 - k * c4 + l * c12) * idt;
|
||||||
|
|
||||||
|
dest[2][0] = (e * c5 - f * c3 + h * c7) * idt;
|
||||||
|
dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt;
|
||||||
|
dest[2][2] = (m * c6 - n * c4 + p * c8) * idt;
|
||||||
|
dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt;
|
||||||
|
|
||||||
|
dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt;
|
||||||
|
dest[3][1] = (a * c9 - b * c11 + c * c7) * idt;
|
||||||
|
dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt;
|
||||||
|
dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */
|
||||||
|
|
||||||
|
r0 = vmulq_f32(a3.val[1], a0.val[0]);
|
||||||
|
r1 = vmulq_f32(a3.val[0], a0.val[0]);
|
||||||
|
r2 = vmulq_f32(a3.val[0], a1.val[1]);
|
||||||
|
r3 = vmulq_f32(a3.val[0], a2.val[1]);
|
||||||
|
|
||||||
|
r0 = glmm_fnmadd(a4.val[0], a1.val[1], r0);
|
||||||
|
r1 = glmm_fnmadd(a4.val[0], a0.val[1], r1);
|
||||||
|
r2 = glmm_fnmadd(a3.val[1], a0.val[1], r2);
|
||||||
|
r3 = glmm_fnmadd(a3.val[1], a1.val[0], r3);
|
||||||
|
|
||||||
|
r0 = glmm_fmadd(a4.val[1], a2.val[1], r0);
|
||||||
|
r1 = glmm_fmadd(a4.val[1], a1.val[0], r1);
|
||||||
|
r2 = glmm_fmadd(a4.val[1], a2.val[0], r2);
|
||||||
|
r3 = glmm_fmadd(a4.val[0], a2.val[0], r3);
|
||||||
|
|
||||||
|
/* 4xor may be fastart then 4mul, see above */
|
||||||
|
r0 = glmm_xor(r0, s1);
|
||||||
|
r1 = glmm_xor(r1, s2);
|
||||||
|
r2 = glmm_xor(r2, s1);
|
||||||
|
r3 = glmm_xor(r3, s2);
|
||||||
|
|
||||||
|
glmm_store(dest[0], r0);
|
||||||
|
glmm_store(dest[1], r1);
|
||||||
|
glmm_store(dest[2], r2);
|
||||||
|
glmm_store(dest[3], r3);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif /* cglm_mat4_neon_h */
|
#endif /* cglm_mat4_neon_h */
|
||||||
|
@@ -18,7 +18,7 @@ CGLM_INLINE
|
|||||||
void
|
void
|
||||||
glm_mat4_scale_sse2(mat4 m, float s) {
|
glm_mat4_scale_sse2(mat4 m, float s) {
|
||||||
__m128 x0;
|
__m128 x0;
|
||||||
x0 = _mm_set1_ps(s);
|
x0 = glmm_set1(s);
|
||||||
|
|
||||||
glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
|
glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
|
||||||
glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
|
glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
|
||||||
@@ -295,6 +295,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
|
|||||||
glmm_store(dest[3], _mm_mul_ps(v3, x0));
|
glmm_store(dest[3], _mm_mul_ps(v3, x0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* old one */
|
||||||
|
#if 0
|
||||||
CGLM_INLINE
|
CGLM_INLINE
|
||||||
void
|
void
|
||||||
glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
||||||
@@ -424,13 +426,148 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
|||||||
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
|
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
|
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
|
||||||
|
|
||||||
x0 = _mm_div_ps(_mm_set1_ps(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
|
x0 = _mm_div_ps(glmm_set1(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
|
||||||
|
|
||||||
glmm_store(dest[0], _mm_mul_ps(v0, x0));
|
glmm_store(dest[0], _mm_mul_ps(v0, x0));
|
||||||
glmm_store(dest[1], _mm_mul_ps(v1, x0));
|
glmm_store(dest[1], _mm_mul_ps(v1, x0));
|
||||||
glmm_store(dest[2], _mm_mul_ps(v2, x0));
|
glmm_store(dest[2], _mm_mul_ps(v2, x0));
|
||||||
glmm_store(dest[3], _mm_mul_ps(v3, x0));
|
glmm_store(dest[3], _mm_mul_ps(v3, x0));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CGLM_INLINE
|
||||||
|
void
|
||||||
|
glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
||||||
|
__m128 r0, r1, r2, r3, s1, s2,
|
||||||
|
v0, v1, v2, v3, v4, v5,
|
||||||
|
t0, t1, t2,
|
||||||
|
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13;
|
||||||
|
|
||||||
|
/* s1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
|
||||||
|
s1 = glmm_float32x4_SIGNMASK_NPNP;
|
||||||
|
s2 = glmm_shuff1(s1, 2, 1, 2, 1);
|
||||||
|
|
||||||
|
/* 127 <- 0 */
|
||||||
|
r1 = glmm_load(mat[1]); /* h g f e */
|
||||||
|
r0 = glmm_load(mat[0]); /* d c b a */
|
||||||
|
r3 = glmm_load(mat[3]); /* p o n m */
|
||||||
|
r2 = glmm_load(mat[2]); /* l k j i */
|
||||||
|
|
||||||
|
x4 = _mm_unpackhi_ps(r0, r2); /* l d k c */
|
||||||
|
x5 = _mm_unpacklo_ps(r0, r2); /* j b i a */
|
||||||
|
x6 = _mm_unpackhi_ps(r1, r3); /* p h o g */
|
||||||
|
x7 = _mm_unpacklo_ps(r1, r3); /* n f m e */
|
||||||
|
|
||||||
|
x0 = _mm_unpackhi_ps(x7, x5); /* j n b f */
|
||||||
|
x1 = _mm_unpacklo_ps(x7, x5); /* i m a e */
|
||||||
|
x2 = _mm_unpackhi_ps(x6, x4); /* l p d h */
|
||||||
|
x3 = _mm_unpacklo_ps(x6, x4); /* k o c g */
|
||||||
|
|
||||||
|
/* c2 = c * h - d * g c12 = a * g - c * e c8 = a * f - b * e
|
||||||
|
c1 = k * p - l * o c11 = i * o - k * m c7 = i * n - j * m
|
||||||
|
c4 = a * h - d * e c6 = b * h - d * f c10 = b * g - c * f
|
||||||
|
c3 = i * p - l * m c5 = j * p - l * n c9 = j * o - k * n */
|
||||||
|
|
||||||
|
x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */
|
||||||
|
x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */
|
||||||
|
|
||||||
|
x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */
|
||||||
|
x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */
|
||||||
|
|
||||||
|
#if 0 /* TODO measure both */
|
||||||
|
x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */
|
||||||
|
x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */
|
||||||
|
#else
|
||||||
|
x12 = _mm_movelh_ps(x4, x5); /* i a k c */
|
||||||
|
x13 = _mm_movelh_ps(x6, x7); /* m e o g */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
t0 = _mm_mul_ps(x12, x10);
|
||||||
|
t1 = _mm_mul_ps(x5, x6);
|
||||||
|
t2 = _mm_mul_ps(x5, x9);
|
||||||
|
|
||||||
|
t0 = glmm_fnmadd(x11, x13, t0);
|
||||||
|
t1 = glmm_fnmadd(x4, x7, t1);
|
||||||
|
t2 = glmm_fnmadd(x8, x7, t2);
|
||||||
|
|
||||||
|
/* det */
|
||||||
|
/* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
|
||||||
|
/* v1: c5 * c12 + c6 * c11 */
|
||||||
|
|
||||||
|
v5 = glmm_set1_rval(1.0f);
|
||||||
|
v0 = glmm_shuff1(t2, 2, 3, 0, 1);
|
||||||
|
v1 = glmm_shuff1(t1, 0, 1, 2, 3);
|
||||||
|
v0 = _mm_mul_ps(t0, v0);
|
||||||
|
v1 = _mm_mul_ps(t1, v1);
|
||||||
|
v2 = glmm_shuff1(v1, 1, 0, 0, 1);
|
||||||
|
v3 = glmm_shuff1(v0, 0, 1, 2, 3);
|
||||||
|
v1 = _mm_add_ps(v1, v2);
|
||||||
|
v0 = _mm_add_ps(v0, v3);
|
||||||
|
v2 = glmm_shuff1(v0, 1, 0, 0, 1);
|
||||||
|
v0 = _mm_add_ps(v0, v2);
|
||||||
|
|
||||||
|
v0 = _mm_sub_ps(v0, v1); /* det */
|
||||||
|
v0 = _mm_div_ps(v5, v0); /* idt */
|
||||||
|
|
||||||
|
/* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
|
||||||
|
t0 = _mm_mul_ps(t0, v0);
|
||||||
|
t1 = _mm_mul_ps(t1, v0);
|
||||||
|
t2 = _mm_mul_ps(t2, v0);
|
||||||
|
|
||||||
|
v0 = glmm_shuff1(t0, 0, 0, 1, 1); /* c2 c2 c1 c1 */
|
||||||
|
v1 = glmm_shuff1(t0, 2, 2, 3, 3); /* c4 c4 c3 c3 */
|
||||||
|
v2 = glmm_shuff1(t1, 0, 0, 1, 1); /* c12 c12 c11 c11 */
|
||||||
|
v3 = glmm_shuff1(t1, 2, 2, 3, 3); /* c6 c6 c5 c5 */
|
||||||
|
v4 = glmm_shuff1(t2, 0, 0, 1, 1); /* c8 c8 c7 c7 */
|
||||||
|
v5 = glmm_shuff1(t2, 2, 2, 3, 3); /* c10 c10 c9 c9 */
|
||||||
|
|
||||||
|
/* result */
|
||||||
|
|
||||||
|
/* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
|
||||||
|
dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt;
|
||||||
|
dest[0][2] = (n * c2 - o * c6 + p * c10) * idt;
|
||||||
|
dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt;
|
||||||
|
|
||||||
|
dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt;
|
||||||
|
dest[1][1] = (a * c1 - c * c3 + d * c11) * idt;
|
||||||
|
dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt;
|
||||||
|
dest[1][3] = (i * c2 - k * c4 + l * c12) * idt;
|
||||||
|
|
||||||
|
dest[2][0] = (e * c5 - f * c3 + h * c7) * idt;
|
||||||
|
dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt;
|
||||||
|
dest[2][2] = (m * c6 - n * c4 + p * c8) * idt;
|
||||||
|
dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt;
|
||||||
|
|
||||||
|
dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt;
|
||||||
|
dest[3][1] = (a * c9 - b * c11 + c * c7) * idt;
|
||||||
|
dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt;
|
||||||
|
dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */
|
||||||
|
|
||||||
|
r0 = _mm_mul_ps(x0, v0);
|
||||||
|
r1 = _mm_mul_ps(x1, v0);
|
||||||
|
r2 = _mm_mul_ps(x1, v3);
|
||||||
|
r3 = _mm_mul_ps(x1, v5);
|
||||||
|
|
||||||
|
r0 = glmm_fnmadd(x3, v3, r0);
|
||||||
|
r1 = glmm_fnmadd(x3, v1, r1);
|
||||||
|
r2 = glmm_fnmadd(x0, v1, r2);
|
||||||
|
r3 = glmm_fnmadd(x0, v2, r3);
|
||||||
|
|
||||||
|
r0 = glmm_fmadd(x2, v5, r0);
|
||||||
|
r1 = glmm_fmadd(x2, v2, r1);
|
||||||
|
r2 = glmm_fmadd(x2, v4, r2);
|
||||||
|
r3 = glmm_fmadd(x3, v4, r3);
|
||||||
|
|
||||||
|
/* 4xor may be fastart then 4mul, see above */
|
||||||
|
r0 = _mm_xor_ps(r0, s1);
|
||||||
|
r1 = _mm_xor_ps(r1, s2);
|
||||||
|
r2 = _mm_xor_ps(r2, s1);
|
||||||
|
r3 = _mm_xor_ps(r3, s2);
|
||||||
|
|
||||||
|
glmm_store(dest[0], r0);
|
||||||
|
glmm_store(dest[1], r1);
|
||||||
|
glmm_store(dest[2], r2);
|
||||||
|
glmm_store(dest[3], r3);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif /* cglm_mat_sse_h */
|
#endif /* cglm_mat_sse_h */
|
||||||
|
@@ -14,8 +14,9 @@
|
|||||||
#define glmm_load(p) wasm_v128_load(p)
|
#define glmm_load(p) wasm_v128_load(p)
|
||||||
#define glmm_store(p, a) wasm_v128_store(p, (a))
|
#define glmm_store(p, a) wasm_v128_store(p, (a))
|
||||||
|
|
||||||
#define glmm_set1(x) wasm_f32x4_splat(x)
|
#define glmm_set1(x) wasm_f32x4_splat(x)
|
||||||
#define glmm_128 v128_t
|
#define glmm_set1_rval(x) wasm_f32x4_splat(x)
|
||||||
|
#define glmm_128 v128_t
|
||||||
|
|
||||||
#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
|
#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
|
||||||
|
|
||||||
|
@@ -18,31 +18,46 @@
|
|||||||
# define glmm_store(p, a) _mm_store_ps(p, a)
|
# define glmm_store(p, a) _mm_store_ps(p, a)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define glmm_set1(x) _mm_set1_ps(x)
|
|
||||||
#define glmm_128 __m128
|
#define glmm_128 __m128
|
||||||
|
|
||||||
#if defined(CGLM_USE_INT_DOMAIN) && defined(__SSE2__)
|
#ifdef __AVX__
|
||||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
|
_mm_permute_ps((xmm), _MM_SHUFFLE(z, y, x, w))
|
||||||
_MM_SHUFFLE(z, y, x, w)))
|
|
||||||
#else
|
#else
|
||||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
# if !defined(CGLM_NO_INT_DOMAIN) && defined(__SSE2__)
|
||||||
|
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||||
|
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
|
||||||
|
_MM_SHUFFLE(z, y, x, w)))
|
||||||
|
# else
|
||||||
|
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||||
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
|
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
|
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
|
||||||
|
|
||||||
#define glmm_splat_x(x) glmm_splat(x, 0)
|
#ifdef __AVX__
|
||||||
#define glmm_splat_y(x) glmm_splat(x, 1)
|
# define glmm_set1(x) _mm_broadcast_ss(&x)
|
||||||
#define glmm_splat_z(x) glmm_splat(x, 2)
|
# define glmm_set1_ptr(x) _mm_broadcast_ss(x)
|
||||||
#define glmm_splat_w(x) glmm_splat(x, 3)
|
# define glmm_set1_rval(x) _mm_set1_ps(x)
|
||||||
|
# ifdef __AVX2__
|
||||||
|
# define glmm_splat_x(x) _mm_broadcastss_ps(x)
|
||||||
|
# else
|
||||||
|
# define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0))
|
||||||
|
# endif
|
||||||
|
# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1))
|
||||||
|
# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2))
|
||||||
|
# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3))
|
||||||
|
#else
|
||||||
|
# define glmm_set1(x) _mm_set1_ps(x)
|
||||||
|
# define glmm_set1_ptr(x) _mm_set1_ps(*x)
|
||||||
|
# define glmm_set1_rval(x) _mm_set1_ps(x)
|
||||||
|
|
||||||
/* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */
|
# define glmm_splat_x(x) glmm_splat(x, 0)
|
||||||
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
|
# define glmm_splat_y(x) glmm_splat(x, 1)
|
||||||
|
# define glmm_splat_z(x) glmm_splat(x, 2)
|
||||||
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
# define glmm_splat_w(x) glmm_splat(x, 3)
|
||||||
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
#endif
|
||||||
z1, y1, x1, w1)
|
|
||||||
|
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
# ifdef CGLM_ALL_UNALIGNED
|
# ifdef CGLM_ALL_UNALIGNED
|
||||||
@@ -86,7 +101,7 @@
|
|||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
# define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */
|
# define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */
|
||||||
#else
|
#else
|
||||||
# define glmm_float32x4_SIGNMASK_NEG _mm_set1_ps(GLMM_NEGZEROf)
|
# define glmm_float32x4_SIGNMASK_NEG glmm_set1(GLMM_NEGZEROf)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
|
#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
|
||||||
|
@@ -14,7 +14,8 @@
|
|||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
/* do not use alignment for older visual studio versions */
|
/* do not use alignment for older visual studio versions */
|
||||||
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
|
/* also ARM32 also causes similar error, disable it for now on ARM32 too */
|
||||||
|
# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */
|
||||||
# define CGLM_ALL_UNALIGNED
|
# define CGLM_ALL_UNALIGNED
|
||||||
# define CGLM_ALIGN(X) /* no alignment */
|
# define CGLM_ALIGN(X) /* no alignment */
|
||||||
# else
|
# else
|
||||||
|
@@ -52,7 +52,7 @@ glm_vec4_broadcast(float val, vec4 d) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(d, wasm_f32x4_splat(val));
|
glmm_store(d, wasm_f32x4_splat(val));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(d, _mm_set1_ps(val));
|
glmm_store(d, glmm_set1(val));
|
||||||
#else
|
#else
|
||||||
d[0] = d[1] = d[2] = d[3] = val;
|
d[0] = d[1] = d[2] = d[3] = val;
|
||||||
#endif
|
#endif
|
||||||
@@ -70,7 +70,7 @@ glm_vec4_fill(vec4 v, float val) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(v, wasm_f32x4_splat(val));
|
glmm_store(v, wasm_f32x4_splat(val));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(v, _mm_set1_ps(val));
|
glmm_store(v, glmm_set1(val));
|
||||||
#else
|
#else
|
||||||
v[0] = v[1] = v[2] = v[3] = val;
|
v[0] = v[1] = v[2] = v[3] = val;
|
||||||
#endif
|
#endif
|
||||||
|
@@ -216,7 +216,7 @@ glm_vec4_one(vec4 v) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(v, wasm_f32x4_const_splat(1.0f));
|
glmm_store(v, wasm_f32x4_const_splat(1.0f));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(v, _mm_set1_ps(1.0f));
|
glmm_store(v, glmm_set1_rval(1.0f));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(v, vdupq_n_f32(1.0f));
|
vst1q_f32(v, vdupq_n_f32(1.0f));
|
||||||
#else
|
#else
|
||||||
@@ -368,7 +368,7 @@ glm_vec4_adds(vec4 v, float s, vec4 dest) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s)));
|
glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s)));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
|
glmm_store(dest, _mm_add_ps(glmm_load(v), glmm_set1(s)));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
||||||
#else
|
#else
|
||||||
@@ -416,7 +416,7 @@ glm_vec4_subs(vec4 v, float s, vec4 dest) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s)));
|
glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s)));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
|
glmm_store(dest, _mm_sub_ps(glmm_load(v), glmm_set1(s)));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
||||||
#else
|
#else
|
||||||
@@ -464,7 +464,7 @@ glm_vec4_scale(vec4 v, float s, vec4 dest) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s)));
|
glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s)));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
|
glmm_store(dest, _mm_mul_ps(glmm_load(v), glmm_set1(s)));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s)));
|
||||||
#else
|
#else
|
||||||
@@ -1064,8 +1064,8 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
|
|||||||
glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)),
|
glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)),
|
||||||
wasm_f32x4_splat(maxVal)));
|
wasm_f32x4_splat(maxVal)));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(v, glmm_min(glmm_max(glmm_load(v), _mm_set1_ps(minVal)),
|
glmm_store(v, glmm_min(glmm_max(glmm_load(v), glmm_set1(minVal)),
|
||||||
_mm_set1_ps(maxVal)));
|
glmm_set1(maxVal)));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)),
|
glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)),
|
||||||
vdupq_n_f32(maxVal)));
|
vdupq_n_f32(maxVal)));
|
||||||
|
Reference in New Issue
Block a user