diff --git a/include/cglm-mat-simd.h b/include/cglm-mat-simd.h index f140efd..f51ccf2 100644 --- a/include/cglm-mat-simd.h +++ b/include/cglm-mat-simd.h @@ -61,80 +61,76 @@ CGLM_INLINE void glm_mat4_inv_see2(mat4 mat, mat4 dest) { - __m128 v0, v1, v2, v3, dt, t0, t1, t2, t3, t4; - float a, b, c, d, - e, f, g, h, - i, j, k, l, - m, n, o, p; + __m128 v0, v1, v2, v3, dt, t0, t1, t2, t3, t4, r0, r1, r2, r3, r4; - a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3], - e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3], - i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3], - m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3]; + r0 = _mm_load_ps(mat[0]); + r1 = _mm_load_ps(mat[1]); + r2 = _mm_load_ps(mat[2]); + r3 = _mm_load_ps(mat[3]); - t3 = _mm_sub_ps(_mm_mul_ps(_mm_set_ps(k, j, j, i), - _mm_set_ps(p, p, o, p)), - _mm_mul_ps(_mm_set_ps(o, n, n, m), - _mm_set_ps(l, l, k, l))); + t3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 2, 1, 1, 0), + _mm_shuffle1_ps(r3, 3, 3, 2, 3)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 2, 1, 1, 0), + _mm_shuffle1_ps(r2, 3, 3, 2, 3))); - t4 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(i), - _mm_set_ps(0.0f, 0.0f, o, n)), - _mm_mul_ps(_mm_set1_ps(m), - _mm_set_ps(0.0f, 0.0f, k, j))); + t4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 0, 0, 0, 0), + _mm_shuffle1_ps(r3, 2, 1, 2, 1)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 0, 0, 0, 0), + _mm_shuffle1_ps(r2, 2, 1, 2, 1))); t0 = _mm_shuffle1_ps(t3, 3, 3, 2, 1); t1 = _mm_shuffle2_ps(t3, t4, 1, 1, 2, 0, 1, 0, 0, 3); t2 = _mm_shuffle2_ps(t3, t4, 0, 1, 1, 1, 0, 2, 3, 3); - v0 = _mm_mul_ps(_mm_set_ps(f, e, e, e), t0); - v0 = _mm_sub_ps(v0, _mm_mul_ps(_mm_set_ps(g, g, f, f), t1)); - v0 = _mm_add_ps(v0, _mm_mul_ps(_mm_set_ps(h, h, h, g), t2)); + v0 = _mm_mul_ps(_mm_shuffle1_ps(r1, 1, 0, 0, 0), t0); + v0 = _mm_sub_ps(v0, _mm_mul_ps(_mm_shuffle1_ps(r1, 2, 2, 1, 1), t1)); + v0 = _mm_add_ps(v0, _mm_mul_ps(_mm_shuffle1_ps(r1, 3, 3, 3, 2), t2)); v0 = _mm_xor_ps(v0, _mm_set_ps(0.f, -0.f, 0.f, -0.f)); - v1 = _mm_mul_ps(_mm_set_ps(b, a, a, a), t0); - v1 = _mm_sub_ps(v1, _mm_mul_ps(_mm_set_ps(c, c, b, b), t1)); - v1 = _mm_add_ps(v1, _mm_mul_ps(_mm_set_ps(d, d, d, c), t2)); + v1 = _mm_mul_ps(_mm_shuffle1_ps(r0, 1, 0, 0, 0), t0); + v1 = _mm_sub_ps(v1, _mm_mul_ps(_mm_shuffle1_ps(r0, 2, 2, 1, 1), t1)); + v1 = _mm_add_ps(v1, _mm_mul_ps(_mm_shuffle1_ps(r0, 3, 3, 3, 2), t2)); v1 = _mm_xor_ps(v1, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); - t3 = _mm_sub_ps(_mm_mul_ps(_mm_set_ps(g, f, f, e), - _mm_set_ps(p, p, o, p)), - _mm_mul_ps(_mm_set_ps(o, n, n, m), - _mm_set_ps(h, h, g, h))); + t3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r1, 2, 1, 1, 0), + _mm_shuffle1_ps(r3, 3, 3, 2, 3)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 2, 1, 1, 0), + _mm_shuffle1_ps(r1, 3, 3, 2, 3))); - t4 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(e), - _mm_set_ps(0.0f, 0.0f, o, n)), - _mm_mul_ps(_mm_set1_ps(m), - _mm_set_ps(0.0f, 0.0f, g, f))); + t4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r1, 0, 0, 0, 0), + _mm_shuffle1_ps(r3, 2, 1, 2, 1)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 0, 0, 0, 0), + _mm_shuffle1_ps(r1, 2, 1, 2, 1))); t0 = _mm_shuffle1_ps(t3, 3, 3, 2, 1); t1 = _mm_shuffle2_ps(t3, t4, 1, 1, 2, 0, 1, 0, 0, 3); t2 = _mm_shuffle2_ps(t3, t4, 0, 1, 1, 1, 0, 2, 3, 3); - v2 = _mm_mul_ps(_mm_set_ps(b, a, a, a), t0); - v2 = _mm_sub_ps(v2, _mm_mul_ps(_mm_set_ps(c, c, b, b), t1)); - v2 = _mm_add_ps(v2, _mm_mul_ps(_mm_set_ps(d, d, d, c), t2)); + v2 = _mm_mul_ps(_mm_shuffle1_ps(r0, 1, 0, 0, 0), t0); + v2 = _mm_sub_ps(v2, _mm_mul_ps(_mm_shuffle1_ps(r0, 2, 2, 1, 1), t1)); + v2 = _mm_add_ps(v2, _mm_mul_ps(_mm_shuffle1_ps(r0, 3, 3, 3, 2), t2)); v2 = _mm_xor_ps(v2, _mm_set_ps(0.f, -0.f, 0.f, -0.f)); - t3 = _mm_sub_ps(_mm_mul_ps(_mm_set_ps(g, f, f, e), - _mm_set_ps(l, l, k, l)), - _mm_mul_ps(_mm_set_ps(k, j, j, i), - _mm_set_ps(h, h, g, h))); + t3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r1, 2, 1, 1, 0), + _mm_shuffle1_ps(r2, 3, 3, 2, 3)), + _mm_mul_ps(_mm_shuffle1_ps(r2, 2, 1, 1, 0), + _mm_shuffle1_ps(r1, 3, 3, 2, 3))); - t4 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(e), - _mm_set_ps(0.0f, 0.0f, k, j)), - _mm_mul_ps(_mm_set1_ps(i), - _mm_set_ps(0.0f, 0.0f, g, f))); + t4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r1, 0, 0, 0, 0), + _mm_shuffle1_ps(r2, 2, 1, 2, 1)), + _mm_mul_ps(_mm_shuffle1_ps(r2, 0, 0, 0, 0), + _mm_shuffle1_ps(r1, 2, 1, 2, 1))); t0 = _mm_shuffle1_ps(t3, 3, 3, 2, 1); t1 = _mm_shuffle2_ps(t3, t4, 1, 1, 2, 0, 1, 0, 0, 3); t2 = _mm_shuffle2_ps(t3, t4, 0, 1, 1, 1, 0, 2, 3, 3); - v3 = _mm_mul_ps(_mm_set_ps(b, a, a, a), t0); - v3 = _mm_sub_ps(v3, _mm_mul_ps(_mm_set_ps(c, c, b, b), t1)); - v3 = _mm_add_ps(v3, _mm_mul_ps(_mm_set_ps(d, d, d, c), t2)); + v3 = _mm_mul_ps(_mm_shuffle1_ps(r0, 1, 0, 0, 0), t0); + v3 = _mm_sub_ps(v3, _mm_mul_ps(_mm_shuffle1_ps(r0, 2, 2, 1, 1), t1)); + v3 = _mm_add_ps(v3, _mm_mul_ps(_mm_shuffle1_ps(r0, 3, 3, 3, 2), t2)); v3 = _mm_xor_ps(v3, _mm_set_ps(0.f, -0.f, 0.f, -0.f)); - dt = _mm_mul_ps(_mm_set_ps(a, b, c, d), v0); + dt = _mm_mul_ps(_mm_shuffle1_ps(r0, 0, 1, 2, 3), v0); dt = _mm_add_ps(dt, _mm_shuffle1_ps(dt, 0, 1, 2, 3)); dt = _mm_add_ps(dt, _mm_shuffle1_ps(dt, 1, 3, 3, 1)); dt = _mm_rcp_ps(dt);