mirror of
https://github.com/recp/cglm.git
synced 2025-10-03 08:41:55 +00:00
simd: organise SIMD-functions
* optimize dot product
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -69,3 +69,4 @@ win/cglm_test_*
|
||||
win/x64
|
||||
win/x85
|
||||
win/Debug
|
||||
cglm-test-ios*
|
||||
|
3
CREDITS
3
CREDITS
@@ -52,3 +52,6 @@ https://gamedev.stackexchange.com/questions/28395/rotating-vector3-by-a-quaterni
|
||||
|
||||
9. Sphere AABB intersect
|
||||
https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
|
||||
|
||||
10. Horizontal add
|
||||
https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
|
||||
|
@@ -218,7 +218,7 @@ glm_quat_normalize_to(versor q, versor dest) {
|
||||
float dot;
|
||||
|
||||
x0 = glmm_load(q);
|
||||
xdot = glmm_dot(x0, x0);
|
||||
xdot = glmm_vdot(x0, x0);
|
||||
dot = _mm_cvtss_f32(xdot);
|
||||
|
||||
if (dot <= 0.0f) {
|
||||
|
41
include/cglm/simd/arm.h
Normal file
41
include/cglm/simd/arm.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c), Recep Aslantas.
|
||||
*
|
||||
* MIT License (MIT), http://opensource.org/licenses/MIT
|
||||
* Full license can be found in the LICENSE file
|
||||
*/
|
||||
|
||||
#ifndef cglm_simd_arm_h
|
||||
#define cglm_simd_arm_h
|
||||
#include "intrin.h"
|
||||
#ifdef CGLM_SIMD_ARM
|
||||
|
||||
#define glmm_load(p) vld1q_f32(p)
|
||||
#define glmm_store(p, a) vst1q_f32(p, a)
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_hadd(float32x4_t v) {
|
||||
#if defined(__aarch64__)
|
||||
return vaddvq_f32(v);
|
||||
#else
|
||||
v = vaddq_f32(v, vrev64q_f32(v));
|
||||
v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
|
||||
return vgetq_lane_f32(v, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_dot(float32x4_t a, float32x4_t b) {
|
||||
return glmm_hadd(vmulq_f32(a, b));
|
||||
}
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_norm(float32x4_t a) {
|
||||
return sqrtf(glmm_dot(a, a));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* cglm_simd_arm_h */
|
@@ -27,94 +27,39 @@
|
||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||
# include <xmmintrin.h>
|
||||
# include <emmintrin.h>
|
||||
|
||||
/* OPTIONAL: You may save some instructions but latency (not sure) */
|
||||
#ifdef CGLM_USE_INT_DOMAIN
|
||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
|
||||
_MM_SHUFFLE(z, y, x, w)))
|
||||
#else
|
||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
|
||||
#endif
|
||||
|
||||
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
|
||||
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
||||
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
||||
z1, y1, x1, w1)
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_dot(__m128 a, __m128 b) {
|
||||
__m128 x0;
|
||||
x0 = _mm_mul_ps(a, b);
|
||||
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
|
||||
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_norm(__m128 a) {
|
||||
return _mm_sqrt_ps(glmm_dot(a, a));
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_load3(float v[3]) {
|
||||
__m128i xy;
|
||||
__m128 z;
|
||||
|
||||
xy = _mm_loadl_epi64((const __m128i *)v);
|
||||
z = _mm_load_ss(&v[2]);
|
||||
|
||||
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
|
||||
}
|
||||
|
||||
static inline
|
||||
void
|
||||
glmm_store3(__m128 vx, float v[3]) {
|
||||
_mm_storel_pi((__m64 *)&v[0], vx);
|
||||
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
||||
}
|
||||
|
||||
#ifdef CGLM_ALL_UNALIGNED
|
||||
# define glmm_load(p) _mm_loadu_ps(p)
|
||||
# define glmm_store(p, a) _mm_storeu_ps(p, a)
|
||||
#else
|
||||
# define glmm_load(p) _mm_load_ps(p)
|
||||
# define glmm_store(p, a) _mm_store_ps(p, a)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* x86, x64 */
|
||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||
# define CGLM_SSE_FP 1
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
# define CGLM_AVX_FP 1
|
||||
|
||||
#ifdef CGLM_ALL_UNALIGNED
|
||||
# define glmm_load256(p) _mm256_loadu_ps(p)
|
||||
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
|
||||
#else
|
||||
# define glmm_load256(p) _mm256_load_ps(p)
|
||||
# define glmm_store256(p, a) _mm256_store_ps(p, a)
|
||||
#endif
|
||||
|
||||
# ifndef CGLM_SIMD_x86
|
||||
# define CGLM_SIMD_x86
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__SSE3__)
|
||||
# include <x86intrin.h>
|
||||
# ifndef CGLM_SIMD_x86
|
||||
# define CGLM_SIMD_x86
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
# include <smmintrin.h>
|
||||
# ifndef CGLM_SIMD_x86
|
||||
# define CGLM_SIMD_x86
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
# include <nmmintrin.h>
|
||||
# ifndef CGLM_SIMD_x86
|
||||
# define CGLM_SIMD_x86
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
# include <immintrin.h>
|
||||
# define CGLM_AVX_FP 1
|
||||
# ifndef CGLM_SIMD_x86
|
||||
# define CGLM_SIMD_x86
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* ARM Neon */
|
||||
@@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) {
|
||||
# include <arm_neon.h>
|
||||
# if defined(__ARM_NEON_FP)
|
||||
# define CGLM_NEON_FP 1
|
||||
# ifndef CGLM_SIMD_ARM
|
||||
# define CGLM_SIMD_ARM
|
||||
# endif
|
||||
# endif
|
||||
#else
|
||||
# undef CGLM_NEON_FP
|
||||
#endif
|
||||
|
||||
#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
|
||||
# ifndef CGLM_SIMD
|
||||
# define CGLM_SIMD
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(CGLM_SIMD_x86)
|
||||
# include "x86.h"
|
||||
#endif
|
||||
|
||||
#if defined(CGLM_SIMD_ARM)
|
||||
# include "arm.h"
|
||||
#endif
|
||||
|
||||
#endif /* cglm_intrin_h */
|
||||
|
136
include/cglm/simd/x86.h
Normal file
136
include/cglm/simd/x86.h
Normal file
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Copyright (c), Recep Aslantas.
|
||||
*
|
||||
* MIT License (MIT), http://opensource.org/licenses/MIT
|
||||
* Full license can be found in the LICENSE file
|
||||
*/
|
||||
|
||||
#ifndef cglm_simd_x86_h
|
||||
#define cglm_simd_x86_h
|
||||
#include "intrin.h"
|
||||
#ifdef CGLM_SIMD_x86
|
||||
|
||||
#ifdef CGLM_ALL_UNALIGNED
|
||||
# define glmm_load(p) _mm_loadu_ps(p)
|
||||
# define glmm_store(p, a) _mm_storeu_ps(p, a)
|
||||
#else
|
||||
# define glmm_load(p) _mm_load_ps(p)
|
||||
# define glmm_store(p, a) _mm_store_ps(p, a)
|
||||
#endif
|
||||
|
||||
#ifdef CGLM_USE_INT_DOMAIN
|
||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
|
||||
_MM_SHUFFLE(z, y, x, w)))
|
||||
#else
|
||||
# define glmm_shuff1(xmm, z, y, x, w) \
|
||||
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
|
||||
#endif
|
||||
|
||||
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
|
||||
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
||||
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
||||
z1, y1, x1, w1)
|
||||
|
||||
#ifdef __AVX__
|
||||
# ifdef CGLM_ALL_UNALIGNED
|
||||
# define glmm_load256(p) _mm256_loadu_ps(p)
|
||||
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
|
||||
# else
|
||||
# define glmm_load256(p) _mm256_load_ps(p)
|
||||
# define glmm_store256(p, a) _mm256_store_ps(p, a)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_vhadds(__m128 v) {
|
||||
#if defined(__SSE3__)
|
||||
__m128 shuf, sums;
|
||||
shuf = _mm_movehdup_ps(v);
|
||||
sums = _mm_add_ps(v, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return sums;
|
||||
#else
|
||||
__m128 shuf, sums;
|
||||
shuf = glmm_shuff1(v, 2, 3, 0, 1);
|
||||
sums = _mm_add_ps(v, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return sums;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_hadd(__m128 v) {
|
||||
return _mm_cvtss_f32(glmm_vhadds(v));
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_vdots(__m128 a, __m128 b) {
|
||||
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
|
||||
return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
|
||||
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
|
||||
__m128 x0, x1;
|
||||
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
|
||||
x1 = _mm_hadd_ps(x0, x0);
|
||||
return _mm_hadd_ps(x1, x1);
|
||||
#else
|
||||
return glmm_vhadds(_mm_mul_ps(a, b));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_vdot(__m128 a, __m128 b) {
|
||||
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
|
||||
return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
|
||||
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
|
||||
__m128 x0, x1;
|
||||
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
|
||||
x1 = _mm_hadd_ps(x0, x0);
|
||||
return _mm_hadd_ps(x1, x1);
|
||||
#else
|
||||
__m128 x0;
|
||||
x0 = _mm_mul_ps(a, b);
|
||||
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
|
||||
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_dot(__m128 a, __m128 b) {
|
||||
return _mm_cvtss_f32(glmm_vdots(a, b));
|
||||
}
|
||||
|
||||
static inline
|
||||
float
|
||||
glmm_norm(__m128 a) {
|
||||
return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a))));
|
||||
}
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_load3(float v[3]) {
|
||||
__m128i xy;
|
||||
__m128 z;
|
||||
|
||||
xy = _mm_loadl_epi64((const __m128i *)v);
|
||||
z = _mm_load_ss(&v[2]);
|
||||
|
||||
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
|
||||
}
|
||||
|
||||
static inline
|
||||
void
|
||||
glmm_store3(__m128 vx, float v[3]) {
|
||||
_mm_storel_pi((__m64 *)&v[0], vx);
|
||||
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* cglm_simd_x86_h */
|
@@ -200,24 +200,8 @@ glm_vec4_one(vec4 v) {
|
||||
CGLM_INLINE
|
||||
float
|
||||
glm_vec4_dot(vec4 a, vec4 b) {
|
||||
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
|
||||
return _mm_cvtss_f32(_mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF));
|
||||
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
|
||||
__m128 x0, x1;
|
||||
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
|
||||
x1 = _mm_hadd_ps(x0, x0);
|
||||
return _mm_cvtss_f32(_mm_hadd_ps(x1, x1));
|
||||
#elif defined(__SSE__) || defined(__SSE2__)
|
||||
__m128 x0;
|
||||
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
|
||||
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
|
||||
return _mm_cvtss_f32(_mm_add_ss(x0, glmm_shuff1(x0, 0, 1, 0, 1)));
|
||||
#elif defined(CGLM_NEON_FP)
|
||||
float32x4_t v0, v1, v2;
|
||||
v0 = vmulq_f32(vld1q_f32(a), vld1q_f32(b));
|
||||
v1 = vaddq_f32(v0, vrev64q_f32(v0));
|
||||
v2 = vaddq_f32(v1, vcombine_f32(vget_high_f32(v1), vget_low_f32(v1)));
|
||||
return vgetq_lane_f32(v2, 0);
|
||||
#if defined(CGLM_SIMD)
|
||||
return glmm_dot(glmm_load(a), glmm_load(b));
|
||||
#else
|
||||
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
|
||||
#endif
|
||||
@@ -250,10 +234,8 @@ glm_vec4_norm2(vec4 v) {
|
||||
CGLM_INLINE
|
||||
float
|
||||
glm_vec4_norm(vec4 v) {
|
||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||
__m128 x0;
|
||||
x0 = glmm_load(v);
|
||||
return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0)));
|
||||
#if defined(CGLM_SIMD)
|
||||
return glmm_norm(glmm_load(v));
|
||||
#else
|
||||
return sqrtf(glm_vec4_dot(v, v));
|
||||
#endif
|
||||
@@ -663,7 +645,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
|
||||
float dot;
|
||||
|
||||
x0 = glmm_load(v);
|
||||
xdot = glmm_dot(x0, x0);
|
||||
xdot = glmm_vdot(x0, x0);
|
||||
dot = _mm_cvtss_f32(xdot);
|
||||
|
||||
if (dot == 0.0f) {
|
||||
|
52
makefile.am
52
makefile.am
@@ -34,30 +34,30 @@ test_tests_CFLAGS = $(checkCFLAGS)
|
||||
|
||||
cglmdir=$(includedir)/cglm
|
||||
cglm_HEADERS = include/cglm/version.h \
|
||||
include/cglm/cglm.h \
|
||||
include/cglm/call.h \
|
||||
include/cglm/cam.h \
|
||||
include/cglm/io.h \
|
||||
include/cglm/mat4.h \
|
||||
include/cglm/mat3.h \
|
||||
include/cglm/types.h \
|
||||
include/cglm/common.h \
|
||||
include/cglm/affine.h \
|
||||
include/cglm/vec3.h \
|
||||
include/cglm/vec3-ext.h \
|
||||
include/cglm/vec4.h \
|
||||
include/cglm/vec4-ext.h \
|
||||
include/cglm/euler.h \
|
||||
include/cglm/util.h \
|
||||
include/cglm/quat.h \
|
||||
include/cglm/affine-mat.h \
|
||||
include/cglm/plane.h \
|
||||
include/cglm/frustum.h \
|
||||
include/cglm/box.h \
|
||||
include/cglm/color.h \
|
||||
include/cglm/project.h \
|
||||
include/cglm/sphere.h \
|
||||
include/cglm/ease.h
|
||||
include/cglm/cglm.h \
|
||||
include/cglm/call.h \
|
||||
include/cglm/cam.h \
|
||||
include/cglm/io.h \
|
||||
include/cglm/mat4.h \
|
||||
include/cglm/mat3.h \
|
||||
include/cglm/types.h \
|
||||
include/cglm/common.h \
|
||||
include/cglm/affine.h \
|
||||
include/cglm/vec3.h \
|
||||
include/cglm/vec3-ext.h \
|
||||
include/cglm/vec4.h \
|
||||
include/cglm/vec4-ext.h \
|
||||
include/cglm/euler.h \
|
||||
include/cglm/util.h \
|
||||
include/cglm/quat.h \
|
||||
include/cglm/affine-mat.h \
|
||||
include/cglm/plane.h \
|
||||
include/cglm/frustum.h \
|
||||
include/cglm/box.h \
|
||||
include/cglm/color.h \
|
||||
include/cglm/project.h \
|
||||
include/cglm/sphere.h \
|
||||
include/cglm/ease.h
|
||||
|
||||
cglm_calldir=$(includedir)/cglm/call
|
||||
cglm_call_HEADERS = include/cglm/call/mat4.h \
|
||||
@@ -77,7 +77,9 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
|
||||
include/cglm/call/ease.h
|
||||
|
||||
cglm_simddir=$(includedir)/cglm/simd
|
||||
cglm_simd_HEADERS = include/cglm/simd/intrin.h
|
||||
cglm_simd_HEADERS = include/cglm/simd/intrin.h \
|
||||
include/cglm/simd/x86.h \
|
||||
include/cglm/simd/arm.h
|
||||
|
||||
cglm_simd_sse2dir=$(includedir)/cglm/simd/sse2
|
||||
cglm_simd_sse2_HEADERS = include/cglm/simd/sse2/affine.h \
|
||||
|
@@ -69,6 +69,7 @@
|
||||
<ClInclude Include="..\include\cglm\plane.h" />
|
||||
<ClInclude Include="..\include\cglm\project.h" />
|
||||
<ClInclude Include="..\include\cglm\quat.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\arm.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\avx\affine.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\avx\mat4.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\intrin.h" />
|
||||
@@ -77,6 +78,7 @@
|
||||
<ClInclude Include="..\include\cglm\simd\sse2\mat3.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\sse2\mat4.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\sse2\quat.h" />
|
||||
<ClInclude Include="..\include\cglm\simd\x86.h" />
|
||||
<ClInclude Include="..\include\cglm\sphere.h" />
|
||||
<ClInclude Include="..\include\cglm\types.h" />
|
||||
<ClInclude Include="..\include\cglm\util.h" />
|
||||
|
@@ -233,5 +233,11 @@
|
||||
<ClInclude Include="..\include\cglm\ease.h">
|
||||
<Filter>include\cglm</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\include\cglm\simd\arm.h">
|
||||
<Filter>include\cglm\simd</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\include\cglm\simd\x86.h">
|
||||
<Filter>include\cglm\simd</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
Reference in New Issue
Block a user