simd: organise SIMD-functions

* optimize dot product
This commit is contained in:
Recep Aslantas
2019-01-24 10:17:49 +03:00
parent be6aa9a89a
commit 31bb303c55
10 changed files with 259 additions and 126 deletions

1
.gitignore vendored
View File

@@ -69,3 +69,4 @@ win/cglm_test_*
win/x64
win/x85
win/Debug
cglm-test-ios*

View File

@@ -52,3 +52,6 @@ https://gamedev.stackexchange.com/questions/28395/rotating-vector3-by-a-quaterni
9. Sphere AABB intersect
https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
10. Horizontal add
https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86

View File

@@ -218,7 +218,7 @@ glm_quat_normalize_to(versor q, versor dest) {
float dot;
x0 = glmm_load(q);
xdot = glmm_dot(x0, x0);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot <= 0.0f) {

41
include/cglm/simd/arm.h Normal file
View File

@@ -0,0 +1,41 @@
/*
* Copyright (c), Recep Aslantas.
*
* MIT License (MIT), http://opensource.org/licenses/MIT
* Full license can be found in the LICENSE file
*/
#ifndef cglm_simd_arm_h
#define cglm_simd_arm_h
#include "intrin.h"
#ifdef CGLM_SIMD_ARM
#define glmm_load(p) vld1q_f32(p)
#define glmm_store(p, a) vst1q_f32(p, a)
static inline
float
glmm_hadd(float32x4_t v) {
#if defined(__aarch64__)
return vaddvq_f32(v);
#else
v = vaddq_f32(v, vrev64q_f32(v));
v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
return vgetq_lane_f32(v, 0);
#endif
}
static inline
float
glmm_dot(float32x4_t a, float32x4_t b) {
return glmm_hadd(vmulq_f32(a, b));
}
static inline
float
glmm_norm(float32x4_t a) {
return sqrtf(glmm_dot(a, a));
}
#endif
#endif /* cglm_simd_arm_h */

View File

@@ -27,94 +27,39 @@
#if defined( __SSE__ ) || defined( __SSE2__ )
# include <xmmintrin.h>
# include <emmintrin.h>
/* OPTIONAL: You may save some instructions but latency (not sure) */
#ifdef CGLM_USE_INT_DOMAIN
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
_MM_SHUFFLE(z, y, x, w)))
#else
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
#endif
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1)
static inline
__m128
glmm_dot(__m128 a, __m128 b) {
__m128 x0;
x0 = _mm_mul_ps(a, b);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
}
static inline
__m128
glmm_norm(__m128 a) {
return _mm_sqrt_ps(glmm_dot(a, a));
}
static inline
__m128
glmm_load3(float v[3]) {
__m128i xy;
__m128 z;
xy = _mm_loadl_epi64((const __m128i *)v);
z = _mm_load_ss(&v[2]);
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
}
static inline
void
glmm_store3(__m128 vx, float v[3]) {
_mm_storel_pi((__m64 *)&v[0], vx);
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
}
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load(p) _mm_loadu_ps(p)
# define glmm_store(p, a) _mm_storeu_ps(p, a)
#else
# define glmm_load(p) _mm_load_ps(p)
# define glmm_store(p, a) _mm_store_ps(p, a)
#endif
#endif
/* x86, x64 */
#if defined( __SSE__ ) || defined( __SSE2__ )
# define CGLM_SSE_FP 1
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#ifdef __AVX__
# define CGLM_AVX_FP 1
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load256(p) _mm256_loadu_ps(p)
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
#else
# define glmm_load256(p) _mm256_load_ps(p)
# define glmm_store256(p, a) _mm256_store_ps(p, a)
#endif
#endif
#if defined(__SSE3__)
# include <x86intrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#if defined(__SSE4_1__)
# include <smmintrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#if defined(__SSE4_2__)
# include <nmmintrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#ifdef __AVX__
# include <immintrin.h>
# define CGLM_AVX_FP 1
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
/* ARM Neon */
@@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) {
# include <arm_neon.h>
# if defined(__ARM_NEON_FP)
# define CGLM_NEON_FP 1
# ifndef CGLM_SIMD_ARM
# define CGLM_SIMD_ARM
# endif
#else
# undef CGLM_NEON_FP
# endif
#endif
#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
# ifndef CGLM_SIMD
# define CGLM_SIMD
# endif
#endif
#if defined(CGLM_SIMD_x86)
# include "x86.h"
#endif
#if defined(CGLM_SIMD_ARM)
# include "arm.h"
#endif
#endif /* cglm_intrin_h */

136
include/cglm/simd/x86.h Normal file
View File

@@ -0,0 +1,136 @@
/*
* Copyright (c), Recep Aslantas.
*
* MIT License (MIT), http://opensource.org/licenses/MIT
* Full license can be found in the LICENSE file
*/
#ifndef cglm_simd_x86_h
#define cglm_simd_x86_h
#include "intrin.h"
#ifdef CGLM_SIMD_x86
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load(p) _mm_loadu_ps(p)
# define glmm_store(p, a) _mm_storeu_ps(p, a)
#else
# define glmm_load(p) _mm_load_ps(p)
# define glmm_store(p, a) _mm_store_ps(p, a)
#endif
#ifdef CGLM_USE_INT_DOMAIN
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
_MM_SHUFFLE(z, y, x, w)))
#else
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
#endif
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1)
#ifdef __AVX__
# ifdef CGLM_ALL_UNALIGNED
# define glmm_load256(p) _mm256_loadu_ps(p)
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
# else
# define glmm_load256(p) _mm256_load_ps(p)
# define glmm_store256(p, a) _mm256_store_ps(p, a)
# endif
#endif
static inline
__m128
glmm_vhadds(__m128 v) {
#if defined(__SSE3__)
__m128 shuf, sums;
shuf = _mm_movehdup_ps(v);
sums = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return sums;
#else
__m128 shuf, sums;
shuf = glmm_shuff1(v, 2, 3, 0, 1);
sums = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return sums;
#endif
}
static inline
float
glmm_hadd(__m128 v) {
return _mm_cvtss_f32(glmm_vhadds(v));
}
static inline
__m128
glmm_vdots(__m128 a, __m128 b) {
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
__m128 x0, x1;
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
x1 = _mm_hadd_ps(x0, x0);
return _mm_hadd_ps(x1, x1);
#else
return glmm_vhadds(_mm_mul_ps(a, b));
#endif
}
static inline
__m128
glmm_vdot(__m128 a, __m128 b) {
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
__m128 x0, x1;
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
x1 = _mm_hadd_ps(x0, x0);
return _mm_hadd_ps(x1, x1);
#else
__m128 x0;
x0 = _mm_mul_ps(a, b);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
#endif
}
static inline
float
glmm_dot(__m128 a, __m128 b) {
return _mm_cvtss_f32(glmm_vdots(a, b));
}
static inline
float
glmm_norm(__m128 a) {
return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a))));
}
static inline
__m128
glmm_load3(float v[3]) {
__m128i xy;
__m128 z;
xy = _mm_loadl_epi64((const __m128i *)v);
z = _mm_load_ss(&v[2]);
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
}
static inline
void
glmm_store3(__m128 vx, float v[3]) {
_mm_storel_pi((__m64 *)&v[0], vx);
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
}
#endif
#endif /* cglm_simd_x86_h */

View File

@@ -200,24 +200,8 @@ glm_vec4_one(vec4 v) {
CGLM_INLINE
float
glm_vec4_dot(vec4 a, vec4 b) {
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
return _mm_cvtss_f32(_mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF));
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
__m128 x0, x1;
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
x1 = _mm_hadd_ps(x0, x0);
return _mm_cvtss_f32(_mm_hadd_ps(x1, x1));
#elif defined(__SSE__) || defined(__SSE2__)
__m128 x0;
x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
return _mm_cvtss_f32(_mm_add_ss(x0, glmm_shuff1(x0, 0, 1, 0, 1)));
#elif defined(CGLM_NEON_FP)
float32x4_t v0, v1, v2;
v0 = vmulq_f32(vld1q_f32(a), vld1q_f32(b));
v1 = vaddq_f32(v0, vrev64q_f32(v0));
v2 = vaddq_f32(v1, vcombine_f32(vget_high_f32(v1), vget_low_f32(v1)));
return vgetq_lane_f32(v2, 0);
#if defined(CGLM_SIMD)
return glmm_dot(glmm_load(a), glmm_load(b));
#else
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
#endif
@@ -250,10 +234,8 @@ glm_vec4_norm2(vec4 v) {
CGLM_INLINE
float
glm_vec4_norm(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ )
__m128 x0;
x0 = glmm_load(v);
return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0)));
#if defined(CGLM_SIMD)
return glmm_norm(glmm_load(v));
#else
return sqrtf(glm_vec4_dot(v, v));
#endif
@@ -663,7 +645,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
float dot;
x0 = glmm_load(v);
xdot = glmm_dot(x0, x0);
xdot = glmm_vdot(x0, x0);
dot = _mm_cvtss_f32(xdot);
if (dot == 0.0f) {

View File

@@ -77,7 +77,9 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
include/cglm/call/ease.h
cglm_simddir=$(includedir)/cglm/simd
cglm_simd_HEADERS = include/cglm/simd/intrin.h
cglm_simd_HEADERS = include/cglm/simd/intrin.h \
include/cglm/simd/x86.h \
include/cglm/simd/arm.h
cglm_simd_sse2dir=$(includedir)/cglm/simd/sse2
cglm_simd_sse2_HEADERS = include/cglm/simd/sse2/affine.h \

View File

@@ -69,6 +69,7 @@
<ClInclude Include="..\include\cglm\plane.h" />
<ClInclude Include="..\include\cglm\project.h" />
<ClInclude Include="..\include\cglm\quat.h" />
<ClInclude Include="..\include\cglm\simd\arm.h" />
<ClInclude Include="..\include\cglm\simd\avx\affine.h" />
<ClInclude Include="..\include\cglm\simd\avx\mat4.h" />
<ClInclude Include="..\include\cglm\simd\intrin.h" />
@@ -77,6 +78,7 @@
<ClInclude Include="..\include\cglm\simd\sse2\mat3.h" />
<ClInclude Include="..\include\cglm\simd\sse2\mat4.h" />
<ClInclude Include="..\include\cglm\simd\sse2\quat.h" />
<ClInclude Include="..\include\cglm\simd\x86.h" />
<ClInclude Include="..\include\cglm\sphere.h" />
<ClInclude Include="..\include\cglm\types.h" />
<ClInclude Include="..\include\cglm\util.h" />

View File

@@ -233,5 +233,11 @@
<ClInclude Include="..\include\cglm\ease.h">
<Filter>include\cglm</Filter>
</ClInclude>
<ClInclude Include="..\include\cglm\simd\arm.h">
<Filter>include\cglm\simd</Filter>
</ClInclude>
<ClInclude Include="..\include\cglm\simd\x86.h">
<Filter>include\cglm\simd</Filter>
</ClInclude>
</ItemGroup>
</Project>