Merge pull request #379 from recp/simd_min_max

simd: min / max helpers
This commit is contained in:
Recep Aslantas
2024-01-11 00:21:10 +03:00
committed by GitHub
4 changed files with 41 additions and 63 deletions

View File

@@ -56,11 +56,9 @@ glmm_float32x4_init(float x, float y, float z, float w) {
#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f) #define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f)
#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f) #define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f)
static inline static inline float32x4_t glmm_abs(float32x4_t v) { return vabsq_f32(v); }
float32x4_t static inline float32x4_t glmm_min(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); }
glmm_abs(float32x4_t v) { static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); }
return vabsq_f32(v);
}
static inline static inline
float32x4_t float32x4_t

View File

@@ -34,13 +34,11 @@
#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf) #define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf)
#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0) #define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0)
#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf) #define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf)
#define glmm_float32x4_SIGNMASK_NEG wasm_i32x4_const_splat(GLMM_NEGZEROf) #define glmm_float32x4_SIGNMASK_NEG wasm_i32x4_const_splat(GLMM_NEGZEROf)
static inline static inline glmm_128 glmm_abs(glmm_128 x) { return wasm_f32x4_abs(x); }
glmm_128 static inline glmm_128 glmm_min(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmin(b, a); }
glmm_abs(glmm_128 x) { static inline glmm_128 glmm_max(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmax(b, a); }
return wasm_f32x4_abs(x);
}
static inline static inline
glmm_128 glmm_128
@@ -74,7 +72,7 @@ glmm_128
glmm_vhmin(glmm_128 v) { glmm_vhmin(glmm_128 v) {
glmm_128 x0, x1, x2; glmm_128 x0, x1, x2;
x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */ x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */
x1 = wasm_f32x4_pmin(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x1 = wasm_f32x4_pmin(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
return wasm_f32x4_pmin(x1, x2); return wasm_f32x4_pmin(x1, x2);
} }
@@ -90,7 +88,7 @@ glmm_128
glmm_vhmax(glmm_128 v) { glmm_vhmax(glmm_128 v) {
glmm_128 x0, x1, x2; glmm_128 x0, x1, x2;
x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */ x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */
x1 = wasm_f32x4_pmax(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x1 = wasm_f32x4_pmax(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
/* _mm_max_ss */ /* _mm_max_ss */
return wasm_i32x4_shuffle(x1, wasm_f32x4_pmax(x1, x2), 4, 1, 2, 3); return wasm_i32x4_shuffle(x1, wasm_f32x4_pmax(x1, x2), 4, 1, 2, 3);

View File

@@ -74,6 +74,9 @@ glmm_abs(__m128 x) {
return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x); return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x);
} }
static inline __m128 glmm_min(__m128 a, __m128 b) { return _mm_min_ps(a, b); }
static inline __m128 glmm_max(__m128 a, __m128 b) { return _mm_max_ps(a, b); }
static inline static inline
__m128 __m128
glmm_vhadd(__m128 v) { glmm_vhadd(__m128 v) {

View File

@@ -653,17 +653,14 @@ CGLM_INLINE
void void
glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add( glmm_store(dest, wasm_f32x4_add(glmm_load(dest),
glmm_load(dest), glmm_max(glmm_load(a), glmm_load(b))));
wasm_f32x4_pmax(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_max_ps(glmm_load(a), glmm_max(glmm_load(a), glmm_load(b))));
glmm_load(b))));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), glmm_store(dest, vaddq_f32(glmm_load(dest),
vmaxq_f32(vld1q_f32(a), glmm_max(glmm_load(a), glmm_load(b))));
vld1q_f32(b))));
#else #else
dest[0] += glm_max(a[0], b[0]); dest[0] += glm_max(a[0], b[0]);
dest[1] += glm_max(a[1], b[1]); dest[1] += glm_max(a[1], b[1]);
@@ -685,17 +682,14 @@ CGLM_INLINE
void void
glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_add( glmm_store(dest, wasm_f32x4_add(glmm_load(dest),
glmm_load(dest), glmm_min(glmm_load(a), glmm_load(b))));
wasm_f32x4_pmin(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_add_ps(glmm_load(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_min_ps(glmm_load(a), glmm_min(glmm_load(a), glmm_load(b))));
glmm_load(b))));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), glmm_store(dest, vaddq_f32(glmm_load(dest),
vminq_f32(vld1q_f32(a), glmm_min(glmm_load(a), glmm_load(b))));
vld1q_f32(b))));
#else #else
dest[0] += glm_min(a[0], b[0]); dest[0] += glm_min(a[0], b[0]);
dest[1] += glm_min(a[1], b[1]); dest[1] += glm_min(a[1], b[1]);
@@ -825,17 +819,14 @@ CGLM_INLINE
void void
glm_vec4_maxsub(vec4 a, vec4 b, vec4 dest) { glm_vec4_maxsub(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_sub( glmm_store(dest, wasm_f32x4_sub(glmm_load(dest),
glmm_load(dest), glmm_max(glmm_load(a), glmm_load(b))));
wasm_f32x4_pmax(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(dest), glmm_store(dest, _mm_sub_ps(glmm_load(dest),
_mm_max_ps(glmm_load(a), glmm_max(glmm_load(a), glmm_load(b))));
glmm_load(b))));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vsubq_f32(vld1q_f32(dest), glmm_store(dest, vsubq_f32(glmm_load(dest),
vmaxq_f32(vld1q_f32(a), glmm_max(glmm_load(a), glmm_load(b))));
vld1q_f32(b))));
#else #else
dest[0] -= glm_max(a[0], b[0]); dest[0] -= glm_max(a[0], b[0]);
dest[1] -= glm_max(a[1], b[1]); dest[1] -= glm_max(a[1], b[1]);
@@ -857,17 +848,14 @@ CGLM_INLINE
void void
glm_vec4_minsub(vec4 a, vec4 b, vec4 dest) { glm_vec4_minsub(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_sub( glmm_store(dest, wasm_f32x4_sub(glmm_load(dest),
glmm_load(dest), glmm_min(glmm_load(a), glmm_load(b))));
wasm_f32x4_pmin(glmm_load(a), glmm_load(b))));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_sub_ps(glmm_load(dest), glmm_store(dest, _mm_sub_ps(glmm_load(dest),
_mm_min_ps(glmm_load(a), glmm_min(glmm_load(a), glmm_load(b))));
glmm_load(b))));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vsubq_f32(vld1q_f32(dest), glmm_store(dest, vsubq_f32(vld1q_f32(dest),
vminq_f32(vld1q_f32(a), glmm_min(glmm_load(a), glmm_load(b))));
vld1q_f32(b))));
#else #else
dest[0] -= glm_min(a[0], b[0]); dest[0] -= glm_min(a[0], b[0]);
dest[1] -= glm_min(a[1], b[1]); dest[1] -= glm_min(a[1], b[1]);
@@ -1031,12 +1019,8 @@ glm_vec4_distance2(vec4 a, vec4 b) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(CGLM_SIMD)
glmm_store(dest, wasm_f32x4_pmax(glmm_load(a), glmm_load(b))); glmm_store(dest, glmm_max(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b)));
#else #else
dest[0] = glm_max(a[0], b[0]); dest[0] = glm_max(a[0], b[0]);
dest[1] = glm_max(a[1], b[1]); dest[1] = glm_max(a[1], b[1]);
@@ -1055,12 +1039,8 @@ glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { glm_vec4_minv(vec4 a, vec4 b, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(CGLM_SIMD)
glmm_store(dest, wasm_f32x4_pmin(glmm_load(a), glmm_load(b))); glmm_store(dest, glmm_min(glmm_load(a), glmm_load(b)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b)));
#else #else
dest[0] = glm_min(a[0], b[0]); dest[0] = glm_min(a[0], b[0]);
dest[1] = glm_min(a[1], b[1]); dest[1] = glm_min(a[1], b[1]);
@@ -1080,14 +1060,13 @@ CGLM_INLINE
void void
glm_vec4_clamp(vec4 v, float minVal, float maxVal) { glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
#if defined(__wasm__) && defined(__wasm_simd128__) #if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(v, wasm_f32x4_pmin( glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)),
wasm_f32x4_pmax(glmm_load(v), wasm_f32x4_splat(minVal)), wasm_f32x4_splat(maxVal)));
wasm_f32x4_splat(maxVal)));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), glmm_store(v, glmm_min(glmm_max(glmm_load(v), _mm_set1_ps(minVal)),
_mm_set1_ps(maxVal))); _mm_set1_ps(maxVal)));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)),
vdupq_n_f32(maxVal))); vdupq_n_f32(maxVal)));
#else #else
v[0] = glm_clamp(v[0], minVal, maxVal); v[0] = glm_clamp(v[0], minVal, maxVal);