From 009405adcd14875c8b076eac4459f06e24f3db05 Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sat, 2 Dec 2023 09:54:49 +0800 Subject: [PATCH] wasm: prefer pmin/pmax According to [emscripten](https://emscripten.org/docs/porting/simd.html) and [v8](https://github.com/v8/v8/blob/b6520eda5eafc3b007a5641b37136dfc9d92f63d/src/compiler/backend/x64/code-generator-x64.cc#L2661-L2699), `[f32x4|f64x2].[min|max]` compiles to much more instructions than `[f32x4|f64x2].[pmin|pmax]`. It is defined in [spec](https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md#floating-point-min-and-max) that the difference between pmin/pmax and min/max is NaN-propagating behavior, and the equivalent to the x86 `_mm_min_ps`/`_mm_max_ps` is pmin/pmax in [v8](https://github.com/v8/v8/blob/b6520eda5eafc3b007a5641b37136dfc9d92f63d/src/compiler/backend/x64/code-generator-x64.cc#L2740-L2747). This should make functions with min/max faster on webassembly, and align with the existing behavior with x86 sse. --- include/cglm/vec4.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 5c9ff6e..ae99ff6 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -649,7 +649,7 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), - wasm_f32x4_max(glmm_load(a), glmm_load(b)))); + wasm_f32x4_pmax(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_max_ps(glmm_load(a), @@ -681,7 +681,7 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add( glmm_load(dest), - wasm_f32x4_min(glmm_load(a), glmm_load(b)))); + wasm_f32x4_pmin(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(dest), _mm_min_ps(glmm_load(a), @@ -854,7 +854,7 @@ CGLM_INLINE void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_max(glmm_load(a), glmm_load(b))); + glmm_store(dest, wasm_f32x4_pmax(glmm_load(a), glmm_load(b))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) @@ -878,7 +878,7 @@ CGLM_INLINE void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_min(glmm_load(a), glmm_load(b))); + glmm_store(dest, wasm_f32x4_pmin(glmm_load(a), glmm_load(b))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); #elif defined(CGLM_NEON_FP) @@ -902,8 +902,8 @@ CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(v, wasm_f32x4_min( - wasm_f32x4_max(glmm_load(v), wasm_f32x4_splat(minVal)), + glmm_store(v, wasm_f32x4_pmin( + wasm_f32x4_pmax(glmm_load(v), wasm_f32x4_splat(minVal)), wasm_f32x4_splat(maxVal))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)),