From 65292a94a6d3ab7b0d0b2d91668a69a588feb0c1 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 24 Apr 2021 00:00:00 +0300 Subject: [PATCH] swizzling functions for NEON --- CREDITS | 5 +++++ include/cglm/simd/arm.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/CREDITS b/CREDITS index daeb27e..94d9d40 100644 --- a/CREDITS +++ b/CREDITS @@ -74,3 +74,8 @@ Link to paper: http://webserver2.tecgraf.puc-rio.br/~mgattass/cg/trbRR/Fast%20Mi 14. ARM NEON: Matrix Vector Multiplication https://stackoverflow.com/a/57793352/2676533 + +15. ARM NEON Vector Swizzling and Permute + +https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic +http://github.com/microsoft/DirectXMath diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 4c8a3b4..1a5adc9 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -21,6 +21,46 @@ #define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0) #define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1) +#define SWIZZLE(NAME) \ + static inline float32x4_t NAME(glmm_128 v) + +/* TODO: + * I'm not sure if glmm_xxxx(), glmm_yyyy()... is better than glmm_0000()... + */ + +SWIZZLE(glmm_0000) { return vdupq_lane_f32(vget_low_f32(v), 0); } +SWIZZLE(glmm_1111) { return vdupq_lane_f32(vget_low_f32(v), 1); } +SWIZZLE(glmm_2222) { return vdupq_lane_f32(vget_high_f32(v), 0); } +SWIZZLE(glmm_3333) { return vdupq_lane_f32(vget_high_f32(v), 1); } + +SWIZZLE(glmm_1032) { return vrev64q_f32(v); } + +SWIZZLE(glmm_0101) { float32x2_t vt = vget_low_f32(v); return vcombine_f32(vt, vt); } +SWIZZLE(glmm_2323) { float32x2_t vt = vget_high_f32(v); return vcombine_f32(vt, vt); } +SWIZZLE(glmm_1010) { float32x2_t vt = vrev64_f32(vget_low_f32(v)); return vcombine_f32(vt, vt); } +SWIZZLE(glmm_3232) { float32x2_t vt = vrev64_f32(vget_high_f32(v)); return vcombine_f32(vt, vt); } + +SWIZZLE(glmm_0132) { return vcombine_f32(vget_low_f32(v), vrev64_f32(vget_high_f32(v))); } +SWIZZLE(glmm_1023) { return vcombine_f32(vrev64_f32(vget_low_f32(v)), vget_high_f32(v)); } +SWIZZLE(glmm_2310) { return vcombine_f32(vget_high_f32(v), vrev64_f32(vget_low_f32(v))); } +SWIZZLE(glmm_3201) { return vcombine_f32(vrev64_f32(vget_high_f32(v)), vget_low_f32(v)); } +SWIZZLE(glmm_3210) { return vcombine_f32(vrev64_f32(vget_high_f32(v)), vrev64_f32(vget_low_f32(v))); } + +SWIZZLE(glmm_0022) { return vtrnq_f32(v, v).val[0]; } +SWIZZLE(glmm_1133) { return vtrnq_f32(v, v).val[1]; } + +SWIZZLE(glmm_0011) { return vzipq_f32(v, v).val[0]; } +SWIZZLE(glmm_2233) { return vzipq_f32(v, v).val[1]; } + +SWIZZLE(glmm_0202) { return vuzpq_f32(v, v).val[0]; } +SWIZZLE(glmm_1313) { return vuzpq_f32(v, v).val[1]; } + +SWIZZLE(glmm_1230) { return vextq_f32(v, v, 1); } +SWIZZLE(glmm_2301) { return vextq_f32(v, v, 2); } +SWIZZLE(glmm_3012) { return vextq_f32(v, v, 3); } + +#undef SWIZZLE + static inline float32x4_t glmm_abs(float32x4_t v) {