diff --git a/include/cglm/call/quat.h b/include/cglm/call/quat.h
index 576f43b..ee45f0c 100644
--- a/include/cglm/call/quat.h
+++ b/include/cglm/call/quat.h
@@ -39,7 +39,7 @@ glmc_quat_dot(versor q, versor r);
 
 CGLM_EXPORT
 void
-glmc_quat_mulv(versor q1, versor q2, versor dest);
+glmc_quat_mul(versor p, versor q, versor dest);
 
 CGLM_EXPORT
 void
diff --git a/include/cglm/quat.h b/include/cglm/quat.h
index e73bf8b..e1112d5 100644
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -11,15 +11,27 @@
    GLM_QUAT_IDENTITY
 
  Functions:
-   CGLM_INLINE void  glm_quat_identity(versor q);
-   CGLM_INLINE void  glm_quat(versor q, float angle, float x, float y, float z);
-   CGLM_INLINE void  glm_quatv(versor q, float angle, vec3 axis);
+   CGLM_INLINE void glm_quat_identity(versor q);
+   CGLM_INLINE void glm_quat_init(versor q, float x, float y, float z, float w);
+   CGLM_INLINE void glm_quat(versor q, float angle, float x, float y, float z);
+   CGLM_INLINE void glm_quatv(versor q, float angle, vec3 axis);
    CGLM_INLINE float glm_quat_norm(versor q);
-   CGLM_INLINE void  glm_quat_normalize(versor q);
-   CGLM_INLINE float glm_quat_dot(versor q, versor r);
-   CGLM_INLINE void  glm_quat_mulv(versor q1, versor q2, versor dest);
-   CGLM_INLINE void  glm_quat_mat4(versor q, mat4 dest);
-   CGLM_INLINE void  glm_quat_slerp(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_normalize(versor q);
+   CGLM_INLINE float glm_quat_dot(versor q1, versor q2);
+   CGLM_INLINE void glm_quat_conjugate(versor q, versor dest);
+   CGLM_INLINE void glm_quat_inv(versor q, versor dest);
+   CGLM_INLINE void glm_quat_add(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_sub(versor p, versor q, versor dest);
+   CGLM_INLINE float glm_quat_real(versor q);
+   CGLM_INLINE void glm_quat_imag(versor q, vec3 dest);
+   CGLM_INLINE void glm_quat_imagn(versor q, vec3 dest);
+   CGLM_INLINE float glm_quat_imaglen(versor q);
+   CGLM_INLINE float glm_quat_angle(versor q);
+   CGLM_INLINE void glm_quat_axis(versor q, versor dest);
+   CGLM_INLINE void glm_quat_mul(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_mat4(versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_mat3(versor q, mat3 dest)
+   CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest);
  */
 
 #ifndef cglm_quat_h
@@ -164,24 +176,6 @@ glm_quat_dot(versor q1, versor q2) {
   return glm_vec4_dot(q1, q2);
 }
 
-/*!
- * @brief multiplies two quaternion and stores result in dest
- *
- * @param[in]   q1    quaternion 1
- * @param[in]   q2    quaternion 2
- * @param[out]  dest  result quaternion
- */
-CGLM_INLINE
-void
-glm_quat_mulv(versor q1, versor q2, versor dest) {
-  dest[0] = q2[0] * q1[0] - q2[1] * q1[1] - q2[2] * q1[2] - q2[3] * q1[3];
-  dest[1] = q2[0] * q1[1] + q2[1] * q1[0] - q2[2] * q1[3] + q2[3] * q1[2];
-  dest[2] = q2[0] * q1[2] + q2[1] * q1[3] + q2[2] * q1[0] - q2[3] * q1[1];
-  dest[3] = q2[0] * q1[3] - q2[1] * q1[2] + q2[2] * q1[1] + q2[3] * q1[0];
-
-  glm_quat_normalize(dest);
-}
-
 /*!
  * @brief conjugate of quaternion
  *
@@ -310,6 +304,37 @@ glm_quat_axis(versor q, versor dest) {
   glm_quat_imagn(q, dest);
 }
 
+/*!
+ * @brief multiplies two quaternion and stores result in dest
+ *        this is also called Hamilton Product
+ *
+ * According to WikiPedia:
+ * The product of two rotation quaternions [clarification needed] will be
+ * equivalent to the rotation q followed by the rotation p
+ *
+ * @param[in]   p     quaternion 1
+ * @param[in]   q     quaternion 2
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_mul(versor p, versor q, versor dest) {
+  /*
+    + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+    + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+    + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+       a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glm_quat_mul_sse2(p, q, dest);
+#else
+  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
+  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
+  dest[2] = p[3] * q[2] + p[0] * q[1] - p[1] * q[0] + p[2] * q[3];
+  dest[3] = p[3] * q[3] - p[0] * q[0] - p[1] * q[1] - p[2] * q[2];
+#endif
+}
+
 /*!
  * @brief convert quaternion to mat4
  *
diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h
index b3420a7..59438f4 100644
--- a/include/cglm/simd/sse2/quat.h
+++ b/include/cglm/simd/sse2/quat.h
@@ -12,6 +12,35 @@
 #include "../../common.h"
 #include "../intrin.h"
 
+CGLM_INLINE
+void
+glm_quat_mul_sse2(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+
+  __m128 xp, xq, x0, r;
+
+  xp = _mm_load_ps(p); /* 3 2 1 0 */
+  xq = _mm_load_ps(q);
+
+  r  = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq);
+
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 0, 1, 2, 3)));
+
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 1, 0, 3, 2)));
+
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 2, 3, 0, 1)));
+
+  _mm_store_ps(dest, r);
+}
+
 CGLM_INLINE
 void
 glm_quat_slerp_sse2(versor q,
diff --git a/src/quat.c b/src/quat.c
index 35ec890..b1cfe90 100644
--- a/src/quat.c
+++ b/src/quat.c
@@ -46,8 +46,8 @@ glmc_quat_dot(versor q, versor r) {
 
 CGLM_EXPORT
 void
-glmc_quat_mulv(versor q1, versor q2, versor dest) {
-  glm_quat_mul(q1, q2, dest);
+glmc_quat_mul(versor p, versor q, versor dest) {
+  glm_quat_mul(p, q, dest);
 }
 
 CGLM_EXPORT
@@ -64,9 +64,6 @@ glmc_quat_mat3(versor q, mat3 dest) {
 
 CGLM_EXPORT
 void
-glmc_quat_slerp(versor q,
-                versor r,
-                float  t,
-                versor dest) {
+glmc_quat_slerp(versor q, versor r,  float t, versor dest) {
   glm_quat_slerp(q, r, t, dest);
 }
diff --git a/test/src/test_common.c b/test/src/test_common.c
index b824b37..bd6deaf 100644
--- a/test/src/test_common.c
+++ b/test/src/test_common.c
@@ -92,10 +92,18 @@ test_assert_vec3_eq(vec3 v1, vec3 v2) {
 }
 
 void
-test_assert_quat_eq(versor v1, versor v2) {
-  assert_true(fabsf(v1[0] - v2[0]) <= 0.0009); /* rounding errors */
-  assert_true(fabsf(v1[1] - v2[1]) <= 0.0009);
-  assert_true(fabsf(v1[2] - v2[2]) <= 0.0009);
-  assert_true(fabsf(v1[3] - v2[3]) <= 0.0009);
+test_assert_quat_eq_abs(versor v1, versor v2) {
+  assert_true(fabsf(fabsf(v1[0]) - fabsf(v2[0])) <= 0.0009); /* rounding errors */
+  assert_true(fabsf(fabsf(v1[1]) - fabsf(v2[1])) <= 0.0009);
+  assert_true(fabsf(fabsf(v1[2]) - fabsf(v2[2])) <= 0.0009);
+  assert_true(fabsf(fabsf(v1[3]) - fabsf(v2[3])) <= 0.0009);
+}
+
+void
+test_assert_quat_eq(versor v1, versor v2) {
+  assert_true(fabsf(v1[0] - v2[0]) <= 0.0000009); /* rounding errors */
+  assert_true(fabsf(v1[1] - v2[1]) <= 0.0000009);
+  assert_true(fabsf(v1[2] - v2[2]) <= 0.0000009);
+  assert_true(fabsf(v1[3] - v2[3]) <= 0.0000009);
 }
 
diff --git a/test/src/test_common.h b/test/src/test_common.h
index 477e59d..c95405e 100644
--- a/test/src/test_common.h
+++ b/test/src/test_common.h
@@ -37,6 +37,9 @@ test_assert_vec3_eq(vec3 v1, vec3 v2);
 void
 test_assert_quat_eq(versor v1, versor v2);
 
+void
+test_assert_quat_eq_abs(versor v1, versor v2);
+
 void
 test_rand_vec3(vec3 dest);
 
diff --git a/test/src/test_quat.c b/test/src/test_quat.c
index 36d2ef5..723dea1 100644
--- a/test/src/test_quat.c
+++ b/test/src/test_quat.c
@@ -7,10 +7,19 @@
 
 #include "test_common.h"
 
+CGLM_INLINE
+void
+test_quat_mul_raw(versor p, versor q, versor dest) {
+  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
+  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
+  dest[2] = p[3] * q[2] + p[0] * q[1] - p[1] * q[0] + p[2] * q[3];
+  dest[3] = p[3] * q[3] - p[0] * q[0] - p[1] * q[1] - p[2] * q[2];
+}
+
 void
 test_quat(void **state) {
   mat4   inRot, outRot;
-  versor inQuat, outQuat;
+  versor inQuat, outQuat, q3, q4;
   int    i;
 
   for (i = 0; i < 1000; i++) {
@@ -18,7 +27,12 @@ test_quat(void **state) {
     glmc_quat_mat4(inQuat, inRot);
     glmc_mat4_quat(inRot, outQuat);
     glmc_quat_mat4(outQuat, outRot);
-    test_assert_quat_eq(inQuat, outQuat);
+    test_assert_quat_eq_abs(inQuat, outQuat);
     test_assert_mat4_eq2(inRot, outRot, 0.000009); /* almost equal */
+
+    test_quat_mul_raw(inQuat, outQuat, q3);
+    glm_quat_mul_sse2(inQuat, outQuat, q4);
+
+    test_assert_quat_eq(q3, q4);
   }
 }