diff --git a/.gitignore b/.gitignore
index 180c5d9..d500b97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,4 +59,13 @@ cglm_test_ios/*
 cglm_test_iosTests/*
 docs/build/*
 win/cglm_test_*
-* copy.*
+* copy.*
+*.o
+*.obj
+*codeanalysis.*.xml
+*codeanalysis.xml
+*.lib
+*.tlog
+win/x64
+win/x85
+win/Debug
diff --git a/.travis.yml b/.travis.yml
index af4e37f..c2c0c84 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -49,7 +49,7 @@ script:
 
 after_success:
   - if [[ "$CC" == "gcc" && "$CODE_COVERAGE" == "ON" ]]; then
-      pip install --user cpp-coveralls
+      pip install --user cpp-coveralls && 
       coveralls
         --build-root .
         --exclude lib
diff --git a/CREDITS b/CREDITS
index 810f0b4..44e55a8 100644
--- a/CREDITS
+++ b/CREDITS
@@ -43,3 +43,10 @@ https://github.com/erich666/GraphicsGems/blob/master/gems/TransBox.c
 6. Cull frustum
 http://www.txutxi.com/?p=584
 http://old.cescg.org/CESCG-2002/DSykoraJJelinek/
+
+7. Quaternions
+Initial mat4_quat is borrowed from Apple's simd library
+
+
+8. Vector Rotation using Quaternion
+https://gamedev.stackexchange.com/questions/28395/rotating-vector3-by-a-quaternion
diff --git a/README.md b/README.md
index dc9f736..ff1b4df 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,11 @@ Complete documentation: http://cglm.readthedocs.io
 - _dup (duplicate) is changed to _copy. For instance `glm_vec_dup -> glm_vec_copy`
 - OpenGL related functions are dropped to make this lib platform/third-party independent
 - make sure you have latest version and feel free to report bugs, troubles
-- **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that you have the latest version
+- **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that 
+you have the latest version
+- **[major change]** by starting v0.4.0, quaternions are stored as [x, y, z, w], it was [w, x, y, z] in v0.3.5 and earlier versions
+- **[api rename]** by starting v0.4.5, **glm_simd** functions are renamed to **glmm_**  
+- **[new option]** by starting v0.4.5, you can disable alignment requirement, check options in docs.  
 
 #### Note for C++ developers:
 If you don't aware about original GLM library yet, you may also want to look at:
diff --git a/autogen.sh b/autogen.sh
index f8d13c8..b3b8d82 100644
--- a/autogen.sh
+++ b/autogen.sh
@@ -8,17 +8,14 @@
 
 cd $(dirname "$0")
 
-if [ "$(uname)" = "Darwin" ]; then
-libtoolBin=$(which glibtoolize)
-libtoolBinDir=$(dirname "${libtoolBin}")
-
-if [ ! -f "${libtoolBinDir}/libtoolize" ]; then
-ln -s $libtoolBin "${libtoolBinDir}/libtoolize"
-fi
-fi
-
 autoheader
-libtoolize
+
+if [ "$(uname)" = "Darwin" ]; then
+  glibtoolize
+else
+  libtoolize
+fi
+
 aclocal -I m4
 autoconf
 automake --add-missing --copy
diff --git a/build-deps.sh b/build-deps.sh
index 74b1cfa..20365ba 100644
--- a/build-deps.sh
+++ b/build-deps.sh
@@ -9,19 +9,8 @@
 # check if deps are pulled
 git submodule update --init --recursive
 
-# fix glibtoolize
-
 cd $(dirname "$0")
 
-if [ "$(uname)" = "Darwin" ]; then
-  libtoolBin=$(which glibtoolize)
-  libtoolBinDir=$(dirname "${libtoolBin}")
-
-  if [ ! -f "${libtoolBinDir}/libtoolize" ]; then
-    ln -s $libtoolBin "${libtoolBinDir}/libtoolize"
-  fi
-fi
-
 # general deps: gcc make autoconf automake libtool cmake
 
 # test - cmocka
diff --git a/cglm.podspec b/cglm.podspec
new file mode 100644
index 0000000..150ca86
--- /dev/null
+++ b/cglm.podspec
@@ -0,0 +1,28 @@
+Pod::Spec.new do |s|
+
+  # Description
+  s.name         = "cglm"
+  s.version      = "0.4.4"
+  s.summary      = "📽 Optimized OpenGL/Graphics Math (glm) for C"
+  s.description  = <<-DESC
+cglm is math library for graphics programming for C. It is similar to original glm but it is written for C instead of C++ (you can use here too). See the documentation or README for all features.
+                   DESC
+
+  s.documentation_url = "http://cglm.readthedocs.io"
+
+  # Home
+  s.homepage     = "https://github.com/recp/cglm"
+  s.license      = { :type => "MIT", :file => "LICENSE" }
+  s.author       = { "Recep Aslantas" => "recp@acm.org" }
+
+  # Sources
+  s.source               = { :git => "https://github.com/recp/cglm.git", :tag => "v#{s.version}" }
+  s.source_files         = "src", "include/cglm/**/*.h"
+  s.public_header_files  = "include", "include/cglm/**/*.h"
+  s.exclude_files        = "src/win/*", "src/dllmain.c", "src/**/*.h"
+  s.preserve_paths       = "include", "src"
+  s.header_mappings_dir  = "include"
+
+  # Linking
+  s.library = "m"
+end
diff --git a/configure.ac b/configure.ac
index afb694b..76c6123 100644
--- a/configure.ac
+++ b/configure.ac
@@ -7,7 +7,7 @@
 #*****************************************************************************
 
 AC_PREREQ([2.69])
-AC_INIT([cglm], [0.3.6], [info@recp.me])
+AC_INIT([cglm], [0.4.5], [info@recp.me])
 AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/docs/source/affine-mat.rst b/docs/source/affine-mat.rst
index 0c0fcaa..43740cf 100644
--- a/docs/source/affine-mat.rst
+++ b/docs/source/affine-mat.rst
@@ -33,6 +33,7 @@ Table of contents (click func go):
 Functions:
 
 1. :c:func:`glm_mul`
+#. :c:func:`glm_mul_rot`
 #. :c:func:`glm_inv_tr`
 
 Functions documentation
@@ -59,6 +60,27 @@ Functions documentation
       | *[in]*  **m2**    affine matrix 2
       | *[out]* **dest**  result matrix
 
+.. c:function:: void  glm_mul_rot(mat4 m1, mat4 m2, mat4 dest)
+
+    | this is similar to glm_mat4_mul but specialized to rotation matrix
+
+    Right Matrix format should be (left is free):
+
+    .. code-block:: text
+
+       R  R  R  0
+       R  R  R  0
+       R  R  R  0
+       0  0  0  1
+
+    this reduces some multiplications. It should be faster than mat4_mul.
+    if you are not sure about matrix format then DON'T use this! use mat4_mul
+
+    Parameters:
+      | *[in]*  **m1**    affine matrix 1
+      | *[in]*  **m2**    affine matrix 2
+      | *[out]* **dest**  result matrix
+
 .. c:function:: void  glm_inv_tr(mat4 mat)
 
     | inverse orthonormal rotation + translation matrix (ridig-body)
diff --git a/docs/source/affine.rst b/docs/source/affine.rst
index c34921f..8ded38d 100644
--- a/docs/source/affine.rst
+++ b/docs/source/affine.rst
@@ -5,6 +5,8 @@ affine transforms
 
 Header: cglm/affine.h
 
+Initialize Transform Matrices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Functions with **_make** prefix expect you don't have a matrix and they create
 a matrix for you. You don't need to pass identity matrix.
 
@@ -15,6 +17,107 @@ before sending to transfrom functions.
 There are also functions to decompose transform matrix. These functions can't
 decompose matrix after projected.
 
+Rotation Center
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Rotating functions uses origin as rotation center (pivot/anchor point),
+since scale factors are stored in rotation matrix, same may also true for scalling.
+cglm provides some functions for rotating around at given point e.g.
+**glm_rotate_at**, **glm_quat_rotate_at**. Use them or follow next section for algorihm ("Rotate or Scale around specific Point (Pivot Point / Anchor Point)").
+
+Rotate or Scale around specific Point (Anchor Point)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to rotate model around arbibtrary point follow these steps:
+
+1. Move model from pivot point to origin: **translate(-pivot.x, -pivot.y, -pivot.z)**
+2. Apply rotation (or scaling maybe)
+3. Move model back from origin to pivot (reverse of step-1): **translate(pivot.x, pivot.y, pivot.z)**
+
+**glm_rotate_at**, **glm_quat_rotate_at** and their helper functions works that way.
+
+The implementation would be:
+
+.. code-block:: c
+  :linenos:
+
+  glm_translate(m, pivot);
+  glm_rotate(m, angle, axis);
+  glm_translate(m, pivotInv); /* pivotInv = -pivot */
+
+Transforms Order
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is important to understand this part especially if you call transform
+functions multiple times
+
+`glm_translate`, `glm_rotate`, `glm_scale` and `glm_quat_rotate` and their
+helpers functions works like this (cglm may provide reverse order too as alternative in the future):
+
+.. code-block:: c
+  :linenos:
+
+  TransformMatrix = TransformMatrix * TraslateMatrix; // glm_translate()
+  TransformMatrix = TransformMatrix * RotateMatrix;   // glm_rotate(), glm_quat_rotate()
+  TransformMatrix = TransformMatrix * ScaleMatrix;    // glm_scale()
+
+As you can see it is multipled as right matrix. For instance what will happen if you call `glm_translate` twice?
+
+.. code-block:: c
+  :linenos:
+
+  glm_translate(transform, translate1); /* transform = transform * translate1 */
+  glm_translate(transform, translate2); /* transform = transform * translate2 */
+  glm_rotate(transform, angle, axis)    /* transform = transform * rotation   */
+
+Now lets try to understand this:
+
+1. You call translate using `translate1` and you expect it will be first transform
+because you call it first, do you?
+
+Result will be **`transform = transform * translate1`**
+
+2. Then you call translate using `translate2` and you expect it will be second transform?
+
+Result will be **`transform = transform * translate2`**. Now lets expand transform,
+it was `transform * translate1` before second call.
+
+Now it is **`transform = transform * translate1 * translate2`**, now do you understand what I say?
+
+3. After last call transform will be:
+
+**`transform = transform * translate1 * translate2 * rotation`**
+
+The order will be; **rotation will be applied first**, then **translate2** then **translate1**
+
+It is all about matrix multiplication order. It is similar to MVP matrix:
+`MVP = Projection * View * Model`, model will be applied first, then view then projection.
+
+**Confused?**
+
+In the end the last function call applied first in shaders.
+
+As alternative way, you can create transform matrices individually then combine manually,
+but don't forget that `glm_translate`, `glm_rotate`, `glm_scale`... are optimized and should be faster (an smaller assembly output) than manual multiplication
+
+.. code-block:: c
+  :linenos:
+
+  mat4 transform1, transform2, transform3, finalTransform;
+
+  glm_translate_make(transform1, translate1);
+  glm_translate_make(transform2, translate2);
+  glm_rotate_make(transform3, angle, axis);
+
+  /* first apply transform1, then transform2, thentransform3 */
+  glm_mat4_mulN((mat4 *[]){&transform3, &transform2, &transform1}, 3, finalTransform);
+
+  /* if you don't want to use mulN, same as above */
+  glm_mat4_mul(transform3, transform2, finalTransform);
+  glm_mat4_mul(finalTransform, transform1, finalTransform);
+
+Now transform1 will be applied first, then transform2 then transform3
+
 Table of contents (click to go):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -29,15 +132,14 @@ Functions:
 #. :c:func:`glm_scale_to`
 #. :c:func:`glm_scale_make`
 #. :c:func:`glm_scale`
-#. :c:func:`glm_scale1`
 #. :c:func:`glm_scale_uni`
 #. :c:func:`glm_rotate_x`
 #. :c:func:`glm_rotate_y`
 #. :c:func:`glm_rotate_z`
-#. :c:func:`glm_rotate_ndc_make`
 #. :c:func:`glm_rotate_make`
-#. :c:func:`glm_rotate_ndc`
 #. :c:func:`glm_rotate`
+#. :c:func:`glm_rotate_at`
+#. :c:func:`glm_rotate_atm`
 #. :c:func:`glm_decompose_scalev`
 #. :c:func:`glm_uniscaled`
 #. :c:func:`glm_decompose_rs`
@@ -122,10 +224,6 @@ Functions documentation
       | *[in, out]* **m** affine transfrom
       | *[in]*      **v** scale vector [x, y, z]
 
-.. c:function:: void  glm_scale1(mat4 m, float s)
-
-    DEPRECATED! Use glm_scale_uni
-
 .. c:function:: void  glm_scale_uni(mat4 m, float s)
 
     applies uniform scale to existing transform matrix v = [s, s, s]
@@ -165,16 +263,6 @@ Functions documentation
       | *[in]*  **angle** angle (radians)
       | *[out]* **dest**  rotated matrix
 
-.. c:function:: void  glm_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc)
-
-    creates NEW rotation matrix by angle and axis
-    this name may change in the future. axis must be is normalized
-
-    Parameters:
-      | *[out]* **m**        affine transfrom
-      | *[in]*  **angle**    angle (radians)
-      | *[in]*  **axis_ndc** normalized axis
-
 .. c:function:: void  glm_rotate_make(mat4 m, float angle, vec3 axis)
 
     creates NEW rotation matrix by angle and axis,
@@ -185,16 +273,6 @@ Functions documentation
       | *[in]*  **axis** angle (radians)
       | *[in]*  **axis** axis
 
-.. c:function:: void  glm_rotate_ndc(mat4 m, float angle, vec3 axis_ndc)
-
-    rotate existing transform matrix around Z axis by angle and axis
-    this name may change in the future, axis must be normalized.
-
-    Parameters:
-      | *[out]* **m**        affine transfrom
-      | *[in]*  **angle**    angle (radians)
-      | *[in]*  **axis_ndc** normalized axis
-
 .. c:function:: void  glm_rotate(mat4 m, float angle, vec3 axis)
 
     rotate existing transform matrix around Z axis by angle and axis
@@ -204,6 +282,29 @@ Functions documentation
       | *[in]*      **angle** angle (radians)
       | *[in]*      **axis**  axis
 
+.. c:function:: void  glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis)
+
+    rotate existing transform around given axis by angle at given pivot point (rotation center)
+
+    Parameters:
+      | *[in, out]* **m**     affine transfrom
+      | *[in]*      **pivot** pivot, anchor point, rotation center
+      | *[in]*      **angle** angle (radians)
+      | *[in]*      **axis**  axis
+
+.. c:function:: void  glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis)
+
+    | creates NEW rotation matrix by angle and axis at given point
+    | this creates rotation matrix, it assumes you don't have a matrix
+
+    | this should work faster than glm_rotate_at because it reduces one glm_translate.
+
+    Parameters:
+      | *[in, out]* **m**     affine transfrom
+      | *[in]*      **pivot** pivot, anchor point, rotation center
+      | *[in]*      **angle** angle (radians)
+      | *[in]*      **axis**  axis
+
 .. c:function:: void  glm_decompose_scalev(mat4 m, vec3 s)
 
     decompose scale vector
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7ad96a1..ef9619a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,9 +62,9 @@ author = u'Recep Aslantas'
 # built documents.
 #
 # The short X.Y version.
-version = u'0.3.4'
+version = u'0.4.5'
 # The full version, including alpha/beta/rc tags.
-release = u'0.3.4'
+release = u'0.4.5'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 89c231c..2f8511c 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -21,17 +21,24 @@ Types:
 As you can see types don't store extra informations in favor of space.
 You can send these values e.g. matrix to OpenGL directly without casting or calling a function like *value_ptr*
 
-Aligment is Required:
+Alignment is Required:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-**vec4** and **mat4** requires 16 byte aligment because vec4 and mat4 operations are
+**vec4** and **mat4** requires 16 byte alignment because vec4 and mat4 operations are
 vectorized by SIMD instructions (SSE/AVX).
 
+**UPDATE:**
+  By starting v0.4.5 cglm provides an option to disable alignment requirement, it is enabled as default
+
+  | Check :doc:`opt` page for more details
+
+  Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined.
+
 Allocations:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *cglm* doesn't alloc any memory on heap. So it doesn't provide any allocator.
 You must allocate memory yourself. You should alloc memory for out parameters too if you pass pointer of memory location.
-When allocating memory don't forget that **vec4** and **mat4** requires aligment.
+When allocating memory don't forget that **vec4** and **mat4** requires alignment.
 
 **NOTE:** Unaligned vec4 and unaligned mat4 operations will be supported in the future. Check todo list.
 Because you may want to multiply a CGLM matrix with external matrix.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 25e7cb3..cfdf220 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -40,6 +40,8 @@ Also currently only **float** type is supported for most operations.
    getting_started
    opengl
    api
+   opt
+   troubleshooting
 
 Indices and tables
 ==================
diff --git a/docs/source/opt.rst b/docs/source/opt.rst
new file mode 100644
index 0000000..c614e42
--- /dev/null
+++ b/docs/source/opt.rst
@@ -0,0 +1,36 @@
+.. default-domain:: C
+
+Options
+===============================================================================
+
+A few options are provided via macros.
+
+Alignment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As default, cglm requires types to be aligned. Alignment requirements:
+
+vec3:   8 byte
+vec4:   16 byte
+mat4:   16 byte
+versor: 16 byte
+
+By starting **v0.4.5** cglm provides an option to disable alignment requirement.
+To enable this option define **CGLM_ALL_UNALIGNED** macro before all headers.
+You can define it in Xcode, Visual Studio (or other IDEs) or you can also prefer
+to define it in build system. If you use pre-compiled verisons then you
+have to compile cglm with **CGLM_ALL_UNALIGNED** macro.
+
+**VERY VERY IMPORTANT:** If you use cglm in multiple projects and
+ those projects are depends on each other, then
+
+ | *ALWAYS* or *NEVER USE* **CGLM_ALL_UNALIGNED** macro in linked projects
+
+ if you do not know what you are doing. Because a cglm header included
+ via 'project A' may force types to be aligned and another cglm header
+ included via 'project B' may not require alignment. In this case
+ cglm functions will read from and write to **INVALID MEMORY LOCATIONs**.
+
+ ALWAYS USE SAME CONFIGURATION / OPTION for **cglm** if you have multiple projects.
+
+ For instance if you set CGLM_ALL_UNALIGNED in a project then set it in other projects too
diff --git a/docs/source/quat.rst b/docs/source/quat.rst
index 863eed5..d9b4680 100644
--- a/docs/source/quat.rst
+++ b/docs/source/quat.rst
@@ -5,17 +5,16 @@ quaternions
 
 Header: cglm/quat.h
 
- **Important:** *cglm* stores quaternion as [w, x, y, z] in memory, don't
- forget that when changing quaternion items manually. For instance *quat[3]*
- is *quat.z* and *quat[0*] is *quat.w*. This may change in the future if *cglm*
- will got enough request to do that. Probably it will not be changed in near
- future
+ **Important:** *cglm* stores quaternion as **[x, y, z, w]** in memory
+ since **v0.4.0** it was **[w, x, y, z]**
+ before v0.4.0 ( **v0.3.5 and earlier** ). w is real part.
 
-There are some TODOs for quaternions check TODO list to see them.
+What you can do with quaternions with existing functions is (Some of them):
 
-Also **versor** is identity quaternion so the type may change to **vec4** or
-something else. This will not affect existing functions for your engine because
-*versor* is alias of *vec4*
+- You can rotate transform matrix using quaterion
+- You can rotate vector using quaterion
+- You can create view matrix using quaterion
+- You can create a lookrotation (from source point to dest)
 
 Table of contents (click to go):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -28,14 +27,38 @@ Macros:
 Functions:
 
 1. :c:func:`glm_quat_identity`
+#. :c:func:`glm_quat_init`
 #. :c:func:`glm_quat`
 #. :c:func:`glm_quatv`
+#. :c:func:`glm_quat_copy`
 #. :c:func:`glm_quat_norm`
 #. :c:func:`glm_quat_normalize`
+#. :c:func:`glm_quat_normalize_to`
 #. :c:func:`glm_quat_dot`
-#. :c:func:`glm_quat_mulv`
+#. :c:func:`glm_quat_conjugate`
+#. :c:func:`glm_quat_inv`
+#. :c:func:`glm_quat_add`
+#. :c:func:`glm_quat_sub`
+#. :c:func:`glm_quat_real`
+#. :c:func:`glm_quat_imag`
+#. :c:func:`glm_quat_imagn`
+#. :c:func:`glm_quat_imaglen`
+#. :c:func:`glm_quat_angle`
+#. :c:func:`glm_quat_axis`
+#. :c:func:`glm_quat_mul`
 #. :c:func:`glm_quat_mat4`
+#. :c:func:`glm_quat_mat4t`
+#. :c:func:`glm_quat_mat3`
+#. :c:func:`glm_quat_mat3t`
+#. :c:func:`glm_quat_lerp`
 #. :c:func:`glm_quat_slerp`
+#. :c:func:`glm_quat_look`
+#. :c:func:`glm_quat_for`
+#. :c:func:`glm_quat_forp`
+#. :c:func:`glm_quat_rotatev`
+#. :c:func:`glm_quat_rotate`
+#. :c:func:`glm_quat_rotate_at`
+#. :c:func:`glm_quat_rotate_atm`
 
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -47,10 +70,23 @@ Functions documentation
     Parameters:
       | *[in, out]*  **q**    quaternion
 
+.. c:function:: void  glm_quat_init(versor q, float x, float y, float z, float w)
+
+    | inits quaternion with given values
+
+    Parameters:
+      | *[out]* **q**      quaternion
+      | *[in]*  **x**      imag.x
+      | *[in]*  **y**      imag.y
+      | *[in]*  **z**      imag.z
+      | *[in]*  **w**      w (real part)
+
 .. c:function:: void  glm_quat(versor q, float  angle, float  x, float  y, float  z)
 
     | creates NEW quaternion with individual axis components
 
+    | given axis will be normalized
+
     Parameters:
       | *[out]* **q**      quaternion
       | *[in]*  **angle**  angle (radians)
@@ -58,14 +94,24 @@ Functions documentation
       | *[in]*  **y**      axis.y
       | *[in]*  **z**      axis.z
 
-.. c:function:: void  glm_quatv(versor q, float  angle, vec3   v)
+.. c:function:: void  glm_quatv(versor q, float  angle, vec3   axis)
 
     | creates NEW quaternion with axis vector
 
+    | given axis will be normalized
+
     Parameters:
       | *[out]* **q**      quaternion
       | *[in]*  **angle**  angle (radians)
-      | *[in]*  **v**      axis
+      | *[in]*  **axis**   axis (will be normalized)
+
+.. c:function:: void  glm_quat_copy(versor q, versor dest)
+
+    | copy quaternion to another one
+
+    Parameters:
+      | *[in]*  **q**     source quaternion
+      | *[out]* **dest**  destination quaternion
 
 .. c:function:: float  glm_quat_norm(versor q)
 
@@ -77,6 +123,14 @@ Functions documentation
     Returns:
       norm (magnitude)
 
+.. c:function:: void  glm_quat_normalize_to(versor q, versor dest)
+
+    | normalize quaternion and store result in dest, original one will not be normalized
+
+    Parameters:
+      | *[in]*  **q**    quaternion to normalize into
+      | *[out]* **dest** destination quaternion
+
 .. c:function:: void  glm_quat_normalize(versor q)
 
     | normalize quaternion
@@ -84,24 +138,118 @@ Functions documentation
     Parameters:
       | *[in, out]*  **q** quaternion
 
-.. c:function:: float  glm_quat_dot(versor q, versor r)
+.. c:function:: float  glm_quat_dot(versor p, versor q)
 
     dot product of two quaternion
 
     Parameters:
-      | *[in]*  **q1**   quaternion 1
-      | *[in]*  **q2**   quaternion 2
+      | *[in]*  **p**   quaternion 1
+      | *[in]*  **q**   quaternion 2
 
     Returns:
       dot product
 
-.. c:function:: void  glm_quat_mulv(versor q1, versor q2, versor dest)
+.. c:function:: void  glm_quat_conjugate(versor q, versor dest)
+
+    conjugate of quaternion
+
+    Parameters:
+      | *[in]*  **q**      quaternion
+      | *[in]*  **dest**   conjugate
+
+.. c:function:: void  glm_quat_inv(versor q, versor dest)
+
+    inverse of non-zero quaternion
+
+    Parameters:
+      | *[in]*  **q**      quaternion
+      | *[in]*  **dest**   inverse quaternion
+
+.. c:function:: void  glm_quat_add(versor p, versor q, versor dest)
+
+    add (componentwise) two quaternions and store result in dest
+
+    Parameters:
+      | *[in]*  **p**      quaternion 1
+      | *[in]*  **q**      quaternion 2
+      | *[in]*  **dest**   result quaternion
+
+.. c:function:: void  glm_quat_sub(versor p, versor q, versor dest)
+
+    subtract (componentwise) two quaternions and store result in dest
+
+    Parameters:
+      | *[in]*  **p**      quaternion 1
+      | *[in]*  **q**      quaternion 2
+      | *[in]*  **dest**   result quaternion
+
+.. c:function:: float  glm_quat_real(versor q)
+
+    returns real part of quaternion
+
+    Parameters:
+      | *[in]*  **q**   quaternion
+
+    Returns:
+      real part (quat.w)
+
+.. c:function:: void  glm_quat_imag(versor q, vec3 dest)
+
+    returns imaginary part of quaternion
+
+    Parameters:
+      | *[in]*   **q**      quaternion
+      | *[out]*  **dest**   imag
+
+.. c:function:: void  glm_quat_imagn(versor q, vec3 dest)
+
+    returns normalized imaginary part of quaternion
+
+    Parameters:
+      | *[in]*   **q**      quaternion
+      | *[out]*  **dest**   imag
+
+.. c:function:: float  glm_quat_imaglen(versor q)
+
+    returns length of imaginary part of quaternion
+
+    Parameters:
+      | *[in]*   **q**      quaternion
+
+    Returns:
+      norm of imaginary part
+
+.. c:function:: float  glm_quat_angle(versor q)
+
+    returns angle of quaternion
+
+    Parameters:
+      | *[in]*  **q**   quaternion
+
+    Returns:
+      angles of quat (radians)
+
+.. c:function:: void  glm_quat_axis(versor q, versor dest)
+
+    axis of quaternion
+
+    Parameters:
+      | *[in]*   **p**      quaternion
+      | *[out]*  **dest**   axis of quaternion
+
+.. c:function:: void  glm_quat_mul(versor p, versor q, versor dest)
 
     | multiplies two quaternion and stores result in dest
 
+    | this is also called Hamilton Product
+
+    | According to WikiPedia:
+    | The product of two rotation quaternions [clarification needed] will be
+      equivalent to the rotation q followed by the rotation p
+
     Parameters:
-      | *[in]*  **q1**    quaternion 1
-      | *[in]*  **q2**    quaternion 2
+      | *[in]*  **p**     quaternion 1 (first rotation)
+      | *[in]*  **q**     quaternion 2 (second rotation)
       | *[out]* **dest**  result quaternion
 
 .. c:function:: void  glm_quat_mat4(versor q, mat4 dest)
@@ -112,13 +260,121 @@ Functions documentation
       | *[in]*  **q**     quaternion
       | *[out]* **dest**  result matrix
 
+.. c:function:: void  glm_quat_mat4t(versor q, mat4 dest)
+
+    | convert quaternion to mat4 (transposed). This is transposed version of glm_quat_mat4
+
+    Parameters:
+      | *[in]*  **q**     quaternion
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_mat3(versor q, mat3 dest)
+
+    | convert quaternion to mat3
+
+    Parameters:
+      | *[in]*  **q**     quaternion
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_mat3t(versor q, mat3 dest)
+
+    | convert quaternion to mat3 (transposed). This is transposed version of glm_quat_mat3
+
+    Parameters:
+      | *[in]*  **q**     quaternion
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_lerp(versor from, versor to, float t, versor dest)
+
+    | interpolates between two quaternions
+    | using spherical linear interpolation (LERP)
+
+    Parameters:
+      | *[in]*  **from**  from
+      | *[in]*  **to**    to
+      | *[in]*  **t**     interpolant (amount) clamped between 0 and 1
+      | *[out]* **dest**  result quaternion
+
 .. c:function:: void glm_quat_slerp(versor q, versor r, float  t, versor dest)
 
     | interpolates between two quaternions
     | using spherical linear interpolation (SLERP)
 
     Parameters:
-      | *[in]*  **q**     from
-      | *[in]*  **r**     to
-      | *[in]*  **t**     amout
+      | *[in]*  **from**  from
+      | *[in]*  **to**    to
+      | *[in]*  **t**     interpolant (amount) clamped between 0 and 1
       | *[out]* **dest**  result quaternion
+
+.. c:function:: void  glm_quat_look(vec3 eye, versor ori, mat4 dest)
+
+    | creates view matrix using quaternion as camera orientation
+
+    Parameters:
+      | *[in]*  **eye**   eye
+      | *[in]*  **ori**   orientation in world space as quaternion
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest)
+
+    | creates look rotation quaternion
+
+    Parameters:
+      | *[in]*  **dir**   direction to look
+      | *[in]*  **fwd**   forward vector
+      | *[in]*  **up**    up vector
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_forp(vec3 from, vec3 to, vec3 fwd, vec3 up, versor dest)
+
+    | creates look rotation quaternion using source and  destination positions p suffix stands for position
+
+    | this is similar to glm_quat_for except this computes direction for glm_quat_for for you.
+
+    Parameters:
+      | *[in]*  **from**  source point
+      | *[in]*  **to**    destination point
+      | *[in]*  **fwd**   forward vector
+      | *[in]*  **up**    up vector
+      | *[out]* **dest**  result matrix
+
+.. c:function:: void  glm_quat_rotatev(versor q, vec3 v, vec3 dest)
+
+    | crotate vector using using quaternion
+
+    Parameters:
+      | *[in]*  **q**     quaternion
+      | *[in]*  **v**     vector to rotate
+      | *[out]* **dest**  rotated vector
+
+.. c:function:: void glm_quat_rotate(mat4 m, versor q, mat4 dest)
+
+    | rotate existing transform matrix using quaternion
+
+    instead of passing identity matrix, consider to use quat_mat4 functions
+
+    Parameters:
+      | *[in]*  **m**     existing transform matrix to rotate
+      | *[in]*  **q**     quaternion
+      | *[out]* **dest**  rotated matrix/transform
+
+.. c:function:: void glm_quat_rotate_at(mat4 m, versor q, vec3 pivot)
+
+    | rotate existing transform matrix using quaternion at pivot point
+
+    Parameters:
+      | *[in, out]*  **m**      existing transform matrix to rotate
+      | *[in]*       **q**      quaternion
+      | *[in]*       **pivot**  pivot
+
+.. c:function:: void glm_quat_rotate(mat4 m, versor q, mat4 dest)
+
+    | rotate NEW transform matrix using quaternion at pivot point
+    | this creates rotation matrix, it assumes you don't have a matrix
+
+    | this should work faster than glm_quat_rotate_at because it reduces one glm_translate.
+
+    Parameters:
+      | *[in, out]*  **m**      existing transform matrix to rotate
+      | *[in]*       **q**      quaternion
+      | *[in]*       **pivot**  pivot
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
new file mode 100644
index 0000000..c897dc2
--- /dev/null
+++ b/docs/source/troubleshooting.rst
@@ -0,0 +1,79 @@
+.. default-domain:: C
+
+Troubleshooting
+================================================================================
+
+It is possible that sometimes you may get crashes or wrong results.
+Follow these topics
+
+Memory Allocation:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Again, **cglm** doesn't alloc any memory on heap.
+cglm functions works like memcpy; it copies data from src,
+makes calculations then copy the result to dest.
+
+You are responsible for allocation of **src** and **dest** parameters.
+
+Alignment:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**vec4** and **mat4** types requires 16 byte alignment.
+These types are marked with align attribute to let compiler know about this
+requirement.
+
+But since MSVC (Windows) throws the error:
+
+**"formal parameter with requested alignment of 16 won't be aligned"**
+
+The alignment attribute has been commented for MSVC
+
+.. code-block:: c
+
+   #if defined(_MSC_VER)
+   #  define CGLM_ALIGN(X) /* __declspec(align(X)) */
+   #else
+   #  define CGLM_ALIGN(X) __attribute((aligned(X)))
+   #endif.
+
+So MSVC may not know about alignment requirements when creating variables.
+The interesting thing is that, if I remember correctly Visual Studio 2017
+doesn't throw the above error. So we may uncomment that line for Visual Studio 2017,
+you may do it yourself.
+
+**This MSVC issue is still in TODOs.**
+
+**UPDATE:** By starting v0.4.5 cglm provides an option to disable alignment requirement.
+Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined.
+
+Crashes, Invalid Memory Access:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Probably you are trying to write to invalid memory location.
+
+You may used wrong function for what you want to do.
+
+For instance you may called **glm_vec4_** functions for **vec3** data type.
+It will try to write 32 byte but since **vec3** is 24 byte it should throw
+memory access error or exit the app without saying anything.
+
+Wrong Results:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Again, you may used wrong function.
+
+For instance if you use **glm_normalize()** or **glm_vec_normalize()** for **vec4**,
+it will assume that passed param is **vec3** and will normalize it for **vec3**.
+Since you need to **vec4** to be normalized in your case, you will get wrong results.
+
+Accessing vec4 type with vec3 functions is valid, you will not get any error, exception or crash.
+You only get wrong results if you don't know what you are doing!
+
+So be carefull, when your IDE (Xcode, Visual Studio ...) tried to autocomplete function names, READ IT :)
+
+**Also implementation may be wrong please let us know by creating an issue on Github.**
+
+Other Issues?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Please let us know by creating an issue on Github.**
diff --git a/docs/source/util.rst b/docs/source/util.rst
index a9f4066..f8dbac4 100644
--- a/docs/source/util.rst
+++ b/docs/source/util.rst
@@ -22,6 +22,7 @@ Functions:
 #. :c:func:`glm_min`
 #. :c:func:`glm_max`
 #. :c:func:`glm_clamp`
+#. :c:func:`glm_lerp`
 
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -121,3 +122,17 @@ Functions documentation
 
     Returns:
       clamped value
+
+.. c:function:: float  glm_lerp(float from, float to, float t)
+
+    linear interpolation between two number
+
+    | formula:  from + s * (to - from)
+
+    Parameters:
+      | *[in]*  **from**   from value
+      | *[in]*  **to**     to value
+      | *[in]*  **t**      interpolant (amount) clamped between 0 and 1
+
+    Returns:
+       interpolated value
diff --git a/docs/source/vec3-ext.rst b/docs/source/vec3-ext.rst
index e632de9..c2c0bfc 100644
--- a/docs/source/vec3-ext.rst
+++ b/docs/source/vec3-ext.rst
@@ -23,6 +23,11 @@ Functions:
 #. :c:func:`glm_vec_eqv_eps`
 #. :c:func:`glm_vec_max`
 #. :c:func:`glm_vec_min`
+#. :c:func:`glm_vec_isnan`
+#. :c:func:`glm_vec_isinf`
+#. :c:func:`glm_vec_isvalid`
+#. :c:func:`glm_vec_sign`
+#. :c:func:`glm_vec_sqrt`
 
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -96,3 +101,43 @@ Functions documentation
 
     Parameters:
       | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec_isnan(vec3 v)
+
+    | check if one of items is NaN (not a number)
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec_isinf(vec3 v)
+
+    | check if one of items is INFINITY
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec_isvalid(vec3 v)
+
+    | check if all items are valid number
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: void glm_vec_sign(vec3 v, vec3 dest)
+
+    get sign of 32 bit float as +1, -1, 0
+
+    Parameters:
+      | *[in]*   **v**     vector
+      | *[out]*  **dest**  sign vector (only keeps signs as -1, 0, -1)
+
+.. c:function:: void glm_vec_sqrt(vec3 v, vec3 dest)
+
+    square root of each vector item
+
+    Parameters:
+      | *[in]*   **v**     vector
+      | *[out]*  **dest**  destination vector (sqrt(v))
diff --git a/docs/source/vec3.rst b/docs/source/vec3.rst
index 355178d..fcbfbfb 100644
--- a/docs/source/vec3.rst
+++ b/docs/source/vec3.rst
@@ -31,15 +31,27 @@ Functions:
 
 1. :c:func:`glm_vec3`
 #. :c:func:`glm_vec_copy`
+#. :c:func:`glm_vec_zero`
+#. :c:func:`glm_vec_one`
 #. :c:func:`glm_vec_dot`
 #. :c:func:`glm_vec_cross`
 #. :c:func:`glm_vec_norm2`
 #. :c:func:`glm_vec_norm`
 #. :c:func:`glm_vec_add`
+#. :c:func:`glm_vec_adds`
 #. :c:func:`glm_vec_sub`
+#. :c:func:`glm_vec_subs`
+#. :c:func:`glm_vec_mul`
 #. :c:func:`glm_vec_scale`
 #. :c:func:`glm_vec_scale_as`
+#. :c:func:`glm_vec_div`
+#. :c:func:`glm_vec_divs`
+#. :c:func:`glm_vec_addadd`
+#. :c:func:`glm_vec_subadd`
+#. :c:func:`glm_vec_muladd`
+#. :c:func:`glm_vec_muladds`
 #. :c:func:`glm_vec_flipsign`
+#. :c:func:`glm_vec_flipsign_to`
 #. :c:func:`glm_vec_inv`
 #. :c:func:`glm_vec_inv_to`
 #. :c:func:`glm_vec_normalize`
@@ -48,12 +60,14 @@ Functions:
 #. :c:func:`glm_vec_angle`
 #. :c:func:`glm_vec_rotate`
 #. :c:func:`glm_vec_rotate_m4`
+#. :c:func:`glm_vec_rotate_m3`
 #. :c:func:`glm_vec_proj`
 #. :c:func:`glm_vec_center`
 #. :c:func:`glm_vec_maxv`
 #. :c:func:`glm_vec_minv`
 #. :c:func:`glm_vec_ortho`
 #. :c:func:`glm_vec_clamp`
+#. :c:func:`glm_vec_lerp`
 
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -74,6 +88,20 @@ Functions documentation
       | *[in]*  **a**     source
       | *[out]* **dest**  destination
 
+.. c:function:: void  glm_vec_zero(vec3 v)
+
+    makes all members 0.0f (zero)
+
+    Parameters:
+      | *[in, out]*  **v**     vector
+
+.. c:function:: void  glm_vec_one(vec3 v)
+
+    makes all members 1.0f (one)
+
+    Parameters:
+      | *[in, out]*  **v**     vector
+
 .. c:function:: float  glm_vec_dot(vec3 a, vec3 b)
 
     dot product of vec3
@@ -115,24 +143,51 @@ Functions documentation
     Parameters:
       | *[in]*  **vec**   vector
 
-.. c:function:: void  glm_vec_add(vec3 v1, vec3 v2, vec3 dest)
+.. c:function:: void  glm_vec_add(vec3 a, vec3 b, vec3 dest)
 
-    add v2 vector to v1 vector store result in dest
+    add a vector to b vector store result in dest
 
     Parameters:
-      | *[in]*  **v1**    vector1
-      | *[in]*  **v2**    vector2
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
+      | *[out]* **dest**  destination vector
+
+.. c:function:: void  glm_vec_adds(vec3 a, float s, vec3 dest)
+
+    add scalar to v vector store result in dest (d = v + vec(s))
+
+    Parameters:
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
       | *[out]* **dest**  destination vector
 
 .. c:function:: void  glm_vec_sub(vec3 v1, vec3 v2, vec3 dest)
 
-    subtract v2 vector from v1 vector store result in dest
+    subtract b vector from a vector store result in dest (d = v1 - v2)
 
     Parameters:
-      | *[in]*  **v1**    vector1
-      | *[in]*  **v2**    vector2
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
       | *[out]* **dest**  destination vector
 
+.. c:function:: void  glm_vec_subs(vec3 v, float s, vec3 dest)
+
+    subtract scalar from v vector store result in dest (d = v - vec(s))
+
+    Parameters:
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  destination vector
+
+.. c:function:: void  glm_vec_mul(vec3 a, vec3 b, vec3 d)
+
+    multiply two vector (component-wise multiplication)
+
+    Parameters:
+      | *[in]*  **a**     vector
+      | *[in]*  **b**     scalar
+      | *[out]* **d**     result = (a[0] * b[0], a[1] * b[1], a[2] * b[2])
+
 .. c:function:: void glm_vec_scale(vec3 v, float s, vec3 dest)
 
      multiply/scale vec3 vector with scalar: result = v * s
@@ -152,12 +207,78 @@ Functions documentation
       | *[in]*  **s**     scalar
       | *[out]* **dest**  destination vector
 
+.. c:function:: void  glm_vec_div(vec3 a, vec3 b, vec3 dest)
+
+    div vector with another component-wise division: d = a / b
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  result = (a[0] / b[0], a[1] / b[1], a[2] / b[2])
+
+.. c:function:: void  glm_vec_divs(vec3 v, float s, vec3 dest)
+
+    div vector with scalar: d = v / s
+
+    Parameters:
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  result = (a[0] / s, a[1] / s, a[2] / s])
+
+.. c:function:: void  glm_vec_addadd(vec3 a, vec3 b, vec3 dest)
+
+    | add two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a + b)
+
+.. c:function:: void  glm_vec_subadd(vec3 a, vec3 b, vec3 dest)
+
+    | sub two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a - b)
+
+.. c:function:: void  glm_vec_muladd(vec3 a, vec3 b, vec3 dest)
+
+    | mul two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a * b)
+
+.. c:function:: void  glm_vec_muladds(vec3 a, float s, vec3 dest)
+
+    | mul vector with scalar and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  dest += (a * b)
+
 .. c:function:: void  glm_vec_flipsign(vec3 v)
 
     flip sign of all vec3 members
 
     Parameters:
-    | *[in, out]*  **v**    vector
+      | *[in, out]*  **v**    vector
+
+.. c:function:: void  glm_vec_flipsign_to(vec3 v, vec3 dest)
+
+    flip sign of all vec3 members and store result in dest
+
+    Parameters:
+      | *[in]*  **v**       vector
+      | *[out]* **dest**    negated vector
 
 .. c:function:: void  glm_vec_inv(vec3 v)
 
@@ -206,7 +327,7 @@ Functions documentation
 
     Parameters:
       | *[in, out]*  **v**      vector
-      | *[in]*       **axis**   axis vector (must be unit vector)
+      | *[in]*       **axis**   axis vector (will be normalized)
       | *[out]*      **angle**  angle (radians)
 
 .. c:function:: void  glm_vec_rotate_m4(mat4 m, vec3 v, vec3 dest)
@@ -218,6 +339,15 @@ Functions documentation
       | *[in]*  **v**     vector
       | *[out]* **dest**  rotated vector
 
+.. c:function:: void  glm_vec_rotate_m3(mat3 m, vec3 v, vec3 dest)
+
+    apply rotation matrix to vector
+
+    Parameters:
+      | *[in]*  **m**     affine matrix or rot matrix
+      | *[in]*  **v**     vector
+      | *[out]* **dest**  rotated vector
+
 .. c:function:: void  glm_vec_proj(vec3 a, vec3 b, vec3 dest)
 
     project a vector onto b vector
@@ -281,3 +411,15 @@ Functions documentation
       | *[in, out]*  **v**       vector
       | *[in]*       **minVal**  minimum value
       | *[in]*       **maxVal**  maximum value
+
+.. c:function:: void  glm_vec_lerp(vec3 from, vec3 to, float t, vec3 dest)
+
+    linear interpolation between two vector
+
+    | formula:  from + s * (to - from)
+
+    Parameters:
+      | *[in]*  **from**   from value
+      | *[in]*  **to**     to value
+      | *[in]*  **t**      interpolant (amount) clamped between 0 and 1
+      | *[out]* **dest**   destination
diff --git a/docs/source/vec4-ext.rst b/docs/source/vec4-ext.rst
index 11613ad..722424e 100644
--- a/docs/source/vec4-ext.rst
+++ b/docs/source/vec4-ext.rst
@@ -96,3 +96,43 @@ Functions documentation
 
     Parameters:
       | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec4_isnan(vec4 v)
+
+    | check if one of items is NaN (not a number)
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec4_isinf(vec4 v)
+
+    | check if one of items is INFINITY
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: bool glm_vec4_isvalid(vec4 v)
+
+    | check if all items are valid number
+    | you should only use this in DEBUG mode or very critical asserts
+
+    Parameters:
+      | *[in]*  **v**  vector
+
+.. c:function:: void glm_vec4_sign(vec4 v, vec4 dest)
+
+    get sign of 32 bit float as +1, -1, 0
+
+    Parameters:
+      | *[in]*   **v**     vector
+      | *[out]*  **dest**  sign vector (only keeps signs as -1, 0, -1)
+
+.. c:function:: void glm_vec4_sqrt(vec4 v, vec4 dest)
+
+    square root of each vector item
+
+    Parameters:
+      | *[in]*   **v**     vector
+      | *[out]*  **dest**  destination vector (sqrt(v))
diff --git a/docs/source/vec4.rst b/docs/source/vec4.rst
index ac1b9c5..a1b7250 100644
--- a/docs/source/vec4.rst
+++ b/docs/source/vec4.rst
@@ -24,14 +24,26 @@ Functions:
 1. :c:func:`glm_vec4`
 #. :c:func:`glm_vec4_copy3`
 #. :c:func:`glm_vec4_copy`
+#. :c:func:`glm_vec4_zero`
+#. :c:func:`glm_vec4_one`
 #. :c:func:`glm_vec4_dot`
 #. :c:func:`glm_vec4_norm2`
 #. :c:func:`glm_vec4_norm`
 #. :c:func:`glm_vec4_add`
+#. :c:func:`glm_vec4_adds`
 #. :c:func:`glm_vec4_sub`
+#. :c:func:`glm_vec4_subs`
+#. :c:func:`glm_vec4_mul`
 #. :c:func:`glm_vec4_scale`
 #. :c:func:`glm_vec4_scale_as`
+#. :c:func:`glm_vec4_div`
+#. :c:func:`glm_vec4_divs`
+#. :c:func:`glm_vec4_addadd`
+#. :c:func:`glm_vec4_subadd`
+#. :c:func:`glm_vec4_muladd`
+#. :c:func:`glm_vec4_muladds`
 #. :c:func:`glm_vec4_flipsign`
+#. :c:func:`glm_vec_flipsign_to`
 #. :c:func:`glm_vec4_inv`
 #. :c:func:`glm_vec4_inv_to`
 #. :c:func:`glm_vec4_normalize`
@@ -40,6 +52,12 @@ Functions:
 #. :c:func:`glm_vec4_maxv`
 #. :c:func:`glm_vec4_minv`
 #. :c:func:`glm_vec4_clamp`
+#. :c:func:`glm_vec4_lerp`
+#. :c:func:`glm_vec4_isnan`
+#. :c:func:`glm_vec4_isinf`
+#. :c:func:`glm_vec4_isvalid`
+#. :c:func:`glm_vec4_sign`
+#. :c:func:`glm_vec4_sqrt`
 
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -71,6 +89,13 @@ Functions documentation
       | *[in]*  **v**     source
       | *[in]*  **dest**  destination
 
+.. c:function:: void  glm_vec4_zero(vec4 v)
+
+    makes all members zero
+
+    Parameters:
+      | *[in, out]*  **v**     vector
+
 .. c:function:: float  glm_vec4_dot(vec4 a, vec4 b)
 
     dot product of vec4
@@ -103,24 +128,51 @@ Functions documentation
     Parameters:
       | *[in]*  **vec**   vector
 
-.. c:function:: void  glm_vec4_add(vec4 v1, vec4 v2, vec4 dest)
+.. c:function:: void  glm_vec4_add(vec4 a, vec4 b, vec4 dest)
 
-    add v2 vector to v1 vector store result in dest
+    add a vector to b vector store result in dest
 
     Parameters:
-      | *[in]*  **v1**    vector1
-      | *[in]*  **v2**    vector2
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
       | *[out]* **dest**  destination vector
 
-.. c:function:: void  glm_vec4_sub(vec4 v1, vec4 v2, vec4 dest)
+.. c:function:: void  glm_vec4_adds(vec4 v, float s, vec4 dest)
 
-    subtract v2 vector from v1 vector store result in dest
+    add scalar to v vector store result in dest (d = v + vec(s))
 
     Parameters:
-      | *[in]*  **v1**    vector1
-      | *[in]*  **v2**    vector2
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
       | *[out]* **dest**  destination vector
 
+.. c:function:: void  glm_vec4_sub(vec4 a, vec4 b, vec4 dest)
+
+    subtract b vector from a vector store result in dest (d = v1 - v2)
+
+    Parameters:
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
+      | *[out]* **dest**  destination vector
+
+.. c:function:: void  glm_vec4_subs(vec4 v, float s, vec4 dest)
+
+    subtract scalar from v vector store result in dest (d = v - vec(s))
+
+    Parameters:
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  destination vector
+
+.. c:function:: void  glm_vec4_mul(vec4 a, vec4 b, vec4 d)
+
+    multiply two vector (component-wise multiplication)
+
+    Parameters:
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
+      | *[out]* **dest**  result = (a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3])
+
 .. c:function:: void glm_vec4_scale(vec4 v, float s, vec4 dest)
 
      multiply/scale vec4 vector with scalar: result = v * s
@@ -139,6 +191,64 @@ Functions documentation
       | *[in]*  **s**     scalar
       | *[out]* **dest**  destination vector
 
+.. c:function:: void  glm_vec4_div(vec4 a, vec4 b, vec4 dest)
+
+    div vector with another component-wise division: d = v1 / v2
+
+    Parameters:
+      | *[in]*  **a**     vector1
+      | *[in]*  **b**     vector2
+      | *[out]* **dest**  result = (a[0] / b[0], a[1] / b[1], a[2] / b[2], a[3] / b[3])
+
+.. c:function:: void  glm_vec4_divs(vec4 v, float s, vec4 dest)
+
+    div vector with scalar: d = v / s
+
+    Parameters:
+      | *[in]*  **v**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  result = (a[0] / s, a[1] / s, a[2] / s, a[3] / s)
+
+.. c:function:: void  glm_vec4_addadd(vec4 a, vec4 b, vec4 dest)
+
+    | add two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a + b)
+
+.. c:function:: void  glm_vec4_subadd(vec4 a, vec4 b, vec4 dest)
+
+    | sub two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a - b)
+
+.. c:function:: void  glm_vec4_muladd(vec4 a, vec4 b, vec4 dest)
+
+    | mul two vectors and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector 1
+      | *[in]*  **b**     vector 2
+      | *[out]* **dest**  dest += (a * b)
+
+.. c:function:: void  glm_vec4_muladds(vec4 a, float s, vec4 dest)
+
+    | mul vector with scalar and add result to sum
+    | it applies += operator so dest must be initialized
+
+    Parameters:
+      | *[in]*  **a**     vector
+      | *[in]*  **s**     scalar
+      | *[out]* **dest**  dest += (a * b)
+
 .. c:function:: void  glm_vec4_flipsign(vec4 v)
 
     flip sign of all vec4 members
@@ -146,6 +256,14 @@ Functions documentation
     Parameters:
     | *[in, out]*  **v**    vector
 
+.. c:function:: void  glm_vec4_flipsign_to(vec4 v, vec4 dest)
+
+    flip sign of all vec4 members and store result in dest
+
+    Parameters:
+      | *[in]*  **v**       vector
+      | *[out]* **dest**    negated vector
+
 .. c:function:: void  glm_vec4_inv(vec4 v)
 
     make vector as inverse/opposite of itself
@@ -213,3 +331,15 @@ Functions documentation
       | *[in, out]*  **v**       vector
       | *[in]*       **minVal**  minimum value
       | *[in]*       **maxVal**  maximum value
+
+.. c:function:: void  glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest)
+
+    linear interpolation between two vector
+
+    | formula:  from + s * (to - from)
+
+    Parameters:
+      | *[in]*  **from**   from value
+      | *[in]*  **to**     to value
+      | *[in]*  **t**      interpolant (amount) clamped between 0 and 1
+      | *[out]* **dest**   destination
diff --git a/include/cglm/affine-mat.h b/include/cglm/affine-mat.h
index 3fd19ef..62bfce1 100644
--- a/include/cglm/affine-mat.h
+++ b/include/cglm/affine-mat.h
@@ -16,6 +16,7 @@
 
 #include "common.h"
 #include "mat4.h"
+#include "mat3.h"
 
 #ifdef CGLM_SSE_FP
 #  include "simd/sse2/affine.h"
@@ -81,6 +82,59 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) {
 #endif
 }
 
+/*!
+ * @brief this is similar to glm_mat4_mul but specialized to affine transform
+ *
+ * Right Matrix format should be:
+ *   R  R  R  0
+ *   R  R  R  0
+ *   R  R  R  0
+ *   0  0  0  1
+ *
+ * this reduces some multiplications. It should be faster than mat4_mul.
+ * if you are not sure about matrix format then DON'T use this! use mat4_mul
+ *
+ * @param[in]   m1    affine matrix 1
+ * @param[in]   m2    affine matrix 2
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glm_mul_rot_sse2(m1, m2, dest);
+#else
+  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
+        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
+        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
+        a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2], a33 = m1[3][3],
+
+        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2],
+        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2],
+        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2];
+
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
+  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02;
+
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
+  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12;
+
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
+  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22;
+
+  dest[3][0] = a30;
+  dest[3][1] = a31;
+  dest[3][2] = a32;
+  dest[3][3] = a33;
+#endif
+}
+
 /*!
  * @brief inverse orthonormal rotation + translation matrix (ridig-body)
  *
diff --git a/include/cglm/affine.h b/include/cglm/affine.h
index 10f190f..dd7dbd1 100644
--- a/include/cglm/affine.h
+++ b/include/cglm/affine.h
@@ -16,15 +16,14 @@
    CGLM_INLINE void glm_scale_to(mat4 m, vec3 v, mat4 dest);
    CGLM_INLINE void glm_scale_make(mat4 m, vec3 v);
    CGLM_INLINE void glm_scale(mat4 m, vec3 v);
-   CGLM_INLINE void glm_scale1(mat4 m, float s);
    CGLM_INLINE void glm_scale_uni(mat4 m, float s);
    CGLM_INLINE void glm_rotate_x(mat4 m, float angle, mat4 dest);
    CGLM_INLINE void glm_rotate_y(mat4 m, float angle, mat4 dest);
    CGLM_INLINE void glm_rotate_z(mat4 m, float angle, mat4 dest);
-   CGLM_INLINE void glm_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc);
    CGLM_INLINE void glm_rotate_make(mat4 m, float angle, vec3 axis);
-   CGLM_INLINE void glm_rotate_ndc(mat4 m, float angle, vec3 axis);
    CGLM_INLINE void glm_rotate(mat4 m, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+   CGLM_INLINE void glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis);
    CGLM_INLINE void glm_decompose_scalev(mat4 m, vec3 s);
    CGLM_INLINE bool glm_uniscaled(mat4 m);
    CGLM_INLINE void glm_decompose_rs(mat4 m, mat4 r, vec3 s);
@@ -35,9 +34,15 @@
 #define cglm_affine_h
 
 #include "common.h"
-#include "vec4.h"
-#include "affine-mat.h"
 #include "util.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
+#include "affine-mat.h"
+
+CGLM_INLINE
+void
+glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest);
 
 /*!
  * @brief translate existing transform matrix by v vector
@@ -53,19 +58,19 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) {
   mat4 t = GLM_MAT4_IDENTITY_INIT;
 
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest[3],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[0]),
-                                                _mm_set1_ps(v[0])),
-                                     _mm_mul_ps(_mm_load_ps(t[1]),
-                                                _mm_set1_ps(v[1]))),
-                          _mm_add_ps(_mm_mul_ps(_mm_load_ps(t[2]),
-                                                _mm_set1_ps(v[2])),
-                                     _mm_load_ps(t[3]))))
+  glmm_store(dest[3],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]),
+                                              _mm_set1_ps(v[0])),
+                                   _mm_mul_ps(glmm_load(t[1]),
+                                              _mm_set1_ps(v[1]))),
+                        _mm_add_ps(_mm_mul_ps(glmm_load(t[2]),
+                                              _mm_set1_ps(v[2])),
+                                   glmm_load(t[3]))))
   ;
 
-  _mm_store_ps(dest[0], _mm_load_ps(m[0]));
-  _mm_store_ps(dest[1], _mm_load_ps(m[1]));
-  _mm_store_ps(dest[2], _mm_load_ps(m[2]));
+  glmm_store(dest[0], glmm_load(m[0]));
+  glmm_store(dest[1], glmm_load(m[1]));
+  glmm_store(dest[2], glmm_load(m[2]));
 #else
   vec4 v1, v2, v3;
 
@@ -92,14 +97,14 @@ CGLM_INLINE
 void
 glm_translate(mat4 m, vec3 v) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(m[3],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]),
-                                                _mm_set1_ps(v[0])),
-                                     _mm_mul_ps(_mm_load_ps(m[1]),
-                                                _mm_set1_ps(v[1]))),
-                          _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]),
-                                                _mm_set1_ps(v[2])),
-                                     _mm_load_ps(m[3]))))
+  glmm_store(m[3],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
+                                              _mm_set1_ps(v[0])),
+                                   _mm_mul_ps(glmm_load(m[1]),
+                                              _mm_set1_ps(v[1]))),
+                        _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
+                                              _mm_set1_ps(v[2])),
+                                   glmm_load(m[3]))))
   ;
 #else
   vec4 v1, v2, v3;
@@ -124,10 +129,10 @@ CGLM_INLINE
 void
 glm_translate_x(mat4 m, float x) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(m[3],
-               _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]),
-                                     _mm_set1_ps(x)),
-                          _mm_load_ps(m[3])))
+  glmm_store(m[3],
+             _mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
+                                   _mm_set1_ps(x)),
+                        glmm_load(m[3])))
   ;
 #else
   vec4 v1;
@@ -146,10 +151,10 @@ CGLM_INLINE
 void
 glm_translate_y(mat4 m, float y) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(m[3],
-               _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[1]),
-                                     _mm_set1_ps(y)),
-                          _mm_load_ps(m[3])))
+  glmm_store(m[3],
+             _mm_add_ps(_mm_mul_ps(glmm_load(m[1]),
+                                   _mm_set1_ps(y)),
+                        glmm_load(m[3])))
   ;
 #else
   vec4 v1;
@@ -168,10 +173,10 @@ CGLM_INLINE
 void
 glm_translate_z(mat4 m, float z) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(m[3],
-               _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]),
-                                     _mm_set1_ps(z)),
-                          _mm_load_ps(m[3])))
+  glmm_store(m[3],
+             _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
+                                   _mm_set1_ps(z)),
+                        glmm_load(m[3])))
   ;
 #else
   vec4 v1;
@@ -237,16 +242,6 @@ glm_scale(mat4 m, vec3 v) {
   glm_scale_to(m, v, m);
 }
 
-/*!
- * @brief DEPRECATED! Use glm_scale_uni
- */
-CGLM_INLINE
-void
-glm_scale1(mat4 m, float s) {
-  vec3 v = { s, s, s };
-  glm_scale_to(m, v, m);
-}
-
 /*!
  * @brief applies uniform scale to existing transform matrix v = [s, s, s]
  *        and stores result in same matrix
@@ -272,19 +267,18 @@ glm_scale_uni(mat4 m, float s) {
 CGLM_INLINE
 void
 glm_rotate_x(mat4 m, float angle, mat4 dest) {
-  float cosVal;
-  float sinVal;
   mat4  t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
 
-  cosVal = cosf(angle);
-  sinVal = sinf(angle);
+  c = cosf(angle);
+  s = sinf(angle);
 
-  t[1][1] =  cosVal;
-  t[1][2] =  sinVal;
-  t[2][1] = -sinVal;
-  t[2][2] =  cosVal;
+  t[1][1] =  c;
+  t[1][2] =  s;
+  t[2][1] = -s;
+  t[2][2] =  c;
 
-  glm_mat4_mul(m, t, dest);
+  glm_mul_rot(m, t, dest);
 }
 
 /*!
@@ -298,19 +292,18 @@ glm_rotate_x(mat4 m, float angle, mat4 dest) {
 CGLM_INLINE
 void
 glm_rotate_y(mat4 m, float angle, mat4 dest) {
-  float cosVal;
-  float sinVal;
   mat4  t = GLM_MAT4_IDENTITY_INIT;
+  float c, s;
 
-  cosVal = cosf(angle);
-  sinVal = sinf(angle);
+  c = cosf(angle);
+  s = sinf(angle);
 
-  t[0][0] =  cosVal;
-  t[0][2] = -sinVal;
-  t[2][0] =  sinVal;
-  t[2][2] =  cosVal;
+  t[0][0] =  c;
+  t[0][2] = -s;
+  t[2][0] =  s;
+  t[2][2] =  c;
 
-  glm_mat4_mul(m, t, dest);
+  glm_mul_rot(m, t, dest);
 }
 
 /*!
@@ -324,61 +317,18 @@ glm_rotate_y(mat4 m, float angle, mat4 dest) {
 CGLM_INLINE
 void
 glm_rotate_z(mat4 m, float angle, mat4 dest) {
-  float cosVal;
-  float sinVal;
   mat4  t = GLM_MAT4_IDENTITY_INIT;
-
-  cosVal = cosf(angle);
-  sinVal = sinf(angle);
-
-  t[0][0] =  cosVal;
-  t[0][1] =  sinVal;
-  t[1][0] = -sinVal;
-  t[1][1] =  cosVal;
-
-  glm_mat4_mul(m, t, dest);
-}
-
-/*!
- * @brief creates NEW rotation matrix by angle and axis
- *
- * this name may change in the future. axis must be is normalized
- *
- * @param[out] m        affine transfrom
- * @param[in]  angle    angle (radians)
- * @param[in]  axis_ndc normalized axis
- */
-CGLM_INLINE
-void
-glm_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc) {
-  /* https://www.opengl.org/sdk/docs/man2/xhtml/glRotate.xml */
-
-  vec3 v, vs;
-  float c;
+  float c, s;
 
   c = cosf(angle);
+  s = sinf(angle);
 
-  glm_vec_scale(axis_ndc, 1.0f - c, v);
-  glm_vec_scale(axis_ndc, sinf(angle), vs);
+  t[0][0] =  c;
+  t[0][1] =  s;
+  t[1][0] = -s;
+  t[1][1] =  c;
 
-  glm_vec_scale(axis_ndc, v[0], m[0]);
-  glm_vec_scale(axis_ndc, v[1], m[1]);
-  glm_vec_scale(axis_ndc, v[2], m[2]);
-
-  m[0][0] += c;
-  m[0][1] += vs[2];
-  m[0][2] -= vs[1];
-
-  m[1][0] -= vs[2];
-  m[1][1] += c;
-  m[1][2] += vs[0];
-
-  m[2][0] += vs[1];
-  m[2][1] -= vs[0];
-  m[2][2] += c;
-
-  m[0][3] = m[1][3] = m[2][3] = m[3][0] = m[3][1] = m[3][2] = 0.0f;
-  m[3][3] = 1.0f;
+  glm_mul_rot(m, t, dest);
 }
 
 /*!
@@ -393,53 +343,29 @@ glm_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc) {
 CGLM_INLINE
 void
 glm_rotate_make(mat4 m, float angle, vec3 axis) {
-  vec3 axis_ndc;
+  vec3  axisn, v, vs;
+  float c;
 
-  glm_vec_normalize_to(axis, axis_ndc);
-  glm_rotate_ndc_make(m, angle, axis_ndc);
+  c = cosf(angle);
+
+  glm_vec_normalize_to(axis, axisn);
+  glm_vec_scale(axisn, 1.0f - c, v);
+  glm_vec_scale(axisn, sinf(angle), vs);
+
+  glm_vec_scale(axisn, v[0], m[0]);
+  glm_vec_scale(axisn, v[1], m[1]);
+  glm_vec_scale(axisn, v[2], m[2]);
+
+  m[0][0] += c;       m[1][0] -= vs[2];   m[2][0] += vs[1];
+  m[0][1] += vs[2];   m[1][1] += c;       m[2][1] -= vs[0];
+  m[0][2] -= vs[1];   m[1][2] += vs[0];   m[2][2] += c;
+
+  m[0][3] = m[1][3] = m[2][3] = m[3][0] = m[3][1] = m[3][2] = 0.0f;
+  m[3][3] = 1.0f;
 }
 
 /*!
- * @brief rotate existing transform matrix around Z axis by angle and axis
- *
- * this name may change in the future, axis must be normalized.
- *
- * @param[in, out]  m         affine transfrom
- * @param[in]       angle     angle (radians)
- * @param[in]       axis_ndc  normalized axis
- */
-CGLM_INLINE
-void
-glm_rotate_ndc(mat4 m, float angle, vec3 axis_ndc) {
-  mat4 rot, tmp;
-
-  glm_rotate_ndc_make(rot, angle, axis_ndc);
-
-  glm_vec4_scale(m[0], rot[0][0], tmp[1]);
-  glm_vec4_scale(m[1], rot[0][1], tmp[0]);
-  glm_vec4_add(tmp[1], tmp[0],    tmp[1]);
-  glm_vec4_scale(m[2], rot[0][2], tmp[0]);
-  glm_vec4_add(tmp[1], tmp[0],    tmp[1]);
-
-  glm_vec4_scale(m[0], rot[1][0], tmp[2]);
-  glm_vec4_scale(m[1], rot[1][1], tmp[0]);
-  glm_vec4_add(tmp[2], tmp[0],    tmp[2]);
-  glm_vec4_scale(m[2], rot[1][2], tmp[0]);
-  glm_vec4_add(tmp[2], tmp[0],    tmp[2]);
-
-  glm_vec4_scale(m[0], rot[2][0], tmp[3]);
-  glm_vec4_scale(m[1], rot[2][1], tmp[0]);
-  glm_vec4_add(tmp[3], tmp[0],    tmp[3]);
-  glm_vec4_scale(m[2], rot[2][2], tmp[0]);
-  glm_vec4_add(tmp[3], tmp[0],    tmp[3]);
-
-  glm_vec4_copy(tmp[1], m[0]);
-  glm_vec4_copy(tmp[2], m[1]);
-  glm_vec4_copy(tmp[3], m[2]);
-}
-
-/*!
- * @brief rotate existing transform matrix around Z axis by angle and axis
+ * @brief rotate existing transform matrix around given axis by angle
  *
  * @param[in, out]  m      affine transfrom
  * @param[in]       angle  angle (radians)
@@ -448,10 +374,56 @@ glm_rotate_ndc(mat4 m, float angle, vec3 axis_ndc) {
 CGLM_INLINE
 void
 glm_rotate(mat4 m, float angle, vec3 axis) {
-  vec3 axis_ndc;
+  mat4 rot;
+  glm_rotate_make(rot, angle, axis);
+  glm_mul_rot(m, rot, m);
+}
 
-  glm_vec_normalize_to(axis, axis_ndc);
-  glm_rotate_ndc(m, angle, axis_ndc);
+/*!
+ * @brief rotate existing transform
+ *        around given axis by angle at given pivot point (rotation center)
+ *
+ * @param[in, out]  m      affine transfrom
+ * @param[in]       pivot  rotation center
+ * @param[in]       angle  angle (radians)
+ * @param[in]       axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  vec3 pivotInv;
+
+  glm_vec_inv_to(pivot, pivotInv);
+
+  glm_translate(m, pivot);
+  glm_rotate(m, angle, axis);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief creates NEW rotation matrix by angle and axis at given point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[out] m      affine transfrom
+ * @param[in]  pivot  rotation center
+ * @param[in]  angle  angle (radians)
+ * @param[in]  axis   axis
+ */
+CGLM_INLINE
+void
+glm_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  vec3 pivotInv;
+
+  glm_vec_inv_to(pivot, pivotInv);
+
+  glm_mat4_identity(m);
+  glm_vec_copy(pivot, m[3]);
+  glm_rotate(m, angle, axis);
+  glm_translate(m, pivotInv);
 }
 
 /*!
@@ -469,7 +441,7 @@ glm_decompose_scalev(mat4 m, vec3 s) {
 }
 
 /*!
- * @brief returns true if matrix is uniform scaled. This is helpful for 
+ * @brief returns true if matrix is uniform scaled. This is helpful for
  *        creating normal matrix.
  *
  * @param[in] m m
diff --git a/include/cglm/box.h b/include/cglm/box.h
index 7032339..31b0ec2 100644
--- a/include/cglm/box.h
+++ b/include/cglm/box.h
@@ -11,6 +11,7 @@
 #include "common.h"
 #include "vec3.h"
 #include "vec4.h"
+#include "util.h"
 
 /*!
  * @brief apply transform to Axis-Aligned Bounding Box
diff --git a/include/cglm/call/affine.h b/include/cglm/call/affine.h
index 3d462ee..4d3834b 100644
--- a/include/cglm/call/affine.h
+++ b/include/cglm/call/affine.h
@@ -13,6 +13,10 @@ extern "C" {
 
 #include "../cglm.h"
 
+CGLM_EXPORT
+void
+glmc_translate_make(mat4 m, vec3 v);
+
 CGLM_EXPORT
 void
 glmc_translate_to(mat4 m, vec3 v, mat4 dest);
@@ -33,6 +37,10 @@ CGLM_EXPORT
 void
 glmc_translate_z(mat4 m, float to);
 
+CGLM_EXPORT
+void
+glmc_scale_make(mat4 m, vec3 v);
+
 CGLM_EXPORT
 void
 glmc_scale_to(mat4 m, vec3 v, mat4 dest);
@@ -43,7 +51,7 @@ glmc_scale(mat4 m, vec3 v);
 
 CGLM_EXPORT
 void
-glmc_scale1(mat4 m, float s);
+glmc_scale_uni(mat4 m, float s);
 
 CGLM_EXPORT
 void
@@ -57,26 +65,30 @@ CGLM_EXPORT
 void
 glmc_rotate_z(mat4 m, float rad, mat4 dest);
 
-CGLM_EXPORT
-void
-glmc_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc);
-
 CGLM_EXPORT
 void
 glmc_rotate_make(mat4 m, float angle, vec3 axis);
 
-CGLM_EXPORT
-void
-glmc_rotate_ndc(mat4 m, float angle, vec3 axis_ndc);
-
 CGLM_EXPORT
 void
 glmc_rotate(mat4 m, float angle, vec3 axis);
 
+CGLM_EXPORT
+void
+glmc_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis);
+
 CGLM_EXPORT
 void
 glmc_decompose_scalev(mat4 m, vec3 s);
 
+CGLM_EXPORT
+bool
+glmc_uniscaled(mat4 m);
+
 CGLM_EXPORT
 void
 glmc_decompose_rs(mat4 m, mat4 r, vec3 s);
diff --git a/include/cglm/call/mat4.h b/include/cglm/call/mat4.h
index 6ea81e4..c9ad796 100644
--- a/include/cglm/call/mat4.h
+++ b/include/cglm/call/mat4.h
@@ -47,12 +47,16 @@ glmc_mat4_mul(mat4 m1, mat4 m2, mat4 dest);
 
 CGLM_EXPORT
 void
-glmc_mat4_mulN(mat4 * __restrict matrices[], int len, mat4 dest);
+glmc_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest);
 
 CGLM_EXPORT
 void
 glmc_mat4_mulv(mat4 m, vec4 v, vec4 dest);
 
+CGLM_EXPORT
+void
+glmc_mat4_quat(mat4 m, versor dest);
+
 CGLM_EXPORT
 void
 glmc_mat4_transpose_to(mat4 m, mat4 dest);
diff --git a/include/cglm/call/quat.h b/include/cglm/call/quat.h
index 0dff506..ae4c9ef 100644
--- a/include/cglm/call/quat.h
+++ b/include/cglm/call/quat.h
@@ -19,33 +19,79 @@ glmc_quat_identity(versor q);
 
 CGLM_EXPORT
 void
-glmc_quat(versor q,
-          float angle,
-          float x,
-          float y,
-          float z);
+glmc_quat_init(versor q, float x, float y, float z, float w);
 
 CGLM_EXPORT
 void
-glmc_quatv(versor q,
-          float  angle,
-          vec3   v);
+glmc_quat(versor q, float angle, float x, float y, float z);
+
+CGLM_EXPORT
+void
+glmc_quatv(versor q, float angle, vec3 axis);
+
+CGLM_EXPORT
+void
+glmc_quat_copy(versor q, versor dest);
 
 CGLM_EXPORT
 float
 glmc_quat_norm(versor q);
 
+CGLM_EXPORT
+void
+glmc_quat_normalize_to(versor q, versor dest);
+
 CGLM_EXPORT
 void
 glmc_quat_normalize(versor q);
 
 CGLM_EXPORT
 float
-glmc_quat_dot(versor q, versor r);
+glmc_quat_dot(versor p, versor q);
 
 CGLM_EXPORT
 void
-glmc_quat_mulv(versor q1, versor q2, versor dest);
+glmc_quat_conjugate(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_inv(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_add(versor p, versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_sub(versor p, versor q, versor dest);
+
+CGLM_EXPORT
+float
+glmc_quat_real(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_imag(versor q, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_imagn(versor q, vec3 dest);
+
+CGLM_EXPORT
+float
+glmc_quat_imaglen(versor q);
+
+CGLM_EXPORT
+float
+glmc_quat_angle(versor q);
+
+CGLM_EXPORT
+void
+glmc_quat_axis(versor q, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mul(versor p, versor q, versor dest);
 
 CGLM_EXPORT
 void
@@ -53,10 +99,51 @@ glmc_quat_mat4(versor q, mat4 dest);
 
 CGLM_EXPORT
 void
-glmc_quat_slerp(versor q,
-                versor r,
-                float  t,
-                versor dest);
+glmc_quat_mat4t(versor q, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat3(versor q, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_mat3t(versor q, mat3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_lerp(versor from, versor to, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_slerp(versor q, versor r, float t, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_look(vec3 eye, versor ori, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_forp(vec3 from, vec3 to, vec3 fwd, vec3 up, versor dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotatev(versor from, vec3 to, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate(mat4 m, versor q, mat4 dest);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_at(mat4 model, versor q, vec3 pivot);
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_atm(mat4 m, versor q, vec3 pivot);
 
 #ifdef __cplusplus
 }
diff --git a/include/cglm/call/vec3.h b/include/cglm/call/vec3.h
index 461de0b..8fcee4f 100644
--- a/include/cglm/call/vec3.h
+++ b/include/cglm/call/vec3.h
@@ -16,10 +16,22 @@ extern "C" {
 /* DEPRECATED! use _copy, _ucopy versions */
 #define glmc_vec_dup(v, dest) glmc_vec_copy(v, dest)
 
+CGLM_EXPORT
+void
+glmc_vec3(vec4 v4, vec3 dest);
+
 CGLM_EXPORT
 void
 glmc_vec_copy(vec3 a, vec3 dest);
 
+CGLM_EXPORT
+void
+glmc_vec_zero(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec_one(vec3 v);
+
 CGLM_EXPORT
 float
 glmc_vec_dot(vec3 a, vec3 b);
@@ -50,7 +62,19 @@ glmc_vec_add(vec3 v1, vec3 v2, vec3 dest);
 
 CGLM_EXPORT
 void
-glmc_vec_sub(vec3 v1, vec3 v2, vec3 dest);
+glmc_vec_adds(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_sub(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_subs(vec3 v, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_mul(vec3 a, vec3 b, vec3 d);
 
 CGLM_EXPORT
 void
@@ -60,10 +84,38 @@ CGLM_EXPORT
 void
 glmc_vec_scale_as(vec3 v, float s, vec3 dest);
 
+CGLM_EXPORT
+void
+glmc_vec_div(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_divs(vec3 a, float s, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_addadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_subadd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_muladd(vec3 a, vec3 b, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_muladds(vec3 a, float s, vec3 dest);
+
 CGLM_EXPORT
 void
 glmc_vec_flipsign(vec3 v);
 
+CGLM_EXPORT
+void
+glmc_vec_flipsign_to(vec3 v, vec3 dest);
+
 CGLM_EXPORT
 void
 glmc_vec_inv(vec3 v);
@@ -108,6 +160,72 @@ CGLM_EXPORT
 void
 glmc_vec_clamp(vec3 v, float minVal, float maxVal);
 
+CGLM_EXPORT
+void
+glmc_vec_ortho(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_lerp(vec3 from, vec3 to, float t, vec3 dest);
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec_mulv(vec3 a, vec3 b, vec3 d);
+
+CGLM_EXPORT
+void
+glmc_vec_broadcast(float val, vec3 d);
+
+CGLM_EXPORT
+bool
+glmc_vec_eq(vec3 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec_eq_eps(vec3 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec_eq_all(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec_eqv(vec3 v1, vec3 v2);
+
+CGLM_EXPORT
+bool
+glmc_vec_eqv_eps(vec3 v1, vec3 v2);
+
+CGLM_EXPORT
+float
+glmc_vec_max(vec3 v);
+
+CGLM_EXPORT
+float
+glmc_vec_min(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec_isnan(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec_isinf(vec3 v);
+
+CGLM_EXPORT
+bool
+glmc_vec_isvalid(vec3 v);
+
+CGLM_EXPORT
+void
+glmc_vec_sign(vec3 v, vec3 dest);
+
+CGLM_EXPORT
+void
+glmc_vec_sqrt(vec3 v, vec3 dest);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cglm/call/vec4.h b/include/cglm/call/vec4.h
index b63af80..adcfc94 100644
--- a/include/cglm/call/vec4.h
+++ b/include/cglm/call/vec4.h
@@ -17,6 +17,18 @@ extern "C" {
 #define glmc_vec4_dup3(v, dest) glmc_vec4_copy3(v, dest)
 #define glmc_vec4_dup(v, dest)  glmc_vec4_copy(v, dest)
 
+CGLM_EXPORT
+void
+glmc_vec4(vec3 v3, float last, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_zero(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_one(vec4 v);
+
 CGLM_EXPORT
 void
 glmc_vec4_copy3(vec4 a, vec3 dest);
@@ -47,11 +59,23 @@ glmc_vec4_normalize(vec4 v);
 
 CGLM_EXPORT
 void
-glmc_vec4_add(vec4 v1, vec4 v2, vec4 dest);
+glmc_vec4_add(vec4 a, vec4 b, vec4 dest);
 
 CGLM_EXPORT
 void
-glmc_vec4_sub(vec4 v1, vec4 v2, vec4 dest);
+glmc_vec4_adds(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_sub(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_subs(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_mul(vec4 a, vec4 b, vec4 d);
 
 CGLM_EXPORT
 void
@@ -61,10 +85,38 @@ CGLM_EXPORT
 void
 glmc_vec4_scale_as(vec3 v, float s, vec3 dest);
 
+CGLM_EXPORT
+void
+glmc_vec4_div(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_divs(vec4 v, float s, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_addadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_subadd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_muladd(vec4 a, vec4 b, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_muladds(vec4 a, float s, vec4 dest);
+
 CGLM_EXPORT
 void
 glmc_vec4_flipsign(vec4 v);
 
+CGLM_EXPORT
+void
+glmc_vec4_flipsign_to(vec4 v, vec4 dest);
+
 CGLM_EXPORT
 void
 glmc_vec4_inv(vec4 v);
@@ -89,6 +141,68 @@ CGLM_EXPORT
 void
 glmc_vec4_clamp(vec4 v, float minVal, float maxVal);
 
+CGLM_EXPORT
+void
+glmc_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest);
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec4_mulv(vec4 a, vec4 b, vec4 d);
+
+CGLM_EXPORT
+void
+glmc_vec4_broadcast(float val, vec4 d);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq(vec4 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_eps(vec4 v, float val);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_all(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv(vec4 v1, vec4 v2);
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv_eps(vec4 v1, vec4 v2);
+
+CGLM_EXPORT
+float
+glmc_vec4_max(vec4 v);
+
+CGLM_EXPORT
+float
+glmc_vec4_min(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isnan(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isinf(vec4 v);
+
+CGLM_EXPORT
+bool
+glmc_vec4_isvalid(vec4 v);
+
+CGLM_EXPORT
+void
+glmc_vec4_sign(vec4 v, vec4 dest);
+
+CGLM_EXPORT
+void
+glmc_vec4_sqrt(vec4 v, vec4 dest);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cglm/common.h b/include/cglm/common.h
index f0eb965..199bfda 100644
--- a/include/cglm/common.h
+++ b/include/cglm/common.h
@@ -14,7 +14,7 @@
 #include <math.h>
 #include <float.h>
 
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 #  ifdef CGLM_DLL
 #    define CGLM_EXPORT __declspec(dllexport)
 #  else
diff --git a/include/cglm/frustum.h b/include/cglm/frustum.h
index 4f8aff4..d4e8d47 100644
--- a/include/cglm/frustum.h
+++ b/include/cglm/frustum.h
@@ -10,6 +10,9 @@
 
 #include "common.h"
 #include "plane.h"
+#include "vec3.h"
+#include "vec4.h"
+#include "mat4.h"
 
 #define GLM_LBN 0 /* left  bottom near */
 #define GLM_LTN 1 /* left  top    near */
diff --git a/include/cglm/mat3.h b/include/cglm/mat3.h
index 61c4f3d..87bf9b1 100644
--- a/include/cglm/mat3.h
+++ b/include/cglm/mat3.h
@@ -31,6 +31,7 @@
 #define cglm_mat3_h
 
 #include "common.h"
+#include "vec3.h"
 
 #ifdef CGLM_SSE_FP
 #  include "simd/sse2/mat3.h"
@@ -186,6 +187,56 @@ glm_mat3_mulv(mat3 m, vec3 v, vec3 dest) {
   dest[2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2];
 }
 
+
+/*!
+ * @brief convert mat4's rotation part to quaternion
+ *
+ * @param[in]  m    left matrix
+ * @param[out] dest destination quaternion
+ */
+CGLM_INLINE
+void
+glm_mat3_quat(mat3 m, versor dest) {
+  float trace, r, rinv;
+
+  /* it seems using like m12 instead of m[1][2] causes extra instructions */
+
+  trace = m[0][0] + m[1][1] + m[2][2];
+  if (trace >= 0.0f) {
+    r       = sqrtf(1.0f + trace);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[1][2] - m[2][1]);
+    dest[1] = rinv * (m[2][0] - m[0][2]);
+    dest[2] = rinv * (m[0][1] - m[1][0]);
+    dest[3] = r    * 0.5f;
+  } else if (m[0][0] >= m[1][1] && m[0][0] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[1][1] - m[2][2] + m[0][0]);
+    rinv    = 0.5f / r;
+
+    dest[0] = r    * 0.5f;
+    dest[1] = rinv * (m[0][1] + m[1][0]);
+    dest[2] = rinv * (m[0][2] + m[2][0]);
+    dest[3] = rinv * (m[1][2] - m[2][1]);
+  } else if (m[1][1] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[0][0] - m[2][2] + m[1][1]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][1] + m[1][0]);
+    dest[1] = r    * 0.5f;
+    dest[2] = rinv * (m[1][2] + m[2][1]);
+    dest[3] = rinv * (m[2][0] - m[0][2]);
+  } else {
+    r       = sqrtf(1.0f - m[0][0] - m[1][1] + m[2][2]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][2] + m[2][0]);
+    dest[1] = rinv * (m[1][2] + m[2][1]);
+    dest[2] = r    * 0.5f;
+    dest[3] = rinv * (m[0][1] - m[1][0]);
+  }
+}
+
 /*!
  * @brief scale (multiply with scalar) matrix
  *
diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h
index 45f54b4..f0b6736 100644
--- a/include/cglm/mat4.h
+++ b/include/cglm/mat4.h
@@ -45,6 +45,8 @@
 #define cglm_mat_h
 
 #include "common.h"
+#include "vec4.h"
+#include "vec3.h"
 
 #ifdef CGLM_SSE_FP
 #  include "simd/sse2/mat4.h"
@@ -58,7 +60,9 @@
 #  include "simd/neon/mat4.h"
 #endif
 
-#include <assert.h>
+#ifdef DEBUG
+# include <assert.h>
+#endif
 
 #define GLM_MAT4_IDENTITY_INIT  {{1.0f, 0.0f, 0.0f, 0.0f},                    \
                                  {0.0f, 1.0f, 0.0f, 0.0f},                    \
@@ -106,13 +110,13 @@ CGLM_INLINE
 void
 glm_mat4_copy(mat4 mat, mat4 dest) {
 #ifdef __AVX__
-  _mm256_store_ps(dest[0], _mm256_load_ps(mat[0]));
-  _mm256_store_ps(dest[2], _mm256_load_ps(mat[2]));
+  glmm_store256(dest[0], glmm_load256(mat[0]));
+  glmm_store256(dest[2], glmm_load256(mat[2]));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest[0], _mm_load_ps(mat[0]));
-  _mm_store_ps(dest[1], _mm_load_ps(mat[1]));
-  _mm_store_ps(dest[2], _mm_load_ps(mat[2]));
-  _mm_store_ps(dest[3], _mm_load_ps(mat[3]));
+  glmm_store(dest[0], glmm_load(mat[0]));
+  glmm_store(dest[1], glmm_load(mat[1]));
+  glmm_store(dest[2], glmm_load(mat[2]));
+  glmm_store(dest[3], glmm_load(mat[3]));
 #else
   glm_mat4_ucopy(mat, dest);
 #endif
@@ -281,19 +285,17 @@ glm_mat4_mul(mat4 m1, mat4 m2, mat4 dest) {
  */
 CGLM_INLINE
 void
-glm_mat4_mulN(mat4 * __restrict matrices[], int len, mat4 dest) {
-  int i;
+glm_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest) {
+  uint32_t i;
 
+#ifdef DEBUG
   assert(len > 1 && "there must be least 2 matrices to go!");
+#endif
 
-  glm_mat4_mul(*matrices[0],
-               *matrices[1],
-               dest);
+  glm_mat4_mul(*matrices[0], *matrices[1], dest);
 
   for (i = 2; i < len; i++)
-    glm_mat4_mul(dest,
-                 *matrices[i],
-                 dest);
+    glm_mat4_mul(dest, *matrices[i], dest);
 }
 
 /*!
@@ -318,6 +320,55 @@ glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) {
 #endif
 }
 
+/*!
+ * @brief convert mat4's rotation part to quaternion
+ *
+ * @param[in]  m    left matrix
+ * @param[out] dest destination quaternion
+ */
+CGLM_INLINE
+void
+glm_mat4_quat(mat4 m, versor dest) {
+  float trace, r, rinv;
+
+  /* it seems using like m12 instead of m[1][2] causes extra instructions */
+
+  trace = m[0][0] + m[1][1] + m[2][2];
+  if (trace >= 0.0f) {
+    r       = sqrtf(1.0f + trace);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[1][2] - m[2][1]);
+    dest[1] = rinv * (m[2][0] - m[0][2]);
+    dest[2] = rinv * (m[0][1] - m[1][0]);
+    dest[3] = r    * 0.5f;
+  } else if (m[0][0] >= m[1][1] && m[0][0] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[1][1] - m[2][2] + m[0][0]);
+    rinv    = 0.5f / r;
+
+    dest[0] = r    * 0.5f;
+    dest[1] = rinv * (m[0][1] + m[1][0]);
+    dest[2] = rinv * (m[0][2] + m[2][0]);
+    dest[3] = rinv * (m[1][2] - m[2][1]);
+  } else if (m[1][1] >= m[2][2]) {
+    r       = sqrtf(1.0f - m[0][0] - m[2][2] + m[1][1]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][1] + m[1][0]);
+    dest[1] = r    * 0.5f;
+    dest[2] = rinv * (m[1][2] + m[2][1]);
+    dest[3] = rinv * (m[2][0] - m[0][2]);
+  } else {
+    r       = sqrtf(1.0f - m[0][0] - m[1][1] + m[2][2]);
+    rinv    = 0.5f / r;
+
+    dest[0] = rinv * (m[0][2] + m[2][0]);
+    dest[1] = rinv * (m[1][2] + m[2][1]);
+    dest[2] = r    * 0.5f;
+    dest[3] = rinv * (m[0][1] - m[1][0]);
+  }
+}
+
 /*!
  * @brief multiply vector with mat4's mat3 part(rotation)
  *
@@ -568,5 +619,4 @@ glm_mat4_swap_row(mat4 mat, int row1, int row2) {
   mat[3][row2] = tmp[3];
 }
 
-#else
 #endif /* cglm_mat_h */
diff --git a/include/cglm/plane.h b/include/cglm/plane.h
index 9faac9c..e0faed5 100644
--- a/include/cglm/plane.h
+++ b/include/cglm/plane.h
@@ -9,9 +9,7 @@
 #define cglm_plane_h
 
 #include "common.h"
-#include "mat4.h"
 #include "vec4.h"
-#include "vec3.h"
 
 /*
  Plane equation:  Ax + By + Cz + D = 0;
diff --git a/include/cglm/project.h b/include/cglm/project.h
index fea44b0..c71e735 100644
--- a/include/cglm/project.h
+++ b/include/cglm/project.h
@@ -8,9 +8,9 @@
 #ifndef cglm_project_h
 #define cglm_project_h
 
-#include "mat4.h"
 #include "vec3.h"
 #include "vec4.h"
+#include "mat4.h"
 
 /*!
  * @brief maps the specified viewport coordinates into specified space [1]
diff --git a/include/cglm/quat.h b/include/cglm/quat.h
index 63236b1..eac853b 100644
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -11,41 +11,84 @@
    GLM_QUAT_IDENTITY
 
  Functions:
-   CGLM_INLINE void  glm_quat_identity(versor q);
-   CGLM_INLINE void  glm_quat(versor q, float angle, float x, float y, float z);
-   CGLM_INLINE void  glm_quatv(versor q, float angle, vec3 v);
+   CGLM_INLINE void glm_quat_identity(versor q);
+   CGLM_INLINE void glm_quat_init(versor q, float x, float y, float z, float w);
+   CGLM_INLINE void glm_quat(versor q, float angle, float x, float y, float z);
+   CGLM_INLINE void glm_quatv(versor q, float angle, vec3 axis);
+   CGLM_INLINE void glm_quat_copy(versor q, versor dest);
    CGLM_INLINE float glm_quat_norm(versor q);
-   CGLM_INLINE void  glm_quat_normalize(versor q);
-   CGLM_INLINE float glm_quat_dot(versor q, versor r);
-   CGLM_INLINE void  glm_quat_mulv(versor q1, versor q2, versor dest);
-   CGLM_INLINE void  glm_quat_mat4(versor q, mat4 dest);
-   CGLM_INLINE void  glm_quat_slerp(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_normalize(versor q);
+   CGLM_INLINE void glm_quat_normalize_to(versor q, versor dest);
+   CGLM_INLINE float glm_quat_dot(versor q1, versor q2);
+   CGLM_INLINE void glm_quat_conjugate(versor q, versor dest);
+   CGLM_INLINE void glm_quat_inv(versor q, versor dest);
+   CGLM_INLINE void glm_quat_add(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_sub(versor p, versor q, versor dest);
+   CGLM_INLINE float glm_quat_real(versor q);
+   CGLM_INLINE void glm_quat_imag(versor q, vec3 dest);
+   CGLM_INLINE void glm_quat_imagn(versor q, vec3 dest);
+   CGLM_INLINE float glm_quat_imaglen(versor q);
+   CGLM_INLINE float glm_quat_angle(versor q);
+   CGLM_INLINE void glm_quat_axis(versor q, versor dest);
+   CGLM_INLINE void glm_quat_mul(versor p, versor q, versor dest);
+   CGLM_INLINE void glm_quat_mat4(versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_mat4t(versor q, mat4 dest);
+   CGLM_INLINE void glm_quat_mat3(versor q, mat3 dest);
+   CGLM_INLINE void glm_quat_mat3t(versor q, mat3 dest);
+   CGLM_INLINE void glm_quat_lerp(versor from, versor to, float t, versor dest);
+   CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest);
+   CGLM_INLINE void glm_quat_look(vec3 eye, versor ori, mat4 dest);
+   CGLM_INLINE void glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest);
+   CGLM_INLINE void glm_quat_forp(vec3 from,
+                                  vec3 to,
+                                  vec3 fwd,
+                                  vec3 up,
+                                  versor dest);
+   CGLM_INLINE void glm_quat_rotatev(versor q, vec3 v, vec3 dest);
+   CGLM_INLINE void glm_quat_rotate(mat4 m, versor q, mat4 dest);
  */
 
 #ifndef cglm_quat_h
 #define cglm_quat_h
 
 #include "common.h"
+#include "vec3.h"
 #include "vec4.h"
+#include "mat4.h"
+#include "mat3.h"
+#include "affine-mat.h"
 
 #ifdef CGLM_SSE_FP
 #  include "simd/sse2/quat.h"
 #endif
 
+CGLM_INLINE
+void
+glm_mat4_identity(mat4 mat);
+
+CGLM_INLINE
+void
+glm_mat4_mulv(mat4 m, vec4 v, vec4 dest);
+
+CGLM_INLINE
+void
+glm_mul_rot(mat4 m1, mat4 m2, mat4 dest);
+
+CGLM_INLINE
+void
+glm_translate(mat4 m, vec3 v);
+
 /*
- * IMPORTANT! cglm stores quat as [w, x, y, z]
+ * IMPORTANT:
+ * ----------------------------------------------------------------------------
+ * cglm stores quat as [x, y, z, w] since v0.3.6
  *
- * Possible changes (these may be changed in the future):
- *  - versor is identity quat, we can define new type for quat.
- *    it can't be quat or quaternion becuase someone can use that name for
- *    variable name. maybe just vec4.
- *  - it stores [w, x, y, z] but it may change to [x, y, z, w] if we get enough
- *    feedback to change it.
- *  - in general we use last param as dest, but this header used first param
- *    as dest this may be changed but decided yet
+ * it was [w, x, y, z] before v0.3.6 it has been changed to [x, y, z, w]
+ * with v0.3.6 version.
+ * ----------------------------------------------------------------------------
  */
 
-#define GLM_QUAT_IDENTITY_INIT  {1.0f, 0.0f, 0.0f, 0.0f}
+#define GLM_QUAT_IDENTITY_INIT  {0.0f, 0.0f, 0.0f, 1.0f}
 #define GLM_QUAT_IDENTITY       ((versor)GLM_QUAT_IDENTITY_INIT)
 
 /*!
@@ -60,6 +103,49 @@ glm_quat_identity(versor q) {
   glm_vec4_copy(v, q);
 }
 
+/*!
+ * @brief inits quaterion with raw values
+ *
+ * @param[out]  q     quaternion
+ * @param[in]   x     x
+ * @param[in]   y     y
+ * @param[in]   z     z
+ * @param[in]   w     w (real part)
+ */
+CGLM_INLINE
+void
+glm_quat_init(versor q, float x, float y, float z, float w) {
+  q[0] = x;
+  q[1] = y;
+  q[2] = z;
+  q[3] = w;
+}
+
+/*!
+ * @brief creates NEW quaternion with axis vector
+ *
+ * @param[out]  q     quaternion
+ * @param[in]   angle angle (radians)
+ * @param[in]   axis  axis
+ */
+CGLM_INLINE
+void
+glm_quatv(versor q, float angle, vec3 axis) {
+  vec3  k;
+  float a, c, s;
+
+  a = angle * 0.5f;
+  c = cosf(a);
+  s = sinf(a);
+
+  glm_normalize_to(axis, k);
+
+  q[0] = s * k[0];
+  q[1] = s * k[1];
+  q[2] = s * k[2];
+  q[3] = c;
+}
+
 /*!
  * @brief creates NEW quaternion with individual axis components
  *
@@ -71,45 +157,21 @@ glm_quat_identity(versor q) {
  */
 CGLM_INLINE
 void
-glm_quat(versor q,
-         float  angle,
-         float  x,
-         float  y,
-         float  z) {
-  float a, c, s;
-
-  a = angle * 0.5f;
-  c = cosf(a);
-  s = sinf(a);
-
-  q[0] = c;
-  q[1] = s * x;
-  q[2] = s * y;
-  q[3] = s * z;
+glm_quat(versor q, float angle, float x, float y, float z) {
+  vec3 axis = {x, y, z};
+  glm_quatv(q, angle, axis);
 }
 
 /*!
- * @brief creates NEW quaternion with axis vector
+ * @brief copy quaternion to another one
  *
- * @param[out]  q     quaternion
- * @param[in]   angle angle (radians)
- * @param[in]   v     axis
+ * @param[in]  q     quaternion
+ * @param[out] dest  destination
  */
 CGLM_INLINE
 void
-glm_quatv(versor q,
-          float  angle,
-          vec3   v) {
-  float a, c, s;
-
-  a = angle * 0.5f;
-  c = cosf(a);
-  s = sinf(a);
-
-  q[0] = c;
-  q[1] = s * v[0];
-  q[2] = s * v[1];
-  q[3] = s * v[2];
+glm_quat_copy(versor q, versor dest) {
+  glm_vec4_copy(q, dest);
 }
 
 /*!
@@ -123,6 +185,43 @@ glm_quat_norm(versor q) {
   return glm_vec4_norm(q);
 }
 
+/*!
+ * @brief normalize quaternion and store result in dest
+ *
+ * @param[in]   q     quaternion to normalze
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_normalize_to(versor q, versor dest) {
+#if defined( __SSE2__ ) || defined( __SSE2__ )
+  __m128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(q);
+  xdot = glmm_dot(x0, x0);
+  dot  = _mm_cvtss_f32(xdot);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
+#else
+  float dot;
+
+  dot = glm_vec4_norm2(q);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(q);
+    return;
+  }
+
+  glm_vec4_scale(q, 1.0f / sqrtf(dot), dest);
+#endif
+}
+
 /*!
  * @brief normalize quaternion
  *
@@ -131,45 +230,178 @@ glm_quat_norm(versor q) {
 CGLM_INLINE
 void
 glm_quat_normalize(versor q) {
-  float sum;
-
-  sum = q[0] * q[0] + q[1] * q[1]
-          + q[2] * q[2] + q[3] * q[3];
-
-  if (fabs(1.0f - sum) < 0.0001f)
-    return;
-
-  glm_vec4_scale(q, 1.0f / sqrtf(sum), q);
+  glm_quat_normalize_to(q, q);
 }
 
 /*!
  * @brief dot product of two quaternion
  *
- * @param[in]  q  quaternion 1
- * @param[in]  r  quaternion 2
+ * @param[in]  p  quaternion 1
+ * @param[in]  q  quaternion 2
  */
 CGLM_INLINE
 float
-glm_quat_dot(versor q, versor r) {
-  return glm_vec4_dot(q, r);
+glm_quat_dot(versor p, versor q) {
+  return glm_vec4_dot(p, q);
+}
+
+/*!
+ * @brief conjugate of quaternion
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  conjugate
+ */
+CGLM_INLINE
+void
+glm_quat_conjugate(versor q, versor dest) {
+  glm_vec4_flipsign_to(q, dest);
+  dest[3] = -dest[3];
+}
+
+/*!
+ * @brief inverse of non-zero quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest inverse quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_inv(versor q, versor dest) {
+  versor conj;
+  glm_quat_conjugate(q, conj);
+  glm_vec4_scale(conj, 1.0f / glm_vec4_norm2(q), dest);
+}
+
+/*!
+ * @brief add (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @param[out]  dest result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_add(versor p, versor q, versor dest) {
+  glm_vec4_add(p, q, dest);
+}
+
+/*!
+ * @brief subtract (componentwise) two quaternions and store result in dest
+ *
+ * @param[in]   p    quaternion 1
+ * @param[in]   q    quaternion 2
+ * @param[out]  dest result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_sub(versor p, versor q, versor dest) {
+  glm_vec4_sub(p, q, dest);
+}
+
+/*!
+ * @brief returns real part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_real(versor q) {
+  return q[3];
+}
+
+/*!
+ * @brief returns imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest imag
+ */
+CGLM_INLINE
+void
+glm_quat_imag(versor q, vec3 dest) {
+  dest[0] = q[0];
+  dest[1] = q[1];
+  dest[2] = q[2];
+}
+
+/*!
+ * @brief returns normalized imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_imagn(versor q, vec3 dest) {
+  glm_normalize_to(q, dest);
+}
+
+/*!
+ * @brief returns length of imaginary part of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_imaglen(versor q) {
+  return glm_vec_norm(q);
+}
+
+/*!
+ * @brief returns angle of quaternion
+ *
+ * @param[in]   q    quaternion
+ */
+CGLM_INLINE
+float
+glm_quat_angle(versor q) {
+  /*
+   sin(theta / 2) = length(x*x + y*y + z*z)
+   cos(theta / 2) = w
+   theta          = 2 * atan(sin(theta / 2) / cos(theta / 2))
+   */
+  return 2.0f * atan2f(glm_quat_imaglen(q), glm_quat_real(q));
+}
+
+/*!
+ * @brief axis of quaternion
+ *
+ * @param[in]   q    quaternion
+ * @param[out]  dest axis of quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_axis(versor q, versor dest) {
+  glm_quat_imagn(q, dest);
 }
 
 /*!
  * @brief multiplies two quaternion and stores result in dest
+ *        this is also called Hamilton Product
  *
- * @param[in]   q1    quaternion 1
- * @param[in]   q2    quaternion 2
+ * According to WikiPedia:
+ * The product of two rotation quaternions [clarification needed] will be
+ * equivalent to the rotation q followed by the rotation p
+ *
+ * @param[in]   p     quaternion 1
+ * @param[in]   q     quaternion 2
  * @param[out]  dest  result quaternion
  */
 CGLM_INLINE
 void
-glm_quat_mulv(versor q1, versor q2, versor dest) {
-  dest[0] = q2[0] * q1[0] - q2[1] * q1[1] - q2[2] * q1[2] - q2[3] * q1[3];
-  dest[1] = q2[0] * q1[1] + q2[1] * q1[0] - q2[2] * q1[3] + q2[3] * q1[2];
-  dest[2] = q2[0] * q1[2] + q2[1] * q1[3] + q2[2] * q1[0] - q2[3] * q1[1];
-  dest[3] = q2[0] * q1[3] - q2[1] * q1[2] + q2[2] * q1[1] + q2[3] * q1[0];
-
-  glm_quat_normalize(dest);
+glm_quat_mul(versor p, versor q, versor dest) {
+  /*
+    + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+    + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+    + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+       a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glm_quat_mul_sse2(p, q, dest);
+#else
+  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
+  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
+  dest[2] = p[3] * q[2] + p[0] * q[1] - p[1] * q[0] + p[2] * q[3];
+  dest[3] = p[3] * q[3] - p[0] * q[0] - p[1] * q[1] - p[2] * q[2];
+#endif
 }
 
 /*!
@@ -181,19 +413,22 @@ glm_quat_mulv(versor q1, versor q2, versor dest) {
 CGLM_INLINE
 void
 glm_quat_mat4(versor q, mat4 dest) {
-  float w, x, y, z;
-  float xx, yy, zz;
-  float xy, yz, xz;
-  float wx, wy, wz;
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
 
-  w = q[0];
-  x = q[1];
-  y = q[2];
-  z = q[3];
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
 
-  xx = 2.0f * x * x;   xy = 2.0f * x * y;   wx = 2.0f * w * x;
-  yy = 2.0f * y * y;   yz = 2.0f * y * z;   wy = 2.0f * w * y;
-  zz = 2.0f * z * z;   xz = 2.0f * x * z;   wz = 2.0f * w * z;
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
 
   dest[0][0] = 1.0f - yy - zz;
   dest[1][1] = 1.0f - xx - zz;
@@ -207,8 +442,8 @@ glm_quat_mat4(versor q, mat4 dest) {
   dest[2][1] = yz - wx;
   dest[0][2] = xz - wy;
 
-  dest[1][3] = 0.0f;
   dest[0][3] = 0.0f;
+  dest[1][3] = 0.0f;
   dest[2][3] = 0.0f;
   dest[3][0] = 0.0f;
   dest[3][1] = 0.0f;
@@ -216,69 +451,347 @@ glm_quat_mat4(versor q, mat4 dest) {
   dest[3][3] = 1.0f;
 }
 
+/*!
+ * @brief convert quaternion to mat4 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix as transposed
+ */
+CGLM_INLINE
+void
+glm_quat_mat4t(versor q, mat4 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[1][0] = xy + wz;
+  dest[2][1] = yz + wx;
+  dest[0][2] = xz + wy;
+
+  dest[0][1] = xy - wz;
+  dest[1][2] = yz - wx;
+  dest[2][0] = xz - wy;
+
+  dest[0][3] = 0.0f;
+  dest[1][3] = 0.0f;
+  dest[2][3] = 0.0f;
+  dest[3][0] = 0.0f;
+  dest[3][1] = 0.0f;
+  dest[3][2] = 0.0f;
+  dest[3][3] = 1.0f;
+}
+
+/*!
+ * @brief convert quaternion to mat3
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_quat_mat3(versor q, mat3 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[0][1] = xy + wz;
+  dest[1][2] = yz + wx;
+  dest[2][0] = xz + wy;
+
+  dest[1][0] = xy - wz;
+  dest[2][1] = yz - wx;
+  dest[0][2] = xz - wy;
+}
+
+/*!
+ * @brief convert quaternion to mat3 (transposed)
+ *
+ * @param[in]   q     quaternion
+ * @param[out]  dest  result matrix
+ */
+CGLM_INLINE
+void
+glm_quat_mat3t(versor q, mat3 dest) {
+  float w, x, y, z,
+        xx, yy, zz,
+        xy, yz, xz,
+        wx, wy, wz, norm, s;
+
+  norm = glm_quat_norm(q);
+  s    = norm > 0.0f ? 2.0f / norm : 0.0f;
+
+  x = q[0];
+  y = q[1];
+  z = q[2];
+  w = q[3];
+
+  xx = s * x * x;   xy = s * x * y;   wx = s * w * x;
+  yy = s * y * y;   yz = s * y * z;   wy = s * w * y;
+  zz = s * z * z;   xz = s * x * z;   wz = s * w * z;
+
+  dest[0][0] = 1.0f - yy - zz;
+  dest[1][1] = 1.0f - xx - zz;
+  dest[2][2] = 1.0f - xx - yy;
+
+  dest[1][0] = xy + wz;
+  dest[2][1] = yz + wx;
+  dest[0][2] = xz + wy;
+
+  dest[0][1] = xy - wz;
+  dest[1][2] = yz - wx;
+  dest[2][0] = xz - wy;
+}
+
+/*!
+ * @brief interpolates between two quaternions
+ *        using linear interpolation (LERP)
+ *
+ * @param[in]   from  from
+ * @param[in]   to    to
+ * @param[in]   t     interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest  result quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_lerp(versor from, versor to, float t, versor dest) {
+  glm_vec4_lerp(from, to, t, dest);
+}
+
 /*!
  * @brief interpolates between two quaternions
  *        using spherical linear interpolation (SLERP)
  *
- * @param[in]   q     from
- * @param[in]   r     to
+ * @param[in]   from  from
+ * @param[in]   to    to
  * @param[in]   t     amout
  * @param[out]  dest  result quaternion
  */
 CGLM_INLINE
 void
-glm_quat_slerp(versor q,
-               versor r,
-               float  t,
-               versor dest) {
-  /* https://en.wikipedia.org/wiki/Slerp */
-#if defined( __SSE__ ) || defined( __SSE2__ )
-  glm_quat_slerp_sse2(q, r, t, dest);
-#else
-  float cosTheta, sinTheta, angle, a, b, c;
+glm_quat_slerp(versor from, versor to, float t, versor dest) {
+  vec4  q1, q2;
+  float cosTheta, sinTheta, angle;
 
-  cosTheta = glm_quat_dot(q, r);
-  if (cosTheta < 0.0f) {
-    q[0] *= -1.0f;
-    q[1] *= -1.0f;
-    q[2] *= -1.0f;
-    q[3] *= -1.0f;
+  cosTheta = glm_quat_dot(from, to);
+  glm_quat_copy(from, q1);
 
-    cosTheta = -cosTheta;
-  }
-
-  if (fabs(cosTheta) >= 1.0f) {
-    dest[0] = q[0];
-    dest[1] = q[1];
-    dest[2] = q[2];
-    dest[3] = q[3];
+  if (fabsf(cosTheta) >= 1.0f) {
+    glm_quat_copy(q1, dest);
     return;
   }
 
-  sinTheta = sqrt(1.0f - cosTheta * cosTheta);
+  if (cosTheta < 0.0f) {
+    glm_vec4_flipsign(q1);
+    cosTheta = -cosTheta;
+  }
 
-  c = 1.0f - t;
+  sinTheta = sqrtf(1.0f - cosTheta * cosTheta);
 
-  /* LERP */
-  /* TODO: FLT_EPSILON vs 0.001? */
-  if (sinTheta < 0.001f) {
-    dest[0] = c * q[0] + t * r[0];
-    dest[1] = c * q[1] + t * r[1];
-    dest[2] = c * q[2] + t * r[2];
-    dest[3] = c * q[3] + t * r[3];
+  /* LERP to avoid zero division */
+  if (fabsf(sinTheta) < 0.001f) {
+    glm_quat_lerp(from, to, t, dest);
     return;
   }
 
   /* SLERP */
   angle = acosf(cosTheta);
-  a = sinf(c * angle);
-  b = sinf(t * angle);
+  glm_vec4_scale(q1, sinf((1.0f - t) * angle), q1);
+  glm_vec4_scale(to, sinf(t * angle), q2);
 
-  dest[0] = (q[0] * a + r[0] * b) / sinTheta;
-  dest[1] = (q[1] * a + r[1] * b) / sinTheta;
-  dest[2] = (q[2] * a + r[2] * b) / sinTheta;
-  dest[3] = (q[3] * a + r[3] * b) / sinTheta;
-#endif
+  glm_vec4_add(q1, q2, q1);
+  glm_vec4_scale(q1, 1.0f / sinTheta, dest);
+}
+
+/*!
+ * @brief creates view matrix using quaternion as camera orientation
+ *
+ * @param[in]   eye   eye
+ * @param[in]   ori   orientation in world space as quaternion
+ * @param[out]  dest  view matrix
+ */
+CGLM_INLINE
+void
+glm_quat_look(vec3 eye, versor ori, mat4 dest) {
+  vec4 t;
+
+  /* orientation */
+  glm_quat_mat4t(ori, dest);
+
+  /* translate */
+  glm_vec4(eye, 1.0f, t);
+  glm_mat4_mulv(dest, t, t);
+  glm_vec_flipsign_to(t, dest[3]);
+}
+
+/*!
+ * @brief creates look rotation quaternion
+ *
+ * @param[in]   dir   direction to look
+ * @param[in]   fwd   forward vector
+ * @param[in]   up    up vector
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest) {
+  vec3  axis;
+  float dot, angle;
+
+  dot = glm_vec_dot(dir, fwd);
+  if (fabsf(dot + 1.0f)  < 0.000001f) {
+    glm_quat_init(dest, up[0], up[1], up[2], CGLM_PI);
+    return;
+  }
+
+  if (fabsf(dot - 1.0f) < 0.000001f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  angle = acosf(dot);
+  glm_cross(fwd, dir, axis);
+  glm_normalize(axis);
+
+  glm_quatv(dest, angle, axis);
+}
+
+/*!
+ * @brief creates look rotation quaternion using source and
+ *        destination positions p suffix stands for position
+ *
+ * @param[in]   from  source point
+ * @param[in]   to    destination point
+ * @param[in]   fwd   forward vector
+ * @param[in]   up    up vector
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_forp(vec3 from, vec3 to, vec3 fwd, vec3 up, versor dest) {
+  vec3 dir;
+  glm_vec_sub(to, from, dir);
+  glm_quat_for(dir, fwd, up, dest);
+}
+
+/*!
+ * @brief rotate vector using using quaternion
+ *
+ * @param[in]   q     quaternion
+ * @param[in]   v     vector to rotate
+ * @param[out]  dest  rotated vector
+ */
+CGLM_INLINE
+void
+glm_quat_rotatev(versor q, vec3 v, vec3 dest) {
+  versor p;
+  vec3   u, v1, v2;
+  float  s;
+
+  glm_quat_normalize_to(q, p);
+  glm_quat_imag(p, u);
+  s = glm_quat_real(p);
+
+  glm_vec_scale(u, 2.0f * glm_vec_dot(u, v), v1);
+  glm_vec_scale(v, s * s - glm_vec_dot(u, u), v2);
+  glm_vec_add(v1, v2, v1);
+
+  glm_vec_cross(u, v, v2);
+  glm_vec_scale(v2, 2.0f * s, v2);
+
+  glm_vec_add(v1, v2, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion
+ *
+ * @param[in]   m     existing transform matrix
+ * @param[in]   q     quaternion
+ * @param[out]  dest  rotated matrix/transform
+ */
+CGLM_INLINE
+void
+glm_quat_rotate(mat4 m, versor q, mat4 dest) {
+  mat4 rot;
+  glm_quat_mat4(q, rot);
+  glm_mul_rot(m, rot, dest);
+}
+
+/*!
+ * @brief rotate existing transform matrix using quaternion at pivot point
+ *
+ * @param[in, out]   m     existing transform matrix
+ * @param[in]        q     quaternion
+ * @param[out]       pivot pivot
+ */
+CGLM_INLINE
+void
+glm_quat_rotate_at(mat4 m, versor q, vec3 pivot) {
+  vec3 pivotInv;
+
+  glm_vec_inv_to(pivot, pivotInv);
+
+  glm_translate(m, pivot);
+  glm_quat_rotate(m, q, m);
+  glm_translate(m, pivotInv);
+}
+
+/*!
+ * @brief rotate NEW transform matrix using quaternion at pivot point
+ *
+ * this creates rotation matrix, it assumes you don't have a matrix
+ *
+ * this should work faster than glm_quat_rotate_at because it reduces
+ * one glm_translate.
+ *
+ * @param[out]  m     existing transform matrix
+ * @param[in]   q     quaternion
+ * @param[in]   pivot pivot
+ */
+CGLM_INLINE
+void
+glm_quat_rotate_atm(mat4 m, versor q, vec3 pivot) {
+  vec3 pivotInv;
+
+  glm_vec_inv_to(pivot, pivotInv);
+
+  glm_mat4_identity(m);
+  glm_vec_copy(pivot, m[3]);
+  glm_quat_rotate(m, q, m);
+  glm_translate(m, pivotInv);
 }
 
 #endif /* cglm_quat_h */
diff --git a/include/cglm/simd/avx/affine.h b/include/cglm/simd/avx/affine.h
index 1b0dcea..5c7f71c 100644
--- a/include/cglm/simd/avx/affine.h
+++ b/include/cglm/simd/avx/affine.h
@@ -21,11 +21,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
 
   __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
 
-  y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */
-  y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */
+  y0 = glmm_load256(m2[0]); /* h g f e d c b a */
+  y1 = glmm_load256(m2[2]); /* p o n m l k j i */
 
-  y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */
-  y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */
+  y2 = glmm_load256(m1[0]); /* h g f e d c b a */
+  y3 = glmm_load256(m1[2]); /* p o n m l k j i */
 
   y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */
   y5 = _mm256_permute2f128_ps(y3, y3, 0b00000000); /* l k j i l k j i */
@@ -37,10 +37,10 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
   y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
   y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
 
-  _mm256_store_ps(dest[0],
-                  _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
-                                              _mm256_mul_ps(y4, y8)),
-                                _mm256_mul_ps(y5, y7)));
+  glmm_store256(dest[0],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y4, y8)),
+                              _mm256_mul_ps(y5, y7)));
 
 
   /* n n n n i i i i */
@@ -52,11 +52,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
   y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
   y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
 
-  _mm256_store_ps(dest[2],
-                  _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
-                                              _mm256_mul_ps(y3, y7)),
-                                _mm256_add_ps(_mm256_mul_ps(y4, y8),
-                                              _mm256_mul_ps(y5, y9))));
+  glmm_store256(dest[2],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y3, y7)),
+                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+                                            _mm256_mul_ps(y5, y9))));
 }
 
 #endif
diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h
index e2ef9da..b5859a7 100644
--- a/include/cglm/simd/avx/mat4.h
+++ b/include/cglm/simd/avx/mat4.h
@@ -21,11 +21,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
 
   __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
 
-  y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */
-  y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */
+  y0 = glmm_load256(m2[0]); /* h g f e d c b a */
+  y1 = glmm_load256(m2[2]); /* p o n m l k j i */
 
-  y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */
-  y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */
+  y2 = glmm_load256(m1[0]); /* h g f e d c b a */
+  y3 = glmm_load256(m1[2]); /* p o n m l k j i */
 
   y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */
   y5 = _mm256_permute2f128_ps(y3, y3, 0b00000011); /* l k j i p o n m */
@@ -39,11 +39,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
   y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
   y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
 
-  _mm256_store_ps(dest[0],
-                  _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
-                                              _mm256_mul_ps(y3, y7)),
-                                _mm256_add_ps(_mm256_mul_ps(y4, y8),
-                                              _mm256_mul_ps(y5, y9))));
+  glmm_store256(dest[0],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y3, y7)),
+                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+                                            _mm256_mul_ps(y5, y9))));
 
   /* n n n n i i i i */
   /* p p p p k k k k */
@@ -54,11 +54,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
   y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
   y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
 
-  _mm256_store_ps(dest[2],
-                  _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
-                                              _mm256_mul_ps(y3, y7)),
-                                _mm256_add_ps(_mm256_mul_ps(y4, y8),
-                                              _mm256_mul_ps(y5, y9))));
+  glmm_store256(dest[2],
+                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+                                            _mm256_mul_ps(y3, y7)),
+                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+                                            _mm256_mul_ps(y5, y9))));
 }
 
 #endif
diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h
index 2507291..a5fa455 100644
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -8,11 +8,19 @@
 #ifndef cglm_intrin_h
 #define cglm_intrin_h
 
-#if defined( _WIN32 )
+#if defined( _MSC_VER )
 #  if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
-#    define __SSE2__
+#    ifndef __SSE2__
+#      define __SSE2__
+#    endif
 #  elif _M_IX86_FP == 1
-#    define __SSE__
+#    ifndef __SSE__
+#      define __SSE__
+#    endif
+#  endif
+/* do not use alignment for older visual studio versions */
+#  if _MSC_VER < 1913     /* Visual Studio 2017 version 15.6 */
+#    define CGLM_ALL_UNALIGNED
 #  endif
 #endif
 
@@ -36,6 +44,49 @@
 #  define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1)               \
      _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),       \
                                     z1, y1, x1, w1)
+
+static inline
+__m128
+glmm_dot(__m128 a, __m128 b) {
+  __m128 x0;
+  x0 = _mm_mul_ps(a, b);
+  x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
+  return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1));
+}
+
+static inline
+__m128
+glmm_norm(__m128 a) {
+  return _mm_sqrt_ps(glmm_dot(a, a));
+}
+
+static inline
+__m128
+glmm_load3(float v[3]) {
+  __m128i xy;
+  __m128  z;
+
+  xy = _mm_loadl_epi64((const __m128i *)v);
+  z  = _mm_load_ss(&v[2]);
+
+  return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
+}
+
+static inline
+void
+glmm_store3(__m128 vx, float v[3]) {
+  _mm_storel_pi((__m64 *)&v[0], vx);
+  _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2));
+}
+
+#ifdef CGLM_ALL_UNALIGNED
+#  define glmm_load(p)      _mm_loadu_ps(p)
+#  define glmm_store(p, a)  _mm_storeu_ps(p, a)
+#else
+#  define glmm_load(p)      _mm_load_ps(p)
+#  define glmm_store(p, a)  _mm_store_ps(p, a)
+#endif
+
 #endif
 
 /* x86, x64 */
@@ -45,6 +96,15 @@
 
 #ifdef __AVX__
 #  define CGLM_AVX_FP 1
+
+#ifdef CGLM_ALL_UNALIGNED
+#  define glmm_load256(p)      _mm256_loadu_ps(p)
+#  define glmm_store256(p, a)  _mm256_storeu_ps(p, a)
+#else
+#  define glmm_load256(p)      _mm256_load_ps(p)
+#  define glmm_store256(p, a)  _mm256_store_ps(p, a)
+#endif
+
 #endif
 
 /* ARM Neon */
diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h
index 3ec8f5f..8a644d3 100644
--- a/include/cglm/simd/sse2/affine.h
+++ b/include/cglm/simd/sse2/affine.h
@@ -18,35 +18,67 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
   /* D = R * L (Column-Major) */
   __m128 l0, l1, l2, l3, r;
 
-  l0 = _mm_load_ps(m1[0]);
-  l1 = _mm_load_ps(m1[1]);
-  l2 = _mm_load_ps(m1[2]);
-  l3 = _mm_load_ps(m1[3]);
+  l0 = glmm_load(m1[0]);
+  l1 = glmm_load(m1[1]);
+  l2 = glmm_load(m1[2]);
+  l3 = glmm_load(m1[3]);
 
-  r = _mm_load_ps(m2[0]);
-  _mm_store_ps(dest[0],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+  r = glmm_load(m2[0]);
+  glmm_store(dest[0],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
 
-  r = _mm_load_ps(m2[1]);
-  _mm_store_ps(dest[1],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+  r = glmm_load(m2[1]);
+  glmm_store(dest[1],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
 
-  r = _mm_load_ps(m2[2]);
-  _mm_store_ps(dest[2],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+  r = glmm_load(m2[2]);
+  glmm_store(dest[2],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
 
-  r = _mm_load_ps(m2[3]);
-  _mm_store_ps(dest[3],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+  r = glmm_load(m2[3]);
+  glmm_store(dest[3],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+}
+
+CGLM_INLINE
+void
+glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
+  /* D = R * L (Column-Major) */
+  __m128 l0, l1, l2, l3, r;
+
+  l0 = glmm_load(m1[0]);
+  l1 = glmm_load(m1[1]);
+  l2 = glmm_load(m1[2]);
+  l3 = glmm_load(m1[3]);
+
+  r = glmm_load(m2[0]);
+  glmm_store(dest[0],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+
+  r = glmm_load(m2[1]);
+  glmm_store(dest[1],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+
+  r = glmm_load(m2[2]);
+  glmm_store(dest[2],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
+
+  glmm_store(dest[3], l3);
 }
 
 CGLM_INLINE
@@ -54,11 +86,11 @@ void
 glm_inv_tr_sse2(mat4 mat) {
   __m128 r0, r1, r2, r3, x0, x1;
 
-  r0 = _mm_load_ps(mat[0]);
-  r1 = _mm_load_ps(mat[1]);
-  r2 = _mm_load_ps(mat[2]);
-  r3 = _mm_load_ps(mat[3]);
-  x1  = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+  r0 = glmm_load(mat[0]);
+  r1 = glmm_load(mat[1]);
+  r2 = glmm_load(mat[2]);
+  r3 = glmm_load(mat[3]);
+  x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
 
   _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
 
@@ -69,10 +101,10 @@ glm_inv_tr_sse2(mat4 mat) {
 
   x0 = _mm_add_ps(x0, x1);
 
-  _mm_store_ps(mat[0], r0);
-  _mm_store_ps(mat[1], r1);
-  _mm_store_ps(mat[2], r2);
-  _mm_store_ps(mat[3], x0);
+  glmm_store(mat[0], r0);
+  glmm_store(mat[1], r1);
+  glmm_store(mat[2], r2);
+  glmm_store(mat[3], x0);
 }
 
 #endif
diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h
index 77874a8..1f82c08 100644
--- a/include/cglm/simd/sse2/mat4.h
+++ b/include/cglm/simd/sse2/mat4.h
@@ -20,10 +20,10 @@ glm_mat4_scale_sse2(mat4 m, float s){
   __m128 x0;
   x0 = _mm_set1_ps(s);
 
-  _mm_store_ps(m[0], _mm_mul_ps(_mm_load_ps(m[0]), x0));
-  _mm_store_ps(m[1], _mm_mul_ps(_mm_load_ps(m[1]), x0));
-  _mm_store_ps(m[2], _mm_mul_ps(_mm_load_ps(m[2]), x0));
-  _mm_store_ps(m[3], _mm_mul_ps(_mm_load_ps(m[3]), x0));
+  glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
+  glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
+  glmm_store(m[2], _mm_mul_ps(glmm_load(m[2]), x0));
+  glmm_store(m[3], _mm_mul_ps(glmm_load(m[3]), x0));
 }
 
 CGLM_INLINE
@@ -31,17 +31,17 @@ void
 glm_mat4_transp_sse2(mat4 m, mat4 dest){
   __m128 r0, r1, r2, r3;
 
-  r0 = _mm_load_ps(m[0]);
-  r1 = _mm_load_ps(m[1]);
-  r2 = _mm_load_ps(m[2]);
-  r3 = _mm_load_ps(m[3]);
+  r0 = glmm_load(m[0]);
+  r1 = glmm_load(m[1]);
+  r2 = glmm_load(m[2]);
+  r3 = glmm_load(m[3]);
 
   _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
 
-  _mm_store_ps(dest[0], r0);
-  _mm_store_ps(dest[1], r1);
-  _mm_store_ps(dest[2], r2);
-  _mm_store_ps(dest[3], r3);
+  glmm_store(dest[0], r0);
+  glmm_store(dest[1], r1);
+  glmm_store(dest[2], r2);
+  glmm_store(dest[3], r3);
 }
 
 CGLM_INLINE
@@ -51,36 +51,36 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
 
   __m128 l0, l1, l2, l3, r;
 
-  l0 = _mm_load_ps(m1[0]);
-  l1 = _mm_load_ps(m1[1]);
-  l2 = _mm_load_ps(m1[2]);
-  l3 = _mm_load_ps(m1[3]);
+  l0 = glmm_load(m1[0]);
+  l1 = glmm_load(m1[1]);
+  l2 = glmm_load(m1[2]);
+  l3 = glmm_load(m1[3]);
 
-  r = _mm_load_ps(m2[0]);
-  _mm_store_ps(dest[0],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
-  r = _mm_load_ps(m2[1]);
-  _mm_store_ps(dest[1],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
-  r = _mm_load_ps(m2[2]);
-  _mm_store_ps(dest[2],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+  r = glmm_load(m2[0]);
+  glmm_store(dest[0],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+  r = glmm_load(m2[1]);
+  glmm_store(dest[1],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+  r = glmm_load(m2[2]);
+  glmm_store(dest[2],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
 
-  r = _mm_load_ps(m2[3]);
-  _mm_store_ps(dest[3],
-               _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
-                          _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
-                                     _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
+  r = glmm_load(m2[3]);
+  glmm_store(dest[3],
+             _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
+                        _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
+                                   _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
 }
 
 CGLM_INLINE
@@ -88,18 +88,18 @@ void
 glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
   __m128 x0, x1, x2;
 
-  x0 = _mm_load_ps(v);
-  x1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]),
+  x0 = glmm_load(v);
+  x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
                              _mm_shuffle1_ps1(x0, 0)),
-                  _mm_mul_ps(_mm_load_ps(m[1]),
+                  _mm_mul_ps(glmm_load(m[1]),
                              _mm_shuffle1_ps1(x0, 1)));
 
-  x2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]),
+  x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
                              _mm_shuffle1_ps1(x0, 2)),
-                  _mm_mul_ps(_mm_load_ps(m[3]),
+                  _mm_mul_ps(glmm_load(m[3]),
                              _mm_shuffle1_ps1(x0, 3)));
 
-  _mm_store_ps(dest, _mm_add_ps(x1, x2));
+  glmm_store(dest, _mm_add_ps(x1, x2));
 }
 
 CGLM_INLINE
@@ -108,10 +108,10 @@ glm_mat4_det_sse2(mat4 mat) {
   __m128 r0, r1, r2, r3, x0, x1, x2;
 
   /* 127 <- 0, [square] det(A) = det(At) */
-  r0 = _mm_load_ps(mat[0]); /* d c b a */
-  r1 = _mm_load_ps(mat[1]); /* h g f e */
-  r2 = _mm_load_ps(mat[2]); /* l k j i */
-  r3 = _mm_load_ps(mat[3]); /* p o n m */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
 
   /*
    t[1] = j * p - n * l;
@@ -166,10 +166,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
          x0, x1, x2, x3, x4, x5, x6, x7;
 
   /* 127 <- 0 */
-  r0 = _mm_load_ps(mat[0]); /* d c b a */
-  r1 = _mm_load_ps(mat[1]); /* h g f e */
-  r2 = _mm_load_ps(mat[2]); /* l k j i */
-  r3 = _mm_load_ps(mat[3]); /* p o n m */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
 
   x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2));  /* p o l k */
   x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3);                  /* l p p p */
@@ -275,10 +275,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
   x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1));
   x0 = _mm_rcp_ps(x0);
 
-  _mm_store_ps(dest[0], _mm_mul_ps(v0, x0));
-  _mm_store_ps(dest[1], _mm_mul_ps(v1, x0));
-  _mm_store_ps(dest[2], _mm_mul_ps(v2, x0));
-  _mm_store_ps(dest[3], _mm_mul_ps(v3, x0));
+  glmm_store(dest[0], _mm_mul_ps(v0, x0));
+  glmm_store(dest[1], _mm_mul_ps(v1, x0));
+  glmm_store(dest[2], _mm_mul_ps(v2, x0));
+  glmm_store(dest[3], _mm_mul_ps(v3, x0));
 }
 
 CGLM_INLINE
@@ -290,10 +290,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
          x0, x1, x2, x3, x4, x5, x6, x7;
 
   /* 127 <- 0 */
-  r0 = _mm_load_ps(mat[0]); /* d c b a */
-  r1 = _mm_load_ps(mat[1]); /* h g f e */
-  r2 = _mm_load_ps(mat[2]); /* l k j i */
-  r3 = _mm_load_ps(mat[3]); /* p o n m */
+  r0 = glmm_load(mat[0]); /* d c b a */
+  r1 = glmm_load(mat[1]); /* h g f e */
+  r2 = glmm_load(mat[2]); /* l k j i */
+  r3 = glmm_load(mat[3]); /* p o n m */
 
   x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2));  /* p o l k */
   x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3);                  /* l p p p */
@@ -399,10 +399,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
   x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1));
   x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0);
 
-  _mm_store_ps(dest[0], _mm_mul_ps(v0, x0));
-  _mm_store_ps(dest[1], _mm_mul_ps(v1, x0));
-  _mm_store_ps(dest[2], _mm_mul_ps(v2, x0));
-  _mm_store_ps(dest[3], _mm_mul_ps(v3, x0));
+  glmm_store(dest[0], _mm_mul_ps(v0, x0));
+  glmm_store(dest[1], _mm_mul_ps(v1, x0));
+  glmm_store(dest[2], _mm_mul_ps(v2, x0));
+  glmm_store(dest[3], _mm_mul_ps(v3, x0));
 }
 
 #endif
diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h
index b3420a7..a8b517c 100644
--- a/include/cglm/simd/sse2/quat.h
+++ b/include/cglm/simd/sse2/quat.h
@@ -14,56 +14,33 @@
 
 CGLM_INLINE
 void
-glm_quat_slerp_sse2(versor q,
-                    versor r,
-                    float  t,
-                    versor dest) {
-  /* https://en.wikipedia.org/wiki/Slerp */
-  float cosTheta, sinTheta, angle, a, b, c;
+glm_quat_mul_sse2(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
 
-  __m128 xmm_q;
+  __m128 xp, xq, x0, r;
 
-  xmm_q = _mm_load_ps(q);
+  xp = glmm_load(p); /* 3 2 1 0 */
+  xq = glmm_load(q);
 
-  cosTheta = glm_vec4_dot(q, r);
-  if (cosTheta < 0.0f) {
-    _mm_store_ps(q,
-                 _mm_xor_ps(xmm_q,
-                            _mm_set1_ps(-0.f))) ;
+  r  = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq);
 
-    cosTheta = -cosTheta;
-  }
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 0, 1, 2, 3)));
 
-  if (cosTheta >= 1.0f) {
-    _mm_store_ps(dest, xmm_q);
-    return;
-  }
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 1, 0, 3, 2)));
 
-  sinTheta = sqrtf(1.0f - cosTheta * cosTheta);
+  x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f));
+  r  = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 2, 3, 0, 1)));
 
-  c = 1.0f - t;
-
-  /* LERP */
-  if (sinTheta < 0.001f) {
-    _mm_store_ps(dest, _mm_add_ps(_mm_mul_ps(_mm_set1_ps(c),
-                                             xmm_q),
-                                  _mm_mul_ps(_mm_set1_ps(t),
-                                             _mm_load_ps(r))));
-    return;
-  }
-
-  /* SLERP */
-  angle = acosf(cosTheta);
-  a = sinf(c * angle);
-  b = sinf(t * angle);
-
-  _mm_store_ps(dest,
-               _mm_div_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(a),
-                                                xmm_q),
-                                     _mm_mul_ps(_mm_set1_ps(b),
-                                                _mm_load_ps(r))),
-                          _mm_set1_ps(sinTheta)));
+  glmm_store(dest, r);
 }
 
+
 #endif
 #endif /* cglm_quat_simd_h */
diff --git a/include/cglm/types.h b/include/cglm/types.h
index c411d8b..d470e7b 100644
--- a/include/cglm/types.h
+++ b/include/cglm/types.h
@@ -9,23 +9,35 @@
 #define cglm_types_h
 
 #if defined(_MSC_VER)
-#  define CGLM_ALIGN(X) /* __declspec(align(X)) */
+/* do not use alignment for older visual studio versions */
+#if _MSC_VER < 1913 /*  Visual Studio 2017 version 15.6  */
+#  define CGLM_ALL_UNALIGNED
+#  define CGLM_ALIGN(X) /* no alignment */
+#else
+#  define CGLM_ALIGN(X) __declspec(align(X))
+#endif
 #else
 #  define CGLM_ALIGN(X) __attribute((aligned(X)))
 #endif
 
-typedef float vec2[2];
-typedef float vec3[3];
-typedef int  ivec3[3];
-typedef CGLM_ALIGN(16) float vec4[4];
+#ifndef CGLM_ALL_UNALIGNED
+#  define CGLM_ALIGN_IF(X) CGLM_ALIGN(X)
+#else
+#  define CGLM_ALIGN_IF(X) /* no alignment */
+#endif
 
-typedef vec3 mat3[3];
-typedef vec4 mat4[4];
+typedef float                   vec2[2];
+typedef CGLM_ALIGN_IF(8)  float vec3[3];
+typedef int                    ivec3[3];
+typedef CGLM_ALIGN_IF(16) float vec4[4];
 
-typedef vec4 versor;
+typedef vec3                    mat3[3];
+typedef CGLM_ALIGN_IF(16) vec4  mat4[4];
 
-#define CGLM_PI    (float)M_PI
-#define CGLM_PI_2  (float)M_PI_2
-#define CGLM_PI_4  (float)M_PI_4
+typedef vec4                    versor;
+
+#define CGLM_PI    ((float)M_PI)
+#define CGLM_PI_2  ((float)M_PI_2)
+#define CGLM_PI_4  ((float)M_PI_4)
 
 #endif /* cglm_types_h */
diff --git a/include/cglm/util.h b/include/cglm/util.h
index 85fc789..b272d44 100644
--- a/include/cglm/util.h
+++ b/include/cglm/util.h
@@ -143,4 +143,19 @@ glm_clamp(float val, float minVal, float maxVal) {
   return glm_min(glm_max(val, minVal), maxVal);
 }
 
+/*!
+ * @brief linear interpolation between two number
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ */
+CGLM_INLINE
+float
+glm_lerp(float from, float to, float t) {
+  return from + glm_clamp(t, 0.0f, 1.0f) * (to - from);
+}
+
 #endif /* cglm_util_h */
diff --git a/include/cglm/vec3-ext.h b/include/cglm/vec3-ext.h
index 99e778a..e12c133 100644
--- a/include/cglm/vec3-ext.h
+++ b/include/cglm/vec3-ext.h
@@ -26,12 +26,13 @@
 #define cglm_vec3_ext_h
 
 #include "common.h"
+#include "util.h"
 #include <stdbool.h>
 #include <math.h>
 #include <float.h>
 
 /*!
- * @brief multiplies individual items, just for convenient like SIMD
+ * @brief DEPRECATED! use glm_vec_mul
  *
  * @param[in]  a vec1
  * @param[in]  b vec2
@@ -160,4 +161,69 @@ glm_vec_min(vec3 v) {
   return min;
 }
 
+/*!
+ * @brief check if all items are NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec_isnan(vec3 v) {
+  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]);
+}
+
+/*!
+ * @brief check if all items are INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec_isinf(vec3 v) {
+  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]);
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec_isvalid(vec3 v) {
+  return !glm_vec_isnan(v) && !glm_vec_isinf(v);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+void
+glm_vec_sign(vec3 v, vec3 dest) {
+  dest[0] = glm_signf(v[0]);
+  dest[1] = glm_signf(v[1]);
+  dest[2] = glm_signf(v[2]);
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec_sqrt(vec3 v, vec3 dest) {
+  dest[0] = sqrtf(v[0]);
+  dest[1] = sqrtf(v[1]);
+  dest[2] = sqrtf(v[2]);
+}
+
 #endif /* cglm_vec3_ext_h */
diff --git a/include/cglm/vec3.h b/include/cglm/vec3.h
index e2108f3..50636a1 100644
--- a/include/cglm/vec3.h
+++ b/include/cglm/vec3.h
@@ -28,10 +28,18 @@
    CGLM_INLINE void  glm_vec_cross(vec3 a, vec3 b, vec3 d);
    CGLM_INLINE float glm_vec_norm2(vec3 v);
    CGLM_INLINE float glm_vec_norm(vec3 vec);
-   CGLM_INLINE void  glm_vec_add(vec3 v1, vec3 v2, vec3 dest);
-   CGLM_INLINE void  glm_vec_sub(vec3 v1, vec3 v2, vec3 dest);
+   CGLM_INLINE void  glm_vec_add(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec_adds(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec_sub(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec_subs(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec_mul(vec3 a, vec3 b, vec3 dest);
    CGLM_INLINE void  glm_vec_scale(vec3 v, float s, vec3 dest);
    CGLM_INLINE void  glm_vec_scale_as(vec3 v, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec_div(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec_divs(vec3 a, float s, vec3 dest);
+   CGLM_INLINE void  glm_vec_addadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec_subadd(vec3 a, vec3 b, vec3 dest);
+   CGLM_INLINE void  glm_vec_muladd(vec3 a, vec3 b, vec3 dest);
    CGLM_INLINE void  glm_vec_flipsign(vec3 v);
    CGLM_INLINE void  glm_vec_inv(vec3 v);
    CGLM_INLINE void  glm_vec_inv_to(vec3 v, vec3 dest);
@@ -59,6 +67,7 @@
 #define cglm_vec3_h
 
 #include "common.h"
+#include "vec4.h"
 #include "vec3-ext.h"
 #include "util.h"
 
@@ -103,6 +112,32 @@ glm_vec_copy(vec3 a, vec3 dest) {
   dest[2] = a[2];
 }
 
+/*!
+ * @brief make vector zero
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec_zero(vec3 v) {
+  v[0] = 0.0f;
+  v[1] = 0.0f;
+  v[2] = 0.0f;
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec_one(vec3 v) {
+  v[0] = 1.0f;
+  v[1] = 1.0f;
+  v[2] = 1.0f;
+}
+
 /*!
  * @brief vec3 dot product
  *
@@ -147,7 +182,7 @@ glm_vec_cross(vec3 a, vec3 b, vec3 d) {
 CGLM_INLINE
 float
 glm_vec_norm2(vec3 v) {
-  return v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
+  return glm_vec_dot(v, v);
 }
 
 /*!
@@ -164,33 +199,78 @@ glm_vec_norm(vec3 vec) {
 }
 
 /*!
- * @brief add v2 vector to v1 vector store result in dest
+ * @brief add a vector to b vector store result in dest
  *
- * @param[in]  v1 vector1
- * @param[in]  v2 vector2
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
  * @param[out] dest destination vector
  */
 CGLM_INLINE
 void
-glm_vec_add(vec3 v1, vec3 v2, vec3 dest) {
-  dest[0] = v1[0] + v2[0];
-  dest[1] = v1[1] + v2[1];
-  dest[2] = v1[2] + v2[2];
+glm_vec_add(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+}
+
+/*!
+ * @brief add scalar to v vector store result in dest (d = v + s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec_adds(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
 }
 
 /*!
  * @brief subtract v2 vector from v1 vector store result in dest
  *
- * @param[in]  v1 vector1
- * @param[in]  v2 vector2
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
  * @param[out] dest destination vector
  */
 CGLM_INLINE
 void
-glm_vec_sub(vec3 v1, vec3 v2, vec3 dest) {
-  dest[0] = v1[0] - v2[0];
-  dest[1] = v1[1] - v2[1];
-  dest[2] = v1[2] - v2[2];
+glm_vec_sub(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - s)
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec_subs(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+}
+
+/*!
+ * @brief multiply two vector (component-wise multiplication)
+ *
+ * @param a v1
+ * @param b v2
+ * @param d v3 = (a[0] * b[0], a[1] * b[1], a[2] * b[2])
+ */
+CGLM_INLINE
+void
+glm_vec_mul(vec3 a, vec3 b, vec3 d) {
+  d[0] = a[0] * b[0];
+  d[1] = a[1] * b[1];
+  d[2] = a[2] * b[2];
 }
 
 /*!
@@ -221,14 +301,112 @@ glm_vec_scale_as(vec3 v, float s, vec3 dest) {
   float norm;
   norm = glm_vec_norm(v);
 
-  if (norm == 0) {
-    glm_vec_copy(v, dest);
+  if (norm == 0.0f) {
+    glm_vec_zero(dest);
     return;
   }
 
   glm_vec_scale(v, s / norm, dest);
 }
 
+/*!
+ * @brief div vector with another component-wise division: d = a / b
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1], a[2]/b[2])
+ */
+CGLM_INLINE
+void
+glm_vec_div(vec3 a, vec3 b, vec3 dest) {
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+  dest[2] = a[2] / b[2];
+}
+
+/*!
+ * @brief div vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest result = (a[0]/s, a[1]/s, a[2]/s)
+ */
+CGLM_INLINE
+void
+glm_vec_divs(vec3 v, float s, vec3 dest) {
+  dest[0] = v[0] / s;
+  dest[1] = v[1] / s;
+  dest[2] = v[2] / s;
+}
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec_addadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec_subadd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec_muladd(vec3 a, vec3 b, vec3 dest) {
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec_muladds(vec3 a, float s, vec3 dest) {
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+}
+
 /*!
  * @brief flip sign of all vec3 members
  *
@@ -242,6 +420,20 @@ glm_vec_flipsign(vec3 v) {
   v[2] = -v[2];
 }
 
+/*!
+ * @brief flip sign of all vec3 members and store result in dest
+ *
+ * @param[in]   v     vector
+ * @param[out]  dest  result vector
+ */
+CGLM_INLINE
+void
+glm_vec_flipsign_to(vec3 v, vec3 dest) {
+  dest[0] = -v[0];
+  dest[1] = -v[1];
+  dest[2] = -v[2];
+}
+
 /*!
  * @brief make vector as inverse/opposite of itself
  *
@@ -300,7 +492,7 @@ glm_vec_normalize_to(vec3 vec, vec3 dest) {
   norm = glm_vec_norm(vec);
 
   if (norm == 0.0f) {
-    dest[0] = dest[1] = dest[2] = 0.0f;
+    glm_vec_zero(dest);
     return;
   }
 
@@ -325,12 +517,6 @@ glm_vec_angle(vec3 v1, vec3 v2) {
   return acosf(glm_vec_dot(v1, v2) * norm);
 }
 
-CGLM_INLINE
-void
-glm_quatv(versor q,
-          float  angle,
-          vec3   v);
-
 /*!
  * @brief rotate vec3 around axis by angle using Rodrigues' rotation formula
  *
@@ -341,31 +527,55 @@ glm_quatv(versor q,
 CGLM_INLINE
 void
 glm_vec_rotate(vec3 v, float angle, vec3 axis) {
-  versor q;
-  vec3   v1, v2, v3;
+  vec3   v1, v2, k;
   float  c, s;
 
   c = cosf(angle);
   s = sinf(angle);
 
+  glm_vec_normalize_to(axis, k);
+
   /* Right Hand, Rodrigues' rotation formula:
         v = v*cos(t) + (kxv)sin(t) + k*(k.v)(1 - cos(t))
    */
-
-  /* quaternion */
-  glm_quatv(q, angle, v);
-
   glm_vec_scale(v, c, v1);
 
-  glm_vec_cross(axis, v, v2);
+  glm_vec_cross(k, v, v2);
   glm_vec_scale(v2, s, v2);
 
-  glm_vec_scale(axis,
-                glm_vec_dot(axis, v) * (1.0f - c),
-                v3);
-
   glm_vec_add(v1, v2, v1);
-  glm_vec_add(v1, v3, v);
+
+  glm_vec_scale(k, glm_vec_dot(k, v) * (1.0f - c), v2);
+  glm_vec_add(v1, v2, v);
+}
+
+/*!
+ * @brief apply rotation matrix to vector
+ *
+ *  matrix format should be (no perspective):
+ *   a  b  c  x
+ *   e  f  g  y
+ *   i  j  k  z
+ *   0  0  0  w
+ *
+ * @param[in]  m    affine matrix or rot matrix
+ * @param[in]  v    vector
+ * @param[out] dest rotated vector
+ */
+CGLM_INLINE
+void
+glm_vec_rotate_m4(mat4 m, vec3 v, vec3 dest) {
+  vec4 x, y, z, res;
+
+  glm_vec4_normalize_to(m[0], x);
+  glm_vec4_normalize_to(m[1], y);
+  glm_vec4_normalize_to(m[2], z);
+
+  glm_vec4_scale(x,   v[0], res);
+  glm_vec4_muladds(y, v[1], res);
+  glm_vec4_muladds(z, v[2], res);
+
+  glm_vec3(res, dest);
 }
 
 /*!
@@ -377,18 +587,22 @@ glm_vec_rotate(vec3 v, float angle, vec3 axis) {
  */
 CGLM_INLINE
 void
-glm_vec_rotate_m4(mat4 m, vec3 v, vec3 dest) {
-  vec3 res, x, y, z;
+glm_vec_rotate_m3(mat3 m, vec3 v, vec3 dest) {
+  vec4 res, x, y, z;
 
-  glm_vec_normalize_to(m[0], x);
-  glm_vec_normalize_to(m[1], y);
-  glm_vec_normalize_to(m[2], z);
+  glm_vec4(m[0], 0.0f, x);
+  glm_vec4(m[1], 0.0f, y);
+  glm_vec4(m[2], 0.0f, z);
 
-  res[0] = x[0] * v[0] + y[0] * v[1] + z[0] * v[2];
-  res[1] = x[1] * v[0] + y[1] * v[1] + z[1] * v[2];
-  res[2] = x[2] * v[0] + y[2] * v[1] + z[2] * v[2];
+  glm_vec4_normalize(x);
+  glm_vec4_normalize(y);
+  glm_vec4_normalize(z);
 
-  glm_vec_copy(res, dest);
+  glm_vec4_scale(x,   v[0], res);
+  glm_vec4_muladds(y, v[1], res);
+  glm_vec4_muladds(z, v[2], res);
+
+  glm_vec3(res, dest);
 }
 
 /*!
@@ -494,6 +708,28 @@ glm_vec_clamp(vec3 v, float minVal, float maxVal) {
   v[2] = glm_clamp(v[2], minVal, maxVal);
 }
 
+/*!
+ * @brief linear interpolation between two vector
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec_lerp(vec3 from, vec3 to, float t, vec3 dest) {
+  vec3 s, v;
+
+  /* from + s * (to - from) */
+  glm_vec_broadcast(glm_clamp(t, 0.0f, 1.0f), s);
+  glm_vec_sub(to, from, v);
+  glm_vec_mulv(s, v, v);
+  glm_vec_add(from, v, dest);
+}
+
 /*!
  * @brief vec3 cross product
  *
diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h
index ca697af..94150da 100644
--- a/include/cglm/vec4-ext.h
+++ b/include/cglm/vec4-ext.h
@@ -32,7 +32,7 @@
 #include <float.h>
 
 /*!
- * @brief multiplies individual items, just for convenient like SIMD
+ * @brief DEPRECATED! use glm_vec4_mul
  *
  * @param a v1
  * @param b v2
@@ -42,7 +42,7 @@ CGLM_INLINE
 void
 glm_vec4_mulv(vec4 a, vec4 b, vec4 d) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
+  glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b)));
 #else
   d[0] = a[0] * b[0];
   d[1] = a[1] * b[1];
@@ -61,7 +61,7 @@ CGLM_INLINE
 void
 glm_vec4_broadcast(float val, vec4 d) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(d, _mm_set1_ps(val));
+  glmm_store(d, _mm_set1_ps(val));
 #else
   d[0] = d[1] = d[2] = d[3] = val;
 #endif
@@ -174,5 +174,88 @@ glm_vec4_min(vec4 v) {
   return min;
 }
 
-#endif /* cglm_vec4_ext_h */
+/*!
+ * @brief check if one of items is NaN (not a number)
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isnan(vec4 v) {
+  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]) || isnan(v[3]);
+}
 
+/*!
+ * @brief check if one of items is INFINITY
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isinf(vec4 v) {
+  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]) || isinf(v[3]);
+}
+
+/*!
+ * @brief check if all items are valid number
+ *        you should only use this in DEBUG mode or very critical asserts
+ *
+ * @param[in] v vector
+ */
+CGLM_INLINE
+bool
+glm_vec4_isvalid(vec4 v) {
+  return !glm_vec4_isnan(v) && !glm_vec4_isinf(v);
+}
+
+/*!
+ * @brief get sign of 32 bit float as +1, -1, 0
+ *
+ * Important: It returns 0 for zero/NaN input
+ *
+ * @param v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sign(vec4 v, vec4 dest) {
+#if defined( __SSE2__ ) || defined( __SSE2__ )
+  __m128 x0, x1, x2, x3, x4;
+
+  x0 = glmm_load(v);
+  x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f);
+  x2 = _mm_shuffle1_ps1(x1, 2);
+
+  x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), _mm_shuffle1_ps1(x1, 1));
+  x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), _mm_shuffle1_ps1(x1, 0));
+
+  glmm_store(dest, _mm_or_ps(x3, x4));
+#else
+  dest[0] = glm_signf(v[0]);
+  dest[1] = glm_signf(v[1]);
+  dest[2] = glm_signf(v[2]);
+  dest[3] = glm_signf(v[3]);
+#endif
+}
+
+/*!
+ * @brief square root of each vector item
+ *
+ * @param[in]  v    vector
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sqrt(vec4 v, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sqrt_ps(glmm_load(v)));
+#else
+  dest[0] = sqrtf(v[0]);
+  dest[1] = sqrtf(v[1]);
+  dest[2] = sqrtf(v[2]);
+  dest[3] = sqrtf(v[3]);
+#endif
+}
+
+#endif /* cglm_vec4_ext_h */
diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h
index 95bab09..b98190b 100644
--- a/include/cglm/vec4.h
+++ b/include/cglm/vec4.h
@@ -28,10 +28,18 @@
    CGLM_INLINE float glm_vec4_dot(vec4 a, vec4 b);
    CGLM_INLINE float glm_vec4_norm2(vec4 v);
    CGLM_INLINE float glm_vec4_norm(vec4 vec);
-   CGLM_INLINE void  glm_vec4_add(vec4 v1, vec4 v2, vec4 dest);
-   CGLM_INLINE void  glm_vec4_sub(vec4 v1, vec4 v2, vec4 dest);
+   CGLM_INLINE void  glm_vec4_add(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_adds(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_sub(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_subs(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_mul(vec4 a, vec4 b, vec4 dest);
    CGLM_INLINE void  glm_vec4_scale(vec4 v, float s, vec4 dest);
    CGLM_INLINE void  glm_vec4_scale_as(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_div(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_divs(vec4 v, float s, vec4 dest);
+   CGLM_INLINE void  glm_vec4_addadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_subadd(vec4 a, vec4 b, vec4 dest);
+   CGLM_INLINE void  glm_vec4_muladd(vec4 a, vec4 b, vec4 dest);
    CGLM_INLINE void  glm_vec4_flipsign(vec4 v);
    CGLM_INLINE void  glm_vec4_inv(vec4 v);
    CGLM_INLINE void  glm_vec4_inv_to(vec4 v, vec4 dest);
@@ -41,6 +49,7 @@
    CGLM_INLINE void  glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest);
    CGLM_INLINE void  glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest);
    CGLM_INLINE void  glm_vec4_clamp(vec4 v, float minVal, float maxVal);
+   CGLM_INLINE void  glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest)
  */
 
 #ifndef cglm_vec4_h
@@ -102,7 +111,7 @@ CGLM_INLINE
 void
 glm_vec4_copy(vec4 v, vec4 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest, _mm_load_ps(v));
+  glmm_store(dest, glmm_load(v));
 #else
   dest[0] = v[0];
   dest[1] = v[1];
@@ -111,6 +120,42 @@ glm_vec4_copy(vec4 v, vec4 dest) {
 #endif
 }
 
+/*!
+ * @brief make vector zero
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_zero(vec4 v) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, _mm_setzero_ps());
+#else
+  v[0] = 0.0f;
+  v[1] = 0.0f;
+  v[2] = 0.0f;
+  v[3] = 0.0f;
+#endif
+}
+
+/*!
+ * @brief make vector one
+ *
+ * @param[in, out]  v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_one(vec4 v) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, _mm_set1_ps(1.0f));
+#else
+  v[0] = 1.0f;
+  v[1] = 1.0f;
+  v[2] = 1.0f;
+  v[3] = 1.0f;
+#endif
+}
+
 /*!
  * @brief vec4 dot product
  *
@@ -122,7 +167,14 @@ glm_vec4_copy(vec4 v, vec4 dest) {
 CGLM_INLINE
 float
 glm_vec4_dot(vec4 a, vec4 b) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 x0;
+  x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
+  x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
+  return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)));
+#else
   return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+#endif
 }
 
 /*!
@@ -139,7 +191,15 @@ glm_vec4_dot(vec4 a, vec4 b) {
 CGLM_INLINE
 float
 glm_vec4_norm2(vec4 v) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 x0;
+  x0 = glmm_load(v);
+  x0 = _mm_mul_ps(x0, x0);
+  x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
+  return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)));
+#else
   return v[0] * v[0] + v[1] * v[1] + v[2] * v[2] + v[3] * v[3];
+#endif
 }
 
 /*!
@@ -152,50 +212,112 @@ glm_vec4_norm2(vec4 v) {
 CGLM_INLINE
 float
 glm_vec4_norm(vec4 vec) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 x0;
+  x0 = glmm_load(vec);
+  return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0)));
+#else
   return sqrtf(glm_vec4_norm2(vec));
+#endif
 }
 
 /*!
  * @brief add v2 vector to v1 vector store result in dest
  *
- * @param[in]  v1 vector1
- * @param[in]  v2 vector2
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
  * @param[out] dest destination vector
  */
 CGLM_INLINE
 void
-glm_vec4_add(vec4 v1, vec4 v2, vec4 dest) {
+glm_vec4_add(vec4 a, vec4 b, vec4 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest,
-               _mm_add_ps(_mm_load_ps(v1),
-                          _mm_load_ps(v2)));
+  glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b)));
 #else
-  dest[0] = v1[0] + v2[0];
-  dest[1] = v1[1] + v2[1];
-  dest[2] = v1[2] + v2[2];
-  dest[3] = v1[3] + v2[3];
+  dest[0] = a[0] + b[0];
+  dest[1] = a[1] + b[1];
+  dest[2] = a[2] + b[2];
+  dest[3] = a[3] + b[3];
 #endif
 }
 
 /*!
- * @brief subtract v2 vector from v1 vector store result in dest
+ * @brief add scalar to v vector store result in dest (d = v + vec(s))
  *
- * @param[in]  v1 vector1
- * @param[in]  v2 vector2
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
  * @param[out] dest destination vector
  */
 CGLM_INLINE
 void
-glm_vec4_sub(vec4 v1, vec4 v2, vec4 dest) {
+glm_vec4_adds(vec4 v, float s, vec4 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest,
-               _mm_sub_ps(_mm_load_ps(v1),
-                          _mm_load_ps(v2)));
+  glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
 #else
-  dest[0] = v1[0] - v2[0];
-  dest[1] = v1[1] - v2[1];
-  dest[2] = v1[2] - v2[2];
-  dest[3] = v1[3] - v2[3];
+  dest[0] = v[0] + s;
+  dest[1] = v[1] + s;
+  dest[2] = v[2] + s;
+  dest[3] = v[3] + s;
+#endif
+}
+
+/*!
+ * @brief subtract b vector from a vector store result in dest (d = v1 - v2)
+ *
+ * @param[in]  a    vector1
+ * @param[in]  b    vector2
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_sub(vec4 a, vec4 b, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b)));
+#else
+  dest[0] = a[0] - b[0];
+  dest[1] = a[1] - b[1];
+  dest[2] = a[2] - b[2];
+  dest[3] = a[3] - b[3];
+#endif
+}
+
+/*!
+ * @brief subtract scalar from v vector store result in dest (d = v - vec(s))
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_subs(vec4 v, float s, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
+#else
+  dest[0] = v[0] - s;
+  dest[1] = v[1] - s;
+  dest[2] = v[2] - s;
+  dest[3] = v[3] - s;
+#endif
+}
+
+/*!
+ * @brief multiply two vector (component-wise multiplication)
+ *
+ * @param a v1
+ * @param b v2
+ * @param d v3 = (a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3])
+ */
+CGLM_INLINE
+void
+glm_vec4_mul(vec4 a, vec4 b, vec4 d) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b)));
+#else
+  d[0] = a[0] * b[0];
+  d[1] = a[1] * b[1];
+  d[2] = a[2] * b[2];
+  d[3] = a[3] * b[3];
 #endif
 }
 
@@ -210,9 +332,7 @@ CGLM_INLINE
 void
 glm_vec4_scale(vec4 v, float s, vec4 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(dest,
-               _mm_mul_ps(_mm_load_ps(v),
-                          _mm_set1_ps(s)));
+  glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
 #else
   dest[0] = v[0] * s;
   dest[1] = v[1] * s;
@@ -234,14 +354,148 @@ glm_vec4_scale_as(vec4 v, float s, vec4 dest) {
   float norm;
   norm = glm_vec4_norm(v);
 
-  if (norm == 0) {
-    glm_vec4_copy(v, dest);
+  if (norm == 0.0f) {
+    glm_vec4_zero(dest);
     return;
   }
 
   glm_vec4_scale(v, s / norm, dest);
 }
 
+/*!
+ * @brief div vector with another component-wise division: d = v1 / v2
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest result = (a[0]/b[0], a[1]/b[1], a[2]/b[2], a[3]/b[3])
+ */
+CGLM_INLINE
+void
+glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b)));
+#else
+  dest[0] = a[0] / b[0];
+  dest[1] = a[1] / b[1];
+  dest[2] = a[2] / b[2];
+  dest[3] = a[3] / b[3];
+#endif
+}
+
+/*!
+ * @brief div vec4 vector with scalar: d = v / s
+ *
+ * @param[in]  v    vector
+ * @param[in]  s    scalar
+ * @param[out] dest destination vector
+ */
+CGLM_INLINE
+void
+glm_vec4_divs(vec4 v, float s, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s)));
+#else
+  glm_vec4_scale(v, 1.0f / s, dest);
+#endif
+}
+
+
+/*!
+ * @brief add two vectors and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a + b)
+ */
+CGLM_INLINE
+void
+glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_add_ps(glmm_load(a),
+                                         glmm_load(b))));
+#else
+  dest[0] += a[0] + b[0];
+  dest[1] += a[1] + b[1];
+  dest[2] += a[2] + b[2];
+  dest[3] += a[3] + b[3];
+#endif
+}
+
+/*!
+ * @brief sub two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a - b)
+ */
+CGLM_INLINE
+void
+glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_sub_ps(glmm_load(a),
+                                         glmm_load(b))));
+#else
+  dest[0] += a[0] - b[0];
+  dest[1] += a[1] - b[1];
+  dest[2] += a[2] - b[2];
+  dest[3] += a[3] - b[3];
+#endif
+}
+
+/*!
+ * @brief mul two vectors and add result to dest
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector 1
+ * @param[in]  b    vector 2
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_mul_ps(glmm_load(a),
+                                         glmm_load(b))));
+#else
+  dest[0] += a[0] * b[0];
+  dest[1] += a[1] * b[1];
+  dest[2] += a[2] * b[2];
+  dest[3] += a[3] * b[3];
+#endif
+}
+
+/*!
+ * @brief mul vector with scalar and add result to sum
+ *
+ * it applies += operator so dest must be initialized
+ *
+ * @param[in]  a    vector
+ * @param[in]  s    scalar
+ * @param[out] dest dest += (a * b)
+ */
+CGLM_INLINE
+void
+glm_vec4_muladds(vec4 a, float s, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_add_ps(glmm_load(dest),
+                              _mm_mul_ps(glmm_load(a),
+                                         _mm_set1_ps(s))));
+#else
+  dest[0] += a[0] * s;
+  dest[1] += a[1] * s;
+  dest[2] += a[2] * s;
+  dest[3] += a[3] * s;
+#endif
+}
+
 /*!
  * @brief flip sign of all vec4 members
  *
@@ -251,8 +505,7 @@ CGLM_INLINE
 void
 glm_vec4_flipsign(vec4 v) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
-  _mm_store_ps(v, _mm_xor_ps(_mm_load_ps(v),
-                             _mm_set1_ps(-0.0f)));
+  glmm_store(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
 #else
   v[0] = -v[0];
   v[1] = -v[1];
@@ -261,6 +514,25 @@ glm_vec4_flipsign(vec4 v) {
 #endif
 }
 
+/*!
+ * @brief flip sign of all vec4 members and store result in dest
+ *
+ * @param[in]  v     vector
+ * @param[out] dest  vector
+ */
+CGLM_INLINE
+void
+glm_vec4_flipsign_to(vec4 v, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
+#else
+  dest[0] = -v[0];
+  dest[1] = -v[1];
+  dest[2] = -v[2];
+  dest[3] = -v[3];
+#endif
+}
+
 /*!
  * @brief make vector as inverse/opposite of itself
  *
@@ -285,26 +557,6 @@ glm_vec4_inv_to(vec4 v, vec4 dest) {
   glm_vec4_flipsign(dest);
 }
 
-/*!
- * @brief normalize vec4 and store result in same vec
- *
- * @param[in, out] v vector
- */
-CGLM_INLINE
-void
-glm_vec4_normalize(vec4 v) {
-  float norm;
-
-  norm = glm_vec4_norm(v);
-
-  if (norm == 0.0f) {
-    v[0] = v[1] = v[2] = v[3] = 0.0f;
-    return;
-  }
-
-  glm_vec4_scale(v, 1.0f / norm, v);
-}
-
 /*!
  * @brief normalize vec4 to dest
  *
@@ -314,16 +566,43 @@ glm_vec4_normalize(vec4 v) {
 CGLM_INLINE
 void
 glm_vec4_normalize_to(vec4 vec, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  __m128 xdot, x0;
+  float  dot;
+
+  x0   = glmm_load(vec);
+  xdot = glmm_dot(x0, x0);
+  dot  = _mm_cvtss_f32(xdot);
+
+  if (dot == 0.0f) {
+    glmm_store(dest, _mm_setzero_ps());
+    return;
+  }
+
+  glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
+#else
   float norm;
 
   norm = glm_vec4_norm(vec);
 
   if (norm == 0.0f) {
-    dest[0] = dest[1] = dest[2] = dest[3] = 0.0f;
+    glm_vec4_zero(dest);
     return;
   }
 
   glm_vec4_scale(vec, 1.0f / norm, dest);
+#endif
+}
+
+/*!
+ * @brief normalize vec4 and store result in same vec
+ *
+ * @param[in, out] v vector
+ */
+CGLM_INLINE
+void
+glm_vec4_normalize(vec4 v) {
+  glm_vec4_normalize_to(v, v);
 }
 
 /**
@@ -352,10 +631,14 @@ glm_vec4_distance(vec4 v1, vec4 v2) {
 CGLM_INLINE
 void
 glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2)));
+#else
   dest[0] = glm_max(v1[0], v2[0]);
   dest[1] = glm_max(v1[1], v2[1]);
   dest[2] = glm_max(v1[2], v2[2]);
   dest[3] = glm_max(v1[3], v2[3]);
+#endif
 }
 
 /*!
@@ -368,10 +651,14 @@ glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) {
 CGLM_INLINE
 void
 glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2)));
+#else
   dest[0] = glm_min(v1[0], v2[0]);
   dest[1] = glm_min(v1[1], v2[1]);
   dest[2] = glm_min(v1[2], v2[2]);
   dest[3] = glm_min(v1[3], v2[3]);
+#endif
 }
 
 /*!
@@ -384,10 +671,37 @@ glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) {
 CGLM_INLINE
 void
 glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
+#if defined( __SSE__ ) || defined( __SSE2__ )
+  glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)),
+                           _mm_set1_ps(maxVal)));
+#else
   v[0] = glm_clamp(v[0], minVal, maxVal);
   v[1] = glm_clamp(v[1], minVal, maxVal);
   v[2] = glm_clamp(v[2], minVal, maxVal);
   v[3] = glm_clamp(v[3], minVal, maxVal);
+#endif
+}
+
+/*!
+ * @brief linear interpolation between two vector
+ *
+ * formula:  from + s * (to - from)
+ *
+ * @param[in]   from from value
+ * @param[in]   to   to value
+ * @param[in]   t    interpolant (amount) clamped between 0 and 1
+ * @param[out]  dest destination
+ */
+CGLM_INLINE
+void
+glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest) {
+  vec4 s, v;
+
+  /* from + s * (to - from) */
+  glm_vec4_broadcast(glm_clamp(t, 0.0f, 1.0f), s);
+  glm_vec4_sub(to, from, v);
+  glm_vec4_mulv(s, v, v);
+  glm_vec4_add(from, v, dest);
 }
 
 #endif /* cglm_vec4_h */
diff --git a/include/cglm/version.h b/include/cglm/version.h
index c27a26e..8e405bd 100644
--- a/include/cglm/version.h
+++ b/include/cglm/version.h
@@ -9,7 +9,7 @@
 #define cglm_version_h
 
 #define CGLM_VERSION_MAJOR 0
-#define CGLM_VERSION_MINOR 3
-#define CGLM_VERSION_PATCH 6
+#define CGLM_VERSION_MINOR 4
+#define CGLM_VERSION_PATCH 5
 
 #endif /* cglm_version_h */
diff --git a/makefile.am b/makefile.am
index 217fff3..1999b3d 100644
--- a/makefile.am
+++ b/makefile.am
@@ -54,7 +54,8 @@ cglm_HEADERS = include/cglm/version.h \
                   include/cglm/plane.h \
                   include/cglm/frustum.h \
                   include/cglm/box.h \
-                  include/cglm/color.h
+                  include/cglm/color.h \
+                  include/cglm/project.h
 
 cglm_calldir=$(includedir)/cglm/call
 cglm_call_HEADERS = include/cglm/call/mat4.h \
@@ -68,7 +69,8 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
                     include/cglm/call/euler.h \
                     include/cglm/call/plane.h \
                     include/cglm/call/frustum.h \
-                    include/cglm/call/box.h
+                    include/cglm/call/box.h \
+                    include/cglm/call/project.h
 
 cglm_simddir=$(includedir)/cglm/simd
 cglm_simd_HEADERS = include/cglm/simd/intrin.h
@@ -108,7 +110,12 @@ test_tests_SOURCES=\
     test/src/test_cam.c \
     test/src/test_project.c \
     test/src/test_clamp.c \
-    test/src/test_euler.c
+    test/src/test_euler.c \
+    test/src/test_quat.c \
+    test/src/test_vec4.c \
+    test/src/test_vec3.c \
+    test/src/test_mat3.c \
+    test/src/test_affine.c
 
 all-local:
 	sh ./post-build.sh
diff --git a/src/affine.c b/src/affine.c
index df591df..a271f9f 100644
--- a/src/affine.c
+++ b/src/affine.c
@@ -8,6 +8,12 @@
 #include "../include/cglm/cglm.h"
 #include "../include/cglm/call.h"
 
+CGLM_EXPORT
+void
+glmc_translate_make(mat4 m, vec3 v) {
+  glm_translate_make(m, v);
+}
+
 CGLM_EXPORT
 void
 glmc_translate_to(mat4 m, vec3 v, mat4 dest) {
@@ -38,6 +44,12 @@ glmc_translate_z(mat4 m, float to) {
   glm_translate_z(m, to);
 }
 
+CGLM_EXPORT
+void
+glmc_scale_make(mat4 m, vec3 v) {
+  glm_scale_make(m, v);
+}
+
 CGLM_EXPORT
 void
 glmc_scale_to(mat4 m, vec3 v, mat4 dest) {
@@ -52,8 +64,8 @@ glmc_scale(mat4 m, vec3 v) {
 
 CGLM_EXPORT
 void
-glmc_scale1(mat4 m, float s) {
-  glm_scale1(m, s);
+glmc_scale_uni(mat4 m, float s) {
+  glm_scale_uni(m, s);
 }
 
 CGLM_EXPORT
@@ -74,36 +86,42 @@ glmc_rotate_z(mat4 m, float rad, mat4 dest) {
   glm_rotate_z(m, rad, dest);
 }
 
-CGLM_EXPORT
-void
-glmc_rotate_ndc_make(mat4 m, float angle, vec3 axis_ndc) {
-  glm_rotate_ndc_make(m, angle, axis_ndc);
-}
-
 CGLM_EXPORT
 void
 glmc_rotate_make(mat4 m, float angle, vec3 axis) {
   glm_rotate_make(m, angle, axis);
 }
 
-CGLM_EXPORT
-void
-glmc_rotate_ndc(mat4 m, float angle, vec3 axis_ndc) {
-  glm_rotate_ndc(m, angle, axis_ndc);
-}
-
 CGLM_EXPORT
 void
 glmc_rotate(mat4 m, float angle, vec3 axis) {
   glm_rotate(m, angle, axis);
 }
 
+CGLM_EXPORT
+void
+glmc_rotate_at(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  glm_rotate_at(m, pivot, angle, axis);
+}
+
+CGLM_EXPORT
+void
+glmc_rotate_atm(mat4 m, vec3 pivot, float angle, vec3 axis) {
+  glm_rotate_atm(m, pivot, angle, axis);
+}
+
 CGLM_EXPORT
 void
 glmc_decompose_scalev(mat4 m, vec3 s) {
   glm_decompose_scalev(m, s);
 }
 
+CGLM_EXPORT
+bool
+glmc_uniscaled(mat4 m) {
+  return glm_uniscaled(m);
+}
+
 CGLM_EXPORT
 void
 glmc_decompose_rs(mat4 m, mat4 r, vec3 s) {
diff --git a/src/mat4.c b/src/mat4.c
index 838b52d..9407684 100644
--- a/src/mat4.c
+++ b/src/mat4.c
@@ -52,7 +52,7 @@ glmc_mat4_mul(mat4 m1, mat4 m2, mat4 dest) {
 
 CGLM_EXPORT
 void
-glmc_mat4_mulN(mat4 * __restrict matrices[], int len, mat4 dest) {
+glmc_mat4_mulN(mat4 * __restrict matrices[], uint32_t len, mat4 dest) {
   glm_mat4_mulN(matrices, len, dest);
 }
 
@@ -62,6 +62,12 @@ glmc_mat4_mulv(mat4 m, vec4 v, vec4 dest) {
   glm_mat4_mulv(m, v, dest);
 }
 
+CGLM_EXPORT
+void
+glmc_mat4_quat(mat4 m, versor dest) {
+  glm_mat4_quat(m, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_mat4_transpose_to(mat4 m, mat4 dest) {
diff --git a/src/quat.c b/src/quat.c
index b26d112..c47761d 100644
--- a/src/quat.c
+++ b/src/quat.c
@@ -8,6 +8,7 @@
 #include "../include/cglm/cglm.h"
 #include "../include/cglm/call.h"
 
+
 CGLM_EXPORT
 void
 glmc_quat_identity(versor q) {
@@ -16,20 +17,26 @@ glmc_quat_identity(versor q) {
 
 CGLM_EXPORT
 void
-glmc_quat(versor q,
-               float angle,
-               float x,
-               float y,
-               float z) {
+glmc_quat_init(versor q, float x, float y, float z, float w) {
+  glm_quat_init(q, x, y, z, w);
+}
+
+CGLM_EXPORT
+void
+glmc_quat(versor q, float angle, float x, float y, float z) {
   glm_quat(q, angle, x, y, z);
 }
 
 CGLM_EXPORT
 void
-glmc_quatv(versor q,
-           float  angle,
-           vec3   v) {
-  glm_quatv(q, angle, v);
+glmc_quatv(versor q, float angle, vec3 axis) {
+  glm_quatv(q, angle, axis);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_copy(versor q, versor dest) {
+  glm_quat_copy(q, dest);
 }
 
 CGLM_EXPORT
@@ -40,20 +47,86 @@ glmc_quat_norm(versor q) {
 
 CGLM_EXPORT
 void
-glmc_quat_normalize(versor q) {
-  glm_quat_normalize(q);
-}
-
-CGLM_EXPORT
-float
-glmc_quat_dot(versor q, versor r) {
-  return glm_quat_dot(q, r);
+glmc_quat_normalize_to(versor q, versor dest) {
+  glm_quat_normalize_to(q, dest);
 }
 
 CGLM_EXPORT
 void
-glmc_quat_mulv(versor q1, versor q2, versor dest) {
-  glm_quat_mulv(q1, q2, dest);
+glmc_quat_normalize(versor q) {
+  glm_quat_norm(q);
+}
+
+CGLM_EXPORT
+float
+glmc_quat_dot(versor p, versor q) {
+  return glm_quat_dot(p, q);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_conjugate(versor q, versor dest) {
+  glm_quat_conjugate(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_inv(versor q, versor dest) {
+  glm_quat_inv(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_add(versor p, versor q, versor dest) {
+  glm_quat_add(p, q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_sub(versor p, versor q, versor dest) {
+  glm_quat_sub(p, q, dest);
+}
+
+CGLM_EXPORT
+float
+glmc_quat_real(versor q) {
+  return glm_quat_real(q);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_imag(versor q, vec3 dest) {
+  glm_quat_imag(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_imagn(versor q, vec3 dest) {
+  glm_quat_imagn(q, dest);
+}
+
+CGLM_EXPORT
+float
+glmc_quat_imaglen(versor q) {
+  return glm_quat_imaglen(q);
+}
+
+CGLM_EXPORT
+float
+glmc_quat_angle(versor q) {
+  return glm_quat_angle(q);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_axis(versor q, versor dest) {
+  glm_quat_axis(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_mul(versor p, versor q, versor dest) {
+  glm_quat_mul(p, q, dest);
 }
 
 CGLM_EXPORT
@@ -64,9 +137,72 @@ glmc_quat_mat4(versor q, mat4 dest) {
 
 CGLM_EXPORT
 void
-glmc_quat_slerp(versor q,
-                versor r,
-                float  t,
-                versor dest) {
-  glm_quat_slerp(q, r, t, dest);
+glmc_quat_mat4t(versor q, mat4 dest) {
+  glm_quat_mat4t(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_mat3(versor q, mat3 dest) {
+  glm_quat_mat3(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_mat3t(versor q, mat3 dest) {
+  glm_quat_mat3t(q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_lerp(versor from, versor to, float t, versor dest) {
+  glm_quat_lerp(from, to, t, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_slerp(versor from, versor to, float t, versor dest) {
+  glm_quat_slerp(from, to, t, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_look(vec3 eye, versor ori, mat4 dest) {
+  glm_quat_look(eye, ori, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest) {
+  glm_quat_for(dir, fwd, up, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_forp(vec3 from, vec3 to, vec3 fwd, vec3 up, versor dest) {
+  glm_quat_forp(from, to, fwd, up, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_rotatev(versor q, vec3 v, vec3 dest) {
+  glm_quat_rotatev(q, v, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_rotate(mat4 m, versor q, mat4 dest) {
+  glm_quat_rotate(m, q, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_at(mat4 model, versor q, vec3 pivot) {
+  glm_quat_rotate_at(model, q, pivot);
+}
+
+CGLM_EXPORT
+void
+glmc_quat_rotate_atm(mat4 m, versor q, vec3 pivot) {
+  glm_quat_rotate_atm(m, q, pivot);
 }
diff --git a/src/vec3.c b/src/vec3.c
index ebc677d..28fda51 100644
--- a/src/vec3.c
+++ b/src/vec3.c
@@ -8,12 +8,30 @@
 #include "../include/cglm/cglm.h"
 #include "../include/cglm/call.h"
 
+CGLM_EXPORT
+void
+glmc_vec3(vec4 v4, vec3 dest) {
+  glm_vec3(v4, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_vec_copy(vec3 a, vec3 dest) {
   glm_vec_copy(a, dest);
 }
 
+CGLM_EXPORT
+void
+glmc_vec_zero(vec3 v) {
+  glm_vec_zero(v);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_one(vec3 v) {
+  glm_vec_one(v);
+}
+
 CGLM_EXPORT
 float
 glmc_vec_dot(vec3 a, vec3 b) {
@@ -58,8 +76,26 @@ glmc_vec_add(vec3 v1, vec3 v2, vec3 dest) {
 
 CGLM_EXPORT
 void
-glmc_vec_sub(vec3 v1, vec3 v2, vec3 dest) {
-  glm_vec_sub(v1, v2, dest);
+glmc_vec_adds(vec3 v, float s, vec3 dest) {
+  glm_vec_adds(v, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_sub(vec3 a, vec3 b, vec3 dest) {
+  glm_vec_sub(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_subs(vec3 v, float s, vec3 dest) {
+  glm_vec_subs(v, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_mul(vec3 a, vec3 b, vec3 d) {
+  glm_vec_mul(a, b, d);
 }
 
 CGLM_EXPORT
@@ -74,12 +110,54 @@ glmc_vec_scale_as(vec3 v, float s, vec3 dest) {
   glm_vec_scale_as(v, s, dest);
 }
 
+CGLM_EXPORT
+void
+glmc_vec_div(vec3 a, vec3 b, vec3 dest) {
+  glm_vec_div(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_divs(vec3 a, float s, vec3 dest) {
+  glm_vec_divs(a, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_addadd(vec3 a, vec3 b, vec3 dest) {
+  glm_vec_addadd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_subadd(vec3 a, vec3 b, vec3 dest) {
+  glm_vec_subadd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_muladd(vec3 a, vec3 b, vec3 dest) {
+  glm_vec_muladd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_muladds(vec3 a, float s, vec3 dest) {
+  glm_vec_muladds(a, s, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_vec_flipsign(vec3 v) {
   glm_vec_flipsign(v);
 }
 
+CGLM_EXPORT
+void
+glmc_vec_flipsign_to(vec3 v, vec3 dest) {
+  glm_vec_flipsign_to(v, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_vec_inv(vec3 v) {
@@ -145,3 +223,101 @@ void
 glmc_vec_clamp(vec3 v, float minVal, float maxVal) {
   glm_vec_clamp(v, minVal, maxVal);
 }
+
+CGLM_EXPORT
+void
+glmc_vec_ortho(vec3 v, vec3 dest) {
+  glm_vec_ortho(v, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_lerp(vec3 from, vec3 to, float t, vec3 dest) {
+  glm_vec_lerp(from, to, t, dest);
+}
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec_mulv(vec3 a, vec3 b, vec3 d) {
+  glm_vec_mulv(a, b, d);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_broadcast(float val, vec3 d) {
+  glm_vec_broadcast(val, d);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_eq(vec3 v, float val) {
+  return glm_vec_eq(v, val);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_eq_eps(vec3 v, float val) {
+  return glm_vec_eq_eps(v, val);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_eq_all(vec3 v) {
+  return glm_vec_eq_all(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_eqv(vec3 v1, vec3 v2) {
+  return glm_vec_eqv(v1, v2);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_eqv_eps(vec3 v1, vec3 v2) {
+  return glm_vec_eqv_eps(v1, v2);
+}
+
+CGLM_EXPORT
+float
+glmc_vec_max(vec3 v) {
+  return glm_vec_max(v);
+}
+
+CGLM_EXPORT
+float
+glmc_vec_min(vec3 v) {
+  return glm_vec_min(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_isnan(vec3 v) {
+  return glm_vec_isnan(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_isinf(vec3 v) {
+  return glm_vec_isinf(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec_isvalid(vec3 v) {
+  return glm_vec_isvalid(v);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_sign(vec3 v, vec3 dest) {
+  glm_vec_sign(v, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec_sqrt(vec3 v, vec3 dest) {
+  glm_vec_sqrt(v, dest);
+}
diff --git a/src/vec4.c b/src/vec4.c
index f5f6a06..fb22796 100644
--- a/src/vec4.c
+++ b/src/vec4.c
@@ -8,6 +8,24 @@
 #include "../include/cglm/cglm.h"
 #include "../include/cglm/call.h"
 
+CGLM_EXPORT
+void
+glmc_vec4(vec3 v3, float last, vec4 dest) {
+  glm_vec4(v3, last, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_zero(vec4 v) {
+  glm_vec4_zero(v);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_one(vec4 v) {
+  glm_vec4_one(v);
+}
+
 CGLM_EXPORT
 void
 glmc_vec4_copy3(vec4 a, vec3 dest) {
@@ -52,14 +70,32 @@ glmc_vec4_norm2(vec4 vec) {
 
 CGLM_EXPORT
 void
-glmc_vec4_add(vec4 v1, vec4 v2, vec4 dest) {
-  glm_vec4_add(v1, v2, dest);
+glmc_vec4_add(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_add(a, b, dest);
 }
 
 CGLM_EXPORT
 void
-glmc_vec4_sub(vec4 v1, vec4 v2, vec4 dest) {
-  glm_vec4_sub(v1, v2, dest);
+glmc_vec4_adds(vec4 v, float s, vec4 dest) {
+  glm_vec4_adds(v, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_sub(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_sub(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_subs(vec4 v, float s, vec4 dest) {
+  glm_vec4_subs(v, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_mul(vec4 a, vec4 b, vec4 d) {
+  glm_vec4_mul(a, b, d);
 }
 
 CGLM_EXPORT
@@ -70,16 +106,58 @@ glmc_vec4_scale(vec4 v, float s, vec4 dest) {
 
 CGLM_EXPORT
 void
-glmc_vec4_scale_as(vec3 v, float s, vec3 dest) {
+glmc_vec4_scale_as(vec4 v, float s, vec4 dest) {
   glm_vec4_scale_as(v, s, dest);
 }
 
+CGLM_EXPORT
+void
+glmc_vec4_div(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_div(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_divs(vec4 v, float s, vec4 dest) {
+  glm_vec4_divs(v, s, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_addadd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_subadd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
+  glm_vec4_muladd(a, b, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_muladds(vec4 a, float s, vec4 dest) {
+  glm_vec4_muladds(a, s, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_vec4_flipsign(vec4 v) {
   glm_vec4_flipsign(v);
 }
 
+CGLM_EXPORT
+void
+glmc_vec4_flipsign_to(vec4 v, vec4 dest) {
+  glm_vec4_flipsign_to(v, dest);
+}
+
 CGLM_EXPORT
 void
 glmc_vec4_inv(vec4 v) {
@@ -115,3 +193,95 @@ void
 glmc_vec4_clamp(vec4 v, float minVal, float maxVal) {
   glm_vec4_clamp(v, minVal, maxVal);
 }
+
+CGLM_EXPORT
+void
+glmc_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest) {
+  glm_vec4_lerp(from, to, t, dest);
+}
+
+/* ext */
+
+CGLM_EXPORT
+void
+glmc_vec4_mulv(vec4 a, vec4 b, vec4 d) {
+  glm_vec4_mulv(a, b, d);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_broadcast(float val, vec4 d) {
+  glm_vec4_broadcast(val, d);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq(vec4 v, float val) {
+  return glm_vec4_eq(v, val);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_eps(vec4 v, float val) {
+  return glm_vec4_eq_eps(v, val);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_eq_all(vec4 v) {
+  return glm_vec4_eq_all(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv(vec4 v1, vec4 v2) {
+  return glm_vec4_eqv(v1, v2);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_eqv_eps(vec4 v1, vec4 v2) {
+  return glm_vec4_eqv_eps(v1, v2);
+}
+
+CGLM_EXPORT
+float
+glmc_vec4_max(vec4 v) {
+  return glm_vec4_max(v);
+}
+
+CGLM_EXPORT
+float
+glmc_vec4_min(vec4 v) {
+  return glm_vec4_min(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_isnan(vec4 v) {
+  return glm_vec4_isnan(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_isinf(vec4 v) {
+  return glm_vec4_isinf(v);
+}
+
+CGLM_EXPORT
+bool
+glmc_vec4_isvalid(vec4 v) {
+  return glm_vec4_isvalid(v);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_sign(vec4 v, vec4 dest) {
+  glm_vec4_sign(v, dest);
+}
+
+CGLM_EXPORT
+void
+glmc_vec4_sqrt(vec4 v, vec4 dest) {
+  glm_vec4_sqrt(v, dest);
+}
diff --git a/test/src/test_affine.c b/test/src/test_affine.c
new file mode 100644
index 0000000..a625a30
--- /dev/null
+++ b/test/src/test_affine.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#include "test_common.h"
+
+void
+test_affine(void **state) {
+  mat4 t1, t2, t3, t4, t5;
+
+  /* test translate is postmultiplied */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t1, t2, t3); /* R * T */
+
+  glm_translate(t1, (vec3){34, 57, 36});
+  test_assert_mat4_eq(t1, t3);
+
+  /* test rotate is postmultiplied */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+
+  glm_rotate(t2, M_PI_4, GLM_YUP);
+  test_assert_mat4_eq(t2, t3);
+
+  /* test scale is postmultiplied */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){34, 57, 36});
+  glm_scale_make(t4, (vec3){3, 5, 6});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+  glmc_mat4_mul(t3, t4, t5); /* T * R * S */
+
+  glm_scale(t3, (vec3){3, 5, 6});
+  test_assert_mat4_eq(t3, t5);
+
+  /* test translate_x */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){34, 0, 0});
+
+  glmc_mat4_mul(t1, t2, t3); /* R * T */
+  glm_translate_x(t1, 34);
+  test_assert_mat4_eq(t1, t3);
+
+  /* test translate_y */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){0, 57, 0});
+
+  glmc_mat4_mul(t1, t2, t3); /* R * T */
+  glm_translate_y(t1, 57);
+  test_assert_mat4_eq(t1, t3);
+
+  /* test translate_z */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){0, 0, 36});
+
+  glmc_mat4_mul(t1, t2, t3); /* R * T */
+  glm_translate_z(t1, 36);
+  test_assert_mat4_eq(t1, t3);
+
+  /* test rotate_x */
+  glmc_rotate_make(t1, M_PI_4, (vec3){1, 0, 0});
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+
+  glm_rotate_x(t2, M_PI_4, t2);
+  test_assert_mat4_eq(t2, t3);
+
+  /* test rotate_y */
+  glmc_rotate_make(t1, M_PI_4, (vec3){0, 1, 0});
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+
+  glm_rotate_y(t2, M_PI_4, t2);
+  test_assert_mat4_eq(t2, t3);
+
+  /* test rotate_z */
+  glmc_rotate_make(t1, M_PI_4, (vec3){0, 0, 1});
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+
+  glm_rotate_z(t2, M_PI_4, t2);
+  test_assert_mat4_eq(t2, t3);
+
+  /* test rotate */
+  glmc_rotate_make(t1, M_PI_4, (vec3){0, 0, 1});
+  glm_translate_make(t2, (vec3){34, 57, 36});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+  glmc_rotate(t2, M_PI_4, (vec3){0, 0, 1});
+
+  test_assert_mat4_eq(t3, t2);
+
+  /* test scale_uni */
+  glmc_rotate_make(t1, M_PI_4, GLM_YUP);
+  glm_translate_make(t2, (vec3){34, 57, 36});
+  glm_scale_make(t4, (vec3){3, 3, 3});
+
+  glmc_mat4_mul(t2, t1, t3); /* T * R */
+  glmc_mat4_mul(t3, t4, t5); /* T * R * S */
+
+  glm_scale_uni(t3, 3);
+  test_assert_mat4_eq(t3, t5);
+}
diff --git a/test/src/test_common.c b/test/src/test_common.c
index 23f2b50..405000d 100644
--- a/test/src/test_common.c
+++ b/test/src/test_common.c
@@ -27,6 +27,50 @@ test_rand_mat4(mat4 dest) {
   /* glm_scale(dest, (vec3){drand48(), drand48(), drand48()}); */
 }
 
+void
+test_rand_mat3(mat3 dest) {
+  mat4 m4;
+
+  srand((unsigned int)time(NULL));
+
+  /* random rotatation around random axis with random angle */
+  glm_rotate_make(m4, drand48(), (vec3){drand48(), drand48(), drand48()});
+  glm_mat4_pick3(m4, dest);
+}
+
+void
+test_rand_vec3(vec3 dest) {
+  srand((unsigned int)time(NULL));
+
+  dest[0] = drand48();
+  dest[1] = drand48();
+  dest[2] = drand48();
+}
+
+void
+test_rand_vec4(vec4 dest) {
+  srand((unsigned int)time(NULL));
+
+  dest[0] = drand48();
+  dest[1] = drand48();
+  dest[2] = drand48();
+  dest[3] = drand48();
+}
+
+float
+test_rand_angle(void) {
+  srand((unsigned int)time(NULL));
+  return drand48();
+}
+
+void
+test_rand_quat(versor q) {
+  srand((unsigned int)time(NULL));
+
+  glm_quat(q, drand48(), drand48(), drand48(), drand48());
+  glm_quat_normalize(q);
+}
+
 void
 test_assert_mat4_eq(mat4 m1, mat4 m2) {
   int i, j, k;
@@ -52,8 +96,50 @@ test_assert_mat4_eq2(mat4 m1, mat4 m2, float eps) {
 }
 
 void
-test_assert_vec3_eq(vec3 v1, vec3 v2) {
-  assert_true(fabsf(v1[0] - v2[0]) <= 0.0000009);
-  assert_true(fabsf(v1[1] - v2[1]) <= 0.0000009);
-  assert_true(fabsf(v1[2] - v2[2]) <= 0.0000009);
+test_assert_mat3_eq(mat3 m1, mat3 m2) {
+  int i, j, k;
+
+  for (i = 0; i < 3; i++) {
+    for (j = 0; j < 3; j++) {
+      for (k = 0; k < 3; k++)
+        assert_true(fabsf(m1[i][j] - m2[i][j]) <= 0.0000009);
+    }
+  }
 }
+
+void
+test_assert_eqf(float a, float b) {
+  assert_true(fabsf(a - b) <= 0.000009); /* rounding errors */
+}
+
+void
+test_assert_vec3_eq(vec3 v1, vec3 v2) {
+  assert_true(fabsf(v1[0] - v2[0]) <= 0.000009); /* rounding errors */
+  assert_true(fabsf(v1[1] - v2[1]) <= 0.000009);
+  assert_true(fabsf(v1[2] - v2[2]) <= 0.000009);
+}
+
+void
+test_assert_vec4_eq(vec4 v1, vec4 v2) {
+  assert_true(fabsf(v1[0] - v2[0]) <= 0.000009); /* rounding errors */
+  assert_true(fabsf(v1[1] - v2[1]) <= 0.000009);
+  assert_true(fabsf(v1[2] - v2[2]) <= 0.000009);
+  assert_true(fabsf(v1[3] - v2[3]) <= 0.000009);
+}
+
+void
+test_assert_quat_eq_abs(versor v1, versor v2) {
+  assert_true(fabsf(fabsf(v1[0]) - fabsf(v2[0])) <= 0.0009); /* rounding errors */
+  assert_true(fabsf(fabsf(v1[1]) - fabsf(v2[1])) <= 0.0009);
+  assert_true(fabsf(fabsf(v1[2]) - fabsf(v2[2])) <= 0.0009);
+  assert_true(fabsf(fabsf(v1[3]) - fabsf(v2[3])) <= 0.0009);
+}
+
+void
+test_assert_quat_eq(versor v1, versor v2) {
+  assert_true(fabsf(v1[0] - v2[0]) <= 0.000009); /* rounding errors */
+  assert_true(fabsf(v1[1] - v2[1]) <= 0.000009);
+  assert_true(fabsf(v1[2] - v2[2]) <= 0.000009);
+  assert_true(fabsf(v1[3] - v2[3]) <= 0.000009);
+}
+
diff --git a/test/src/test_common.h b/test/src/test_common.h
index aeea4d6..7881e7a 100644
--- a/test/src/test_common.h
+++ b/test/src/test_common.h
@@ -25,13 +25,43 @@
 void
 test_rand_mat4(mat4 dest);
 
+void
+test_rand_mat3(mat3 dest);
+
+void
+test_assert_eqf(float a, float b);
+
 void
 test_assert_mat4_eq(mat4 m1, mat4 m2);
 
 void
 test_assert_mat4_eq2(mat4 m1, mat4 m2, float eps);
 
+void
+test_assert_mat3_eq(mat3 m1, mat3 m2);
+
 void
 test_assert_vec3_eq(vec3 v1, vec3 v2);
 
+void
+test_assert_vec4_eq(vec4 v1, vec4 v2);
+
+void
+test_assert_quat_eq(versor v1, versor v2);
+
+void
+test_assert_quat_eq_abs(versor v1, versor v2);
+
+void
+test_rand_vec3(vec3 dest);
+
+void
+test_rand_vec4(vec4 dest) ;
+
+float
+test_rand_angle(void);
+
+void
+test_rand_quat(versor q);
+
 #endif /* test_common_h */
diff --git a/test/src/test_main.c b/test/src/test_main.c
index 5c1a647..ff77b02 100644
--- a/test/src/test_main.c
+++ b/test/src/test_main.c
@@ -12,6 +12,9 @@ main(int argc, const char * argv[]) {
     /* mat4 */
     cmocka_unit_test(test_mat4),
 
+    /* mat3 */
+    cmocka_unit_test(test_mat3),
+
     /* camera */
     cmocka_unit_test(test_camera_lookat),
     cmocka_unit_test(test_camera_decomp),
@@ -23,7 +26,19 @@ main(int argc, const char * argv[]) {
     cmocka_unit_test(test_clamp),
 
     /* euler */
-    cmocka_unit_test(test_euler)
+    cmocka_unit_test(test_euler),
+
+    /* quaternion */
+    cmocka_unit_test(test_quat),
+
+    /* vec4 */
+    cmocka_unit_test(test_vec4),
+
+    /* vec3 */
+    cmocka_unit_test(test_vec3),
+
+    /* affine */
+    cmocka_unit_test(test_affine)
   };
 
   return cmocka_run_group_tests(tests, NULL, NULL);
diff --git a/test/src/test_mat3.c b/test/src/test_mat3.c
new file mode 100644
index 0000000..d97d1f5
--- /dev/null
+++ b/test/src/test_mat3.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#include "test_common.h"
+
+#define m 3
+#define n 3
+
+void
+test_mat3(void **state) {
+  mat3  m1 = GLM_MAT3_IDENTITY_INIT;
+  mat3  m2 = GLM_MAT3_IDENTITY_INIT;
+  mat3  m3;
+  mat3  m4 = GLM_MAT3_ZERO_INIT;
+  mat3  m5;
+  int   i, j, k;
+
+  /* test identity matrix multiplication */
+  glmc_mat3_mul(m1, m2, m3);
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      if (i == j)
+        assert_true(m3[i][j] == 1.0f);
+      else
+        assert_true(m3[i][j] == 0.0f);
+    }
+  }
+
+  /* test random matrices */
+  /* random matrices */
+  test_rand_mat3(m1);
+  test_rand_mat3(m2);
+
+  glmc_mat3_mul(m1, m2, m3);
+  for (i = 0; i < m; i++) {
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < m; k++)
+        /* column-major */
+        m4[i][j] += m1[k][j] * m2[i][k];
+    }
+  }
+
+  test_assert_mat3_eq(m3, m4);
+
+  for (i = 0; i < 100000; i++) {
+    test_rand_mat3(m3);
+    test_rand_mat3(m4);
+
+    /* test inverse precise */
+    glmc_mat3_inv(m3, m4);
+    glmc_mat3_inv(m4, m5);
+    test_assert_mat3_eq(m3, m5);
+  }
+}
diff --git a/test/src/test_quat.c b/test/src/test_quat.c
new file mode 100644
index 0000000..74d12b5
--- /dev/null
+++ b/test/src/test_quat.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#include "test_common.h"
+
+CGLM_INLINE
+void
+test_quat_mul_raw(versor p, versor q, versor dest) {
+  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
+  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
+  dest[2] = p[3] * q[2] + p[0] * q[1] - p[1] * q[0] + p[2] * q[3];
+  dest[3] = p[3] * q[3] - p[0] * q[0] - p[1] * q[1] - p[2] * q[2];
+}
+
+void
+test_quat(void **state) {
+  mat4   inRot, outRot, view1, view2, rot1, rot2;
+  versor inQuat, outQuat, q3, q4, q5;
+  vec3   eye, axis, imag, v1, v2;
+  int    i;
+
+  /* 0. test identiy quat */
+  glm_quat_identity(q4);
+  assert_true(glm_quat_real(q4) == cosf(glm_rad(0.0f) * 0.5f));
+  glm_quat_mat4(q4, rot1);
+  test_assert_mat4_eq2(rot1, GLM_MAT4_IDENTITY, 0.000009);
+
+  /* 1. test quat to mat and mat to quat */
+  for (i = 0; i < 1000; i++) {
+    test_rand_quat(inQuat);
+
+    glmc_quat_mat4(inQuat, inRot);
+    glmc_mat4_quat(inRot, outQuat);
+    glmc_quat_mat4(outQuat, outRot);
+
+    /* 2. test first quat and generated one equality */
+    test_assert_quat_eq_abs(inQuat, outQuat);
+
+    /* 3. test first rot and second rotation */
+    test_assert_mat4_eq2(inRot, outRot, 0.000009); /* almost equal */
+
+    /* 4. test SSE mul and raw mul */
+    test_quat_mul_raw(inQuat, outQuat, q3);
+    glm_quat_mul_sse2(inQuat, outQuat, q4);
+    test_assert_quat_eq(q3, q4);
+  }
+
+  /* 5. test lookat */
+  test_rand_vec3(eye);
+  glm_quatv(q3, glm_rad(-90.0f), GLM_YUP);
+
+  /* now X axis must be forward axis, Z must be right axis */
+  glm_look(eye, GLM_XUP, GLM_YUP, view1);
+
+  /* create view matrix with quaternion */
+  glm_quat_look(eye, q3, view2);
+
+  test_assert_mat4_eq2(view1, view2, 0.000009);
+
+  /* 6. test quaternion rotation matrix result */
+  test_rand_quat(q3);
+  glm_quat_mat4(q3, rot1);
+
+  /* 6.1 test axis and angle of quat */
+  glm_quat_axis(q3, axis);
+  glm_rotate_make(rot2, glm_quat_angle(q3), axis);
+
+  test_assert_mat4_eq2(rot1, rot2, 0.000009);
+
+  /* 7. test quaternion multiplication (hamilton product),
+        final rotation = first rotation + second = quat1 * quat2
+   */
+  test_rand_quat(q3);
+  test_rand_quat(q4);
+
+  glm_quat_mul(q3, q4, q5);
+
+  glm_quat_axis(q3, axis);
+  glm_rotate_make(rot1, glm_quat_angle(q3), axis);
+
+  glm_quat_axis(q4, axis);
+  glm_rotate(rot1, glm_quat_angle(q4), axis);
+
+  /* rot2 is combine of two rotation now test with quaternion result */
+  glm_quat_mat4(q5, rot2);
+
+  /* result must be same (almost) */
+  test_assert_mat4_eq2(rot1, rot2, 0.000009);
+
+  /* 8. test quaternion for look rotation */
+
+  /* 8.1 same direction */
+  /* look at from 0, 0, 1 to zero, direction = 0, 0, -1 */
+  glm_quat_for((vec3){0, 0, -1}, (vec3){0, 0, -1}, GLM_YUP, q3);
+
+  /* result must be identity */
+  glm_quat_identity(q4);
+  test_assert_quat_eq(q3, q4);
+
+  /* look at from 0, 0, 1 to zero, direction = 0, 0, -1 */
+  glm_quat_forp(GLM_ZUP, GLM_VEC3_ZERO, (vec3){0, 0, -1}, GLM_YUP, q3);
+
+  /* result must be identity */
+  glm_quat_identity(q4);
+  test_assert_quat_eq(q3, q4);
+
+  /* 8.2 perpendicular */
+  glm_quat_for(GLM_XUP, (vec3){0, 0, -1}, GLM_YUP, q3);
+
+  /* result must be -90 */
+  glm_quatv(q4, glm_rad(-90.0f), GLM_YUP);
+  test_assert_quat_eq(q3, q4);
+
+  /* 9. test imag, real */
+
+  /* 9.1 real */
+  assert_true(glm_quat_real(q4) == cosf(glm_rad(-90.0f) * 0.5f));
+
+  /* 9.1 imag */
+  glm_quat_imag(q4, imag);
+
+  /* axis = Y_UP * sinf(angle * 0.5), YUP = 0, 1, 0 */
+  axis[0] = 0.0f;
+  axis[1] = sinf(glm_rad(-90.0f) * 0.5f) * 1.0f;
+  axis[2] = 0.0f;
+
+  assert_true(glm_vec_eqv_eps(imag, axis));
+
+  /* 9.2 axis */
+  glm_quat_axis(q4, axis);
+  imag[0] =  0.0f;
+  imag[1] = -1.0f;
+  imag[2] =  0.0f;
+
+  test_assert_vec3_eq(imag, axis);
+
+  /* 10. test rotate vector using quat */
+  /* (0,0,-1) around (1,0,0) must give (0,1,0) */
+  v1[0] = 0.0f; v1[1] = 0.0f; v1[2] = -1.0f;
+  v2[0] = 0.0f; v2[1] = 0.0f; v2[2] = -1.0f;
+
+  glm_vec_rotate(v1, glm_rad(90.0f), (vec3){1.0f, 0.0f, 0.0f});
+  glm_quatv(q3, glm_rad(90.0f), (vec3){1.0f, 0.0f, 0.0f});
+
+  glm_vec4_scale(q3, 1.5, q3);
+  glm_quat_rotatev(q3, v2, v2);
+
+  /* result must be : (0,1,0) */
+  assert_true(fabsf(v1[0]) <= 0.00009f
+              && fabsf(v1[1] - 1.0f) <= 0.00009f
+              && fabsf(v1[2]) <= 0.00009f);
+
+  test_assert_vec3_eq(v1, v2);
+
+  /* 11. test rotate transform */
+  glm_translate_make(rot1, (vec3){-10.0, 45.0f, 8.0f});
+  glm_rotate(rot1, glm_rad(-90), GLM_ZUP);
+
+  glm_quatv(q3, glm_rad(-90.0f), GLM_ZUP);
+  glm_translate_make(rot2, (vec3){-10.0, 45.0f, 8.0f});
+  glm_quat_rotate(rot2, q3, rot2);
+
+  /* result must be same (almost) */
+  test_assert_mat4_eq2(rot1, rot2, 0.000009);
+
+  glm_rotate_make(rot1, glm_rad(-90), GLM_ZUP);
+  glm_translate(rot1, (vec3){-10.0, 45.0f, 8.0f});
+
+  glm_quatv(q3, glm_rad(-90.0f), GLM_ZUP);
+  glm_mat4_identity(rot2);
+  glm_quat_rotate(rot2, q3, rot2);
+  glm_translate(rot2, (vec3){-10.0, 45.0f, 8.0f});
+
+  /* result must be same (almost) */
+  test_assert_mat4_eq2(rot1, rot2, 0.000009);
+
+  /* reverse */
+  glm_rotate_make(rot1, glm_rad(-90), GLM_ZUP);
+  glm_quatv(q3, glm_rad(90.0f), GLM_ZUP);
+  glm_quat_rotate(rot1, q3, rot1);
+
+  /* result must be identity */
+  test_assert_mat4_eq2(rot1, GLM_MAT4_IDENTITY, 0.000009);
+
+  test_rand_quat(q3);
+
+  /* 12. inverse of quat, multiplication must be IDENTITY */
+  glm_quat_inv(q3, q4);
+  glm_quat_mul(q3, q4, q5);
+
+  glm_quat_identity(q3);
+  test_assert_quat_eq(q3, q5);
+
+  /* TODO: add tests for slerp, lerp */
+}
diff --git a/test/src/test_tests.h b/test/src/test_tests.h
index 7234782..7b9cf0a 100644
--- a/test/src/test_tests.h
+++ b/test/src/test_tests.h
@@ -9,6 +9,9 @@
 /* mat4 */
 void test_mat4(void **state);
 
+/* mat3 */
+void test_mat3(void **state);
+
 /* camera */
 void
 test_camera_lookat(void **state);
@@ -25,4 +28,16 @@ test_clamp(void **state);
 void
 test_euler(void **state);
 
+void
+test_quat(void **state);
+
+void
+test_vec4(void **state);
+
+void
+test_vec3(void **state);
+
+void
+test_affine(void **state);
+
 #endif /* test_tests_h */
diff --git a/test/src/test_vec3.c b/test/src/test_vec3.c
new file mode 100644
index 0000000..dd0c55d
--- /dev/null
+++ b/test/src/test_vec3.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#include "test_common.h"
+
+void
+test_vec3(void **state) {
+  mat3 rot1m3;
+  mat4 rot1;
+  vec3 v, v1, v2;
+
+  /* test zero */
+  glm_vec_zero(v);
+  test_assert_vec3_eq(GLM_VEC3_ZERO, v);
+
+  /* test one */
+  glm_vec_one(v);
+  test_assert_vec3_eq(GLM_VEC3_ONE, v);
+
+  /* adds, subs, div, divs, mul */
+  glm_vec_add(v, GLM_VEC3_ONE, v);
+  assert_true(glmc_vec_eq_eps(v, 2));
+
+  glm_vec_adds(v, 10, v);
+  assert_true(glmc_vec_eq_eps(v, 12));
+
+  glm_vec_sub(v, GLM_VEC3_ONE, v);
+  assert_true(glmc_vec_eq_eps(v, 11));
+
+  glm_vec_subs(v, 1, v);
+  assert_true(glmc_vec_eq_eps(v, 10));
+
+  glm_vec_broadcast(2, v1);
+  glm_vec_div(v, v1, v);
+  assert_true(glmc_vec_eq_eps(v, 5));
+
+  glm_vec_divs(v, 0.5, v);
+  assert_true(glmc_vec_eq_eps(v, 10));
+
+  glm_vec_mul(v, v1, v);
+  assert_true(glmc_vec_eq_eps(v, 20));
+
+  glm_vec_scale(v, 0.5, v);
+  assert_true(glmc_vec_eq_eps(v, 10));
+
+  glm_vec_normalize_to(v, v1);
+  glm_vec_scale(v1, 0.8, v1);
+  glm_vec_scale_as(v, 0.8, v);
+  test_assert_vec3_eq(v1, v);
+
+  /* addadd, subadd, muladd */
+  glm_vec_one(v);
+
+  glm_vec_addadd(GLM_VEC3_ONE, GLM_VEC3_ONE, v);
+  assert_true(glmc_vec_eq_eps(v, 3));
+
+  glm_vec_subadd(GLM_VEC3_ONE, GLM_VEC3_ZERO, v);
+  assert_true(glmc_vec_eq_eps(v, 4));
+
+  glm_vec_broadcast(2, v1);
+  glm_vec_broadcast(3, v2);
+  glm_vec_muladd(v1, v2, v);
+  assert_true(glmc_vec_eq_eps(v, 10));
+
+  /* rotate */
+  glm_vec_copy(GLM_YUP, v);
+  glm_rotate_make(rot1, glm_rad(90), GLM_XUP);
+  glm_vec_rotate_m4(rot1, v, v1);
+  glm_mat4_pick3(rot1, rot1m3);
+  glm_vec_rotate_m3(rot1m3, v, v2);
+
+  test_assert_vec3_eq(v1, v2);
+  test_assert_vec3_eq(v1, GLM_ZUP);
+}
diff --git a/test/src/test_vec4.c b/test/src/test_vec4.c
new file mode 100644
index 0000000..8e4fda5
--- /dev/null
+++ b/test/src/test_vec4.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#include "test_common.h"
+
+CGLM_INLINE
+float
+test_vec4_dot(vec4 a, vec4 b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+}
+
+CGLM_INLINE
+void
+test_vec4_normalize_to(vec4 vec, vec4 dest) {
+  float norm;
+
+  norm = glm_vec4_norm(vec);
+
+  if (norm == 0.0f) {
+    dest[0] = dest[1] = dest[2] = dest[3] = 0.0f;
+    return;
+  }
+
+  glm_vec4_scale(vec, 1.0f / norm, dest);
+}
+
+float
+test_vec4_norm2(vec4 vec) {
+  return test_vec4_dot(vec, vec);
+}
+
+float
+test_vec4_norm(vec4 vec) {
+  return sqrtf(test_vec4_dot(vec, vec));
+}
+
+void
+test_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) {
+  dest[0] = glm_max(v1[0], v2[0]);
+  dest[1] = glm_max(v1[1], v2[1]);
+  dest[2] = glm_max(v1[2], v2[2]);
+  dest[3] = glm_max(v1[3], v2[3]);
+}
+
+void
+test_vec4_minv(vec4 v1, vec4 v2, vec4 dest) {
+  dest[0] = glm_min(v1[0], v2[0]);
+  dest[1] = glm_min(v1[1], v2[1]);
+  dest[2] = glm_min(v1[2], v2[2]);
+  dest[3] = glm_min(v1[3], v2[3]);
+}
+
+void
+test_vec4_clamp(vec4 v, float minVal, float maxVal) {
+  v[0] = glm_clamp(v[0], minVal, maxVal);
+  v[1] = glm_clamp(v[1], minVal, maxVal);
+  v[2] = glm_clamp(v[2], minVal, maxVal);
+  v[3] = glm_clamp(v[3], minVal, maxVal);
+}
+
+void
+test_vec4(void **state) {
+  vec4  v, v1, v2, v3, v4;
+  int   i;
+  float d1, d2;
+
+
+  for (i = 0; i < 1000; i++) {
+    /* 1. test SSE/SIMD dot product */
+    test_rand_vec4(v);
+    d1 = glm_vec4_dot(v, v);
+    d2 = test_vec4_dot(v, v);
+
+    assert_true(fabsf(d1 - d2) <= 0.000009);
+
+    /* 2. test SIMD normalize */
+    test_vec4_normalize_to(v, v1);
+    glm_vec4_normalize_to(v, v2);
+    glm_vec4_normalize(v);
+
+    /* all must be same */
+    test_assert_vec4_eq(v1, v2);
+    test_assert_vec4_eq(v, v2);
+
+    /* 3. test SIMD norm */
+    test_rand_vec4(v);
+    test_assert_eqf(test_vec4_norm(v), glm_vec4_norm(v));
+
+    /* 3. test SIMD norm2 */
+    test_rand_vec4(v);
+    test_assert_eqf(test_vec4_norm2(v), glm_vec4_norm2(v));
+  }
+
+  /* test zero */
+  glm_vec4_zero(v);
+  test_assert_vec4_eq(GLM_VEC4_ZERO, v);
+
+  /* test one */
+  glm_vec4_one(v);
+  test_assert_vec4_eq(GLM_VEC4_ONE, v);
+
+  /* adds, subs, div, divs, mul */
+  glm_vec4_add(v, GLM_VEC4_ONE, v);
+  assert_true(glmc_vec4_eq_eps(v, 2));
+
+  glm_vec4_adds(v, 10, v);
+  assert_true(glmc_vec4_eq_eps(v, 12));
+
+  glm_vec4_sub(v, GLM_VEC4_ONE, v);
+  assert_true(glmc_vec4_eq_eps(v, 11));
+
+  glm_vec4_subs(v, 1, v);
+  assert_true(glmc_vec4_eq_eps(v, 10));
+
+  glm_vec4_broadcast(2, v1);
+  glm_vec4_div(v, v1, v);
+  assert_true(glmc_vec4_eq_eps(v, 5));
+
+  glm_vec4_divs(v, 0.5, v);
+  assert_true(glmc_vec4_eq_eps(v, 10));
+
+  glm_vec4_mul(v, v1, v);
+  assert_true(glmc_vec4_eq_eps(v, 20));
+
+  glm_vec4_scale(v, 0.5, v);
+  assert_true(glmc_vec4_eq_eps(v, 10));
+
+  glm_vec4_normalize_to(v, v1);
+  glm_vec4_scale(v1, 0.8, v1);
+  glm_vec4_scale_as(v, 0.8, v);
+  test_assert_vec4_eq(v1, v);
+
+  /* addadd, subadd, muladd */
+  glm_vec4_one(v);
+
+  glm_vec4_addadd(GLM_VEC4_ONE, GLM_VEC4_ONE, v);
+  assert_true(glmc_vec4_eq_eps(v, 3));
+
+  glm_vec4_subadd(GLM_VEC4_ONE, GLM_VEC4_ZERO, v);
+  assert_true(glmc_vec4_eq_eps(v, 4));
+
+  glm_vec4_broadcast(2, v1);
+  glm_vec4_broadcast(3, v2);
+  glm_vec4_muladd(v1, v2, v);
+  assert_true(glmc_vec4_eq_eps(v, 10));
+
+  /* min, max */
+  test_rand_vec4(v1);
+  test_rand_vec4(v2);
+
+  glm_vec4_maxv(v1, v2, v3);
+  test_vec4_maxv(v1, v2, v4);
+  test_assert_vec4_eq(v3, v4);
+
+  glm_vec4_minv(v1, v2, v3);
+  test_vec4_minv(v1, v2, v4);
+  test_assert_vec4_eq(v3, v4);
+
+  glm_vec4_print(v3, stderr);
+  glm_vec4_print(v4, stderr);
+
+  /* clamp */
+  glm_vec4_clamp(v3, 0.1, 0.8);
+  test_vec4_clamp(v4, 0.1, 0.8);
+  test_assert_vec4_eq(v3, v4);
+
+  glm_vec4_print(v3, stderr);
+  glm_vec4_print(v4, stderr);
+
+  assert_true(v3[0] >= 0.0999 && v3[0] <= 0.80001); /* rounding erros */
+  assert_true(v3[1] >= 0.0999 && v3[1] <= 0.80001);
+  assert_true(v3[2] >= 0.0999 && v3[2] <= 0.80001);
+  assert_true(v3[3] >= 0.0999 && v3[3] <= 0.80001);
+}