Przeglądaj źródła

New WIP template math lib. Fix a bug in the quad drawing

Panagiotis Christopoulos Charitos 12 lat temu
rodzic
commit
6b56b6ab1f

+ 190 - 0
include/anki/math/Axisang.h

@@ -9,6 +9,196 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// Axis angles. Used for rotations
+template<typename T>
+class TAxisang
+{
+public:
+	/// @name Constructors
+	/// @{
+	explicit TAxisang()
+		: ang(0), axis(0)
+	{}
+	
+	TAxisang(const TAxisang& b)
+		: ang(b.ang), axis(b.axis)
+	{}
+
+	explicit TAxisang(const T rad, const TVec3<T>& axis_)
+		: ang(rad), axis(axis_)
+	{}
+
+	explicit TAxisang(const TQuat<T>& q)
+	{
+		ang = 2.0 * acos(q.w());
+		T length = sqrt(1.0 - q.w() * q.w());
+		if(isZero<T>(length))
+		{
+			axis = TVec3<T>(0.0);
+		}
+		else
+		{
+			length = 1.0 / length;
+			axis = TVec3<T>(q.x() * length, q.y() * length, q.z() * length);
+		}
+	}
+
+	explicit TAxisang(const TMat3<T>& m3)
+	{
+		if((fabs(m3(0, 1) - m3(1, 0)) < getEpsilon<T>()) 
+			&&(fabs(m3(0, 2) - m3(2, 0)) < getEpsilon<T>()) 
+			&& (fabs(m3(1, 2) - m3(2, 1)) < getEpsilon<T>()))
+		{
+
+			if((fabs(m3(0, 1) + m3(1, 0)) < 0.1) 
+				&& (fabs(m3(0, 2) + m3(2, 0)) < 0.1) 
+				&& (fabs(m3(1, 2) + m3(2, 1)) < 0.1) 
+				&& (fabs(m3(0, 0) + m3(1, 1) + m3(2, 2)) - 3) < 0.1)
+			{
+				axis = TVec3<T>(1.0, 0.0, 0.0);
+				ang = 0.0;
+				return;
+			}
+
+			ang = getPi<T>();
+			axis.x() = (m3(0, 0)+1) / 2.0;
+			if(axis.x() > 0.0)
+			{
+				axis.x() = sqrt(axis.x());
+			}
+			else
+			{
+				axis.x() = 0;
+			}
+			axis.y() = (m3(1, 1)+1)/2;
+			if(axis.y() > 0)
+			{
+				axis.y() = sqrt(axis.y());
+			}
+			else
+			{
+				axis.y() = 0;
+			}
+
+			axis.z() = (m3(2, 2)+1)/2;
+			if(axis.z() > 0)
+			{
+				axis.z() = sqrt(axis.z());
+			}
+			else
+			{
+				axis.z() = 0.0;
+			}
+
+			Bool xZero = (fabs(axis.x()) < getEpsilon<T>());
+			Bool yZero = (fabs(axis.y()) < getEpsilon<T>());
+			Bool zZero = (fabs(axis.z()) < getEpsilon<T>());
+			Bool xyPositive = (m3(0, 1) > 0);
+			Bool xzPositive = (m3(0, 2) > 0);
+			Bool yzPositive = (m3(1, 2) > 0);
+			if(xZero && !yZero && !zZero)
+			{
+				if(!yzPositive)
+				{
+					axis.y() = -axis.y();
+				}
+			}
+			else if(yZero && !zZero)
+			{
+				if(!xzPositive)
+				{
+					axis.z() = -axis.z();
+				}
+			}
+			else if(zZero)
+			{
+				if(!xyPositive)
+				{
+					axis.x() = -axis.x();
+				}
+			}
+
+			return;
+		}
+
+		T s = sqrt((m3(2, 1) - m3(1, 2)) * (m3(2, 1) - m3(1, 2)) 
+			+ (m3(0, 2) - m3(2, 0)) * (m3(0, 2) - m3(2, 0)) 
+			+ (m3(1, 0) - m3(0, 1)) * (m3(1, 0) - m3(0, 1)));
+
+		if(fabs(s) < 0.001)
+		{
+			s = 1;
+		}
+
+		ang = acos((m3(0, 0) + m3(1, 1) + m3(2, 2) - 1) / 2);
+		axis.x() = (m3(2, 1) - m3(1, 2)) / s;
+		axis.y() = (m3(0, 2) - m3(2, 0)) / s;
+		axis.z() = (m3(1, 0) - m3(0, 1)) / s;
+		
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T getAngle() const
+	{
+		return ang;
+	}
+
+	T& getAngle()
+	{
+		return ang;
+	}
+
+	void setAngle(const T a)
+	{
+		ang = a;
+	}
+
+	const TVec3<T>& getAxis() const
+	{
+		return axis;
+	}
+
+	TVec3<T>& getAxis()
+	{
+		return axis;
+	}
+
+	void setAxis(const TVec3<T>& a)
+	{
+		axis = a;
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TAxisang& operator=(const TAxisang& b)
+	{
+		ang = b.ang;
+		axis = b.axis;
+		return *this;
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	std::string toString()
+	{
+		std::string s;
+		//std::string s = << "axis: " << a.getAxis() << ", angle: " << a.getAngle();
+		return s;
+	}
+	/// @}
+
+private:
+	/// @name Data
+	/// @{
+	T ang;
+	TVec3<T> axis;
+	/// @}
+};
+
 /// Axis angles. Used for rotations
 class Axisang
 {

+ 2 - 0
include/anki/math/CommonIncludes.h

@@ -1,5 +1,7 @@
 #include "anki/math/Forward.h"
+#include "anki/math/Functions.h"
 #include "anki/math/Simd.h"
 #include "anki/util/StdTypes.h"
 #include "anki/util/Array.h"
 #include <iosfwd>
+#include <string>

+ 161 - 0
include/anki/math/Euler.h

@@ -8,6 +8,167 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// Euler angles. Used for rotations. It cannot describe a rotation
+/// accurately though
+template<typename T>
+class TEuler
+{
+public:
+	/// @name Constructors
+	/// @{
+	explicit TEuler()
+	{
+		x() = y() = z() = 0.0;
+	}
+
+	explicit TEuler(const T x_, const T y_, const T z_)
+	{
+		x() = x_;
+		y() = y_;
+		z() = z_;
+	}
+
+	TEuler(const TEuler& b)
+	{
+		x() = b.x();
+		y() = b.y();
+		z() = b.z();
+	}
+
+	explicit TEuler(const TQuat<T>& q)
+	{
+		T test = q.x() * q.y() + q.z() * q.w();
+		if(test > 0.499)
+		{
+			y() = 2.0 * atan2(q.x(), q.w());
+			z() = getPi<T>() / 2.0;
+			x() = 0.0;
+			return;
+		}
+		if(test < -0.499)
+		{
+			y() = -2.0 * atan2(q.x(), q.w());
+			z() = -getPi<T>() / 2.0;
+			x() = 0.0;
+			return;
+		}
+
+		T sqx = q.x() * q.x();
+		T sqy = q.y() * q.y();
+		T sqz = q.z() * q.z();
+		y() = atan2(2.0 * q.y() * q.w() - 2.0 * q.x() * q.z(),
+			1.0 - 2.0 * sqy - 2.0 * sqz);
+		z() = asin(2.0 * test);
+		x() = atan2(2.0 * q.x() * q.w() - 2.0 * q.y() * q.z(),
+			1.0 - 2.0 * sqx - 2.0 * sqz);
+	}
+
+	explicit TEuler(const TMat3<T>& m3)
+	{
+		T cx, sx;
+		T cy, sy;
+		T cz, sz;
+
+		sy = m3(0, 2);
+		cy = sqrt(1.0 - sy * sy);
+		// normal case
+		if (!isZero<T>(cy))
+		{
+			T factor = 1.0 / cy;
+			sx = -m3(1, 2) * factor;
+			cx = m3(2, 2) * factor;
+			sz = -m3(0, 1) * factor;
+			cz = m3(0, 0) * factor;
+		}
+		// x and z axes aligned
+		else
+		{
+			sz = 0.0;
+			cz = 1.0;
+			sx = m3(2, 1);
+			cx = m3(1, 1);
+		}
+
+		z() = atan2(sz, cz);
+		y() = atan2(sy, cy);
+		x() = atan2(sx, cx);
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& operator [](const U i)
+	{
+		return arr[i];
+	}
+
+	T operator [](const U i) const
+	{
+		return arr[i];
+	}
+
+	T& x()
+	{
+		return vec.x;
+	}
+
+	T x() const
+	{
+		return vec.x;
+	}
+
+	T& y()
+	{
+		return vec.y;
+	}
+
+	T y() const
+	{
+		return vec.y;
+	}
+
+	T& z()
+	{
+		return vec.z;
+	}
+
+	T z() const
+	{
+		return vec.z;
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TEuler& operator=(const TEuler& b)
+	{
+		x() = b.x();
+		y() = b.y();
+		z() = b.z();
+		return *this;
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	std::string toString();
+	/// @}
+
+private:
+	/// @name Data
+	/// @{
+	union
+	{
+		struct
+		{
+			T x, y, z;
+		} vec;
+
+		Array<T, 3> arr;
+	};
+	/// @}
+};
+
 /// Euler angles. Used for rotations. It cannot describe a rotation
 /// accurately though
 class Euler

+ 10 - 0
include/anki/math/Forward.h

@@ -16,6 +16,16 @@ class Mat4;
 class Transform;
 class F16;
 
+template<typename T> class TVec2;
+template<typename T> class TVec3;
+template<typename T> class TVec4;
+template<typename T> class TMat3;
+template<typename T> class TMat4;
+template<typename T> class TQuat;
+template<typename T> class TTransform;
+template<typename T> class TAxisang;
+template<typename T> class TEuler;
+
 } // end namespace
 
 #endif

+ 3 - 7
include/anki/math/Functions.h

@@ -36,14 +36,10 @@ constexpr F64 getEpsilon<F64>()
 	return 1.0e-6;
 }
 
-inline Bool isZero(const F32 f)
+template<typename T>
+inline Bool isZero(const T f)
 {
-	return fabs(f) < getEpsilon<F32>();
-}
-
-inline Bool isZero(const F64 f)
-{
-	return fabs(f) < getEpsilon<F64>();
+	return fabs(f) < getEpsilon<T>();
 }
 
 inline F32 toRad(const F32 degrees)

+ 776 - 0
include/anki/math/Mat3.h

@@ -8,6 +8,782 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// 3x3 Matrix. Mainly used for rotations. It includes many helpful member
+/// functions. Its row major. The columns are the x,y,z axis
+template<typename T>
+class TMat3
+{
+public:
+	/// @name Constructors
+	/// @{
+	explicit TMat3() 
+	{}
+
+	explicit TMat3(const T f)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] = f;
+		}
+	}
+
+	explicit TMat3(const T m00, const T m01, const T m02,
+		const T m10, const T m11, const T m12,
+		const T m20, const T m21, const T m22)
+	{
+		arr2[0][0] = m00;
+		arr2[0][1] = m01;
+		arr2[0][2] = m02;
+		arr2[1][0] = m10;
+		arr2[1][1] = m11;
+		arr2[1][2] = m12;
+		arr2[2][0] = m20;
+		arr2[2][1] = m21;
+		arr2[2][2] = m22;
+	}
+
+	explicit TMat3(const T arr[])
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			(*this)[i] = arr[i];
+		}
+	}
+
+	TMat3(const TMat3& b)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] = b.arr1[i];
+		}
+	}
+
+	/// TQuat to TMat3. 12 muls, 12 adds
+	explicit TMat3(const TQuat<T>& q)
+	{
+		// If length is > 1 + 0.002 or < 1 - 0.002 then not normalized quat
+		ANKI_ASSERT(fabs(1.0 - q.getLength()) <= 0.002);
+
+		T xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz;
+
+		xs = q.x() + q.x();
+		ys = q.y() + q.y();
+		zs = q.z() + q.z();
+		wx = q.w() * xs;
+		wy = q.w() * ys;
+		wz = q.w() * zs;
+		xx = q.x() * xs;
+		xy = q.x() * ys;
+		xz = q.x() * zs;
+		yy = q.y() * ys;
+		yz = q.y() * zs;
+		zz = q.z() * zs;
+
+		arr2[0][0] = T(1) - (yy + zz);
+		arr2[0][1] = xy - wz;
+		arr2[0][2] = xz + wy;
+
+		arr2[1][0] = xy + wz;
+		arr2[1][1] = T(1) - (xx + zz);
+		arr2[1][2] = yz - wx;
+
+		arr2[2][0] = xz - wy;
+		arr2[2][1] = yz + wx;
+		arr2[2][2] = T(1) - (xx + yy);
+	}
+
+	explicit TMat3(const TEuler<T>& e)
+	{
+		T ch, sh, ca, sa, cb, sb;
+		sinCos(e.y(), sh, ch);
+		sinCos(e.z(), sa, ca);
+		sinCos(e.x(), sb, cb);
+
+		arr2[0][0] = ch * ca;
+		arr2[0][1] = sh * sb - ch * sa * cb;
+		arr2[0][2] = ch * sa * sb + sh * cb;
+		arr2[1][0] = sa;
+		arr2[1][1] = ca * cb;
+		arr2[1][2] = -ca * sb;
+		arr2[2][0] = -sh * ca;
+		arr2[2][1] = sh * sa * cb + ch * sb;
+		arr2[2][2] = -sh * sa * sb + ch * cb;
+	}
+
+	explicit TMat3(const TAxisang<T>& axisang)
+	{
+		// Not normalized axis
+		ANKI_ASSERT(isZero(1.0 - axisang.getAxis().getLength()));
+
+		T c, s;
+		sinCos(axisang.getAngle(), s, c);
+		T t = 1.0 - c;
+
+		const TVec3<T>& axis = axisang.getAxis();
+		arr2[0][0] = c + axis.x() * axis.x() * t;
+		arr2[1][1] = c + axis.y() * axis.y() * t;
+		arr2[2][2] = c + axis.z() * axis.z() * t;
+
+		T tmp1 = axis.x() * axis.y() * t;
+		T tmp2 = axis.z() * s;
+		arr2[1][0] = tmp1 + tmp2;
+		arr2[0][1] = tmp1 - tmp2;
+		tmp1 = axis.x() * axis.z() * t;
+		tmp2 = axis.y() * s;
+		arr2[2][0] = tmp1 - tmp2;
+		arr2[0][2] = tmp1 + tmp2;
+		tmp1 = axis.y() * axis.z() * t;
+		tmp2 = axis.x() * s;
+		arr2[2][1] = tmp1 + tmp2;
+		arr2[1][2] = tmp1 - tmp2;
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& operator()(const U i, const U j)
+	{
+		return arr2[i][j];
+	}
+
+	const T& operator()(const U i, const U j) const
+	{
+		return arr2[i][j];
+	}
+
+	T& operator[](const U i)
+	{
+		return arr1[i];
+	}
+
+	const T& operator[](const U i) const
+	{
+		return arr1[i];
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TMat3& operator=(const TMat3& b)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] = b.arr1[i];
+		}
+		return (*this);
+	}
+
+	TMat3 operator+(const TMat3& b) const
+	{
+		TMat3<T> c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] + b.arr1[i];
+		}
+		return c;
+	}
+
+	TMat3& operator+=(const TMat3& b)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] += b.arr1[i];
+		}
+		return (*this);
+	}
+
+	TMat3 operator-(const TMat3& b) const
+	{
+		TMat3 c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] - b.arr1[i];
+		}
+		return c;
+	}
+
+	TMat3& operator-=(const TMat3& b)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] -= b.arr1[i];
+		}
+		return (*this);
+	}
+
+	/// @note 27 muls, 18 adds
+	TMat3 operator*(const TMat3& b) const
+	{
+		TMat3 c;
+		c(0, 0) = arr2[0][0] * b(0, 0) + arr2[0][1] * b(1, 0)
+			+ arr2[0][2] * b(2, 0);
+		c(0, 1) = arr2[0][0] * b(0, 1) + arr2[0][1] * b(1, 1)
+			+ arr2[0][2] * b(2, 1);
+		c(0, 2) = arr2[0][0] * b(0, 2) + arr2[0][1] * b(1, 2)
+			+ arr2[0][2] * b(2, 2);
+		c(1, 0) = arr2[1][0] * b(0, 0) + arr2[1][1] * b(1, 0)
+			+ arr2[1][2] * b(2, 0);
+		c(1, 1) = arr2[1][0] * b(0, 1) + arr2[1][1] * b(1, 1)
+			+ arr2[1][2] * b(2, 1);
+		c(1, 2) = arr2[1][0] * b(0, 2) + arr2[1][1] * b(1, 2)
+			+ arr2[1][2] * b(2, 2);
+		c(2, 0) = arr2[2][0] * b(0, 0) + arr2[2][1] * b(1, 0)
+			+ arr2[2][2] * b(2, 0);
+		c(2, 1) = arr2[2][0] * b(0, 1) + arr2[2][1] * b(1, 1)
+			+ arr2[2][2] * b(2, 1);
+		c(2, 2) = arr2[2][0] * b(0, 2) + arr2[2][1] * b(1, 2)
+			+ arr2[2][2] * b(2, 2);
+		
+		return c;
+	}
+
+	TMat3& operator*=(const TMat3& b)
+	{
+		(*this) = (*this) * b;
+		return (*this);
+	}
+
+	TMat3 operator/(const TMat3& b) const;
+	TMat3& operator/=(const TMat3& b);
+
+	Bool operator==(const TMat3& b) const
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			if(!isZero<T>(arr1[i] - b.arr1[i]))
+			{
+				return false;
+			}
+		}
+		return true;
+	}
+
+	Bool operator!=(const TMat3& b) const
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			if(!isZero<T>(arr1[i] - b.arr1[i]))
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+	/// @}
+
+	/// @name Operators with T
+	/// @{
+	TMat3 operator+(const T f) const
+	{
+		TMat3 c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] + f;
+		}
+		return c;
+	}
+
+	TMat3& operator+=(const T f)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] += f;
+		}
+		return (*this);
+	}
+
+	TMat3 operator-(const T f) const
+	{
+		TMat3 c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] - f;
+		}
+		return c;
+	}
+
+	TMat3& operator-=(const T f)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] -= f;
+		}
+		return (*this);
+	}
+
+	TMat3 operator*(const T f) const
+	{
+		TMat3 c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] * f;
+		}
+		return c;
+	}
+
+	TMat3& operator*=(const T f)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] *= f;
+		}
+		return (*this);
+	}
+
+	TMat3 operator/(const T f) const
+	{
+		TMat3 c;
+		for(U i = 0; i < 9; i++)
+		{
+			c.arr1[i] = arr1[i] / f;
+		}
+		return c;
+	}
+
+	TMat3& operator/=(const T f)
+	{
+		for(U i = 0; i < 9; i++)
+		{
+			arr1[i] /= f;
+		}
+		return (*this);
+	}
+	/// @}
+
+	/// @name Operators with others
+	/// @{
+
+	/// TVec3<T>(dot(row0 * b), dot(row1 * b), dot(row2 * b)). 9 muls, 6 adds
+	TVec3<T> operator*(const TVec3<T>& b) const
+	{
+		return TVec3<T>(
+			arr2[0][0] * b.x() + arr2[0][1] * b.y() + arr2[0][2] * b.z(),
+			arr2[1][0] * b.x() + arr2[1][1] * b.y() + arr2[1][2] * b.z(),
+			arr2[2][0] * b.x() + arr2[2][1] * b.y() + arr2[2][2] * b.z());
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	void setRows(const TVec3<T>& a, const TVec3<T>& b, const TVec3<T>& c)
+	{
+		arr2[0][0] = a.x();
+		arr2[0][1] = a.y();
+		arr2[0][2] = a.z();
+		arr2[1][0] = b.x();
+		arr2[1][1] = b.y();
+		arr2[1][2] = b.z();
+		arr2[2][0] = c.x();
+		arr2[2][1] = c.y();
+		arr2[2][2] = c.z();
+	}
+
+	void setRow(const U i, const TVec3<T>& v)
+	{
+		arr2[i][0] = v.x();
+		arr2[i][1] = v.y();
+		arr2[i][2] = v.z();
+	}
+
+	void getRows(TVec3<T>& a, TVec3<T>& b, TVec3<T>& c) const
+	{
+		a.x() = arr2[0][0];
+		a.y() = arr2[0][1];
+		a.z() = arr2[0][2];
+		b.x() = arr2[1][0];
+		b.y() = arr2[1][1];
+		b.z() = arr2[1][2];
+		c.x() = arr2[2][0];
+		c.y() = arr2[2][1];
+		c.z() = arr2[2][2];
+	}
+
+	TVec3<T> getRow(const U i) const
+	{
+		return TVec3<T>(arr2[i][0], arr2[i][1], arr2[i][2]);
+	}
+
+	void setColumns(const TVec3<T>& a, const TVec3<T>& b, const TVec3<T>& c)
+	{
+		arr2[0][0] = a.x();
+		arr2[1][0] = a.y();
+		arr2[2][0] = a.z();
+		arr2[0][1] = b.x();
+		arr2[1][1] = b.y();
+		arr2[2][1] = b.z();
+		arr2[0][2] = c.x();
+		arr2[1][2] = c.y();
+		arr2[2][2] = c.z();
+	}
+
+	void setColumn(const U i, const TVec3<T>& v)
+	{
+		arr2[0][i] = v.x();
+		arr2[1][i] = v.y();
+		arr2[2][i] = v.z();
+	}
+
+	void getColumns(TVec3<T>& a, TVec3<T>& b, TVec3<T>& c) const
+	{
+		a.x() = arr2[0][0];
+		a.y() = arr2[1][0];
+		a.z() = arr2[2][0];
+		b.x() = arr2[0][1];
+		b.y() = arr2[1][1];
+		b.z() = arr2[2][1];
+		c.x() = arr2[0][2];
+		c.y() = arr2[1][2];
+		c.z() = arr2[2][2];
+	}
+
+	TVec3<T> getColumn(const U i) const
+	{
+		return TVec3<T>(arr2[0][i], arr2[1][i], arr2[2][i]);
+	}
+
+	/// Get 1st column
+	TVec3<T> getXAxis() const
+	{
+		return getColumn(0);
+	}
+
+	/// Get 2nd column
+	TVec3<T> getYAxis() const
+	{
+		return getColumn(1);
+	}
+
+	/// Get 3rd column
+	TVec3<T> getZAxis() const
+	{
+		return getColumn(2);
+	}
+
+	/// Set 1st column
+	void setXAxis(const TVec3<T>& v3)
+	{
+		setColumn(0, v3);
+	}
+
+	/// Set 2nd column
+	void setYAxis(const TVec3<T>& v3)
+	{
+		setColumn(1, v3);
+	}
+
+	/// Set 3rd column
+	void setZAxis(const TVec3<T>& v3)
+	{
+		setColumn(2, v3);
+	}
+
+	void setRotationX(const T rad)
+	{
+		T sintheta, costheta;
+		sinCos(rad, sintheta, costheta);
+
+		arr2[0][0] = 1.0;
+		arr2[0][1] = 0.0;
+		arr2[0][2] = 0.0;
+		arr2[1][0] = 0.0;
+		arr2[1][1] = costheta;
+		arr2[1][2] = -sintheta;
+		arr2[2][0] = 0.0;
+		arr2[2][1] = sintheta;
+		arr2[2][2] = costheta;
+	}
+
+	void setRotationY(const T rad)
+	{
+		T sintheta, costheta;
+		sinCos(rad, sintheta, costheta);
+
+		arr2[0][0] = costheta;
+		arr2[0][1] = 0.0;
+		arr2[0][2] = sintheta;
+		arr2[1][0] = 0.0;
+		arr2[1][1] = 1.0;
+		arr2[1][2] = 0.0;
+		arr2[2][0] = -sintheta;
+		arr2[2][1] = 0.0;
+		arr2[2][2] = costheta;
+	}
+
+	void setRotationZ(const T rad)
+	{
+		T sintheta, costheta;
+		sinCos(rad, sintheta, costheta);
+
+		arr2[0][0] = costheta;
+		arr2[0][1] = -sintheta;
+		arr2[0][2] = 0.0;
+		arr2[1][0] = sintheta;
+		arr2[1][1] = costheta;
+		arr2[1][2] = 0.0;
+		arr2[2][0] = 0.0;
+		arr2[2][1] = 0.0;
+		arr2[2][2] = 1.0;
+	}
+
+	/// It rotates "this" in the axis defined by the rotation AND not the
+	/// world axis
+	void rotateXAxis(const T rad)
+	{
+		// If we analize the mat3 we can extract the 3 unit vectors rotated by 
+		// the mat3. The 3 rotated vectors are in mat's columns. This means 
+		// that: mat3.colomn[0] == i * mat3. rotateXAxis() rotates rad angle 
+		// not from i vector (aka x axis) but from the vector from colomn 0
+		// NOTE: See the clean code from < r664
+
+		T sina, cosa;
+		sinCos(rad, sina, cosa);
+
+		// zAxis = zAxis*cosa - yAxis*sina;
+		arr2[0][2] = arr2[0][2] * cosa - arr2[0][1] * sina;
+		arr2[1][2] = arr2[1][2] * cosa - arr2[1][1] * sina;
+		arr2[2][2] = arr2[2][2] * cosa - arr2[2][1] * sina;
+
+		// zAxis.normalize();
+		F32 len = sqrt(arr2[0][2] * arr2[0][2]
+			+ arr2[1][2] * arr2[1][2] + arr2[2][2] * arr2[2][2]);
+		arr2[0][2] /= len;
+		arr2[1][2] /= len;
+		arr2[2][2] /= len;
+
+		// yAxis = zAxis * xAxis;
+		arr2[0][1] = arr2[1][2] * arr2[2][0]
+			- arr2[2][2] * arr2[1][0];
+		arr2[1][1] = arr2[2][2] * arr2[0][0]
+			- arr2[0][2] * arr2[2][0];
+		arr2[2][1] = arr2[0][2] * arr2[1][0]
+			- arr2[1][2] * arr2[0][0];
+
+		// yAxis.normalize();
+	}
+
+	/// @copybrief rotateXAxis
+	void rotateYAxis(const T rad)
+	{
+		// NOTE: See the clean code from < r664
+		T sina, cosa;
+		sinCos(rad, sina, cosa);
+
+		// zAxis = zAxis*cosa + xAxis*sina;
+		arr2[0][2] = arr2[0][2] * cosa + arr2[0][0] * sina;
+		arr2[1][2] = arr2[1][2] * cosa + arr2[1][0] * sina;
+		arr2[2][2] = arr2[2][2] * cosa + arr2[2][0] * sina;
+
+		// zAxis.normalize();
+		F32 len = sqrt(arr2[0][2] * arr2[0][2]
+			+ arr2[1][2] * arr2[1][2] + arr2[2][2] * arr2[2][2]);
+		arr2[0][2] /= len;
+		arr2[1][2] /= len;
+		arr2[2][2] /= len;
+
+		// xAxis = (zAxis*yAxis) * -1.0f;
+		arr2[0][0] = arr2[2][2] * arr2[1][1]
+			- arr2[1][2] * arr2[2][1];
+		arr2[1][0] = arr2[0][2] * arr2[2][1]
+			- arr2[2][2] * arr2[0][1];
+		arr2[2][0] = arr2[1][2] * arr2[0][1]
+			- arr2[0][2] * arr2[1][1];
+			
+	}
+
+	/// @copybrief rotateXAxis
+	void rotateZAxis(const T rad)
+	{
+		// NOTE: See the clean code from < r664
+		T sina, cosa;
+		sinCos(rad, sina, cosa);
+
+		// xAxis = xAxis*cosa + yAxis*sina;
+		arr2[0][0] = arr2[0][0] * cosa + arr2[0][1] * sina;
+		arr2[1][0] = arr2[1][0] * cosa + arr2[1][1] * sina;
+		arr2[2][0] = arr2[2][0] * cosa + arr2[2][1] * sina;
+
+		// xAxis.normalize();
+		T len = sqrt(arr2[0][0] * arr2[0][0]
+			+ arr2[1][0] * arr2[1][0] + arr2[2][0] * arr2[2][0]);
+		arr2[0][0] /= len;
+		arr2[1][0] /= len;
+		arr2[2][0] /= len;
+
+		// yAxis = zAxis*xAxis;
+		arr2[0][1] = arr2[1][2] * arr2[2][0] - arr2[2][2] * arr2[1][0];
+		arr2[1][1] = arr2[2][2] * arr2[0][0] - arr2[0][2] * arr2[2][0];
+		arr2[2][1] = arr2[0][2] * arr2[1][0] - arr2[1][2] * arr2[0][0];
+	}
+
+	void transpose()
+	{
+		T temp = arr2[0][1];
+		arr2[0][1] = arr2[1][0];
+		arr2[1][0] = temp;
+		temp = arr2[0][2];
+		arr2[0][2] = arr2[2][0];
+		arr2[2][0] = temp;
+		temp = arr2[1][2];
+		arr2[1][2] = arr2[2][1];
+		arr2[2][1] = temp;
+		
+	}
+
+	TMat3 getTransposed() const
+	{
+		TMat3 m3;
+		for(U i = 0; i < 3; i++)
+		{
+			for(U j = 0; j < 3; j++)
+			{
+				m3[i][j] = arr2[j][i];
+			}
+		}
+	}
+
+	void reorthogonalize()
+	{
+		// There are 2 methods, the standard and the Gram-Schmidt method with a 
+		// twist for zAxis. This uses the 2nd. For the first see < r664
+		TVec3<T> xAxis, yAxis, zAxis;
+		getColumns(xAxis, yAxis, zAxis);
+
+		xAxis.normalize();
+
+		yAxis = yAxis - (xAxis * xAxis.dot(yAxis));
+		yAxis.normalize();
+
+		zAxis = xAxis.cross(yAxis);
+
+		setColumns(xAxis, yAxis, zAxis);
+	}
+
+	T getDet() const
+	{
+		// For the accurate method see < r664
+		return arr2[0][0] * (arr2[1][1] * arr2[2][2]
+			- arr2[1][2] * arr2[2][1]) - arr2[0][1] * (arr2[1][0]
+			* arr2[2][2] - arr2[1][2] * arr2[2][0]) + arr2[0][2]
+			* (arr2[0][1] * arr2[2][1] - arr2[1][1] * arr2[2][0]);
+	}
+
+	TMat3 getInverse() const
+	{
+		// Using Gramer's method Inv(A) = (1 / getDet(A)) * Adj(A)
+		TMat3 r;
+
+		// compute determinant
+		T cofactor0 = arr2[1][1] * arr2[2][2] - arr2[1][2] * arr2[2][1];
+		T cofactor3 = arr2[0][2] * arr2[2][1] - arr2[0][1] * arr2[2][2];
+		T cofactor6 = arr2[0][1] * arr2[1][2] - arr2[0][2] * arr2[1][1];
+		T det = arr2[0][0] * cofactor0 + arr2[1][0] * cofactor3
+			+ arr2[2][0] * cofactor6;
+
+		ANKI_ASSERT(!isZero<T>(det)); // Cannot invert det == 0
+
+		// create adjoint matrix and multiply by 1/det to get inverse
+		T invDet = 1.0 / det;
+		r(0, 0) = invDet * cofactor0;
+		r(0, 1) = invDet * cofactor3;
+		r(0, 2) = invDet * cofactor6;
+
+		r(1, 0) = invDet * (arr2[1][2] * arr2[2][0] - arr2[1][0] * arr2[2][2]);
+		r(1, 1) = invDet * (arr2[0][0] * arr2[2][2] - arr2[0][2] * arr2[2][0]);
+		r(1, 2) = invDet * (arr2[0][2] * arr2[1][0] - arr2[0][0] * arr2[1][2]);
+
+		r(2, 0) = invDet * (arr2[1][0] * arr2[2][1] - arr2[1][1] * arr2[2][0]);
+		r(2, 1) = invDet * (arr2[0][1] * arr2[2][0] - arr2[0][0] * arr2[2][1]);
+		r(2, 2) = invDet * (arr2[0][0] * arr2[1][1] - arr2[0][1] * arr2[1][0]);
+
+		return r;		
+	}
+
+	void invert()
+	{
+		(*this) = getInverse();
+	}
+
+	void setIdentity()
+	{
+		(*this) = getIdentity();
+	}
+
+	static const TMat3& getZero()
+	{
+		static const TMat3 zero(0.0);
+		return zero;
+	}
+
+	static const TMat3& getIdentity()
+	{
+		static const TMat3 ident(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0);
+		return ident;
+	}
+	/// @}
+
+	/// @name Friends
+	/// @{
+	friend TMat3 operator+(T f, const TMat3& m3)
+	{
+		return m3 + f;
+	}
+
+	friend TMat3 operator-(T f, const TMat3& m3)
+	{
+		TMat3 out;
+		for(U i = 0; i < 9; i++)
+		{
+			out[i] = f - m3[i];
+		}
+		return out;
+	}
+
+	friend TMat3 operator*(T f, const TMat3& m3)
+	{
+		return m3 * f;
+	}
+
+	friend TMat3 operator/(T f, const TMat3& m3)
+	{
+		TMat3 out;
+		for(U i = 0; i < 9; i++)
+		{
+			out[i] = f / m3[i];
+		}
+		return out;
+	}
+
+	friend std::ostream& operator<<(std::ostream& s, const TMat3& m)
+	{
+		for(U i = 0; i < 3; i++)
+		{
+			for(U j = 0; j < 3; j++)
+			{
+				s << m(i, j) << ' ';
+			}
+
+			if(i != 2)
+			{
+				//s << "\n";
+			}
+		}
+		return s;
+	}
+	/// @}
+
+private:
+	/// @name Data members
+	/// @{
+	union
+	{
+		Array<T, 9> arr1;
+		Array<Array<T, 3>, 3> arr2;
+		T carr1[9]; ///< For easy debugging with gdb
+		T carr2[3][3]; ///< For easy debugging with gdb
+	};
+	/// @}
+};
+
 /// 3x3 Matrix. Mainly used for rotations. It includes many helpful member
 /// functions. Its row major. The columns are the x,y,z axis
 class Mat3

+ 845 - 0
include/anki/math/Mat4.h

@@ -8,6 +8,851 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// Template struct that gives the type of the TVec4 SIMD
+template<typename T>
+struct TMat4Simd
+{
+	typedef Array<T, 16> Type;
+};
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+// Specialize for F32
+template<>
+struct TMat4Simd<F32>
+{
+	typedef Array<__m128, 4> Type;
+};
+#endif
+
+/// 4x4 Matrix. Used mainly for transformations but not necessarily. Its
+/// row major. SSE optimized
+template<typename T>
+ANKI_ATTRIBUTE_ALIGNED(class, 16) TMat4
+{
+public:
+	typedef typename TMat4Simd<T>::Type Simd;
+
+	/// @name Constructors
+	/// @{
+	explicit TMat4()
+	{}
+
+	explicit TMat4(const T f)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] = f;
+		}
+	}
+
+	explicit TMat4(const T m00, const T m01, const T m02,
+		const T m03, const T m10, const T m11,
+		const T m12, const T m13, const T m20,
+		const T m21, const T m22, const T m23,
+		const T m30, const T m31, const T m32,
+		const T m33)
+	{
+		arr2[0][0] = m00;
+		arr2[0][1] = m01;
+		arr2[0][2] = m02;
+		arr2[0][3] = m03;
+		arr2[1][0] = m10;
+		arr2[1][1] = m11;
+		arr2[1][2] = m12;
+		arr2[1][3] = m13;
+		arr2[2][0] = m20;
+		arr2[2][1] = m21;
+		arr2[2][2] = m22;
+		arr2[2][3] = m23;
+		arr2[3][0] = m30;
+		arr2[3][1] = m31;
+		arr2[3][2] = m32;
+		arr2[3][3] = m33;
+	}
+
+	explicit TMat4(const T arr_[])
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] = arr_[i];
+		}	
+	}
+
+	TMat4(const TMat4& b)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] = b.arr1[i];
+		}
+	}
+
+	explicit TMat4(const TMat3<T>& m3)
+	{
+		arr2[0][0] = m3(0, 0);
+		arr2[0][1] = m3(0, 1);
+		arr2[0][2] = m3(0, 2);
+		arr2[0][3] = 0.0;
+		arr2[1][0] = m3(1, 0);
+		arr2[1][1] = m3(1, 1);
+		arr2[1][2] = m3(1, 2);
+		arr2[1][3] = 0.0;
+		arr2[2][0] = m3(2, 0);
+		arr2[2][1] = m3(2, 1);
+		arr2[2][2] = m3(2, 2);
+		arr2[2][3] = 0.0;
+		arr2[3][0] = 0.0;
+		arr2[3][1] = 0.0;
+		arr2[3][2] = 0.0;
+		arr2[3][3] = 1.0;
+	}
+
+	explicit TMat4(const TVec3<T>& v)
+	{
+		arr2[0][0] = 1.0;
+		arr2[0][1] = 0.0;
+		arr2[0][2] = 0.0;
+		arr2[0][3] = v.x();
+		arr2[1][0] = 0.0;
+		arr2[1][1] = 1.0;
+		arr2[1][2] = 0.0;
+		arr2[1][3] = v.y();
+		arr2[2][0] = 0.0;
+		arr2[2][1] = 0.0;
+		arr2[2][2] = 1.0;
+		arr2[2][3] = v.z();
+		arr2[3][0] = 0.0;
+		arr2[3][1] = 0.0;
+		arr2[3][2] = 0.0;
+		arr2[3][3] = 1.0;
+	}
+
+	explicit TMat4(const TVec4<T>& v)
+	{
+		arr2[0][0] = 1.0;
+		arr2[0][1] = 0.0;
+		arr2[0][2] = 0.0;
+		arr2[0][3] = v.x();
+		arr2[1][0] = 0.0;
+		arr2[1][1] = 1.0;
+		arr2[1][2] = 0.0;
+		arr2[1][3] = v.y();
+		arr2[2][0] = 0.0;
+		arr2[2][1] = 0.0;
+		arr2[2][2] = 1.0;
+		arr2[2][3] = v.z();
+		arr2[3][0] = 0.0;
+		arr2[3][1] = 0.0;
+		arr2[3][2] = 0.0;
+		arr2[3][3] = v.w();
+	}
+
+	explicit TMat4(const TVec3<T>& transl, const TMat3<T>& rot)
+	{
+		setRotationPart(rot);
+		setTranslationPart(transl);
+		arr2[3][0] = arr2[3][1] = arr2[3][2] = 0.0;
+		arr2[3][3] = 1.0;
+	}
+
+	explicit TMat4(const TVec3<T>& transl, const TMat3<T>& rot, const T scale)
+	{
+		if(isZero<T>(scale - 1.0))
+		{
+			setRotationPart(rot);
+		}
+		else
+		{
+			setRotationPart(rot * scale);
+		}
+
+		setTranslationPart(transl);
+
+		arr2[3][0] = arr2[3][1] = arr2[3][2] = 0.0;
+		arr2[3][3] = 1.0;
+	}
+
+	explicit TMat4(const TTransform<T>& t)
+	{
+		(*this) = TMat4(t.getOrigin(), t.getRotation(), t.getScale());
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& operator()(const U i, const U j)
+	{
+		return arr2[i][j];
+	}
+
+	const T& operator()(const U i, const U j) const
+	{
+		return arr2[i][j];
+	}
+
+	T& operator[](const U i)
+	{
+		return arr1[i];
+	}
+
+	const T& operator[](const U i) const
+	{
+		return arr1[i];
+	}
+
+	Simd& getSimd(const U i)
+	{
+		return simd[i];
+	}
+
+	const Simd& getSimd(const U i) const
+	{
+		return simd[i];
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TMat4& operator=(const TMat4& b)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] = b.arr1[i];
+		}
+	}
+
+	TMat4 operator+(const TMat4& b) const
+	{
+		TMat4<T> c;
+		for(U i = 0; i < 16; i++)
+		{
+			c.arr1[i] = arr1[i] + b.arr1[i];
+		}
+		return c;
+	}
+
+	TMat4& operator+=(const TMat4& b)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] += b.arr1[i];
+		}
+		return (*this);
+	}
+
+	TMat4 operator-(const TMat4& b) const
+	{
+		TMat4<T> c;
+		for(U i = 0; i < 16; i++)
+		{
+			c.arr1[i] = arr1[i] - b.arr1[i];
+		}
+		return c;
+	}
+
+	TMat4& operator-=(const TMat4& b)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] -= b.arr1[i];
+		}
+		return (*this);
+	}
+
+	/// @note 64 muls, 48 adds
+	TMat4 operator*(const TMat4& b) const
+	{
+		TMat4<T> c;
+		for(U i = 0; i < 4; i++)
+		{
+			for(U j = 0; j < 4; j++)
+			{
+				c(i, j) = arr2[i][0] * b(0, j) + arr2[i][1] * b(1, j) 
+					+ arr2[i][2] * b(2, j) + arr2[i][3] * b(3, j);
+			}
+		}
+		return c;
+	}
+
+	TMat4& operator*=(const TMat4& b)
+	{
+		(*this) = (*this) * b;
+		return (*this);
+	}
+
+	Bool operator==(const TMat4& b) const
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			if(!isZero<T>(arr1[i] - b[i]))
+			{
+				return false;
+			}
+		}
+		return true;
+	}
+
+	Bool operator!=(const TMat4& b) const
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			if(!isZero(arr1[i] - b[i]))
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+	/// @}
+
+	/// @name Operators with T
+	/// @{
+	TMat4 operator+(const T f) const
+	{
+		TMat4 c;
+		for(U i = 0; i < 16; i++)
+		{
+			c[i] = arr1[i] + f;
+		}
+		return c;
+	}
+
+	TMat4& operator+=(const T f)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] += f;
+		}
+	}
+
+	TMat4 operator-(const T f) const
+	{
+		TMat4 c;
+		for(U i = 0; i < 16; i++)
+		{
+			c[i] = arr1[i] - f;
+		}
+		return c;
+	}
+
+	TMat4& operator-=(const T f)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] -= f;
+		}
+	}
+
+	TMat4 operator*(const T f) const
+	{
+		TMat4 c;
+		for(U i = 0; i < 16; i++)
+		{
+			c[i] = arr1[i] * f;
+		}
+		return c;
+	}
+
+	TMat4& operator*=(const T f)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] *= f;
+		}
+	}
+
+	TMat4 operator/(const T f) const
+	{
+		TMat4 c;
+		for(U i = 0; i < 16; i++)
+		{
+			c[i] = arr1[i] / f;
+		}
+		return c;
+	}
+
+	TMat4& operator/=(const T f)
+	{
+		for(U i = 0; i < 16; i++)
+		{
+			arr1[i] /= f;
+		}
+	}
+	/// @}
+
+	/// @name Operators with other types
+	/// @{
+
+	/// 16 muls, 12 adds
+	TVec4<T> operator*(const TVec4<T>& v4) const
+	{
+		TVec4<T> out;
+
+		out.x() = arr2[0][0] * v4.x() + arr2[0][1] * v4.y() 
+			+ arr2[0][2] * v4.z() + arr2[0][3] * v4.w();
+
+		out.y() = arr2[1][0] * v4.x() + arr2[1][1] * v4.y() 
+			+ arr2[1][2] * v4.z() + arr2[1][3] * v4.w();
+
+		out.z() = arr2[2][0] * v4.x() + arr2[2][1] * v4.y() 
+			+ arr2[2][2] * v4.z() + arr2[2][3] * v4.w();
+
+		out.w() = arr2[3][0] * v4.x() + arr2[3][1] * v4.y() 
+			+ arr2[3][2] * v4.z() + arr2[3][3] * v4.w();
+
+		return out;
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	void setRows(const TVec4<T>& a, const TVec4<T>& b, const TVec4<T>& c,
+		const TVec4<T>& d)
+	{
+		arr2[0][0] = a.x();
+		arr2[0][1] = a.y();
+		arr2[0][2] = a.z();
+		arr2[0][3] = a.w();
+		arr2[1][0] = b.x();
+		arr2[1][1] = b.y();
+		arr2[1][2] = b.z();
+		arr2[1][3] = b.w();
+		arr2[2][0] = c.x();
+		arr2[2][1] = c.y();
+		arr2[2][2] = c.z();
+		arr2[2][3] = c.w();
+		arr2[3][0] = d.x();
+		arr2[3][1] = d.y();
+		arr2[3][2] = d.z();
+		arr2[3][3] = d.w();
+	}
+
+	void setRow(const U i, const TVec4<T>& v)
+	{
+		arr2[i][0] = v.x();
+		arr2[i][1] = v.y();
+		arr2[i][2] = v.z();
+		arr2[i][3] = v.w();
+	}
+
+	TVec4<T> getRow(const U i) const
+	{
+		return TVec4<T>(arr2[i][0], arr2[i][1], arr2[i][2], arr2[i][3]);
+	}
+
+	void setColumns(const TVec4<T>& a, const TVec4<T>& b, const TVec4<T>& c,
+		const TVec4<T>& d)
+	{
+		arr2[0][0] = a.x();
+		arr2[1][0] = a.y();
+		arr2[2][0] = a.z();
+		arr2[3][0] = a.w();
+		arr2[0][1] = b.x();
+		arr2[1][1] = b.y();
+		arr2[2][1] = b.z();
+		arr2[3][1] = b.w();
+		arr2[0][2] = c.x();
+		arr2[1][2] = c.y();
+		arr2[2][2] = c.z();
+		arr2[3][2] = c.w();
+		arr2[0][3] = d.x();
+		arr2[1][3] = d.y();
+		arr2[2][3] = d.z();
+		arr2[3][3] = d.w();
+	}
+
+	void setColumn(const U i, const TVec4<T>& v)
+	{
+		arr2[0][i] = v.x();
+		arr2[1][i] = v.y();
+		arr2[2][i] = v.z();
+		arr2[3][i] = v.w();
+	}
+
+	TVec4<T> getColumn(const U i) const
+	{
+		return TVec4<T>(arr2[0][i], arr2[1][i], arr2[2][i], arr2[3][i]);
+	}
+
+	void setRotationPart(const TMat3<T>& m3)
+	{
+		arr2[0][0] = m3(0, 0);
+		arr2[0][1] = m3(0, 1);
+		arr2[0][2] = m3(0, 2);
+		arr2[1][0] = m3(1, 0);
+		arr2[1][1] = m3(1, 1);
+		arr2[1][2] = m3(1, 2);
+		arr2[2][0] = m3(2, 0);
+		arr2[2][1] = m3(2, 1);
+		arr2[2][2] = m3(2, 2);
+	}
+
+	TMat3<T> getRotationPart() const
+	{
+		TMat3<T> m3;
+		m3(0, 0) = arr2[0][0];
+		m3(0, 1) = arr2[0][1];
+		m3(0, 2) = arr2[0][2];
+		m3(1, 0) = arr2[1][0];
+		m3(1, 1) = arr2[1][1];
+		m3(1, 2) = arr2[1][2];
+		m3(2, 0) = arr2[2][0];
+		m3(2, 1) = arr2[2][1];
+		m3(2, 2) = arr2[2][2];
+		return m3;
+	}
+
+	void setTranslationPart(const TVec4<T>& v)
+	{
+		arr2[0][3] = v.x();
+		arr2[1][3] = v.y();
+		arr2[2][3] = v.z();
+		arr2[3][3] = v.w();
+	}
+
+	void setTranslationPart(const TVec3<T>& v)
+	{
+		arr2[0][3] = v.x();
+		arr2[1][3] = v.y();
+		arr2[2][3] = v.z();
+		arr2[3][3] = v.w();
+	}
+
+	TVec3<T> getTranslationPart() const
+	{
+		return TVec3<T>(arr2[0][3], arr2[1][3], arr2[2][3]);
+	}
+
+	void transpose()
+	{
+		T tmp = arr2[0][1];
+		arr2[0][1] = arr2[1][0];
+		arr2[1][0] = tmp;
+		tmp = arr2[0][2];
+		arr2[0][2] = arr2[2][0];
+		arr2[2][0] = tmp;
+		tmp = arr2[0][3];
+		arr2[0][3] = arr2[3][0];
+		arr2[3][0] = tmp;
+		tmp = arr2[1][2];
+		arr2[1][2] = arr2[2][1];
+		arr2[2][1] = tmp;
+		tmp = arr2[1][3];
+		arr2[1][3] = arr2[3][1];
+		arr2[3][1] = tmp;
+		tmp = arr2[2][3];
+		arr2[2][3] = arr2[3][2];
+		arr2[3][2] = tmp;
+	}
+
+	TMat4 getTransposed() const
+	{
+		TMat4 out;
+		for(U i = 0; i < 4; i++)
+		{
+			for(U j = 0; j < 4; j++)
+			{
+				out(i, j) = arr2[j][i];
+			}
+		}
+		return out;
+	}
+
+	T getDet() const
+	{
+		const TMat4& t = *this;
+		return t(0, 3) * t(1, 2) * t(2, 1) * t(3, 0) 
+			- t(0, 2) * t(1, 3) * t(2, 1) * t(3, 0) 
+			- t(0, 3) * t(1, 1) * t(2, 2) * t(3, 0) 
+			+ t(0, 1) * t(1, 3) * t(2, 2) * t(3, 0)
+			+ t(0, 2) * t(1, 1) * t(2, 3) * t(3, 0) 
+			- t(0, 1) * t(1, 2) * t(2, 3) * t(3, 0) 
+			- t(0, 3) * t(1, 2) * t(2, 0) * t(3, 1) 
+			+ t(0, 2) * t(1, 3) * t(2, 0) * t(3, 1) 
+			+ t(0, 3) * t(1, 0) * t(2, 2) * t(3, 1) 
+			- t(0, 0) * t(1, 3) * t(2, 2) * t(3, 1)
+			- t(0, 2) * t(1, 0) * t(2, 3) * t(3, 1) 
+			+ t(0, 0) * t(1, 2) * t(2, 3) * t(3, 1) 
+			+ t(0, 3) * t(1, 1) * t(2, 0) * t(3, 2) 
+			- t(0, 1) * t(1, 3) * t(2, 0) * t(3, 2)
+			- t(0, 3) * t(1, 0) * t(2, 1) * t(3, 2)
+			+ t(0, 0) * t(1, 3) * t(2, 1) * t(3, 2) 
+			+ t(0, 1) * t(1, 0) * t(2, 3) * t(3, 2)
+			- t(0, 0) * t(1, 1) * t(2, 3) * t(3, 2)
+			- t(0, 2) * t(1, 1) * t(2, 0) * t(3, 3)
+			+ t(0, 1) * t(1, 2) * t(2, 0) * t(3, 3)
+			+ t(0, 2) * t(1, 0) * t(2, 1) * t(3, 3)
+			- t(0, 0) * t(1, 2) * t(2, 1) * t(3, 3) 
+			- t(0, 1) * t(1, 0) * t(2, 2) * t(3, 3) 
+			+ t(0, 0) * t(1, 1) * t(2, 2) * t(3, 3);
+	}
+
+	/// Invert using Cramer's rule
+	TMat4 getInverse() const
+	{
+		Array<T, 12> tmp;
+		const TMat4& in = (*this);
+		TMat4 m4;
+
+		tmp[0] = in(2, 2) * in(3, 3);
+		tmp[1] = in(3, 2) * in(2, 3);
+		tmp[2] = in(1, 2) * in(3, 3);
+		tmp[3] = in(3, 2) * in(1, 3);
+		tmp[4] = in(1, 2) * in(2, 3);
+		tmp[5] = in(2, 2) * in(1, 3);
+		tmp[6] = in(0, 2) * in(3, 3);
+		tmp[7] = in(3, 2) * in(0, 3);
+		tmp[8] = in(0, 2) * in(2, 3);
+		tmp[9] = in(2, 2) * in(0, 3);
+		tmp[10] = in(0, 2) * in(1, 3);
+		tmp[11] = in(1, 2) * in(0, 3);
+
+		m4(0, 0) =  tmp[0] * in(1, 1) + tmp[3] * in(2, 1) + tmp[4] * in(3, 1);
+		m4(0, 0) -= tmp[1] * in(1, 1) + tmp[2] * in(2, 1) + tmp[5] * in(3, 1);
+		m4(0, 1) =  tmp[1] * in(0, 1) + tmp[6] * in(2, 1) + tmp[9] * in(3, 1);
+		m4(0, 1) -= tmp[0] * in(0, 1) + tmp[7] * in(2, 1) + tmp[8] * in(3, 1);
+		m4(0, 2) =  tmp[2] * in(0, 1) + tmp[7] * in(1, 1) + tmp[10] * in(3, 1);
+		m4(0, 2) -= tmp[3] * in(0, 1) + tmp[6] * in(1, 1) + tmp[11] * in(3, 1);
+		m4(0, 3) =  tmp[5] * in(0, 1) + tmp[8] * in(1, 1) + tmp[11] * in(2, 1);
+		m4(0, 3) -= tmp[4] * in(0, 1) + tmp[9] * in(1, 1) + tmp[10] * in(2, 1);
+		m4(1, 0) =  tmp[1] * in(1, 0) + tmp[2] * in(2, 0) + tmp[5] * in(3, 0);
+		m4(1, 0) -= tmp[0] * in(1, 0) + tmp[3] * in(2, 0) + tmp[4] * in(3, 0);
+		m4(1, 1) =  tmp[0] * in(0, 0) + tmp[7] * in(2, 0) + tmp[8] * in(3, 0);
+		m4(1, 1) -= tmp[1] * in(0, 0) + tmp[6] * in(2, 0) + tmp[9] * in(3, 0);
+		m4(1, 2) =  tmp[3] * in(0, 0) + tmp[6] * in(1, 0) + tmp[11] * in(3, 0);
+		m4(1, 2) -= tmp[2] * in(0, 0) + tmp[7] * in(1, 0) + tmp[10] * in(3, 0);
+		m4(1, 3) =  tmp[4] * in(0, 0) + tmp[9] * in(1, 0) + tmp[10] * in(2, 0);
+		m4(1, 3) -= tmp[5] * in(0, 0) + tmp[8] * in(1, 0) + tmp[11] * in(2, 0);
+
+		tmp[0] = in(2, 0) * in(3, 1);
+		tmp[1] = in(3, 0) * in(2, 1);
+		tmp[2] = in(1, 0) * in(3, 1);
+		tmp[3] = in(3, 0) * in(1, 1);
+		tmp[4] = in(1, 0) * in(2, 1);
+		tmp[5] = in(2, 0) * in(1, 1);
+		tmp[6] = in(0, 0) * in(3, 1);
+		tmp[7] = in(3, 0) * in(0, 1);
+		tmp[8] = in(0, 0) * in(2, 1);
+		tmp[9] = in(2, 0) * in(0, 1);
+		tmp[10] = in(0, 0) * in(1, 1);
+		tmp[11] = in(1, 0) * in(0, 1);
+
+		m4(2, 0) =  tmp[0] * in(1, 3) + tmp[3] * in(2, 3) + tmp[4] * in(3, 3);
+		m4(2, 0) -= tmp[1] * in(1, 3) + tmp[2] * in(2, 3) + tmp[5] * in(3, 3);
+		m4(2, 1) =  tmp[1] * in(0, 3) + tmp[6] * in(2, 3) + tmp[9] * in(3, 3);
+		m4(2, 1) -= tmp[0] * in(0, 3) + tmp[7] * in(2, 3) + tmp[8] * in(3, 3);
+		m4(2, 2) =  tmp[2] * in(0, 3) + tmp[7] * in(1, 3) + tmp[10] * in(3, 3);
+		m4(2, 2) -= tmp[3] * in(0, 3) + tmp[6] * in(1, 3) + tmp[11] * in(3, 3);
+		m4(2, 3) =  tmp[5] * in(0, 3) + tmp[8] * in(1, 3) + tmp[11] * in(2, 3);
+		m4(2, 3) -= tmp[4] * in(0, 3) + tmp[9] * in(1, 3) + tmp[10] * in(2, 3);
+		m4(3, 0) =  tmp[2] * in(2, 2) + tmp[5] * in(3, 2) + tmp[1] * in(1, 2);
+		m4(3, 0) -= tmp[4] * in(3, 2) + tmp[0] * in(1, 2) + tmp[3] * in(2, 2);
+		m4(3, 1) =  tmp[8] * in(3, 2) + tmp[0] * in(0, 2) + tmp[7] * in(2, 2);
+		m4(3, 1) -= tmp[6] * in(2, 2) + tmp[9] * in(3, 2) + tmp[1] * in(0, 2);
+		m4(3, 2) =  tmp[6] * in(1, 2) + tmp[11] * in(3, 2) + tmp[3] * in(0, 2);
+		m4(3, 2) -= tmp[10] * in(3, 2) + tmp[2] * in(0, 2) + tmp[7] * in(1, 2);
+		m4(3, 3) =  tmp[10] * in(2, 2) + tmp[4] * in(0, 2) + tmp[9] * in(1, 2);
+		m4(3, 3) -= tmp[8] * in(1, 2) + tmp[11] * in(2, 2) + tmp[5] * in(0, 2);
+
+		T det = in(0, 0) * m4(0, 0) + in(1, 0) * m4(0, 1) 
+			+ in(2, 0) * m4(0, 2) + in(3, 0) * m4(0, 3);
+
+		ANKI_ASSERT(!isZero<T>(det)); // Cannot invert, det == 0
+		det = 1.0 / det;
+		m4 *= det;
+		return m4;
+	}
+
+	/// See getInverse
+	void invert()
+	{
+		(*this) = getInverse();
+	}
+
+	/// If we suppose this matrix represents a transformation, return the
+	/// inverted transformation
+	TMat4 getInverseTransformation() const
+	{
+		TMat3<T> invertedRot = getRotationPart().getTransposed();
+		TVec3<T> invertedTsl = getTranslationPart();
+		invertedTsl = -(invertedRot * invertedTsl);
+		return TMat4(invertedTsl, invertedRot);
+	}
+
+	TMat4 lerp(const TMat4& b, T t) const
+	{
+		return ((*this) * (1.0 - t)) + (b * t);
+	}
+
+	void setIdentity()
+	{
+		(*this) = getIdentity();
+	}
+
+	static const TMat4& getIdentity()
+	{
+		static const TMat4 ident(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 
+			0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0);
+		return ident;
+	}
+
+	static const TMat4& getZero()
+	{
+		static const TMat4 zero(0.0);
+		return zero;
+	}
+
+	/// 12 muls, 27 adds. Something like m4 = m0 * m1 but without touching
+	/// the 4rth row and allot faster
+	static TMat4 combineTransformations(const TMat4& m0, const TMat4& m1)
+	{
+		// See the clean code in < r664
+
+		// one of the 2 mat4 doesnt represent transformation
+		ANKI_ASSERT(isZero<T>(m0(3, 0) + m0(3, 1) + m0(3, 2) + m0(3, 3) - 1.0)
+			&& isZero<T>(m1(3, 0) + m1(3, 1) + m1(3, 2) + m1(3, 3) - 1.0));
+
+		TMat4 m4;
+
+		m4(0, 0) = 
+			m0(0, 0) * m1(0, 0) + m0(0, 1) * m1(1, 0) + m0(0, 2) * m1(2, 0);
+		m4(0, 1) = 
+			m0(0, 0) * m1(0, 1) + m0(0, 1) * m1(1, 1) + m0(0, 2) * m1(2, 1);
+		m4(0, 2) = 
+			m0(0, 0) * m1(0, 2) + m0(0, 1) * m1(1, 2) + m0(0, 2) * m1(2, 2);
+		m4(1, 0) = 
+			m0(1, 0) * m1(0, 0) + m0(1, 1) * m1(1, 0) + m0(1, 2) * m1(2, 0);
+		m4(1, 1) = 
+			m0(1, 0) * m1(0, 1) + m0(1, 1) * m1(1, 1) + m0(1, 2) * m1(2, 1);
+		m4(1, 2) = 
+			m0(1, 0) * m1(0, 2) + m0(1, 1) * m1(1, 2) + m0(1, 2) * m1(2, 2);
+		m4(2, 0) =
+			m0(2, 0) * m1(0, 0) + m0(2, 1) * m1(1, 0) + m0(2, 2) * m1(2, 0);
+		m4(2, 1) = 
+			m0(2, 0) * m1(0, 1) + m0(2, 1) * m1(1, 1) + m0(2, 2) * m1(2, 1);
+		m4(2, 2) =
+			m0(2, 0) * m1(0, 2) + m0(2, 1) * m1(1, 2) + m0(2, 2) * m1(2, 2);
+
+		m4(0, 3) = m0(0, 0) * m1(0, 3) + m0(0, 1) * m1(1, 3) 
+			+ m0(0, 2) * m1(2, 3) + m0(0, 3);
+
+		m4(1, 3) = m0(1, 0) * m1(0, 3) + m0(1, 1) * m1(1, 3) 
+			+ m0(1, 2) * m1(2, 3) + m0(1, 3);
+
+		m4(2, 3) = m0(2, 0) * m1(0, 3) + m0(2, 1) * m1(1, 3) 
+			+ m0(2, 2) * m1(2, 3) + m0(2, 3);
+
+		m4(3, 0) = m4(3, 1) = m4(3, 2) = 0.0;
+		m4(3, 3) = 1.0;
+
+		return m4;
+	}
+
+	std::string toString() const;
+	/// @}
+
+	/// @name Friends
+	/// @{
+	template<typename Y>
+	friend TMat4<Y> operator+(const Y f, const TMat4<Y>& m4);
+
+	template<typename Y>
+	friend TMat4<Y> operator-(const Y f, const TMat4<Y>& m4);
+
+	template<typename Y>
+	friend TMat4<Y> operator*(const Y f, const TMat4<Y>& m4);
+
+	template<typename Y>
+	friend TMat4<Y> operator/(const Y f, const TMat4<Y>& m4);
+	/// @}
+
+private:
+	/// @name Data
+	/// @{
+	union
+	{
+		Array<T, 16> arr1;
+		Array<Array<T, 4>, 4> arr2;
+		T carr1[16]; ///< For gdb
+		T carr2[4][4]; ///< For gdb
+		Simd simd;
+	};
+	/// @}
+};
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+
+// Forward declare specializations
+
+template<>
+TMat4<F32>::TMat4(const TMat4<F32>& b);
+
+template<>
+TMat4<F32>::TMat4(const F32 f);
+
+template<>
+TMat4<F32>& TMat4<F32>::operator=(const TMat4<F32>& b);
+
+template<>
+TMat4<F32> TMat4<F32>::operator+(const TMat4<F32>& b) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator+=(const TMat4<F32>& b);
+
+template<>
+TMat4<F32> TMat4<F32>::operator-(const TMat4<F32>& b) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator-=(const TMat4<F32>& b);
+
+template<>
+TMat4<F32> TMat4<F32>::operator*(const TMat4<F32>& b) const;
+
+template<>
+TMat4<F32> TMat4<F32>::operator+(const F32 f) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator+=(const F32 f);
+
+template<>
+TMat4<F32> TMat4<F32>::operator-(const F32 f) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator-=(const F32 f);
+
+template<>
+TMat4<F32> TMat4<F32>::operator*(const F32 f) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator*=(const F32 f);
+
+template<>
+TMat4<F32> TMat4<F32>::operator/(const F32 f) const;
+
+template<>
+TMat4<F32>& TMat4<F32>::operator/=(const F32 f);
+
+template<>
+TVec4<F32> TMat4<F32>::operator*(const TVec4<F32>& b) const;
+
+template<>
+void TMat4<F32>::setRows(const TVec4<F32>& a, const TVec4<F32>& b, 
+	const TVec4<F32>& c, const TVec4<F32>& d);
+
+template<>
+void TMat4<F32>::setRow(const U i, const TVec4<F32>& v);
+
+template<>
+void TMat4<F32>::transpose();
+
+template<>
+TMat4<F32> operator-(const F32 f, const TMat4<F32>& m4);
+
+template<>
+TMat4<F32> operator/(const F32 f, const TMat4<F32>& m4);
+
+#endif
+
+
 /// 4x4 Matrix. Used mainly for transformations but not necessarily. Its
 /// row major. SSE optimized
 ANKI_ATTRIBUTE_ALIGNED(class, 16) Mat4

+ 356 - 0
include/anki/math/Mat4.inl.h

@@ -2,6 +2,362 @@
 
 namespace anki {
 
+//==============================================================================
+// Friends                                                                     =
+//==============================================================================
+
+//==============================================================================
+template<typename T>
+TMat4<T> operator+(const T f, const TMat4<T>& m4)
+{
+	return m4 + f;
+}
+
+//==============================================================================
+template<typename T>
+TMat4<T> operator-(const T f, const TMat4<T>& m4)
+{
+	TMat4<T> out;
+	for(U i = 0; i < 16; i++)
+	{
+		out[i] = f - m4[i];
+	}
+	return out;
+}
+
+//==============================================================================
+template<typename T>
+TMat4<T> operator*(const T f, const TMat4<T>& m4)
+{
+	return m4 * f;
+}
+
+//==============================================================================
+template<typename T>
+TMat4<T> operator/(const T f, const TMat4<T>& m4)
+{
+	TMat4<T> out;
+	for(U i = 0; i < 16; i++)
+	{
+		out[i] = f / m4[i];
+	}
+	return out;
+}
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+
+//==============================================================================
+// SSE specializations                                                         =
+//==============================================================================
+
+//==============================================================================
+// Constructors                                                                =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TMat4<F32>::TMat4(const TMat4<F32>& b)
+{
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = b.simd[i];
+	}
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>::TMat4(const F32 f)
+{
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_set1_ps(f);
+	}
+}
+
+//==============================================================================
+// Operators with same                                                         =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator=(const TMat4<F32>& b)
+{
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = b.simd[i];
+	}
+	return *this;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator+(const TMat4<F32>& b) const
+{
+	TMat4<F32> c;
+	for(U i = 0; i < 4; i++)
+	{
+		c.simd[i] = _mm_add_ps(simd[i], b.simd[i]);
+	}
+	return c;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator+=(const TMat4<F32>& b)
+{
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_add_ps(simd[i], b.simd[i]);
+	}
+	return *this;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator-(const TMat4<F32>& b) const
+{
+	TMat4<F32> c;
+	for(U i = 0; i < 4; i++)
+	{
+		c.simd[i] = _mm_sub_ps(simd[i], b.simd[i]);
+	}
+	return c;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator-=(const TMat4<F32>& b)
+{
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_sub_ps(simd[i], b.simd[i]);
+	}
+	return *this;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator*(const TMat4<F32>& b) const
+{
+	TMat4<F32> c;
+	TMat4<F32> t(b);
+	t.transpose();
+	
+	// XXX See if this is optimal
+	for(U i = 0; i < 4; i++)
+	{
+		for(U j = 0; j < 4; j++)
+		{
+			_mm_store_ss(&c(i, j), _mm_dp_ps(simd[i], t.simd[j], 0xF1));
+		}
+	}
+
+	return c;
+}
+
+//==============================================================================
+// Operators with F32                                                          =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator+(const F32 f) const
+{
+	TMat4<F32> c;
+	__m128 mm = _mm_set1_ps(f);
+	
+	for(U i = 0; i < 4; i++)
+	{
+		c.simd[i] = _mm_add_ps(simd[i], mm);
+	}
+
+	return c;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator+=(const F32 f)
+{
+	__m128 mm = _mm_set1_ps(f);
+	
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_add_ps(simd[i], mm);
+	}
+
+	return *this;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator-(const F32 f) const
+{
+	TMat4<F32> r;
+	__m128 mm = _mm_set1_ps(f);
+	
+	for(U i = 0; i < 4; i++)
+	{
+		r.simd[i] = _mm_sub_ps(simd[i], mm);
+	}
+
+	return r;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator-=(const F32 f)
+{
+	__m128 mm = _mm_set1_ps(f);
+	
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_sub_ps(simd[i], mm);
+	}
+
+	return (*this);
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator*(const F32 f) const
+{
+	TMat4<F32> r;
+	__m128 mm = _mm_set1_ps(f);
+	
+	for(U i = 0; i < 4; i++)
+	{
+		r.simd[i] = _mm_mul_ps(simd[i], mm);
+	}
+
+	return r;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator*=(const F32 f)
+{
+	__m128 mm = _mm_set1_ps(f);
+
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_mul_ps(simd[i], mm);
+	}
+
+	return *this;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> TMat4<F32>::operator/(const F32 f) const
+{
+	TMat4<F32> r;
+	__m128 mm = _mm_set1_ps(f);
+
+	for(U i = 0; i < 4; i++)
+	{
+		r.simd[i] = _mm_div_ps(simd[i], mm);
+	}
+
+	return r;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32>& TMat4<F32>::operator/=(const F32 f)
+{
+	__m128 mm = _mm_set1_ps(f);
+
+	for(U i = 0; i < 4; i++)
+	{
+		simd[i] = _mm_div_ps(simd[i], mm);
+	}
+
+	return *this;
+}
+
+//==============================================================================
+// Operators with other                                                        =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TVec4<F32> TMat4<F32>::operator*(const TVec4<F32>& b) const
+{
+	TVec4<F32> v;
+	
+	for(U i = 0; i < 4; i++)
+	{
+		_mm_store_ss(&v[i], _mm_dp_ps(simd[i], b.getSimd(), 0xF1));
+	}
+
+	return v;
+}
+
+//==============================================================================
+// Other                                                                       =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline void TMat4<F32>::setRows(const TVec4<F32>& a, const TVec4<F32>& b, 
+	const TVec4<F32>& c, const TVec4<F32>& d)
+{
+	simd[0] = a.getSimd();
+	simd[1] = b.getSimd();
+	simd[2] = c.getSimd();
+	simd[3] = d.getSimd();
+}
+
+//==============================================================================
+template<>
+inline void TMat4<F32>::setRow(const U i, const TVec4<F32>& v)
+{
+	simd[i] = v.getSimd();
+}
+
+//==============================================================================
+template<>
+inline void TMat4<F32>::transpose()
+{
+	_MM_TRANSPOSE4_PS(simd[0], simd[1], simd[2], simd[3]);
+}
+
+//==============================================================================
+// Friends                                                                     =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TMat4<F32> operator-(const F32 f, const TMat4<F32>& m4)
+{
+	TMat4<F32> r;
+	__m128 mm = _mm_set1_ps(f);
+
+	for(U i = 0; i < 4; i++)
+	{
+		r.simd[i] = _mm_sub_ps(mm, m4.simd[i]);
+	}
+
+	return r;
+}
+
+//==============================================================================
+template<>
+inline TMat4<F32> operator/(const F32 f, const TMat4<F32>& m4)
+{
+	TMat4<F32> r;
+	__m128 mm = _mm_set1_ps(f);
+
+	for(U i = 0; i < 4; i++)
+	{
+		r.simd[i] = _mm_div_ps(mm, m4.simd[i]);
+	}
+
+	return r;
+}
+
+#endif
+
 //==============================================================================
 // Constructors                                                                =
 //==============================================================================

+ 290 - 0
include/anki/math/Vec2.h

@@ -8,6 +8,296 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// XXX
+template<typename T>
+class TVec2
+{
+public:
+	/// @name Constructors
+	/// @{
+	explicit TVec2()
+	{}
+
+	explicit TVec2(const T x_, const T y_)
+	{
+		x() = x_;
+		y() = y_;
+	}
+	
+	explicit TVec2(const T f)
+	{
+		x() = y() = f;
+	}
+
+	explicit TVec2(const T arr[])
+	{
+		x() = arr[0];
+		y() = arr[1];
+	}
+
+	TVec2(const TVec2& b)
+	{
+		x() = b.x();
+		y() = b.y();
+	}
+
+	explicit TVec2(const TVec3<T>& v3)
+	{
+		x() = v3.x();
+		y() = v3.y();
+	}
+
+	explicit TVec2(const TVec4<T>& v4)
+	{
+		x() = v4.x();
+		y() = v4.y();
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& x()
+	{
+		return vec.x;
+	}
+
+	T x() const
+	{
+		return vec.x;
+	}
+
+	T& y()
+	{
+		return vec.y;
+	}
+
+	T y() const
+	{
+		return vec.y;
+	}
+
+	T& operator[](const U i)
+	{
+		return arr[i];
+	}
+
+	T operator[](const U i) const
+	{
+		return arr[i];
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TVec2& operator=(const TVec2& b)
+	{
+		x() = b.x();
+		y() = b.y();
+		return *this;
+	}
+
+	TVec2 operator+(const TVec2& b) const
+	{
+		return TVec2(x() + b.x(), y() + b.y());
+	}
+
+	TVec2& operator+=(const TVec2& b)
+	{
+		x() += b.x();
+		y() += b.y();
+		return (*this);
+	}
+
+	TVec2 operator-(const TVec2& b) const
+	{
+		return TVec2(x() - b.x(), y() - b.y());
+	}
+
+	TVec2& operator-=(const TVec2& b)
+	{
+		x() -= b.x();
+		y() -= b.y();
+		return (*this);
+	}
+
+	TVec2 operator*(const TVec2& b) const
+	{
+		return TVec2(x() * b.x(), y() * b.y());
+	}
+
+	TVec2& operator*=(const TVec2& b)
+	{
+		x() *= b.x();
+		y() *= b.y();
+		return (*this);
+	}
+
+	TVec2 operator/(const TVec2& b) const
+	{
+		return TVec2(x() / b.x(), y() / b.y());
+	}
+
+	TVec2& operator/=(const TVec2& b)
+	{
+		x() /= b.x();
+		y() /= b.y();
+		return (*this);
+	}
+
+	TVec2 operator-() const
+	{
+		return TVec2(-x(), -y());
+	}
+
+	Bool operator==(const TVec2& b) const
+	{
+		return isZero<T>(x() - b.x()) && isZero<T>(y() - b.y());
+	}
+
+	Bool operator!=(const TVec2& b) const
+	{
+		return !(*this == b);
+	}
+
+	Bool operator<(const TVec2& b) const
+	{
+		return vec.x < b.vec.x && vec.y < b.vec.y;
+	}
+
+	Bool operator<=(const TVec2& b) const
+	{
+		return vec.x <= b.vec.x && vec.y <= b.vec.y;
+	}
+
+	Bool operator>(const TVec2& b) const
+	{
+		return vec.x > b.vec.x && vec.y > b.vec.y;
+	}
+
+	Bool operator>=(const TVec2& b) const
+	{
+		return vec.x >= b.vec.x && vec.y >= b.vec.y;
+	}
+	/// @}
+
+	/// @name Operators with T
+	/// @{
+	TVec2 operator+(const T f) const
+	{
+		return (*this) + TVec2(f);
+	}
+
+	TVec2& operator+=(const T f)
+	{
+		(*this) += TVec2(f);
+		return (*this);
+	}
+
+	TVec2 operator-(const T f) const
+	{
+		return (*this) - TVec2(f);
+	}
+
+	TVec2& operator-=(const T f)
+	{
+		(*this) -= TVec2(f);
+		return (*this);
+	}
+
+	TVec2 operator*(const T f) const
+	{
+		return (*this) * TVec2(f);
+	}
+
+	TVec2& operator*=(const T f)
+	{
+		(*this) *= TVec2(f);
+		return (*this);
+	}
+
+	TVec2 operator/(const T f) const
+	{
+		return (*this) / TVec2(f);
+	}
+
+	TVec2& operator/=(const T f)
+	{
+		(*this) /= TVec2(f);
+		return (*this);
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	T getLengthSquared() const
+	{
+		return x() * x() + y() * y();
+	}
+
+	T getLength() const
+	{
+		return sqrt(getLengthSquared());
+	}
+
+	TVec2 getNormalized() const
+	{
+		return (*this) / getLength();
+	}
+
+	void normalize()
+	{
+		(*this) /= getLength();
+	}
+
+	T dot(const TVec2& b) const
+	{
+		return x() * b.x() + y() * b.y();
+	}
+	/// @}
+
+	/// @name Friends
+	friend TVec2 operator+(const T f, const TVec2& v2)
+	{
+		return v2 + f;
+	}
+
+	friend TVec2 operator-(const T f, const TVec2& v2)
+	{
+		return TVec2(f - v2.x(), f - v2.y());
+	}
+
+	friend TVec2 operator*(const T f, const TVec2& v2)
+	{
+		return v2 * f;
+	}
+
+	friend TVec2 operator/(const T f, const TVec2& v2)
+	{
+		return TVec2(f / v2.x(), f / v2.y());
+	}
+
+	friend std::ostream& operator<<(std::ostream& s, const TVec2& v)
+	{
+		s << v.x() << ' ' << v.y();
+		return s;
+	}
+	///@]
+
+private:
+	/// @name Data members
+	/// @{
+	union
+	{
+		struct
+		{
+			T x, y;
+		} vec;
+
+		Array<T, 2> arr;
+	};
+	/// @}
+};
+
 /// 2D vector
 class Vec2
 {

+ 451 - 0
include/anki/math/Vec3.h

@@ -8,6 +8,457 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// 3D vector template. One of the most used classes
+template<typename T>
+class TVec3
+{
+public:
+	/// @name Constructors
+	/// @{
+	explicit TVec3()
+	{}
+
+	explicit TVec3(const T x_, const T y_, const T z_)
+	{
+		x() = x_;
+		y() = y_;
+		z() = z_;
+	}
+
+	explicit TVec3(const T f)
+	{
+		arr[0] = arr[1] = arr[2] = f;
+	}
+
+	explicit TVec3(const T arr_[])
+	{
+		arr[0] = arr_[0];
+		arr[1] = arr_[1];
+		arr[2] = arr_[2];
+	}
+
+	explicit TVec3(const TVec2<T>& v2, const T z_)
+	{
+		x() = v2.x();
+		y() = v2.y();
+		z() = z_;
+	}
+
+	TVec3(const TVec3& b)
+	{
+		arr[0] = b.arr[0];
+		arr[1] = b.arr[1];
+		arr[2] = b.arr[2];
+	}
+
+	explicit TVec3(const TVec4<T>& v4)
+	{
+		arr[0] = v4[0];
+		arr[1] = v4[1];
+		arr[2] = v4[2];
+	}
+
+	explicit TVec3(const TQuat<T>& q)
+	{
+		x() = q.x();
+		y() = q.y();
+		z() = q.z();
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& x()
+	{
+		return vec.x;
+	}
+
+	T x() const
+	{
+		return vec.x;
+	}
+
+	T& y()
+	{
+		return vec.y;
+	}
+
+	T y() const
+	{
+		return vec.y;
+	}
+
+	T& z()
+	{
+		return vec.z;
+	}
+
+	T z() const
+	{
+		return vec.z;
+	}
+
+	T& operator[](const U i)
+	{
+		return arr[i];
+	}
+
+	T operator[](const U i) const
+	{
+		return arr[i];
+	}
+
+	TVec2<T> xy() const
+	{
+		return TVec2<T>(x(), y());
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TVec3& operator=(const TVec3& b)
+	{
+		arr[0] = b.arr[0];
+		arr[1] = b.arr[1];
+		arr[2] = b.arr[2];
+		return (*this);
+	}
+
+	TVec3 operator+(const TVec3& b) const
+	{
+		return TVec3(x() + b.x(), y() + b.y(), z() + b.z());
+	}
+
+	TVec3& operator+=(const TVec3& b)
+	{
+		x() += b.x();
+		y() += b.y();
+		z() += b.z();
+		return (*this);
+	}
+
+	TVec3 operator-(const TVec3& b) const
+	{
+		return TVec3(x() - b.x(), y() - b.y(), z() - b.z());
+	}
+
+	TVec3& operator-=(const TVec3& b)
+	{
+		x() -= b.x();
+		y() -= b.y();
+		z() -= b.z();
+		return (*this);
+	}
+
+	TVec3 operator*(const TVec3& b) const
+	{
+		return TVec3(x() * b.x(), y() * b.y(), z() * b.z());
+	}
+
+	TVec3& operator*=(const TVec3& b)
+	{
+		x() *= b.x();
+		y() *= b.y();
+		z() *= b.z();
+		return (*this);
+	}
+
+	TVec3 operator/(const TVec3& b) const
+	{
+		return TVec3(x() / b.x(), y() / b.y(), z() / b.z());
+	}
+
+	TVec3& operator/=(const TVec3& b)
+	{
+		x() /= b.x();
+		y() /= b.y();
+		z() /= b.z();
+		return (*this);
+	}
+
+	TVec3 operator-() const
+	{
+		return TVec3(-x(), -y(), -z());
+	}
+
+	Bool operator==(const TVec3& b) const
+	{
+		return isZero<T>(x() - b.x()) 
+			&& isZero<T>(y() - b.y()) 
+			&& isZero<T>(z() - b.z());
+	}
+
+	Bool operator!=(const TVec3& b) const
+	{
+		return !operator==(b);
+	}
+
+	Bool operator<(const TVec3& b) const
+	{
+		return x() < b.x() && y() < b.y() && z() < b.z();
+	}
+
+	Bool operator<=(const TVec3& b) const
+	{
+		return x() <= b.x() && y() <= b.y() && z() <= b.z();
+	}
+
+	Bool operator>(const TVec3& b) const
+	{
+		return x() > b.x() && y() > b.y() && z() > b.z();
+	}
+
+	Bool operator>=(const TVec3& b) const
+	{
+		return x() >= b.x() && y() >= b.y() && z() >= b.z();
+	}
+	/// @}
+
+	/// @name Operators with T
+	/// @{
+	TVec3 operator+(const T f) const
+	{
+		return (*this) + TVec3(f);
+	}
+
+	TVec3& operator+=(const T f)
+	{
+		(*this) += TVec3(f);
+		return (*this);
+	}
+
+	TVec3 operator-(const T f) const
+	{
+		return (*this) - TVec3(f);
+	}
+
+	TVec3& operator-=(const T f)
+	{
+		(*this) -= TVec3(f);
+		return (*this);
+	}
+
+	TVec3 operator*(const T f) const
+	{
+		return (*this) * TVec3(f);
+	}
+
+	TVec3& operator*=(const T f)
+	{
+		(*this) *= TVec3(f);
+		return (*this);
+	}
+
+	TVec3 operator/(const T f) const
+	{
+		return (*this) / TVec3(f);
+	}
+
+	TVec3& operator/=(const T f)
+	{
+		(*this) /= TVec3(f);
+		return (*this);
+	}
+	/// @}
+
+	/// @name Operators with other types
+	/// @{
+	TVec3 operator*(const TMat3<T>& m3) const
+	{
+		ANKI_ASSERT(0 && "TODO");
+		return TVec3(0.0);
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+
+	/// 3 muls, 2 adds
+	T dot(const TVec3& b) const
+	{
+		return x() * b.x() + y() * b.y() + z() * b.z();
+	}
+
+	/// 6 muls, 3 adds
+	TVec3 cross(const TVec3& b) const
+	{
+		return TVec3(y() * b.z() - z() * b.y(),
+			z() * b.x() - x() * b.z(),
+			x() * b.y() - y() * b.x());
+	}
+
+	T getLengthSquared() const
+	{
+		return x() * x() + y() * y() + z() * z();
+	}
+
+	T getLength() const
+	{
+		return sqrt(getLengthSquared());
+	}
+
+	T getDistanceSquared(const TVec3& b) const
+	{
+		return ((*this) - b).getLengthSquared();
+	}
+
+	void normalize()
+	{
+		(*this) /= getLength();
+	}
+
+	TVec3 getNormalized() const
+	{
+		return (*this) / getLength();
+	}
+
+	TVec3 getProjection(const TVec3& toThis) const
+	{
+		return toThis * ((*this).dot(toThis) / (toThis.dot(toThis)));
+	}
+
+	/// Returns q * this * q.Conjucated() aka returns a rotated this.
+	/// 18 muls, 12 adds
+	TVec3 getRotated(const TQuat<T>& q) const
+	{
+		ANKI_ASSERT(isZero(1.0 - q.getLength())); // Not normalized quat
+		TVec3 qXyz(q);
+		return 
+			(*this) + qXyz.cross(qXyz.cross((*this)) + (*this) * q.w()) * 2.0;
+	}
+
+	void rotate(const TQuat<T>& q)
+	{
+		(*this) = getRotated(q);
+	}
+
+	/// Return lerp(this, v1, t)
+	TVec3 lerp(const TVec3& v1, T t) const
+	{
+		return ((*this) * (1.0 - t)) + (v1 * t);
+	}
+	/// @}
+
+	/// @name Transformations
+	/// The faster way is by far the TMat4 * TVec3 or the
+	/// getTransformed(const TVec3&, const TMat3&)
+	/// @{
+	TVec3 getTransformed(const TVec3& translate, const TMat3<T>& rotate,
+		T scale) const
+	{
+		return (rotate * ((*this) * scale)) + translate;
+	}
+
+	void transform(const TVec3& translate, const TMat3<T>& rotate, T scale)
+	{
+		(*this) = getTransformed(translate, rotate, scale);
+	}
+
+	TVec3 getTransformed(const TVec3& translate, const TMat3<T>& rotate) const
+	{
+		return (rotate * (*this)) + translate;
+	}
+
+	void transform(const TVec3& translate, const TMat3<T>& rotate)
+	{
+		(*this) = getTransformed(translate, rotate);
+	}
+
+	TVec3 getTransformed(const TVec3& translate, const TQuat<T>& rotate,
+		T scale) const
+	{
+		return ((*this) * scale).getRotated(rotate) + translate;
+	}
+
+	void transform(const TVec3& translate, const TQuat<T>& rotate, T scale)
+	{
+		(*this) = getTransformed(translate, rotate, scale);
+	}
+
+	TVec3 getTransformed(const TVec3& translate, const TQuat<T>& rotate) const
+	{
+		return getRotated(rotate) + translate;
+	}
+
+	void transform(const TVec3& translate, const TQuat<T>& rotate)
+	{
+		(*this) = getTransformed(translate, rotate);
+	}
+
+	/// 9 muls, 9 adds
+	TVec3 getTransformed(const TMat4<T>& transform) const
+	{
+		return TVec3(
+			transform(0, 0) * x() + transform(0, 1) * y() 
+			+ transform(0, 2) * z() + transform(0, 3),
+			transform(1, 0) * x() + transform(1, 1) * y() 
+			+ transform(1, 2) * z() + transform(1, 3),
+			transform(2, 0) * x() + transform(2, 1) * y() 
+			+ transform(2, 2) * z() + transform(2, 3));
+	}
+
+	void transform(const TMat4<T>& transform)
+	{
+		(*this) = getTransformed(transform);
+	}
+
+	/// 12 muls, 9 adds
+	TVec3 getTransformed(const TTransform<T>& transform) const
+	{
+		return (transform.getRotation() * ((*this) * transform.getScale())) 
+			+ transform.getOrigin();
+	}
+
+	void transform(const TTransform<T>& transform)
+	{
+		(*this) = getTransformed(transform);
+	}
+	/// @}
+
+	/// @name Friends
+	/// @{
+	friend TVec3 operator+(const T f, const TVec3& v)
+	{
+		return v + f;
+	}
+
+	friend TVec3 operator-(const T f, const TVec3& v)
+	{
+		return TVec3(f) - v;
+	}
+
+	friend TVec3 operator*(const T f, const TVec3& v)
+	{
+		return v * f;
+	}
+
+	friend TVec3 operator/(const T f, const TVec3& v)
+	{
+		return TVec3(f) / v;
+	}
+
+	friend std::ostream& operator<<(std::ostream& s, const TVec3& v)
+	{
+		s << v.x() << ' ' << v.y() << ' ' << v.z();
+		return s;
+	}
+	/// @}
+
+private:
+	/// @name Data
+	/// @{
+	union
+	{
+		struct
+		{
+			T x, y, z;
+		} vec;
+
+		Array<T, 3> arr;
+	};
+	/// @}
+};
+
 /// 3D vector. One of the most used classes
 class Vec3
 {

+ 15 - 0
include/anki/math/Vec3.inl.h

@@ -2,6 +2,21 @@
 
 namespace anki {
 
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+/*template<>
+TVec3<F32> TVec3<F32>::getTransformed(const TMat4<F32>& transform) const
+{
+	TVec3<F32> out;
+	TVec4<F32> v4((*this), 1.0);
+	for(U i = 0; i < 3; i++)
+	{
+		_mm_store_ss(
+			&out[i], _mm_dp_ps(transform.getSimd(i), v4.getSimd(), 0xF1));
+	}
+	return out;
+}*/
+#endif
+
 //==============================================================================
 // Constructors                                                                =
 //==============================================================================

+ 444 - 0
include/anki/math/Vec4.h

@@ -8,6 +8,450 @@ namespace anki {
 /// @addtogroup Math
 /// @{
 
+/// Template struct that gives the type of the TVec4 SIMD
+template<typename T>
+struct TVec4Simd
+{
+	typedef Array<T, 4> Type;
+};
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+// Specialize for F32
+template<>
+struct TVec4Simd<F32>
+{
+	typedef __m128 Type;
+};
+#endif
+
+/// 4D vector. SIMD optimized
+template<typename T>
+ANKI_ATTRIBUTE_ALIGNED(class, 16) TVec4
+{
+public:
+	typedef typename TVec4Simd<T>::Type Simd;
+
+	/// @name Constructors
+	/// @{
+	explicit TVec4()
+	{}
+
+	explicit TVec4(const T x_, const T y_, const T z_, const T w_)
+	{
+		x() = x_;
+		y() = y_;
+		z() = z_;
+		w() = w_;
+	}
+
+	explicit TVec4(const T f)
+	{
+		arr[0] = arr[1] = arr[2] = arr[3] = f;
+	}
+
+	explicit TVec4(const T arr_[])
+	{
+		arr[0] = arr_[0];
+		arr[1] = arr_[1];
+		arr[2] = arr_[2];
+		arr[3] = arr_[3];
+	}
+
+	explicit TVec4(const TVec2<T>& v2, const T z_, const T w_)
+	{
+		x() = v2.x();
+		y() = v2.y();
+		z() = z_;
+		w() = w_;
+	}
+
+	explicit TVec4(const TVec2<T>& av2, const TVec2<T>& bv2)
+	{
+		x() = av2.x();
+		y() = av2.y();
+		z() = bv2.x();
+		w() = bv2.y();
+	}
+
+	explicit TVec4(const TVec3<T>& v3, const T w_)
+	{
+		x() = v3.x();
+		y() = v3.y();
+		z() = v3.z();
+		w() = w_;
+	}
+
+	TVec4(const TVec4& b)
+	{
+		x() = b.x();
+		y() = b.y();
+		z() = b.z();
+		w() = b.w();
+	}
+
+	explicit TVec4(const TQuat<T>& q)
+	{
+		x() = q.x();
+		y() = q.y();
+		z() = q.z();
+		w() = q.w();
+	}
+
+	explicit TVec4(const Simd& simd_)
+	{
+		simd = simd_;
+	}
+	/// @}
+
+	/// @name Accessors
+	/// @{
+	T& x()
+	{
+		return vec.x;
+	}
+
+	T x() const
+	{
+		return vec.x;
+	}
+
+	T& y()
+	{
+		return vec.y;
+	}
+
+	T y() const
+	{
+		return vec.y;
+	}
+
+	T& z()
+	{
+		return vec.z;
+	}
+
+	T z() const
+	{
+		return vec.z;
+	}
+
+	T& w()
+	{
+		return vec.w;
+	}
+
+	T w() const
+	{
+		return vec.w;
+	}
+
+	T& operator[](const U i)
+	{
+		return arr[i];
+	}
+
+	T operator[](const U i) const
+	{
+		return arr[i];
+	}
+
+	Simd& getSimd()
+	{
+		return simd;
+	}
+
+	const Simd& getSimd() const
+	{
+		return simd;
+	}
+
+	TVec2<T> xy() const
+	{
+		return TVec2<T>(x(), y());
+	}
+
+	TVec3<T> xyz() const
+	{
+		return TVec3<T>(x(), y(), z());
+	}
+	/// @}
+
+	/// @name Operators with same type
+	/// @{
+	TVec4& operator=(const TVec4& b)
+	{
+		x() = b.x();
+		y() = b.y();
+		z() = b.z();
+		w() = b.w();
+		return (*this);
+	}
+
+	TVec4 operator+(const TVec4& b) const
+	{
+		return TVec4(x() + b.x(), y() + b.y(), z() + b.z(), w() + b.w());
+	}
+
+	TVec4& operator+=(const TVec4& b)
+	{
+		x() += b.x();
+		y() += b.y();
+		z() += b.z();
+		w() += b.w();
+		return (*this);
+	}
+
+	TVec4 operator-(const TVec4& b) const
+	{
+		return TVec4(x() - b.x(), y() - b.y(), z() - b.z(), w() - b.w());
+	}
+
+	TVec4& operator-=(const TVec4& b)
+	{
+		x() -= b.x();
+		y() -= b.y();
+		z() -= b.z();
+		w() -= b.w();
+		return (*this);
+	}
+
+	TVec4 operator*(const TVec4& b) const
+	{
+		return TVec4(x() * b.x(), y() * b.y(), z() * b.z(), w() * b.w());
+	}
+
+	TVec4& operator*=(const TVec4& b)
+	{
+		x() *= b.x();
+		y() *= b.y();
+		z() *= b.z();
+		w() *= b.w();
+		return (*this);
+	}
+
+	TVec4 operator/(const TVec4& b) const
+	{
+		return TVec4(x() / b.x(), y() / b.y(), z() / b.z(), w() / b.w());
+	}
+
+	TVec4& operator/=(const TVec4& b)
+	{
+		x() /= b.x();
+		y() /= b.y();
+		z() /= b.z();
+		w() /= b.w();
+		return (*this);
+	}
+
+	TVec4 operator-() const
+	{
+		return TVec4(-x(), -y(), -z(), -w());
+	}
+
+	Bool operator==(const TVec4& b) const
+	{
+		TVec4 sub = (*this) - b;
+		return isZero<T>(sub.x()) 
+			&& isZero<T>(sub.y()) 
+			&& isZero<T>(sub.z()) 
+			&& isZero<T>(sub.w());
+	}
+
+	Bool operator!=(const TVec4& b) const
+	{
+		return !operator==(b);
+	}
+
+	Bool operator<(const TVec4& b) const
+	{
+		return x() < b.x() && y() < b.y() && z() < b.z() && w() < b.w();
+	}
+
+	Bool operator<=(const TVec4& b) const
+	{
+		return x() <= b.x() && y() <= b.y() && z() <= b.z() && w() <= b.w();
+	}
+
+	Bool operator>(const TVec4& b) const
+	{
+		return x() > b.x() && y() > b.y() && z() > b.z() && w() > b.w();
+	}
+
+	Bool operator>=(const TVec4& b) const
+	{
+		return x() >= b.x() && y() >= b.y() && z() >= b.z() && w() >= b.w();
+	}
+	/// @}
+
+	/// @name Operators with T
+	/// @{
+	TVec4 operator+(const T f) const
+	{
+		return (*this) + TVec4(f);
+	}
+
+	TVec4& operator+=(const T f)
+	{
+		(*this) += TVec4(f);
+		return (*this);
+	}
+
+	TVec4 operator-(const T f) const
+	{
+		return (*this) - TVec4(f);
+	}
+
+	TVec4& operator-=(const T f)
+	{
+		(*this) -= TVec4(f);
+		return (*this);
+	}
+
+	TVec4 operator*(const T f) const
+	{
+		return (*this) * TVec4(f);
+	}
+
+	TVec4& operator*=(const T f)
+	{
+		(*this) *= TVec4(f);
+		return (*this);
+	}
+
+	TVec4 operator/(const T f) const
+	{
+		return (*this) / TVec4(f);
+	}
+
+	TVec4& operator/=(const T f)
+	{
+		(*this) /= TVec4(f);
+		return (*this);
+	}
+	/// @}
+
+	/// @name Operators with other
+	/// @{
+	TVec4 operator*(const TMat4<T>& m4) const
+	{
+		return TVec4(
+			x() * m4(0, 0) + y() * m4(1, 0) + z() * m4(2, 0) + w() * m4(3, 0),
+			x() * m4(0, 1) + y() * m4(1, 1) + z() * m4(2, 1) + w() * m4(3, 1),
+			x() * m4(0, 2) + y() * m4(1, 2) + z() * m4(2, 2) + w() * m4(3, 2),
+			x() * m4(0, 3) + y() * m4(1, 3) + z() * m4(2, 3) + w() * m4(3, 3));
+	}
+	/// @}
+
+	/// @name Other
+	/// @{
+	T getLength() const
+	{
+		return sqrt(dot((*this)));
+	}
+
+	TVec4 getNormalized() const
+	{
+		return (*this) / getLength();
+	}
+
+	void normalize()
+	{
+		(*this) /= getLength();
+	}
+
+	T dot(const TVec4& b) const
+	{
+		return x() * b.x() + y() * b.y() + z() * b.z() + w() * b.w();
+	}
+	/// @}
+
+	/// @name Friends
+	/// @{
+	template<typename Y>
+	friend TVec4 operator+(const Y f, const TVec4<Y>& v4);
+
+	template<typename Y>
+	friend TVec4 operator-(const Y f, const TVec4<Y>& v4);
+
+	template<typename Y>
+	friend TVec4 operator*(const Y f, const TVec4<Y>& v4);
+
+	template<typename Y>
+	friend TVec4 operator/(const Y f, const TVec4<Y>& v4);
+	/// @}
+
+private:
+	/// @name Data
+	/// @{
+	union
+	{
+		struct
+		{
+			T x, y, z, w;
+		} vec;
+
+		Array<T, 4> arr;
+
+		Simd simd;
+	};
+	/// @}
+};
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+
+// Forward declare specializations
+
+template<>
+TVec4<F32>::TVec4(F32 f);
+
+template<>
+TVec4<F32>::TVec4(const F32 arr_[]);
+
+template<>
+TVec4<F32>::TVec4(const F32 x_, const F32 y_, const F32 z_, const F32 w_);
+
+template<>
+TVec4<F32>::TVec4(const TVec4<F32>& b);
+
+template<>
+TVec4<F32>& TVec4<F32>::operator=(const TVec4<F32>& b);
+
+template<>
+TVec4<F32> TVec4<F32>::operator+(const TVec4<F32>& b) const;
+
+template<>
+TVec4<F32>& TVec4<F32>::operator+=(const TVec4<F32>& b);
+
+template<>
+TVec4<F32> TVec4<F32>::operator-(const TVec4<F32>& b) const;
+
+template<>
+TVec4<F32>& TVec4<F32>::operator-=(const TVec4<F32>& b);
+
+template<>
+TVec4<F32> TVec4<F32>::operator*(const TVec4<F32>& b) const;
+
+template<>
+TVec4<F32>& TVec4<F32>::operator*=(const TVec4<F32>& b);
+
+template<>
+TVec4<F32> TVec4<F32>::operator/(const TVec4<F32>& b) const;
+
+template<>
+TVec4<F32>& TVec4<F32>::operator/=(const TVec4<F32>& b);
+
+template<>
+F32 TVec4<F32>::dot(const TVec4<F32>& b) const;
+
+template<>
+TVec4<F32> TVec4<F32>::getNormalized() const;
+
+template<>
+void TVec4<F32>::normalize();
+
+#endif
+
+
 /// 4D vector. SIMD optimized
 ANKI_ATTRIBUTE_ALIGNED(class, 16) Vec4
 {

+ 173 - 0
include/anki/math/Vec4.inl.h

@@ -2,6 +2,179 @@
 
 namespace anki {
 
+//==============================================================================
+// Friends                                                                     =
+//==============================================================================
+
+//==============================================================================
+template<typename T>
+TVec4<T> operator+(const T f, const TVec4<T>& v4)
+{
+	return v4 + f;
+}
+
+//==============================================================================
+template<typename T>
+TVec4<T> operator-(const T f, const TVec4<T>& v4)
+{
+	return TVec4<T>(f) - v4;
+}
+
+//==============================================================================
+template<typename T>
+TVec4<T> operator*(const T f, const TVec4<T>& v4)
+{
+	return v4 * f;
+}
+
+//==============================================================================
+template<typename T>
+TVec4<T> operator/(const T f, const TVec4<T>& v4)
+{
+	return TVec4<T>(f) / v4;
+}
+
+#if ANKI_MATH_SIMD == ANKI_MATH_SIMD_SSE
+
+//==============================================================================
+// SSE specializations                                                         =
+//==============================================================================
+
+//==============================================================================
+// Constructors                                                                =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TVec4<F32>::TVec4(F32 f)
+{
+	simd = _mm_set1_ps(f);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>::TVec4(const F32 arr_[])
+{
+	simd = _mm_load_ps(arr_);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>::TVec4(const F32 x_, const F32 y_, const F32 z_, const F32 w_)
+{
+	simd = _mm_set_ps(w_, z_, y_, x_);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>::TVec4(const TVec4<F32>& b)
+{
+	simd = b.simd;
+}
+
+//==============================================================================
+// Operators with same                                                         =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline TVec4<F32>& TVec4<F32>::operator=(const TVec4<F32>& b)
+{
+	simd = b.simd;
+	return (*this);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32> TVec4<F32>::operator+(const TVec4<F32>& b) const
+{
+	return TVec4<F32>(_mm_add_ps(simd, b.simd));
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>& TVec4<F32>::operator+=(const TVec4<F32>& b)
+{
+	simd = _mm_add_ps(simd, b.simd);
+	return (*this);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32> TVec4<F32>::operator-(const TVec4<F32>& b) const
+{
+	return TVec4<F32>(_mm_sub_ps(simd, b.simd));
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>& TVec4<F32>::operator-=(const TVec4<F32>& b)
+{
+	simd = _mm_sub_ps(simd, b.simd);
+	return (*this);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32> TVec4<F32>::operator*(const TVec4<F32>& b) const
+{
+	return TVec4<F32>(_mm_mul_ps(simd, b.simd));
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>& TVec4<F32>::operator*=(const TVec4<F32>& b)
+{
+	simd = _mm_mul_ps(simd, b.simd);
+	return (*this);
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32> TVec4<F32>::operator/(const TVec4<F32>& b) const
+{
+	return TVec4<F32>(_mm_div_ps(simd, b.simd));
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32>& TVec4<F32>::operator/=(const TVec4<F32>& b)
+{
+	simd = _mm_div_ps(simd, b.simd);
+	return (*this);
+}
+
+//==============================================================================
+// Other                                                                       =
+//==============================================================================
+
+//==============================================================================
+template<>
+inline F32 TVec4<F32>::dot(const TVec4<F32>& b) const
+{
+	F32 o;
+	_mm_store_ss(&o, _mm_dp_ps(simd, b.simd, 0xF1));
+	return o;
+}
+
+//==============================================================================
+template<>
+inline TVec4<F32> TVec4<F32>::getNormalized() const
+{
+	__m128 inverse_norm = _mm_rsqrt_ps(_mm_dp_ps(simd, simd, 0xFF));
+	return TVec4<F32>(_mm_mul_ps(simd, inverse_norm));
+}
+
+//==============================================================================
+template<>
+inline void TVec4<F32>::normalize()
+{
+	__m128 inverseNorm = _mm_rsqrt_ps(_mm_dp_ps(simd, simd, 0xFF));
+	simd = _mm_mul_ps(simd, inverseNorm);
+}
+
+#endif
+
 //==============================================================================
 // Constructors                                                                =
 //==============================================================================

+ 24 - 0
include/anki/scene/Light.h

@@ -10,6 +10,30 @@
 
 namespace anki {
 
+/// XXX
+class FlareBatch
+{
+public:
+	static constexpr U MAX_FLARES = 10;
+
+	enum FlareFlag
+	{
+		POSITION_LIGHT = 1 << 0,
+		POSITION_FLOATING = 1 << 1
+	};
+
+private:
+	/// A 2D array texture with the flare textures
+	TextureResourcePointer flaresTex;
+
+	/// The size of each flare
+	Array<Vec2, MAX_FLARES> size;
+
+	Array<Vec2, MAX_FLARES> stretchMultiplier;
+
+	F32 flaresAlpha = 1.0;
+};
+
 /// Light scene node. It can be spot or point
 ///
 /// Explaining the lighting model:

+ 1 - 1
src/renderer/Is.cpp

@@ -855,7 +855,7 @@ void Is::lightPass()
 	lightPassProg->findUniformVariable("shadowMapArr").set(sm.sm2DArrayTex);
 
 	quadVao.bind();
-	glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 6, TILES_COUNT);
+	glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, TILES_COUNT);
 }
 
 //==============================================================================

+ 2 - 2
src/renderer/Renderer.cpp

@@ -104,14 +104,14 @@ void Renderer::render(SceneGraph& scene_)
 void Renderer::drawQuad()
 {
 	quadVao.bind();
-	glDrawArrays(GL_TRIANGLE_STRIP, 0, 6);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
 }
 
 //==============================================================================
 void Renderer::drawQuadInstanced(U32 primitiveCount)
 {
 	quadVao.bind();
-	glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 6, primitiveCount);
+	glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, primitiveCount);
 }
 
 //==============================================================================