浏览代码

SSE optimize Urho3D/Math/Matrix3x4.h and Urho3D/Math/Matrix4.h. These yield a ~3% improvement in 06_SkeletalAnimation sample with 2000 Jacks in it.

Jukka Jylänki 10 年之前
父节点
当前提交
0107a0bf75
共有 2 个文件被更改,包括 259 次插入16 次删除
  1. 124 8
      Source/Urho3D/Math/Matrix3x4.h
  2. 135 8
      Source/Urho3D/Math/Matrix4.h

+ 124 - 8
Source/Urho3D/Math/Matrix3x4.h

@@ -36,8 +36,9 @@ class URHO3D_API Matrix3x4
 {
 public:
     /// Construct an identity matrix.
-    Matrix3x4() :
-        m00_(1.0f),
+    Matrix3x4()
+#ifndef URHO3D_SSE
+       :m00_(1.0f),
         m01_(0.0f),
         m02_(0.0f),
         m03_(0.0f),
@@ -49,12 +50,19 @@ public:
         m21_(0.0f),
         m22_(1.0f),
         m23_(0.0f)
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_set_ps(0.f, 0.f, 0.f, 1.f));
+        _mm_storeu_ps(&m10_, _mm_set_ps(0.f, 0.f, 1.f, 0.f));
+        _mm_storeu_ps(&m20_, _mm_set_ps(0.f, 1.f, 0.f, 0.f));
+#endif
     }
 
     /// Copy-construct from another matrix.
-    Matrix3x4(const Matrix3x4& matrix) :
-        m00_(matrix.m00_),
+    Matrix3x4(const Matrix3x4& matrix)
+#ifndef URHO3D_SSE
+       :m00_(matrix.m00_),
         m01_(matrix.m01_),
         m02_(matrix.m02_),
         m03_(matrix.m03_),
@@ -66,7 +74,13 @@ public:
         m21_(matrix.m21_),
         m22_(matrix.m22_),
         m23_(matrix.m23_)
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&matrix.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&matrix.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&matrix.m20_));
+#endif
     }
 
     /// Copy-construct from a 3x3 matrix and set the extra elements to identity.
@@ -87,8 +101,9 @@ public:
     }
 
     /// Copy-construct from a 4x4 matrix which is assumed to contain no projection.
-    Matrix3x4(const Matrix4& matrix) :
-        m00_(matrix.m00_),
+    Matrix3x4(const Matrix4& matrix)
+#ifndef URHO3D_SSE
+       :m00_(matrix.m00_),
         m01_(matrix.m01_),
         m02_(matrix.m02_),
         m03_(matrix.m03_),
@@ -100,7 +115,13 @@ public:
         m21_(matrix.m21_),
         m22_(matrix.m22_),
         m23_(matrix.m23_)
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&matrix.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&matrix.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&matrix.m20_));
+#endif
     }
 
     // Construct from values.
@@ -123,8 +144,9 @@ public:
     }
 
     /// Construct from a float array.
-    Matrix3x4(const float* data) :
-        m00_(data[0]),
+    Matrix3x4(const float* data)
+#ifndef URHO3D_SSE
+       :m00_(data[0]),
         m01_(data[1]),
         m02_(data[2]),
         m03_(data[3]),
@@ -136,7 +158,13 @@ public:
         m21_(data[9]),
         m22_(data[10]),
         m23_(data[11])
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(data));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(data + 4));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(data + 8));
+#endif
     }
 
     /// Construct from translation, rotation and uniform scale.
@@ -147,6 +175,11 @@ public:
     /// Assign from another matrix.
     Matrix3x4& operator =(const Matrix3x4& rhs)
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&rhs.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&rhs.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&rhs.m20_));
+#else
         m00_ = rhs.m00_;
         m01_ = rhs.m01_;
         m02_ = rhs.m02_;
@@ -159,6 +192,7 @@ public:
         m21_ = rhs.m21_;
         m22_ = rhs.m22_;
         m23_ = rhs.m23_;
+#endif
         return *this;
     }
 
@@ -183,6 +217,11 @@ public:
     /// Assign from a 4x4 matrix which is assumed to contain no projection.
     Matrix3x4& operator =(const Matrix4& rhs)
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&rhs.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&rhs.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&rhs.m20_));
+#else
         m00_ = rhs.m00_;
         m01_ = rhs.m01_;
         m02_ = rhs.m02_;
@@ -195,12 +234,23 @@ public:
         m21_ = rhs.m21_;
         m22_ = rhs.m22_;
         m23_ = rhs.m23_;
+#endif
         return *this;
     }
 
     /// Test for equality with another matrix without epsilon.
     bool operator ==(const Matrix3x4& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 c0 = _mm_cmpeq_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_));
+        __m128 c1 = _mm_cmpeq_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_));
+        c0 = _mm_and_ps(c0, c1);
+        __m128 c2 = _mm_cmpeq_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_));
+        c0 = _mm_and_ps(c0, c2);
+        __m128 hi = _mm_movehl_ps(c0, c0);
+        c0 = _mm_and_ps(c0, hi);
+        return _mm_cvtsi128_si64(_mm_castps_si128(c0)) == -1LL;
+#else
         const float* leftData = Data();
         const float* rightData = rhs.Data();
 
@@ -211,6 +261,7 @@ public:
         }
 
         return true;
+#endif
     }
 
     /// Test for inequality with another matrix without epsilon.
@@ -219,26 +270,63 @@ public:
     /// Multiply a Vector3 which is assumed to represent position.
     Vector3 operator *(const Vector3& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
+        __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
+        __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
+        __m128 r3 = _mm_setzero_ps();
+        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+
+        return Vector3(
+            _mm_cvtss_f32(vec),
+            _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
+            _mm_cvtss_f32(_mm_movehl_ps(vec, vec)));
+#else
         return Vector3(
             (m00_ * rhs.x_ + m01_ * rhs.y_ + m02_ * rhs.z_ + m03_),
             (m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_),
             (m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_)
         );
+#endif
     }
 
     /// Multiply a Vector4.
     Vector3 operator *(const Vector4& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 vec = _mm_loadu_ps(&rhs.x_);
+        __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
+        __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
+        __m128 r3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);
+        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+
+        return Vector3(
+            _mm_cvtss_f32(vec),
+            _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
+            _mm_cvtss_f32(_mm_movehl_ps(vec, vec)));
+#else
         return Vector3(
             (m00_ * rhs.x_ + m01_ * rhs.y_ + m02_ * rhs.z_ + m03_ * rhs.w_),
             (m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_ * rhs.w_),
             (m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_ * rhs.w_)
         );
+#endif
     }
 
     /// Add a matrix.
     Matrix3x4 operator +(const Matrix3x4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix3x4 ret;
+        _mm_storeu_ps(&ret.m00_, _mm_add_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
+        _mm_storeu_ps(&ret.m10_, _mm_add_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
+        _mm_storeu_ps(&ret.m20_, _mm_add_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
+        return ret;
+#else
         return Matrix3x4(
             m00_ + rhs.m00_,
             m01_ + rhs.m01_,
@@ -253,11 +341,19 @@ public:
             m22_ + rhs.m22_,
             m23_ + rhs.m23_
         );
+#endif
     }
 
     /// Subtract a matrix.
     Matrix3x4 operator -(const Matrix3x4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix3x4 ret;
+        _mm_storeu_ps(&ret.m00_, _mm_sub_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
+        _mm_storeu_ps(&ret.m10_, _mm_sub_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
+        _mm_storeu_ps(&ret.m20_, _mm_sub_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
+        return ret;
+#else
         return Matrix3x4(
             m00_ - rhs.m00_,
             m01_ - rhs.m01_,
@@ -272,11 +368,20 @@ public:
             m22_ - rhs.m22_,
             m23_ - rhs.m23_
         );
+#endif
     }
 
     /// Multiply with a scalar.
     Matrix3x4 operator *(float rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix3x4 ret;
+        const __m128 mul = _mm_set1_ps(rhs);
+        _mm_storeu_ps(&ret.m00_, _mm_mul_ps(_mm_loadu_ps(&m00_), mul));
+        _mm_storeu_ps(&ret.m10_, _mm_mul_ps(_mm_loadu_ps(&m10_), mul));
+        _mm_storeu_ps(&ret.m20_, _mm_mul_ps(_mm_loadu_ps(&m20_), mul));
+        return ret;
+#else
         return Matrix3x4(
             m00_ * rhs,
             m01_ * rhs,
@@ -291,6 +396,7 @@ public:
             m22_ * rhs,
             m23_ * rhs
         );
+#endif
     }
 
     /// Multiply a matrix.
@@ -458,6 +564,15 @@ public:
     /// Convert to a 4x4 matrix by filling in an identity last row.
     Matrix4 ToMatrix4() const
     {
+#ifdef URHO3D_SSE
+        Matrix4 ret;
+        _mm_storeu_ps(&ret.m00_, _mm_loadu_ps(&m00_));
+        _mm_storeu_ps(&ret.m10_, _mm_loadu_ps(&m10_));
+        _mm_storeu_ps(&ret.m20_, _mm_loadu_ps(&m20_));
+        _mm_storeu_ps(&ret.m30_, _mm_set_ps(1.f, 0.f, 0.f, 0.f));
+        return ret;
+#else
+
         return Matrix4(
             m00_,
             m01_,
@@ -476,6 +591,7 @@ public:
             0.0f,
             1.0f
         );
+#endif
     }
 
     /// Return the rotation matrix with scaling removed.

+ 135 - 8
Source/Urho3D/Math/Matrix4.h

@@ -26,7 +26,7 @@
 #include "../Math/Vector4.h"
 
 #ifdef URHO3D_SSE
-#include <xmmintrin.h>
+#include <emmintrin.h>
 #endif
 
 namespace Urho3D
@@ -39,8 +39,9 @@ class URHO3D_API Matrix4
 {
 public:
     /// Construct an identity matrix.
-    Matrix4() :
-        m00_(1.0f),
+    Matrix4()
+#ifndef URHO3D_SSE
+       :m00_(1.0f),
         m01_(0.0f),
         m02_(0.0f),
         m03_(0.0f),
@@ -56,12 +57,20 @@ public:
         m31_(0.0f),
         m32_(0.0f),
         m33_(1.0f)
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_set_ps(0.f, 0.f, 0.f, 1.f));
+        _mm_storeu_ps(&m10_, _mm_set_ps(0.f, 0.f, 1.f, 0.f));
+        _mm_storeu_ps(&m20_, _mm_set_ps(0.f, 1.f, 0.f, 0.f));
+        _mm_storeu_ps(&m30_, _mm_set_ps(1.f, 0.f, 0.f, 0.f));
+#endif
     }
 
     /// Copy-construct from another matrix.
-    Matrix4(const Matrix4& matrix) :
-        m00_(matrix.m00_),
+    Matrix4(const Matrix4& matrix)
+#ifndef URHO3D_SSE
+       :m00_(matrix.m00_),
         m01_(matrix.m01_),
         m02_(matrix.m02_),
         m03_(matrix.m03_),
@@ -77,7 +86,14 @@ public:
         m31_(matrix.m31_),
         m32_(matrix.m32_),
         m33_(matrix.m33_)
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&matrix.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&matrix.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&matrix.m20_));
+        _mm_storeu_ps(&m30_, _mm_loadu_ps(&matrix.m30_));
+#endif
     }
 
     /// Copy-cnstruct from a 3x3 matrix and set the extra elements to identity.
@@ -126,8 +142,9 @@ public:
     }
 
     /// Construct from a float array.
-    Matrix4(const float* data) :
-        m00_(data[0]),
+    Matrix4(const float* data)
+#ifndef URHO3D_SSE
+       :m00_(data[0]),
         m01_(data[1]),
         m02_(data[2]),
         m03_(data[3]),
@@ -143,12 +160,25 @@ public:
         m31_(data[13]),
         m32_(data[14]),
         m33_(data[15])
+#endif
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(data));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(data + 4));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(data + 8));
+        _mm_storeu_ps(&m30_, _mm_loadu_ps(data + 12));
+#endif
     }
 
     /// Assign from another matrix.
     Matrix4& operator =(const Matrix4& rhs)
     {
+#ifdef URHO3D_SSE
+        _mm_storeu_ps(&m00_, _mm_loadu_ps(&rhs.m00_));
+        _mm_storeu_ps(&m10_, _mm_loadu_ps(&rhs.m10_));
+        _mm_storeu_ps(&m20_, _mm_loadu_ps(&rhs.m20_));
+        _mm_storeu_ps(&m30_, _mm_loadu_ps(&rhs.m30_));
+#else
         m00_ = rhs.m00_;
         m01_ = rhs.m01_;
         m02_ = rhs.m02_;
@@ -165,6 +195,7 @@ public:
         m31_ = rhs.m31_;
         m32_ = rhs.m32_;
         m33_ = rhs.m33_;
+#endif
         return *this;
     }
 
@@ -193,6 +224,18 @@ public:
     /// Test for equality with another matrix without epsilon.
     bool operator ==(const Matrix4& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 c0 = _mm_cmpeq_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_));
+        __m128 c1 = _mm_cmpeq_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_));
+        c0 = _mm_and_ps(c0, c1);
+        __m128 c2 = _mm_cmpeq_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_));
+        __m128 c3 = _mm_cmpeq_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_));
+        c2 = _mm_and_ps(c2, c3);
+        c0 = _mm_and_ps(c0, c2);
+        __m128 hi = _mm_movehl_ps(c0, c0);
+        c0 = _mm_and_ps(c0, hi);
+        return _mm_cvtsi128_si64(_mm_castps_si128(c0)) == -1LL;
+#else
         const float* leftData = Data();
         const float* rightData = rhs.Data();
 
@@ -203,6 +246,7 @@ public:
         }
 
         return true;
+#endif
     }
 
     /// Test for inequality with another matrix without epsilon.
@@ -211,6 +255,21 @@ public:
     /// Multiply a Vector3 which is assumed to represent position.
     Vector3 operator *(const Vector3& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
+        __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
+        __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
+        __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
+        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+        vec = _mm_div_ps(vec, _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 3, 3, 3)));
+
+        return Vector3(
+            _mm_cvtss_f32(vec),
+            _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
+            _mm_cvtss_f32(_mm_movehl_ps(vec, vec)));
+#else
         float invW = 1.0f / (m30_ * rhs.x_ + m31_ * rhs.y_ + m32_ * rhs.z_ + m33_);
 
         return Vector3(
@@ -218,22 +277,45 @@ public:
             (m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_) * invW,
             (m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_) * invW
         );
+#endif
     }
 
     /// Multiply a Vector4.
     Vector4 operator *(const Vector4& rhs) const
     {
+#ifdef URHO3D_SSE
+        __m128 vec = _mm_loadu_ps(&rhs.x_);
+        __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
+        __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
+        __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
+        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+
+        Vector4 ret;
+        _mm_storeu_ps(&ret.x_, vec);
+        return ret;
+#else
         return Vector4(
             m00_ * rhs.x_ + m01_ * rhs.y_ + m02_ * rhs.z_ + m03_ * rhs.w_,
             m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_ * rhs.w_,
             m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_ * rhs.w_,
             m30_ * rhs.x_ + m31_ * rhs.y_ + m32_ * rhs.z_ + m33_ * rhs.w_
         );
+#endif
     }
 
     /// Add a matrix.
     Matrix4 operator +(const Matrix4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix4 ret;
+        _mm_storeu_ps(&ret.m00_, _mm_add_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
+        _mm_storeu_ps(&ret.m10_, _mm_add_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
+        _mm_storeu_ps(&ret.m20_, _mm_add_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
+        _mm_storeu_ps(&ret.m30_, _mm_add_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_)));
+        return ret;
+#else
         return Matrix4(
             m00_ + rhs.m00_,
             m01_ + rhs.m01_,
@@ -252,11 +334,20 @@ public:
             m32_ + rhs.m32_,
             m33_ + rhs.m33_
         );
+#endif
     }
 
     /// Subtract a matrix.
     Matrix4 operator -(const Matrix4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix4 ret;
+        _mm_storeu_ps(&ret.m00_, _mm_sub_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
+        _mm_storeu_ps(&ret.m10_, _mm_sub_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
+        _mm_storeu_ps(&ret.m20_, _mm_sub_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
+        _mm_storeu_ps(&ret.m30_, _mm_sub_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_)));
+        return ret;
+#else
         return Matrix4(
             m00_ - rhs.m00_,
             m01_ - rhs.m01_,
@@ -275,11 +366,21 @@ public:
             m32_ - rhs.m32_,
             m33_ - rhs.m33_
         );
+#endif
     }
 
     /// Multiply with a scalar.
     Matrix4 operator *(float rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix4 ret;
+        const __m128 mul = _mm_set1_ps(rhs);
+        _mm_storeu_ps(&ret.m00_, _mm_mul_ps(_mm_loadu_ps(&m00_), mul));
+        _mm_storeu_ps(&ret.m10_, _mm_mul_ps(_mm_loadu_ps(&m10_), mul));
+        _mm_storeu_ps(&ret.m20_, _mm_mul_ps(_mm_loadu_ps(&m20_), mul));
+        _mm_storeu_ps(&ret.m30_, _mm_mul_ps(_mm_loadu_ps(&m30_), mul));
+        return ret;
+#else
         return Matrix4(
             m00_ * rhs,
             m01_ * rhs,
@@ -298,6 +399,7 @@ public:
             m32_ * rhs,
             m33_ * rhs
         );
+#endif
     }
 
     /// Multiply a matrix.
@@ -457,6 +559,19 @@ public:
     /// Return transpose
     Matrix4 Transpose() const
     {
+#ifdef URHO3D_SSE
+        __m128 m0 = _mm_loadu_ps(&m00_);
+        __m128 m1 = _mm_loadu_ps(&m10_);
+        __m128 m2 = _mm_loadu_ps(&m20_);
+        __m128 m3 = _mm_loadu_ps(&m30_);
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);
+        Matrix4 out;
+        _mm_storeu_ps(&out.m00_, m0);
+        _mm_storeu_ps(&out.m10_, m1);
+        _mm_storeu_ps(&out.m20_, m2);
+        _mm_storeu_ps(&out.m30_, m3);
+        return out;
+#else
         return Matrix4(
             m00_,
             m10_,
@@ -475,6 +590,7 @@ public:
             m23_,
             m33_
         );
+#endif
     }
 
     /// Test for equality with another matrix with epsilon.
@@ -525,6 +641,17 @@ public:
     {
         for (unsigned i = 0; i < count; ++i)
         {
+#ifdef URHO3D_SSE
+            __m128 m0 = _mm_loadu_ps(src);
+            __m128 m1 = _mm_loadu_ps(src + 4);
+            __m128 m2 = _mm_loadu_ps(src + 8);
+            __m128 m3 = _mm_loadu_ps(src + 12);
+            _MM_TRANSPOSE4_PS(m0, m1, m2, m3);
+            _mm_storeu_ps(dest, m0);
+            _mm_storeu_ps(dest + 4, m1);
+            _mm_storeu_ps(dest + 8, m2);
+            _mm_storeu_ps(dest + 12, m3);
+#else
             dest[0] = src[0];
             dest[1] = src[4];
             dest[2] = src[8];
@@ -541,7 +668,7 @@ public:
             dest[13] = src[7];
             dest[14] = src[11];
             dest[15] = src[15];
-
+#endif
             dest += 16;
             src += 16;
         }