Browse Source

SSE optimize matrix-matrix multiplications when URHO3D_SSE is enabled. These constitute 7.82% of 06_SkeletalAnimation sample with 2000 Jacks in it. After SSE optimizations applied, matrix multiplication takes only 2.13% of total time in AMD CodeXL profile. This change also seems to have the effect that in scalar version, VS2015 did not dare to inline Matrix3x4 multiplication in AnimatedModel::UpdateSkinning(), but after SSE, the instruction count is small enough that it happily does so.

Jukka Jylänki 10 years ago
parent
commit
3ad9ba306d
2 changed files with 113 additions and 0 deletions
  1. 70 0
      Source/Urho3D/Math/Matrix3x4.h
  2. 43 0
      Source/Urho3D/Math/Matrix4.h

+ 70 - 0
Source/Urho3D/Math/Matrix3x4.h

@@ -24,6 +24,10 @@
 
 #include "../Math/Matrix4.h"
 
+#ifdef URHO3D_SSE
+#include <xmmintrin.h>
+#endif
+
 namespace Urho3D
 {
 
@@ -292,6 +296,37 @@ public:
     /// Multiply a matrix.
     Matrix3x4 operator *(const Matrix3x4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix3x4 out;
+
+        __m128 r0 = _mm_loadu_ps(&rhs.m00_);
+        __m128 r1 = _mm_loadu_ps(&rhs.m10_);
+        __m128 r2 = _mm_loadu_ps(&rhs.m20_);
+        __m128 r3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);
+
+        __m128 l = _mm_loadu_ps(&m00_);
+        __m128 t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        __m128 t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        __m128 t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        __m128 t3 = _mm_mul_ps(l, r3);
+        _mm_storeu_ps(&out.m00_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m10_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(l, r3);
+        _mm_storeu_ps(&out.m10_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m20_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(l, r3);
+        _mm_storeu_ps(&out.m20_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        return out;
+#else
         return Matrix3x4(
             m00_ * rhs.m00_ + m01_ * rhs.m10_ + m02_ * rhs.m20_,
             m00_ * rhs.m01_ + m01_ * rhs.m11_ + m02_ * rhs.m21_,
@@ -306,11 +341,45 @@ public:
             m20_ * rhs.m02_ + m21_ * rhs.m12_ + m22_ * rhs.m22_,
             m20_ * rhs.m03_ + m21_ * rhs.m13_ + m22_ * rhs.m23_ + m23_
         );
+#endif
     }
 
     /// Multiply a 4x4 matrix.
     Matrix4 operator *(const Matrix4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix4 out;
+
+        __m128 r0 = _mm_loadu_ps(&rhs.m00_);
+        __m128 r1 = _mm_loadu_ps(&rhs.m10_);
+        __m128 r2 = _mm_loadu_ps(&rhs.m20_);
+        __m128 r3 = _mm_loadu_ps(&rhs.m30_);
+
+        __m128 l = _mm_loadu_ps(&m00_);
+        __m128 t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        __m128 t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        __m128 t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        __m128 t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m00_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m10_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m10_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m20_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m20_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        _mm_storeu_ps(&out.m30_, r3);
+
+        return out;
+#else
         return Matrix4(
             m00_ * rhs.m00_ + m01_ * rhs.m10_ + m02_ * rhs.m20_ + m03_ * rhs.m30_,
             m00_ * rhs.m01_ + m01_ * rhs.m11_ + m02_ * rhs.m21_ + m03_ * rhs.m31_,
@@ -329,6 +398,7 @@ public:
             rhs.m32_,
             rhs.m33_
         );
+#endif
     }
 
     /// Set translation elements.

+ 43 - 0
Source/Urho3D/Math/Matrix4.h

@@ -25,6 +25,10 @@
 #include "../Math/Quaternion.h"
 #include "../Math/Vector4.h"
 
+#ifdef URHO3D_SSE
+#include <xmmintrin.h>
+#endif
+
 namespace Urho3D
 {
 
@@ -299,6 +303,44 @@ public:
     /// Multiply a matrix.
     Matrix4 operator *(const Matrix4& rhs) const
     {
+#ifdef URHO3D_SSE
+        Matrix4 out;
+
+        __m128 r0 = _mm_loadu_ps(&rhs.m00_);
+        __m128 r1 = _mm_loadu_ps(&rhs.m10_);
+        __m128 r2 = _mm_loadu_ps(&rhs.m20_);
+        __m128 r3 = _mm_loadu_ps(&rhs.m30_);
+
+        __m128 l = _mm_loadu_ps(&m00_);
+        __m128 t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        __m128 t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        __m128 t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        __m128 t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m00_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m10_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m10_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m20_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m20_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        l = _mm_loadu_ps(&m30_);
+        t0 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(0, 0, 0, 0)), r0);
+        t1 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(1, 1, 1, 1)), r1);
+        t2 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(2, 2, 2, 2)), r2);
+        t3 = _mm_mul_ps(_mm_shuffle_ps(l, l, _MM_SHUFFLE(3, 3, 3, 3)), r3);
+        _mm_storeu_ps(&out.m30_, _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)));
+
+        return out;
+#else
         return Matrix4(
             m00_ * rhs.m00_ + m01_ * rhs.m10_ + m02_ * rhs.m20_ + m03_ * rhs.m30_,
             m00_ * rhs.m01_ + m01_ * rhs.m11_ + m02_ * rhs.m21_ + m03_ * rhs.m31_,
@@ -317,6 +359,7 @@ public:
             m30_ * rhs.m02_ + m31_ * rhs.m12_ + m32_ * rhs.m22_ + m33_ * rhs.m32_,
             m30_ * rhs.m03_ + m31_ * rhs.m13_ + m32_ * rhs.m23_ + m33_ * rhs.m33_
         );
+#endif
     }
 
     /// Multiply with a 3x4 matrix.