Browse Source

SSE optimize Quaternion multiplication.

Jukka Jylänki 10 years ago
parent
commit
fd6a6e9336
1 changed files with 20 additions and 0 deletions
  1. 20 0
      Source/Urho3D/Math/Quaternion.h

+ 20 - 0
Source/Urho3D/Math/Quaternion.h

@@ -24,6 +24,10 @@
 
 
 #include "../Math/Matrix3.h"
 #include "../Math/Matrix3.h"
 
 
+#ifdef URHO3D_SSE
+#include <emmintrin.h>
+#endif
+
 namespace Urho3D
 namespace Urho3D
 {
 {
 
 
@@ -154,12 +158,28 @@ public:
     /// Multiply a quaternion.
     /// Multiply a quaternion.
     Quaternion operator *(const Quaternion& rhs) const
     Quaternion operator *(const Quaternion& rhs) const
     {
     {
+#ifdef URHO3D_SSE
+        __m128 q1 = _mm_loadu_ps(&w_);
+        __m128 q2 = _mm_loadu_ps(&rhs.w_);
+        q2 = _mm_shuffle_ps(q2, q2, _MM_SHUFFLE(0, 3, 2, 1));
+        const __m128 signy = _mm_castsi128_ps(_mm_set_epi32((int)0x80000000UL, (int)0x80000000UL, 0, 0));
+        const __m128 signx = _mm_shuffle_ps(signy, signy, _MM_SHUFFLE(2, 0, 2, 0));
+        const __m128 signz = _mm_shuffle_ps(signy, signy, _MM_SHUFFLE(3, 0, 0, 3));
+        __m128 out = _mm_mul_ps(_mm_shuffle_ps(q1, q1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(q2, q2, _MM_SHUFFLE(0, 1, 2, 3)));
+        out = _mm_add_ps(_mm_mul_ps(_mm_xor_ps(signy, _mm_shuffle_ps(q1, q1, _MM_SHUFFLE(2, 2, 2, 2))), _mm_shuffle_ps(q2, q2, _MM_SHUFFLE(1, 0, 3, 2))), _mm_xor_ps(signx, out));
+        out = _mm_add_ps(_mm_mul_ps(_mm_xor_ps(signz, _mm_shuffle_ps(q1, q1, _MM_SHUFFLE(3, 3, 3, 3))), _mm_shuffle_ps(q2, q2, _MM_SHUFFLE(2, 3, 0, 1))), out);
+        out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(q1, q1, _MM_SHUFFLE(0, 0, 0, 0)), q2), out);
+        Quaternion q;
+        _mm_storeu_ps(&q.w_, _mm_shuffle_ps(out, out, _MM_SHUFFLE(2, 1, 0, 3)));
+        return q;
+#else
         return Quaternion(
         return Quaternion(
             w_ * rhs.w_ - x_ * rhs.x_ - y_ * rhs.y_ - z_ * rhs.z_,
             w_ * rhs.w_ - x_ * rhs.x_ - y_ * rhs.y_ - z_ * rhs.z_,
             w_ * rhs.x_ + x_ * rhs.w_ + y_ * rhs.z_ - z_ * rhs.y_,
             w_ * rhs.x_ + x_ * rhs.w_ + y_ * rhs.z_ - z_ * rhs.y_,
             w_ * rhs.y_ + y_ * rhs.w_ + z_ * rhs.x_ - x_ * rhs.z_,
             w_ * rhs.y_ + y_ * rhs.w_ + z_ * rhs.x_ - x_ * rhs.z_,
             w_ * rhs.z_ + z_ * rhs.w_ + x_ * rhs.y_ - y_ * rhs.x_
             w_ * rhs.z_ + z_ * rhs.w_ + x_ * rhs.y_ - y_ * rhs.x_
         );
         );
+#endif
     }
     }
 
 
     /// Multiply a Vector3.
     /// Multiply a Vector3.