Browse Source

Optimize away some uses of _MM_TRANSPOSE4_PS(), to remove two shuffles, two movs and replace shufps with the lighter movehl/lh and unpacks. Also do vector loads and stores manually in BoundingBox::Transformed() to avoid VS2015 from generating dirty asm code for the function. This has a few percent improvement on the heavy BoundingBox::Transformed() function call.

Jukka Jylänki 10 years ago
parent
commit
e28a678914

+ 12 - 19
Source/Urho3D/Math/BoundingBox.cpp

@@ -131,8 +131,9 @@ BoundingBox BoundingBox::Transformed(const Matrix3& transform) const
 BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
 BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
 {
 {
 #ifdef URHO3D_SSE
 #ifdef URHO3D_SSE
-    __m128 minPt = _mm_set_ps(1.f, min_.z_, min_.y_, min_.x_);
-    __m128 maxPt = _mm_set_ps(1.f, max_.z_, max_.y_, max_.x_);
+    const __m128 one = _mm_set_ss(1.f);
+    __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one));
+    __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one));
     __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
     __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
     __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
     __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
     __m128 m0 = _mm_loadu_ps(&transform.m00_);
     __m128 m0 = _mm_loadu_ps(&transform.m00_);
@@ -140,27 +141,19 @@ BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
     __m128 m2 = _mm_loadu_ps(&transform.m20_);
     __m128 m2 = _mm_loadu_ps(&transform.m20_);
     __m128 r0 = _mm_mul_ps(m0, centerPoint);
     __m128 r0 = _mm_mul_ps(m0, centerPoint);
     __m128 r1 = _mm_mul_ps(m1, centerPoint);
     __m128 r1 = _mm_mul_ps(m1, centerPoint);
+    __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1));
     __m128 r2 = _mm_mul_ps(m2, centerPoint);
     __m128 r2 = _mm_mul_ps(m2, centerPoint);
-    __m128 r3 = _mm_setzero_ps();
-    _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-    __m128 newCenter = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
-    __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    const __m128 zero = _mm_setzero_ps();
+    __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero));
+    __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
+    const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
     __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
     __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
     __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
     __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
     __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
     __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
-    __m128 w = _mm_setzero_ps();
-    _MM_TRANSPOSE4_PS(x, y, z, w);
-    __m128 newDir = _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w));
-    __m128 boxMin = _mm_sub_ps(newCenter, newDir);
-    __m128 boxMax = _mm_add_ps(newCenter, newDir);
-
-    return BoundingBox(
-        Vector3(_mm_cvtss_f32(boxMin),
-                _mm_cvtss_f32(_mm_shuffle_ps(boxMin, boxMin, _MM_SHUFFLE(1, 1, 1, 1))),
-                _mm_cvtss_f32(_mm_movehl_ps(boxMin, boxMin))),
-        Vector3(_mm_cvtss_f32(boxMax),
-                _mm_cvtss_f32(_mm_shuffle_ps(boxMax, boxMax, _MM_SHUFFLE(1, 1, 1, 1))),
-                _mm_cvtss_f32(_mm_movehl_ps(boxMax, boxMax))));
+    t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y));
+    t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero));
+    __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
+    return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir));
 #else
 #else
     Vector3 newCenter = transform * Center();
     Vector3 newCenter = transform * Center();
     Vector3 oldEdge = Size() * 0.5f;
     Vector3 oldEdge = Size() * 0.5f;

+ 8 - 0
Source/Urho3D/Math/BoundingBox.h

@@ -78,6 +78,14 @@ public:
     {
     {
     }
     }
 
 
+#ifdef URHO3D_SSE
+    BoundingBox(__m128 min, __m128 max)
+    {
+        _mm_storeu_ps(&min_.x_, min);
+        _mm_storeu_ps(&max_.x_, max);
+    }
+#endif
+
     /// Construct from an array of vertices.
     /// Construct from an array of vertices.
     BoundingBox(const Vector3* vertices, unsigned count) :
     BoundingBox(const Vector3* vertices, unsigned count) :
         min_(M_INFINITY, M_INFINITY, M_INFINITY),
         min_(M_INFINITY, M_INFINITY, M_INFINITY),

+ 15 - 5
Source/Urho3D/Math/Matrix3x4.h

@@ -299,10 +299,15 @@ public:
         __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
         __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 t0 = _mm_unpacklo_ps(r0, r1);
+        __m128 t1 = _mm_unpackhi_ps(r0, r1);
+        t0 = _mm_add_ps(t0, t1);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r3 = _mm_setzero_ps();
         __m128 r3 = _mm_setzero_ps();
-        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+        __m128 t2 = _mm_unpacklo_ps(r2, r3);
+        __m128 t3 = _mm_unpackhi_ps(r2, r3);
+        t2 = _mm_add_ps(t2, t3);
+        vec = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
 
 
         return Vector3(
         return Vector3(
             _mm_cvtss_f32(vec),
             _mm_cvtss_f32(vec),
@@ -324,10 +329,15 @@ public:
         __m128 vec = _mm_loadu_ps(&rhs.x_);
         __m128 vec = _mm_loadu_ps(&rhs.x_);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 t0 = _mm_unpacklo_ps(r0, r1);
+        __m128 t1 = _mm_unpackhi_ps(r0, r1);
+        t0 = _mm_add_ps(t0, t1);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
-        __m128 r3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);
-        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+        __m128 r3 = _mm_setzero_ps();
+        __m128 t2 = _mm_unpacklo_ps(r2, r3);
+        __m128 t3 = _mm_unpackhi_ps(r2, r3);
+        t2 = _mm_add_ps(t2, t3);
+        vec = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
 
 
         return Vector3(
         return Vector3(
             _mm_cvtss_f32(vec),
             _mm_cvtss_f32(vec),

+ 14 - 5
Source/Urho3D/Math/Matrix4.h

@@ -261,12 +261,16 @@ public:
         __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
         __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 t0 = _mm_unpacklo_ps(r0, r1);
+        __m128 t1 = _mm_unpackhi_ps(r0, r1);
+        t0 = _mm_add_ps(t0, t1);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
         __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
-        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+        __m128 t2 = _mm_unpacklo_ps(r2, r3);
+        __m128 t3 = _mm_unpackhi_ps(r2, r3);
+        t2 = _mm_add_ps(t2, t3);
+        vec = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
         vec = _mm_div_ps(vec, _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 3, 3, 3)));
         vec = _mm_div_ps(vec, _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 3, 3, 3)));
-
         return Vector3(
         return Vector3(
             _mm_cvtss_f32(vec),
             _mm_cvtss_f32(vec),
             _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
             _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
@@ -289,10 +293,15 @@ public:
         __m128 vec = _mm_loadu_ps(&rhs.x_);
         __m128 vec = _mm_loadu_ps(&rhs.x_);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
         __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
+        __m128 t0 = _mm_unpacklo_ps(r0, r1);
+        __m128 t1 = _mm_unpackhi_ps(r0, r1);
+        t0 = _mm_add_ps(t0, t1);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
         __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
         __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
-        _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
-        vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
+        __m128 t2 = _mm_unpacklo_ps(r2, r3);
+        __m128 t3 = _mm_unpackhi_ps(r2, r3);
+        t2 = _mm_add_ps(t2, t3);
+        vec = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
 
 
         Vector4 ret;
         Vector4 ret;
         _mm_storeu_ps(&ret.x_, vec);
         _mm_storeu_ps(&ret.x_, vec);