|
|
@@ -26,7 +26,7 @@
|
|
|
#include "../Math/Vector4.h"
|
|
|
|
|
|
#ifdef URHO3D_SSE
|
|
|
-#include <xmmintrin.h>
|
|
|
+#include <emmintrin.h>
|
|
|
#endif
|
|
|
|
|
|
namespace Urho3D
|
|
|
@@ -39,8 +39,9 @@ class URHO3D_API Matrix4
|
|
|
{
|
|
|
public:
|
|
|
/// Construct an identity matrix.
|
|
|
- Matrix4() :
|
|
|
- m00_(1.0f),
|
|
|
+ Matrix4()
|
|
|
+#ifndef URHO3D_SSE
|
|
|
+ :m00_(1.0f),
|
|
|
m01_(0.0f),
|
|
|
m02_(0.0f),
|
|
|
m03_(0.0f),
|
|
|
@@ -56,12 +57,20 @@ public:
|
|
|
m31_(0.0f),
|
|
|
m32_(0.0f),
|
|
|
m33_(1.0f)
|
|
|
+#endif
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ _mm_storeu_ps(&m00_, _mm_set_ps(0.f, 0.f, 0.f, 1.f));
|
|
|
+ _mm_storeu_ps(&m10_, _mm_set_ps(0.f, 0.f, 1.f, 0.f));
|
|
|
+ _mm_storeu_ps(&m20_, _mm_set_ps(0.f, 1.f, 0.f, 0.f));
|
|
|
+ _mm_storeu_ps(&m30_, _mm_set_ps(1.f, 0.f, 0.f, 0.f));
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Copy-construct from another matrix.
|
|
|
- Matrix4(const Matrix4& matrix) :
|
|
|
- m00_(matrix.m00_),
|
|
|
+ Matrix4(const Matrix4& matrix)
|
|
|
+#ifndef URHO3D_SSE
|
|
|
+ :m00_(matrix.m00_),
|
|
|
m01_(matrix.m01_),
|
|
|
m02_(matrix.m02_),
|
|
|
m03_(matrix.m03_),
|
|
|
@@ -77,7 +86,14 @@ public:
|
|
|
m31_(matrix.m31_),
|
|
|
m32_(matrix.m32_),
|
|
|
m33_(matrix.m33_)
|
|
|
+#endif
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ _mm_storeu_ps(&m00_, _mm_loadu_ps(&matrix.m00_));
|
|
|
+ _mm_storeu_ps(&m10_, _mm_loadu_ps(&matrix.m10_));
|
|
|
+ _mm_storeu_ps(&m20_, _mm_loadu_ps(&matrix.m20_));
|
|
|
+ _mm_storeu_ps(&m30_, _mm_loadu_ps(&matrix.m30_));
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Copy-cnstruct from a 3x3 matrix and set the extra elements to identity.
|
|
|
@@ -126,8 +142,9 @@ public:
|
|
|
}
|
|
|
|
|
|
/// Construct from a float array.
|
|
|
- Matrix4(const float* data) :
|
|
|
- m00_(data[0]),
|
|
|
+ Matrix4(const float* data)
|
|
|
+#ifndef URHO3D_SSE
|
|
|
+ :m00_(data[0]),
|
|
|
m01_(data[1]),
|
|
|
m02_(data[2]),
|
|
|
m03_(data[3]),
|
|
|
@@ -143,12 +160,25 @@ public:
|
|
|
m31_(data[13]),
|
|
|
m32_(data[14]),
|
|
|
m33_(data[15])
|
|
|
+#endif
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ _mm_storeu_ps(&m00_, _mm_loadu_ps(data));
|
|
|
+ _mm_storeu_ps(&m10_, _mm_loadu_ps(data + 4));
|
|
|
+ _mm_storeu_ps(&m20_, _mm_loadu_ps(data + 8));
|
|
|
+ _mm_storeu_ps(&m30_, _mm_loadu_ps(data + 12));
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Assign from another matrix.
|
|
|
Matrix4& operator =(const Matrix4& rhs)
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ _mm_storeu_ps(&m00_, _mm_loadu_ps(&rhs.m00_));
|
|
|
+ _mm_storeu_ps(&m10_, _mm_loadu_ps(&rhs.m10_));
|
|
|
+ _mm_storeu_ps(&m20_, _mm_loadu_ps(&rhs.m20_));
|
|
|
+ _mm_storeu_ps(&m30_, _mm_loadu_ps(&rhs.m30_));
|
|
|
+#else
|
|
|
m00_ = rhs.m00_;
|
|
|
m01_ = rhs.m01_;
|
|
|
m02_ = rhs.m02_;
|
|
|
@@ -165,6 +195,7 @@ public:
|
|
|
m31_ = rhs.m31_;
|
|
|
m32_ = rhs.m32_;
|
|
|
m33_ = rhs.m33_;
|
|
|
+#endif
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
@@ -193,6 +224,18 @@ public:
|
|
|
/// Test for equality with another matrix without epsilon.
|
|
|
bool operator ==(const Matrix4& rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ __m128 c0 = _mm_cmpeq_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_));
|
|
|
+ __m128 c1 = _mm_cmpeq_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_));
|
|
|
+ c0 = _mm_and_ps(c0, c1);
|
|
|
+ __m128 c2 = _mm_cmpeq_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_));
|
|
|
+ __m128 c3 = _mm_cmpeq_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_));
|
|
|
+ c2 = _mm_and_ps(c2, c3);
|
|
|
+ c0 = _mm_and_ps(c0, c2);
|
|
|
+ __m128 hi = _mm_movehl_ps(c0, c0);
|
|
|
+ c0 = _mm_and_ps(c0, hi);
|
|
|
+ return _mm_cvtsi128_si64(_mm_castps_si128(c0)) == -1LL;
|
|
|
+#else
|
|
|
const float* leftData = Data();
|
|
|
const float* rightData = rhs.Data();
|
|
|
|
|
|
@@ -203,6 +246,7 @@ public:
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Test for inequality with another matrix without epsilon.
|
|
|
@@ -211,6 +255,21 @@ public:
|
|
|
/// Multiply a Vector3 which is assumed to represent position.
|
|
|
Vector3 operator *(const Vector3& rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ __m128 vec = _mm_set_ps(1.f, rhs.z_, rhs.y_, rhs.x_);
|
|
|
+ __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
|
|
|
+ __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
|
|
|
+ __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
|
|
|
+ __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
|
|
|
+ _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
|
|
+ vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
|
|
|
+ vec = _mm_div_ps(vec, _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 3, 3, 3)));
|
|
|
+
|
|
|
+ return Vector3(
|
|
|
+ _mm_cvtss_f32(vec),
|
|
|
+ _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1))),
|
|
|
+ _mm_cvtss_f32(_mm_movehl_ps(vec, vec)));
|
|
|
+#else
|
|
|
float invW = 1.0f / (m30_ * rhs.x_ + m31_ * rhs.y_ + m32_ * rhs.z_ + m33_);
|
|
|
|
|
|
return Vector3(
|
|
|
@@ -218,22 +277,45 @@ public:
|
|
|
(m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_) * invW,
|
|
|
(m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_) * invW
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Multiply a Vector4.
|
|
|
Vector4 operator *(const Vector4& rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ __m128 vec = _mm_loadu_ps(&rhs.x_);
|
|
|
+ __m128 r0 = _mm_mul_ps(_mm_loadu_ps(&m00_), vec);
|
|
|
+ __m128 r1 = _mm_mul_ps(_mm_loadu_ps(&m10_), vec);
|
|
|
+ __m128 r2 = _mm_mul_ps(_mm_loadu_ps(&m20_), vec);
|
|
|
+ __m128 r3 = _mm_mul_ps(_mm_loadu_ps(&m30_), vec);
|
|
|
+ _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
|
|
+ vec = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
|
|
|
+
|
|
|
+ Vector4 ret;
|
|
|
+ _mm_storeu_ps(&ret.x_, vec);
|
|
|
+ return ret;
|
|
|
+#else
|
|
|
return Vector4(
|
|
|
m00_ * rhs.x_ + m01_ * rhs.y_ + m02_ * rhs.z_ + m03_ * rhs.w_,
|
|
|
m10_ * rhs.x_ + m11_ * rhs.y_ + m12_ * rhs.z_ + m13_ * rhs.w_,
|
|
|
m20_ * rhs.x_ + m21_ * rhs.y_ + m22_ * rhs.z_ + m23_ * rhs.w_,
|
|
|
m30_ * rhs.x_ + m31_ * rhs.y_ + m32_ * rhs.z_ + m33_ * rhs.w_
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Add a matrix.
|
|
|
Matrix4 operator +(const Matrix4& rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ Matrix4 ret;
|
|
|
+ _mm_storeu_ps(&ret.m00_, _mm_add_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
|
|
|
+ _mm_storeu_ps(&ret.m10_, _mm_add_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
|
|
|
+ _mm_storeu_ps(&ret.m20_, _mm_add_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
|
|
|
+ _mm_storeu_ps(&ret.m30_, _mm_add_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_)));
|
|
|
+ return ret;
|
|
|
+#else
|
|
|
return Matrix4(
|
|
|
m00_ + rhs.m00_,
|
|
|
m01_ + rhs.m01_,
|
|
|
@@ -252,11 +334,20 @@ public:
|
|
|
m32_ + rhs.m32_,
|
|
|
m33_ + rhs.m33_
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Subtract a matrix.
|
|
|
Matrix4 operator -(const Matrix4& rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ Matrix4 ret;
|
|
|
+ _mm_storeu_ps(&ret.m00_, _mm_sub_ps(_mm_loadu_ps(&m00_), _mm_loadu_ps(&rhs.m00_)));
|
|
|
+ _mm_storeu_ps(&ret.m10_, _mm_sub_ps(_mm_loadu_ps(&m10_), _mm_loadu_ps(&rhs.m10_)));
|
|
|
+ _mm_storeu_ps(&ret.m20_, _mm_sub_ps(_mm_loadu_ps(&m20_), _mm_loadu_ps(&rhs.m20_)));
|
|
|
+ _mm_storeu_ps(&ret.m30_, _mm_sub_ps(_mm_loadu_ps(&m30_), _mm_loadu_ps(&rhs.m30_)));
|
|
|
+ return ret;
|
|
|
+#else
|
|
|
return Matrix4(
|
|
|
m00_ - rhs.m00_,
|
|
|
m01_ - rhs.m01_,
|
|
|
@@ -275,11 +366,21 @@ public:
|
|
|
m32_ - rhs.m32_,
|
|
|
m33_ - rhs.m33_
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Multiply with a scalar.
|
|
|
Matrix4 operator *(float rhs) const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ Matrix4 ret;
|
|
|
+ const __m128 mul = _mm_set1_ps(rhs);
|
|
|
+ _mm_storeu_ps(&ret.m00_, _mm_mul_ps(_mm_loadu_ps(&m00_), mul));
|
|
|
+ _mm_storeu_ps(&ret.m10_, _mm_mul_ps(_mm_loadu_ps(&m10_), mul));
|
|
|
+ _mm_storeu_ps(&ret.m20_, _mm_mul_ps(_mm_loadu_ps(&m20_), mul));
|
|
|
+ _mm_storeu_ps(&ret.m30_, _mm_mul_ps(_mm_loadu_ps(&m30_), mul));
|
|
|
+ return ret;
|
|
|
+#else
|
|
|
return Matrix4(
|
|
|
m00_ * rhs,
|
|
|
m01_ * rhs,
|
|
|
@@ -298,6 +399,7 @@ public:
|
|
|
m32_ * rhs,
|
|
|
m33_ * rhs
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Multiply a matrix.
|
|
|
@@ -457,6 +559,19 @@ public:
|
|
|
/// Return transpose
|
|
|
Matrix4 Transpose() const
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ __m128 m0 = _mm_loadu_ps(&m00_);
|
|
|
+ __m128 m1 = _mm_loadu_ps(&m10_);
|
|
|
+ __m128 m2 = _mm_loadu_ps(&m20_);
|
|
|
+ __m128 m3 = _mm_loadu_ps(&m30_);
|
|
|
+ _MM_TRANSPOSE4_PS(m0, m1, m2, m3);
|
|
|
+ Matrix4 out;
|
|
|
+ _mm_storeu_ps(&out.m00_, m0);
|
|
|
+ _mm_storeu_ps(&out.m10_, m1);
|
|
|
+ _mm_storeu_ps(&out.m20_, m2);
|
|
|
+ _mm_storeu_ps(&out.m30_, m3);
|
|
|
+ return out;
|
|
|
+#else
|
|
|
return Matrix4(
|
|
|
m00_,
|
|
|
m10_,
|
|
|
@@ -475,6 +590,7 @@ public:
|
|
|
m23_,
|
|
|
m33_
|
|
|
);
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
/// Test for equality with another matrix with epsilon.
|
|
|
@@ -525,6 +641,17 @@ public:
|
|
|
{
|
|
|
for (unsigned i = 0; i < count; ++i)
|
|
|
{
|
|
|
+#ifdef URHO3D_SSE
|
|
|
+ __m128 m0 = _mm_loadu_ps(src);
|
|
|
+ __m128 m1 = _mm_loadu_ps(src + 4);
|
|
|
+ __m128 m2 = _mm_loadu_ps(src + 8);
|
|
|
+ __m128 m3 = _mm_loadu_ps(src + 12);
|
|
|
+ _MM_TRANSPOSE4_PS(m0, m1, m2, m3);
|
|
|
+ _mm_storeu_ps(dest, m0);
|
|
|
+ _mm_storeu_ps(dest + 4, m1);
|
|
|
+ _mm_storeu_ps(dest + 8, m2);
|
|
|
+ _mm_storeu_ps(dest + 12, m3);
|
|
|
+#else
|
|
|
dest[0] = src[0];
|
|
|
dest[1] = src[4];
|
|
|
dest[2] = src[8];
|
|
|
@@ -541,7 +668,7 @@ public:
|
|
|
dest[13] = src[7];
|
|
|
dest[14] = src[11];
|
|
|
dest[15] = src[15];
|
|
|
-
|
|
|
+#endif
|
|
|
dest += 16;
|
|
|
src += 16;
|
|
|
}
|