Browse Source

Use SIMD SSE functionality when available for 4x4 matrix multiplies. Slightly improves performance of large numbers of love.graphics.draw(texture) calls.

--HG--
branch : minor
Alex Szpakowski 8 years ago
parent
commit
4ef91ed79f
2 changed files with 53 additions and 3 deletions
  1. 37 3
      src/common/Matrix.cpp
  2. 16 0
      src/common/config.h

+ 37 - 3
src/common/Matrix.cpp

@@ -19,11 +19,16 @@
  **/
  **/
 
 
 #include "Matrix.h"
 #include "Matrix.h"
+#include "common/config.h"
 
 
 // STD
 // STD
 #include <cstring> // memcpy
 #include <cstring> // memcpy
 #include <cmath>
 #include <cmath>
 
 
+#if defined(LOVE_SIMD_SSE)
+#include <xmmintrin.h>
+#endif
+
 namespace love
 namespace love
 {
 {
 
 
@@ -38,6 +43,35 @@ namespace love
 
 
 void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, float t[16])
 void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, float t[16])
 {
 {
+	// NOTE: in my testing with ARM NEON instructions (on an iPhone 6 with arm64)
+	// it performed slightly worse than the regular add/multiply code. Further
+	// investigation would be useful.
+#if defined(LOVE_SIMD_SSE)
+
+	// We can't guarantee 16-bit alignment (e.g. for heap-allocated Matrix4
+	// objects) so we use unaligned loads and stores.
+	__m128 col1 = _mm_loadu_ps(&a.e[0]);
+	__m128 col2 = _mm_loadu_ps(&a.e[4]);
+	__m128 col3 = _mm_loadu_ps(&a.e[8]);
+	__m128 col4 = _mm_loadu_ps(&a.e[12]);
+
+	for (int i = 0; i < 4; i++)
+	{
+		__m128 brod1 = _mm_set1_ps(b.e[4*i + 0]);
+		__m128 brod2 = _mm_set1_ps(b.e[4*i + 1]);
+		__m128 brod3 = _mm_set1_ps(b.e[4*i + 2]);
+		__m128 brod4 = _mm_set1_ps(b.e[4*i + 3]);
+
+		__m128 col = _mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(brod1, col1), _mm_mul_ps(brod2, col2)),
+			_mm_add_ps(_mm_mul_ps(brod3, col3), _mm_mul_ps(brod4, col4))
+		);
+
+		_mm_storeu_ps(&t[4*i], col);
+	}
+
+#else
+
 	t[0]  = (a.e[0]*b.e[0])  + (a.e[4]*b.e[1])  + (a.e[8]*b.e[2])  + (a.e[12]*b.e[3]);
 	t[0]  = (a.e[0]*b.e[0])  + (a.e[4]*b.e[1])  + (a.e[8]*b.e[2])  + (a.e[12]*b.e[3]);
 	t[4]  = (a.e[0]*b.e[4])  + (a.e[4]*b.e[5])  + (a.e[8]*b.e[6])  + (a.e[12]*b.e[7]);
 	t[4]  = (a.e[0]*b.e[4])  + (a.e[4]*b.e[5])  + (a.e[8]*b.e[6])  + (a.e[12]*b.e[7]);
 	t[8]  = (a.e[0]*b.e[8])  + (a.e[4]*b.e[9])  + (a.e[8]*b.e[10]) + (a.e[12]*b.e[11]);
 	t[8]  = (a.e[0]*b.e[8])  + (a.e[4]*b.e[9])  + (a.e[8]*b.e[10]) + (a.e[12]*b.e[11]);
@@ -57,6 +91,8 @@ void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, float t[16])
 	t[7]  = (a.e[3]*b.e[4])  + (a.e[7]*b.e[5])  + (a.e[11]*b.e[6])  + (a.e[15]*b.e[7]);
 	t[7]  = (a.e[3]*b.e[4])  + (a.e[7]*b.e[5])  + (a.e[11]*b.e[6])  + (a.e[15]*b.e[7]);
 	t[11] = (a.e[3]*b.e[8])  + (a.e[7]*b.e[9])  + (a.e[11]*b.e[10]) + (a.e[15]*b.e[11]);
 	t[11] = (a.e[3]*b.e[8])  + (a.e[7]*b.e[9])  + (a.e[11]*b.e[10]) + (a.e[15]*b.e[11]);
 	t[15] = (a.e[3]*b.e[12]) + (a.e[7]*b.e[13]) + (a.e[11]*b.e[14]) + (a.e[15]*b.e[15]);
 	t[15] = (a.e[3]*b.e[12]) + (a.e[7]*b.e[13]) + (a.e[11]*b.e[14]) + (a.e[15]*b.e[15]);
+
+#endif
 }
 }
 
 
 void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, Matrix4 &t)
 void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, Matrix4 &t)
@@ -97,9 +133,7 @@ Matrix4::Matrix4(float x, float y, float angle, float sx, float sy, float ox, fl
 
 
 Matrix4 Matrix4::operator * (const Matrix4 &m) const
 Matrix4 Matrix4::operator * (const Matrix4 &m) const
 {
 {
-	float t[16];
-	multiply(*this, m, t);
-	return Matrix4(t);
+	return Matrix4(*this, m);
 }
 }
 
 
 void Matrix4::operator *= (const Matrix4 &m)
 void Matrix4::operator *= (const Matrix4 &m)

+ 16 - 0
src/common/config.h

@@ -60,6 +60,22 @@
 #	define LOVE_LITTLE_ENDIAN 1
 #	define LOVE_LITTLE_ENDIAN 1
 #endif
 #endif
 
 
+// SSE instructions.
+#if defined(__SSE__)
+#	define LOVE_SIMD_SSE
+#elif defined(_MSC_VER)
+#	if defined(_M_AMD64) || defined(_M_X64)
+#		define LOVE_SIMD_SSE
+#	elif _M_IX86_FP
+#		define LOVE_SIMD_SSE
+#	endif
+#endif
+
+// NEON instructions.
+#if defined(__ARM_NEON)
+#	define LOVE_SIMD_NEON
+#endif
+
 // Warnings.
 // Warnings.
 #ifndef _CRT_SECURE_NO_WARNINGS
 #ifndef _CRT_SECURE_NO_WARNINGS
 #	define _CRT_SECURE_NO_WARNINGS
 #	define _CRT_SECURE_NO_WARNINGS