7 years ago · e1ec98ad64
--- a/src/common/Matrix.cpp
+++ b/src/common/Matrix.cpp
@@ -29,6 +29,10 @@
 
				 #include <xmmintrin.h>
			
 
				 #endif
			
 
				 
			
 
				+#if defined(LOVE_SIMD_NEON)
			
 
				+#include <arm_neon.h>
			
 
				+#endif
			
 
				+
			
 
				 namespace love
			
 
				 {
			
 
				 
			
@@ -43,9 +47,6 @@ namespace love
 
				 
			
 
				 void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, float t[16])
			
 
				 {
			
 
				-	// NOTE: in my testing with ARM NEON instructions (on an iPhone 6 with arm64)
			
 
				-	// it performed slightly worse than the regular add/multiply code. Further
			
 
				-	// investigation would be useful.
			
 
				 #if defined(LOVE_SIMD_SSE)
			
 
				 
			
 
				 	// We can't guarantee 16-bit alignment (e.g. for heap-allocated Matrix4
			
@@ -70,6 +71,38 @@ void Matrix4::multiply(const Matrix4 &a, const Matrix4 &b, float t[16])
 
				 		_mm_storeu_ps(&t[4*i], col);
			
 
				 	}
			
 
				 
			
 
				+#elif defined(LOVE_SIMD_NEON)
			
 
				+
			
 
				+	float32x4_t cola1 = vld1q_f32(&a.e[0]);
			
 
				+	float32x4_t cola2 = vld1q_f32(&a.e[4]);
			
 
				+	float32x4_t cola3 = vld1q_f32(&a.e[8]);
			
 
				+	float32x4_t cola4 = vld1q_f32(&a.e[12]);
			
 
				+
			
 
				+	float32x4_t col1 = vmulq_n_f32(cola1, b.e[0]);
			
 
				+	col1 = vmlaq_n_f32(col1, cola2, b.e[1]);
			
 
				+	col1 = vmlaq_n_f32(col1, cola3, b.e[2]);
			
 
				+	col1 = vmlaq_n_f32(col1, cola4, b.e[3]);
			
 
				+
			
 
				+	float32x4_t col2 = vmulq_n_f32(cola1, b.e[4]);
			
 
				+	col2 = vmlaq_n_f32(col2, cola2, b.e[5]);
			
 
				+	col2 = vmlaq_n_f32(col2, cola3, b.e[6]);
			
 
				+	col2 = vmlaq_n_f32(col2, cola4, b.e[7]);
			
 
				+
			
 
				+	float32x4_t col3 = vmulq_n_f32(cola1, b.e[8]);
			
 
				+	col3 = vmlaq_n_f32(col3, cola2, b.e[9]);
			
 
				+	col3 = vmlaq_n_f32(col3, cola3, b.e[10]);
			
 
				+	col3 = vmlaq_n_f32(col3, cola4, b.e[11]);
			
 
				+
			
 
				+	float32x4_t col4 = vmulq_n_f32(cola1, b.e[12]);
			
 
				+	col4 = vmlaq_n_f32(col4, cola2, b.e[13]);
			
 
				+	col4 = vmlaq_n_f32(col4, cola3, b.e[14]);
			
 
				+	col4 = vmlaq_n_f32(col4, cola4, b.e[15]);
			
 
				+
			
 
				+	vst1q_f32(&t[0], col1);
			
 
				+	vst1q_f32(&t[4], col2);
			
 
				+	vst1q_f32(&t[8], col3);
			
 
				+	vst1q_f32(&t[12], col4);
			
 
				+
			
 
				 #else
			
 
				 
			
 
				 	t[0]  = (a.e[0]*b.e[0])  + (a.e[4]*b.e[1])  + (a.e[8]*b.e[2])  + (a.e[12]*b.e[3]);