Browse Source

Added embedded SSE opetimizations in mat4 code

Christophe Riccio 12 years ago
parent
commit
929b521381
4 changed files with 56 additions and 14 deletions
  1. 4 6
      glm/core/dummy.cpp
  2. 41 8
      glm/core/type_mat4x4.inl
  3. 3 0
      glm/core/type_vec4.hpp
  4. 8 0
      glm/core/type_vec4.inl

+ 4 - 6
glm/core/dummy.cpp

@@ -32,13 +32,11 @@
 #define GLM_MESSAGES
 #define GLM_MESSAGES
 #include "../glm.hpp"
 #include "../glm.hpp"
 
 
-/*
-
-*/
-
 int main()
 int main()
 {
 {
-	//auto d = 90.0_deg;
+	glm::mat4 A(1.0f);
+	glm::vec4 B(1.0f);
+	glm::vec4 C = A * B;
 
 
-	//glm::vec3 v{0, 1, 2};
+	return 0;
 }
 }

+ 41 - 8
glm/core/type_mat4x4.inl

@@ -708,11 +708,44 @@ namespace detail
 		typename tmat4x4<T, P>::row_type const & v
 		typename tmat4x4<T, P>::row_type const & v
 	)
 	)
 	{
 	{
+		__m128 v0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(0, 0, 0, 0));
+		__m128 v1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(1, 1, 1, 1));
+		__m128 v2 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(2, 2, 2, 2));
+		__m128 v3 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 3, 3, 3));
+
+		__m128 m0 = _mm_mul_ps(m[0].data, v0);
+		__m128 m1 = _mm_mul_ps(m[1].data, v1);
+		__m128 a0 = _mm_add_ps(m0, m1);
+
+		__m128 m2 = _mm_mul_ps(m[2].data, v2);
+		__m128 m3 = _mm_mul_ps(m[3].data, v3);
+		__m128 a1 = _mm_add_ps(m2, m3);
+
+		__m128 a2 = _mm_add_ps(a0, a1);
+
+		return typename tmat4x4<T, P>::col_type(a2);
+/*
+		tmat4x4<T, P>::col_type const Mov0(v[0]);
+		tmat4x4<T, P>::col_type const Mov1(v[1]);
+		tmat4x4<T, P>::col_type const Mul0 = m[0] * Mov0;
+		tmat4x4<T, P>::col_type const Mul1 = m[1] * Mov1;
+		tmat4x4<T, P>::col_type const Add0 = Mul0 * Mul1;
+		tmat4x4<T, P>::col_type const Mov2(v[2]);
+		tmat4x4<T, P>::col_type const Mov3(v[3]);
+		tmat4x4<T, P>::col_type const Mul2 = m[2] * Mov2;
+		tmat4x4<T, P>::col_type const Mul3 = m[3] * Mov3;
+		tmat4x4<T, P>::col_type const Add1 = Mul2 * Mul3;
+		tmat4x4<T, P>::col_type const Add2 = Add0 * Add1;
+		return Add2;
+*/
+
+/*
 		return typename tmat4x4<T, P>::col_type(
 		return typename tmat4x4<T, P>::col_type(
-			m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z + m[3][0] * v.w,
-			m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z + m[3][1] * v.w,
-			m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z + m[3][2] * v.w,
-			m[0][3] * v.x + m[1][3] * v.y + m[2][3] * v.z + m[3][3] * v.w);
+			m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3],
+			m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2] + m[3][1] * v[3],
+			m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2] * v[3],
+			m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3] * v[3]);
+*/
 	}
 	}
 
 
 	template <typename T, precision P>
 	template <typename T, precision P>
@@ -723,10 +756,10 @@ namespace detail
 	)
 	)
 	{
 	{
 		return typename tmat4x4<T, P>::row_type(
 		return typename tmat4x4<T, P>::row_type(
-			m[0][0] * v.x + m[0][1] * v.y + m[0][2] * v.z + m[0][3] * v.w,
-			m[1][0] * v.x + m[1][1] * v.y + m[1][2] * v.z + m[1][3] * v.w,
-			m[2][0] * v.x + m[2][1] * v.y + m[2][2] * v.z + m[2][3] * v.w,
-			m[3][0] * v.x + m[3][1] * v.y + m[3][2] * v.z + m[3][3] * v.w);
+			m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3],
+			m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3],
+			m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3],
+			m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]);
 	}
 	}
 
 
 	template <typename T, precision P>
 	template <typename T, precision P>

+ 3 - 0
glm/core/type_vec4.hpp

@@ -69,6 +69,7 @@ namespace detail
 			struct {value_type r, g, b, a;};
 			struct {value_type r, g, b, a;};
 			struct {value_type s, t, p, q;};
 			struct {value_type s, t, p, q;};
 			struct {value_type x, y, z, w;};
 			struct {value_type x, y, z, w;};
+			__m128 data;
 		};
 		};
 #	elif(GLM_COMPONENT == GLM_COMPONENT_CXX98)
 #	elif(GLM_COMPONENT == GLM_COMPONENT_CXX98)
 		union {value_type x, r, s;};
 		union {value_type x, r, s;};
@@ -115,6 +116,8 @@ namespace detail
 			value_type const & s1,
 			value_type const & s1,
 			value_type const & s2,
 			value_type const & s2,
 			value_type const & s3);
 			value_type const & s3);
+		GLM_FUNC_DECL explicit tvec4(
+			__m128 const & v);
 
 
 		//////////////////////////////////////
 		//////////////////////////////////////
 		// Convertion scalar constructors
 		// Convertion scalar constructors

+ 8 - 0
glm/core/type_vec4.inl

@@ -115,6 +115,14 @@ namespace detail
 		w(s4)
 		w(s4)
 	{}
 	{}
 
 
+	template <typename T, precision P>
+	GLM_FUNC_QUALIFIER tvec4<T, P>::tvec4
+	(
+		__m128 const & v
+	) :
+		data(v)
+	{}
+
 	//////////////////////////////////////
 	//////////////////////////////////////
 	// Swizzle constructors
 	// Swizzle constructors