Explorar o código

Added AVX2 bitwise optimization

Christophe Riccio %!s(int64=9) %!d(string=hai) anos
pai
achega
ae6082db5e
Modificáronse 3 ficheiros con 115 adicións e 48 borrados
  1. 0 1
      glm/detail/type_vec4.hpp
  2. 37 25
      glm/detail/type_vec4.inl
  3. 78 22
      glm/detail/type_vec4_simd.inl

+ 0 - 1
glm/detail/type_vec4.hpp

@@ -70,7 +70,6 @@ namespace detail
 			typedef __m256i type;
 		};
 #	endif
-
 }//namespace detail
 
 	template <typename T, precision P = defaultp>

+ 37 - 25
glm/detail/type_vec4.inl

@@ -5,19 +5,31 @@ namespace glm{
 namespace detail
 {
 	template <typename T>
-	struct is_int32
+	struct is_int
 	{
 		enum test {value = 0};
 	};
 
 	template <>
-	struct is_int32<uint32>
+	struct is_int<uint32>
 	{
 		enum test {value = ~0};
 	};
 
 	template <>
-	struct is_int32<int32>
+	struct is_int<int32>
+	{
+		enum test {value = ~0};
+	};
+
+	template <>
+	struct is_int<uint64>
+	{
+		enum test {value = ~0};
+	};
+
+	template <>
+	struct is_int<int64>
 	{
 		enum test {value = ~0};
 	};
@@ -67,7 +79,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_and
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & a, tvec4<T, P> const & b)
@@ -76,7 +88,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_or
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & a, tvec4<T, P> const & b)
@@ -85,7 +97,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P, int IsInt32>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_xor
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & a, tvec4<T, P> const & b)
@@ -94,7 +106,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P, int IsInt32>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_shift_left
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & a, tvec4<T, P> const & b)
@@ -103,7 +115,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P, int IsInt32>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_shift_right
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & a, tvec4<T, P> const & b)
@@ -112,7 +124,7 @@ namespace detail
 		}
 	};
 
-	template <typename T, precision P, int IsInt32>
+	template <typename T, precision P, int IsInt, std::size_t Size>
 	struct compute_vec4_logical_not
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & v)
@@ -488,105 +500,105 @@ namespace detail
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator&=(U scalar)
 	{
-		return (*this = detail::compute_vec4_and<T, P>::call(*this, tvec4<T, P>(scalar)));
+		return (*this = detail::compute_vec4_and<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(scalar)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator&=(tvec1<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_and<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_and<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator&=(tvec4<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_and<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_and<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator|=(U scalar)
 	{
-		return (*this = detail::compute_vec4_or<T, P>::call(*this, tvec4<T, P>(scalar)));
+		return (*this = detail::compute_vec4_or<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(scalar)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator|=(tvec1<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_or<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_or<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator|=(tvec4<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_or<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_or<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator^=(U scalar)
 	{
-		return (*this = detail::compute_vec4_xor<T, P>::call(*this, tvec4<T, P>(scalar)));
+		return (*this = detail::compute_vec4_xor<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(scalar)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator^=(tvec1<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_xor<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_xor<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator^=(tvec4<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_xor<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_xor<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator<<=(U scalar)
 	{
-		return (*this = detail::compute_vec4_shift_left<T, P>::call(*this, tvec4<T, P>(scalar)));
+		return (*this = detail::compute_vec4_shift_left<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(scalar)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator<<=(tvec1<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_shift_left<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_shift_left<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator<<=(tvec4<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_shift_left<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_shift_left<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator>>=(U scalar)
 	{
-		return (*this = detail::compute_vec4_shift_right<T, P>::call(*this, tvec4<T, P>(scalar)));
+		return (*this = detail::compute_vec4_shift_right<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(scalar)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator>>=(tvec1<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_shift_right<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_shift_right<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	template <typename T, precision P>
 	template <typename U> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> & tvec4<T, P>::operator>>=(tvec4<U, P> const & v)
 	{
-		return (*this = detail::compute_vec4_shift_right<T, P>::call(*this, tvec4<T, P>(v)));
+		return (*this = detail::compute_vec4_shift_right<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(*this, tvec4<T, P>(v)));
 	}
 
 	// -- Unary constant operators --
@@ -910,7 +922,7 @@ namespace detail
 	template <typename T, precision P> 
 	GLM_FUNC_QUALIFIER tvec4<T, P> operator~(tvec4<T, P> const & v)
 	{
-		return detail::compute_vec4_logical_not<T, P, detail::is_int32<T>::value>::call(v);
+		return detail::compute_vec4_logical_not<T, P, detail::is_int<T>::value, sizeof(T) * 8>::call(v);
 	}
 
 	// -- Boolean operators --

+ 78 - 22
glm/detail/type_vec4_simd.inl

@@ -61,52 +61,56 @@ namespace detail
 		}
 	};
 
-	template <precision P>
-	struct compute_vec4_and<int32, P>
+	template <typename T, precision P>
+	struct compute_vec4_and<T, P, true, 32>
 	{
-		static tvec4<int32, P> call(tvec4<int32, P> const& a, tvec4<int32, P> const& b)
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
-			tvec4<int32, P> Result(uninitialize);
+			tvec4<T, P> Result(uninitialize);
 			Result.data = _mm_and_si128(a.data, b.data);
 			return Result;
 		}
 	};
 
-	template <precision P>
-	struct compute_vec4_and<uint32, P>
+#	if GLM_ARCH & GLM_ARCH_AVX2
+	template <typename T, precision P>
+	struct compute_vec4_and<T, P, true, 64>
 	{
-		static tvec4<uint32, P> call(tvec4<uint32, P> const& a, tvec4<uint32, P> const& b)
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
-			tvec4<uint32, P> Result(uninitialize);
-			Result.data = _mm_and_si128(a.data, b.data);
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm_and_si256(a.data, b.data);
 			return Result;
 		}
 	};
+#	endif
 
-	template <precision P>
-	struct compute_vec4_or<int32, P>
+	template <typename T, precision P>
+	struct compute_vec4_or<T, P, true, 32>
 	{
-		static tvec4<int32, P> call(tvec4<int32, P> const& a, tvec4<int32, P> const& b)
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
-			tvec4<int32, P> Result(uninitialize);
+			tvec4<T, P> Result(uninitialize);
 			Result.data = _mm_or_si128(a.data, b.data);
 			return Result;
 		}
 	};
 
-	template <precision P>
-	struct compute_vec4_or<uint32, P>
+#	if GLM_ARCH & GLM_ARCH_AVX2
+	template <typename T, precision P>
+	struct compute_vec4_or<T, P, true, 64>
 	{
-		static tvec4<uint32, P> call(tvec4<uint32, P> const& a, tvec4<uint32, P> const& b)
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
-			tvec4<uint32, P> Result(uninitialize);
-			Result.data = _mm_or_si128(a.data, b.data);
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm_or_si256(a.data, b.data);
 			return Result;
 		}
 	};
+#	endif
 
 	template <typename T, precision P>
-	struct compute_vec4_xor<T, P, true>
+	struct compute_vec4_xor<T, P, true, 32>
 	{
 		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
@@ -116,8 +120,21 @@ namespace detail
 		}
 	};
 
+#	if GLM_ARCH & GLM_ARCH_AVX2
+	template <typename T, precision P>
+	struct compute_vec4_xor<T, P, true, 64>
+	{
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
+		{
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm256_xor_si256(a.data, b.data);
+			return Result;
+		}
+	};
+#	endif
+
 	template <typename T, precision P>
-	struct compute_vec4_shift_left<T, P, true>
+	struct compute_vec4_shift_left<T, P, true, 32>
 	{
 		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
@@ -127,8 +144,21 @@ namespace detail
 		}
 	};
 
+#	if GLM_ARCH & GLM_ARCH_AVX2
 	template <typename T, precision P>
-	struct compute_vec4_shift_right<T, P, true>
+	struct compute_vec4_shift_left<T, P, true, 64>
+	{
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
+		{
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm256_sll_epi64(a.data, b.data);
+			return Result;
+		}
+	};
+#	endif
+
+	template <typename T, precision P>
+	struct compute_vec4_shift_right<T, P, true, 32>
 	{
 		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
 		{
@@ -138,8 +168,21 @@ namespace detail
 		}
 	};
 
+#	if GLM_ARCH & GLM_ARCH_AVX2
 	template <typename T, precision P>
-	struct compute_vec4_logical_not<T, P, true>
+	struct compute_vec4_shift_right<T, P, true, 64>
+	{
+		static tvec4<T, P> call(tvec4<T, P> const& a, tvec4<T, P> const& b)
+		{
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm256_srl_epi64(a.data, b.data);
+			return Result;
+		}
+	};
+#	endif
+
+	template <typename T, precision P>
+	struct compute_vec4_logical_not<T, P, true, 32>
 	{
 		static tvec4<T, P> call(tvec4<T, P> const & v)
 		{
@@ -148,6 +191,19 @@ namespace detail
 			return Result;
 		}
 	};
+
+#	if GLM_ARCH & GLM_ARCH_AVX2
+	template <typename T, precision P>
+	struct compute_vec4_logical_not<T, P, true, 64>
+	{
+		static tvec4<T, P> call(tvec4<T, P> const & v)
+		{
+			tvec4<T, P> Result(uninitialize);
+			Result.data = _mm256_xor_si256(v.data, _mm_set1_epi32(-1));
+			return Result;
+		}
+	};
+#	endif
 }//namespace detail
 
 #	if !GLM_HAS_DEFAULTED_FUNCTIONS