Browse Source

Add Neon to glm

A few simple functions that use Neon as compiler does not
use the full potential of Neon
For now, -DGLM_FORCE_NEON is required until it's the default
Amaury Le Leyzour 6 years ago
parent
commit
cd3cc166b4
3 changed files with 378 additions and 5 deletions
  1. 20 0
      glm/detail/qualifier.hpp
  2. 334 0
      glm/detail/type_vec4_simd.inl
  3. 24 5
      glm/simd/platform.h

+ 20 - 0
glm/detail/qualifier.hpp

@@ -167,6 +167,26 @@ namespace detail
 	};
 	};
 #	endif
 #	endif
 
 
+#	if GLM_ARCH & GLM_ARCH_NEON_BIT
+	template<>
+	struct storage<4, float, true>
+	{
+		typedef glm_f32vec4 type;
+	};
+
+	template<>
+	struct storage<4, int, true>
+	{
+		typedef glm_i32vec4 type;
+	};
+
+	template<>
+	struct storage<4, unsigned int, true>
+	{
+		typedef glm_u32vec4 type;
+	};
+#	endif
+
 	enum genTypeEnum
 	enum genTypeEnum
 	{
 	{
 		GENTYPE_VEC,
 		GENTYPE_VEC,

+ 334 - 0
glm/detail/type_vec4_simd.inl

@@ -461,3 +461,337 @@ namespace detail
 }//namespace glm
 }//namespace glm
 
 
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
+
+#if GLM_ARCH & GLM_ARCH_NEON_BIT
+namespace glm {
+namespace detail {
+
+	template<qualifier Q>
+	struct compute_vec4_add<float, Q, true>
+	{
+		static
+		vec<4, float, Q>
+		call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
+		{
+			vec<4, float, Q> Result;
+			Result.data = vaddq_f32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_add<uint, Q, true>
+	{
+		static
+		vec<4, uint, Q>
+		call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
+		{
+			vec<4, uint, Q> Result;
+			Result.data = vaddq_u32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_add<int, Q, true>
+	{
+		static
+		vec<4, int, Q>
+		call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
+		{
+			vec<4, uint, Q> Result;
+			Result.data = vaddq_s32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_sub<float, Q, true>
+	{
+		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
+		{
+			vec<4, float, Q> Result;
+			Result.data = vsubq_f32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_sub<uint, Q, true>
+	{
+		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
+		{
+			vec<4, uint, Q> Result;
+			Result.data = vsubq_u32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_sub<int, Q, true>
+	{
+		static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
+		{
+			vec<4, int, Q> Result;
+			Result.data = vsubq_s32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_mul<float, Q, true>
+	{
+		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
+		{
+			vec<4, float, Q> Result;
+			Result.data = vmulq_f32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_mul<uint, Q, true>
+	{
+		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
+		{
+			vec<4, uint, Q> Result;
+			Result.data = vmulq_u32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_mul<int, Q, true>
+	{
+		static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
+		{
+			vec<4, int, Q> Result;
+			Result.data = vmulq_s32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_div<float, Q, true>
+	{
+		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
+		{
+			vec<4, float, Q> Result;
+			Result.data = vdivq_f32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_div<uint, Q, true>
+	{
+		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
+		{
+			vec<4, uint, Q> Result;
+			Result.data = vdivq_u32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_div<int, Q, true>
+	{
+		static vec<4, int, Q> call(vec<4, float, Q> const& a, vec<4, int, Q> const& b)
+		{
+			vec<4, int, Q> Result;
+			Result.data = vdivq_s32(a.data, b.data);
+			return Result;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_equal<float, Q, false, 32, true>
+	{
+		static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
+		{
+			uint32x4_t cmp = vceqq_f32(v1.data, v2.data);
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			cmp = vpminq_u32(cmp, cmp);
+			cmp = vpminq_u32(cmp, cmp);
+			uint32_t r = cmp[0];
+#else
+			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
+			cmpx2 = vpmin_u32(cmpx2, cmpx2);
+			uint32_t r = cmpx2[0];
+#endif
+			return r == ~0u;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_equal<uint, Q, false, 32, true>
+	{
+		static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
+		{
+			uint32x4_t cmp = vceqq_u32(v1.data, v2.data);
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			cmp = vpminq_u32(cmp, cmp);
+			cmp = vpminq_u32(cmp, cmp);
+			uint32_t r = cmp[0];
+#else
+			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
+			cmpx2 = vpmin_u32(cmpx2, cmpx2);
+			uint32_t r = cmpx2[0];
+#endif
+			return r == ~0u;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_equal<int, Q, false, 32, true>
+	{
+		static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
+		{
+			uint32x4_t cmp = vceqq_s32(v1.data, v2.data);
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			cmp = vpminq_u32(cmp, cmp);
+			cmp = vpminq_u32(cmp, cmp);
+			uint32_t r = cmp[0];
+#else
+			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
+			cmpx2 = vpmin_u32(cmpx2, cmpx2);
+			uint32_t r = cmpx2[0];
+#endif
+			return r == ~0u;
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_nequal<float, Q, false, 32, true>
+	{
+		static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
+		{
+			return !compute_vec4_equal<float, Q, false, 32, true>::call(v1, v2);
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_nequal<uint, Q, false, 32, true>
+	{
+		static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
+		{
+			return !compute_vec4_equal<uint, Q, false, 32, true>::call(v1, v2);
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_vec4_nequal<int, Q, false, 32, true>
+	{
+		static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
+		{
+			return !compute_vec4_equal<int, Q, false, 32, true>::call(v1, v2);
+		}
+	};
+
+}//namespace detail
+
+#if !GLM_CONFIG_XYZW_ONLY
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) :
+		data(vdupq_n_f32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) :
+		data(vdupq_n_f32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) :
+		data(vdupq_n_f32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) :
+		data(vdupq_n_s32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) :
+		data(vdupq_n_s32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) :
+		data(vdupq_n_s32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) :
+		data(vdupq_n_u32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) :
+		data(vdupq_n_u32(_s))
+	{}
+
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) :
+		data(vdupq_n_u32(_s))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) :
+		data(rhs.data)
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) :
+		data(vcvtq_f32_s32(rhs.data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) :
+		data(vcvtq_f32_u32(rhs.data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) :
+		data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) :
+		data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) :
+		data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) :
+		data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data))
+	{}
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) :
+		data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data))
+	{}
+
+
+	template<>
+	template<>
+	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) :
+		data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data))
+	{}
+
+#endif
+}//namespace glm
+
+#endif

+ 24 - 5
glm/simd/platform.h

@@ -235,10 +235,11 @@
 
 
 // User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
 // User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
 
 
-#define GLM_ARCH_MIPS_BIT	(0x10000000)
-#define GLM_ARCH_PPC_BIT	(0x20000000)
-#define GLM_ARCH_ARM_BIT	(0x40000000)
-#define GLM_ARCH_X86_BIT	(0x80000000)
+#define GLM_ARCH_MIPS_BIT	  (0x10000000)
+#define GLM_ARCH_PPC_BIT	  (0x20000000)
+#define GLM_ARCH_ARM_BIT	  (0x40000000)
+#define GLM_ARCH_ARMV8_BIT  (0x01000000)
+#define GLM_ARCH_X86_BIT	  (0x80000000)
 
 
 #define GLM_ARCH_SIMD_BIT	(0x00001000)
 #define GLM_ARCH_SIMD_BIT	(0x00001000)
 
 
@@ -263,6 +264,7 @@
 #define GLM_ARCH_AVX		(GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42)
 #define GLM_ARCH_AVX		(GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42)
 #define GLM_ARCH_AVX2		(GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX)
 #define GLM_ARCH_AVX2		(GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX)
 #define GLM_ARCH_ARM		(GLM_ARCH_ARM_BIT)
 #define GLM_ARCH_ARM		(GLM_ARCH_ARM_BIT)
+#define GLM_ARCH_ARMV8		(GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM | GLM_ARCH_ARMV8_BIT)
 #define GLM_ARCH_NEON		(GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM)
 #define GLM_ARCH_NEON		(GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM)
 #define GLM_ARCH_MIPS		(GLM_ARCH_MIPS_BIT)
 #define GLM_ARCH_MIPS		(GLM_ARCH_MIPS_BIT)
 #define GLM_ARCH_PPC		(GLM_ARCH_PPC_BIT)
 #define GLM_ARCH_PPC		(GLM_ARCH_PPC_BIT)
@@ -270,7 +272,11 @@
 #if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE)
 #if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE)
 #	define GLM_ARCH GLM_ARCH_UNKNOWN
 #	define GLM_ARCH GLM_ARCH_UNKNOWN
 #elif defined(GLM_FORCE_NEON)
 #elif defined(GLM_FORCE_NEON)
-#	define GLM_ARCH (GLM_ARCH_NEON)
+#	if __ARM_ARCH >= 8
+#		define GLM_ARCH (GLM_ARCH_ARMV8)
+#	else
+#		define GLM_ARCH (GLM_ARCH_NEON)
+#	endif
 #	define GLM_FORCE_INTRINSICS
 #	define GLM_FORCE_INTRINSICS
 #elif defined(GLM_FORCE_AVX2)
 #elif defined(GLM_FORCE_AVX2)
 #	define GLM_ARCH (GLM_ARCH_AVX2)
 #	define GLM_ARCH (GLM_ARCH_AVX2)
@@ -313,9 +319,14 @@
 #		define GLM_ARCH (GLM_ARCH_SSE2)
 #		define GLM_ARCH (GLM_ARCH_SSE2)
 #	elif defined(__i386__)
 #	elif defined(__i386__)
 #		define GLM_ARCH (GLM_ARCH_X86)
 #		define GLM_ARCH (GLM_ARCH_X86)
+#	elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
+#		define GLM_ARCH (GLM_ARCH_ARMV8)
+#warning "ARM v8"
 #	elif defined(__ARM_NEON)
 #	elif defined(__ARM_NEON)
+#warning "ARM NEON"
 #		define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
 #		define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
 #	elif defined(__arm__ ) || defined(_M_ARM)
 #	elif defined(__arm__ ) || defined(_M_ARM)
+#warning "ARM v6"
 #		define GLM_ARCH (GLM_ARCH_ARM)
 #		define GLM_ARCH (GLM_ARCH_ARM)
 #	elif defined(__mips__ )
 #	elif defined(__mips__ )
 #		define GLM_ARCH (GLM_ARCH_MIPS)
 #		define GLM_ARCH (GLM_ARCH_MIPS)
@@ -355,6 +366,8 @@
 #	include <pmmintrin.h>
 #	include <pmmintrin.h>
 #elif GLM_ARCH & GLM_ARCH_SSE2_BIT
 #elif GLM_ARCH & GLM_ARCH_SSE2_BIT
 #	include <emmintrin.h>
 #	include <emmintrin.h>
+#elif GLM_ARCH & GLM_ARCH_NEON_BIT
+#	include <arm_neon.h>
 #endif//GLM_ARCH
 #endif//GLM_ARCH
 
 
 #if GLM_ARCH & GLM_ARCH_SSE2_BIT
 #if GLM_ARCH & GLM_ARCH_SSE2_BIT
@@ -380,3 +393,9 @@
 	typedef __m256i			glm_i64vec4;
 	typedef __m256i			glm_i64vec4;
 	typedef __m256i			glm_u64vec4;
 	typedef __m256i			glm_u64vec4;
 #endif
 #endif
+
+#if GLM_ARCH & GLM_ARCH_NEON_BIT
+	typedef float32x4_t			glm_f32vec4;
+	typedef int32x4_t			glm_i32vec4;
+	typedef uint32x4_t			glm_u32vec4;
+#endif