Browse Source

Clarify support for SSSE3, SSE4.1 and SSE4.2

Christophe Riccio 9 years ago
parent
commit
71e6b537cc
2 changed files with 49 additions and 24 deletions
  1. 36 24
      glm/detail/setup.hpp
  2. 13 0
      glm/simd/common.h

+ 36 - 24
glm/detail/setup.hpp

@@ -70,14 +70,16 @@
 
 #define GLM_ARCH_PURE		0x00000000
 #define GLM_ARCH_X86		0x00000001
-#define GLM_ARCH_SSE2		0x00000002
-#define GLM_ARCH_SSE3		0x00000004
-#define GLM_ARCH_SSE4		0x00000008
-#define GLM_ARCH_AVX		0x00000010
-#define GLM_ARCH_AVX2		0x00000020
-#define GLM_ARCH_AVX512		0x00000040 // Skylake subset
+#define GLM_ARCH_SSE2		0x00000002 | GLM_ARCH_X86
+#define GLM_ARCH_SSE3		0x00000004 | GLM_ARCH_SSE2
+#define GLM_ARCH_SSSE3		0x00000008 | GLM_ARCH_SSE3
+#define GLM_ARCH_SSE41		0x00000010 | GLM_ARCH_SSSE3
+#define GLM_ARCH_SSE42		0x00000020 | GLM_ARCH_SSE41
+#define GLM_ARCH_AVX		0x00000040 | GLM_ARCH_SSE42
+#define GLM_ARCH_AVX2		0x00000080 | GLM_ARCH_AVX
+#define GLM_ARCH_AVX512		0x00000100 | GLM_ARCH_AVX2 // Skylake subset
 #define GLM_ARCH_ARM		0x00000100
-#define GLM_ARCH_NEON		0x00000200
+#define GLM_ARCH_NEON		0x00000200 | GLM_ARCH_ARM
 #define GLM_ARCH_MIPS		0x00010000
 #define GLM_ARCH_PPC		0x01000000
 
@@ -88,31 +90,41 @@
 #elif defined(GLM_FORCE_PPC)
 #	define GLM_ARCH (GLM_ARCH_PPC)
 #elif defined(GLM_FORCE_NEON)
-#	define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
+#	define GLM_ARCH (GLM_ARCH_NEON)
 #elif defined(GLM_FORCE_AVX512)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#	define GLM_ARCH (GLM_ARCH_AVX512)
 #elif defined(GLM_FORCE_AVX2)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#	define GLM_ARCH (GLM_ARCH_AVX2)
 #elif defined(GLM_FORCE_AVX)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
-#elif defined(GLM_FORCE_SSE4)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#	define GLM_ARCH (GLM_ARCH_AVX)
+#elif defined(GLM_FORCE_SSE42)
+#	define GLM_ARCH (GLM_ARCH_SSE42)
+#elif defined(GLM_FORCE_SSE41)
+#	define GLM_ARCH (GLM_ARCH_SSE41)
+#elif defined(GLM_FORCE_SSSE3)
+#	define GLM_ARCH (GLM_ARCH_SSSE3)
 #elif defined(GLM_FORCE_SSE3)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#	define GLM_ARCH (GLM_ARCH_SSE3)
 #elif defined(GLM_FORCE_SSE2)
-#	define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2)
+#	define GLM_ARCH (GLM_ARCH_SSE2)
 #elif (GLM_COMPILER & (GLM_COMPILER_LLVM | GLM_COMPILER_GCC)) || ((GLM_COMPILER & GLM_COMPILER_INTEL) && (GLM_PLATFORM & GLM_PLATFORM_LINUX))
 //	This is Skylake set of instruction set
 #	if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_AVX512)
 #	elif defined(__AVX2__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_AVX2)
 #	elif defined(__AVX__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_AVX)
+#	elif defined(__SSE4_2__)
+#		define GLM_ARCH (GLM_ARCH_SSE42)
+#	elif defined(__SSE4_1__)
+#		define GLM_ARCH (GLM_ARCH_SSE41)
+#	elif defined(__SSSE3__)
+#		define GLM_ARCH (GLM_ARCH_SSSE3)
 #	elif defined(__SSE3__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_SSE3)
 #	elif defined(__SSE2__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_SSE2)
 #	elif defined(__i386__) || defined(__x86_64__)
 #		define GLM_ARCH (GLM_ARCH_X86)
 #	elif defined(__ARM_NEON)
@@ -130,14 +142,14 @@
 #	if defined(_M_ARM)
 #		define GLM_ARCH (GLM_ARCH_ARM)
 #	elif defined(__AVX2__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_AVX2)
 #	elif defined(__AVX__)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_AVX)
 #	elif defined(_M_X64)
-#		define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2)
+#		define GLM_ARCH (GLM_ARCH_SSE2)
 #	elif defined(_M_IX86_FP)
 #		if _M_IX86_FP >= 2
-#			define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2)
+#			define GLM_ARCH (GLM_ARCH_SSE2)
 #		else
 #			define GLM_ARCH (GLM_ARCH_PURE)
 #		endif

+ 13 - 0
glm/simd/common.h

@@ -12,11 +12,24 @@ static const __m128 GLM_VAR_USED glm_three = _mm_set_ps1(3.0f);
 
 static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f);
 
+//abs
 GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x)
 {
 	return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
 }
 
+GLM_FUNC_QUALIFIER __m128i glm_i32v4_abs(__m128i x)
+{
+#	if GLM_ARCH & GLM_ARCH_SSSE3
+		return _mm_sign_epi32(x, x);
+#	else
+		__m128i const sgn0 = _mm_srai_epi32(x, 31);
+		__m128i const inv0 = _mm_xor_si128(x, sgn0);
+		__m128i const sub0 = _mm_sub_epi32(inv0, sgn0);
+		return sub0;
+#	endif
+}
+
 //sign
 GLM_FUNC_QUALIFIER __m128 glm_f32v4_sgn(__m128 x)
 {