Browse Source

Add SSE2 codepath (#93)

Add an SSE2 codepath to Jolt by providing a C++ implementation for all
functions that use SSE4.1 instructions.  This enables Jolt to run on
all x64 processors.

The SSE2 codepath is ~10% slower than the SSE4.2 codepath on average in
the PerformanceTest tests.
John Poole 3 years ago
parent
commit
28f363856a
9 changed files with 202 additions and 59 deletions
  1. 11 3
      Build/CMakeLists.txt
  2. 1 0
      Build/README.md
  3. 3 0
      Jolt/Core/Core.h
  4. 8 5
      Jolt/Math/Mat44.inl
  5. 1 1
      Jolt/Math/Quat.inl
  6. 63 18
      Jolt/Math/UVec4.inl
  7. 65 17
      Jolt/Math/Vec3.inl
  8. 49 14
      Jolt/Math/Vec4.inl
  9. 1 1
      README.md

+ 11 - 3
Build/CMakeLists.txt

@@ -7,7 +7,8 @@ option(TARGET_UNIT_TESTS "Build Unit Tests" ON)
 option(TARGET_HELLO_WORLD "Build Hello World" ON)
 option(TARGET_HELLO_WORLD "Build Hello World" ON)
 option(TARGET_PERFORMANCE_TEST "Build Performance Test" ON)
 option(TARGET_PERFORMANCE_TEST "Build Performance Test" ON)
 
 
-# Select X86 processor features to use (if everything is off it will be SSE4.1 compatible)
+# Select X86 processor features to use (if everything is off it will be SSE2 compatible)
+option(USE_SSE4_1 "Enable SSE4.1" ON)
 option(USE_SSE4_2 "Enable SSE4.2" ON)
 option(USE_SSE4_2 "Enable SSE4.2" ON)
 option(USE_AVX "Enable AVX" ON)
 option(USE_AVX "Enable AVX" ON)
 option(USE_AVX2 "Enable AVX2" ON)
 option(USE_AVX2 "Enable AVX2" ON)
@@ -52,6 +53,9 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
 		elseif (USE_AVX)
 		elseif (USE_AVX)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
 		endif()	
 		endif()	
+		if (USE_SSE4_1)
+			add_compile_definitions(JPH_USE_SSE4_1)
+		endif()
 		if (USE_SSE4_2)
 		if (USE_SSE4_2)
 			add_compile_definitions(JPH_USE_SSE4_2)
 			add_compile_definitions(JPH_USE_SSE4_2)
 		endif()
 		endif()
@@ -79,8 +83,10 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
 		elseif (USE_SSE4_2)
 		elseif (USE_SSE4_2)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
-		else()
+		elseif (USE_SSE4_1)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+		else ()
+		  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
 		endif()
 		endif()
 		if (USE_LZCNT)
 		if (USE_LZCNT)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
@@ -117,8 +123,10 @@ elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQU
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
 		elseif (USE_SSE4_2)
 		elseif (USE_SSE4_2)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
-		else()
+		elseif (USE_SSE_4_1)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+		else()
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
 		endif()
 		endif()
 		if (USE_LZCNT)
 		if (USE_LZCNT)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")

+ 1 - 0
Build/README.md

@@ -25,6 +25,7 @@ There are a number of user configurable defines that turn on/off certain feature
 - JPH_DEBUG_RENDERER - Adds support to draw lines and triangles, used to be able to debug draw the state of the world.
 - JPH_DEBUG_RENDERER - Adds support to draw lines and triangles, used to be able to debug draw the state of the world.
 - JPH_DISABLE_TEMP_ALLOCATOR - Disables the temporary memory allocator, used mainly to allow ASAN to do its job.
 - JPH_DISABLE_TEMP_ALLOCATOR - Disables the temporary memory allocator, used mainly to allow ASAN to do its job.
 - JPH_FLOATING_POINT_EXCEPTIONS_ENABLED - Turns on division by zero and invalid floating point exception support in order to detect bugs (Windows only).
 - JPH_FLOATING_POINT_EXCEPTIONS_ENABLED - Turns on division by zero and invalid floating point exception support in order to detect bugs (Windows only).
+- JPH_USE_SSE4_1 - Enable SSE4.1 CPU instructions (x64 only)
 - JPH_USE_SSE4_2 - Enable SSE4.2 CPU instructions (x64 only)
 - JPH_USE_SSE4_2 - Enable SSE4.2 CPU instructions (x64 only)
 - JPH_USE_F16C - Enable half float CPU instructions (x64 only)
 - JPH_USE_F16C - Enable half float CPU instructions (x64 only)
 - JPH_USE_LZCNT - Enable the lzcnt CPU instruction (x64 only)
 - JPH_USE_LZCNT - Enable the lzcnt CPU instruction (x64 only)

+ 3 - 0
Jolt/Core/Core.h

@@ -93,6 +93,9 @@
 	#if (defined(__BMI__) || defined(__AVX2__)) && !defined(JPH_USE_TZCNT)
 	#if (defined(__BMI__) || defined(__AVX2__)) && !defined(JPH_USE_TZCNT)
 		#define JPH_USE_TZCNT
 		#define JPH_USE_TZCNT
 	#endif
 	#endif
+	#if (defined(__SSE4_1__) || defined(__AVX__)) && !defined(JPH_USE_SSE4_1)
+		#define JPH_USE_SSE4_1
+	#endif
 	#if (defined(__SSE4_2__) || defined(__AVX__)) && !defined(JPH_USE_SSE4_2)
 	#if (defined(__SSE4_2__) || defined(__AVX__)) && !defined(JPH_USE_SSE4_2)
 		#define JPH_USE_SSE4_2
 		#define JPH_USE_SSE4_2
 	#endif
 	#endif

+ 8 - 5
Jolt/Math/Mat44.inl

@@ -76,7 +76,7 @@ Mat44 Mat44::sRotation(QuatArg inQuat)
 	JPH_ASSERT(inQuat.IsNormalized());
 	JPH_ASSERT(inQuat.IsNormalized());
 
 
 	// See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
 	// See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
-#ifdef JPH_USE_SSE
+#ifdef JPH_USE_SSE4_1
 	__m128 xyzw = inQuat.mValue.mValue;
 	__m128 xyzw = inQuat.mValue.mValue;
 	__m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
 	__m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
 	__m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
 	__m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
@@ -167,7 +167,7 @@ Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
 
 
 Mat44 Mat44::sCrossProduct(Vec3Arg inV)
 Mat44 Mat44::sCrossProduct(Vec3Arg inV)
 {
 {
-#ifdef JPH_USE_SSE
+#ifdef JPH_USE_SSE4_1
 	// Zero out the W component
 	// Zero out the W component
 	__m128 zero = _mm_setzero_ps();
 	__m128 zero = _mm_setzero_ps();
 	__m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
 	__m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
@@ -295,7 +295,7 @@ Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
 
 
 Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
 Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	__m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
 	__m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
 	__m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
 	__m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
 	__m128 xy = _mm_blend_ps(x, y, 0b0010);
 	__m128 xy = _mm_blend_ps(x, y, 0b0010);
@@ -1026,7 +1026,7 @@ Mat44 Mat44::GetRotation() const
 
 
 Mat44 Mat44::GetRotationSafe() const
 Mat44 Mat44::GetRotationSafe() const
 { 
 { 
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	__m128 zero = _mm_setzero_ps(); 
 	__m128 zero = _mm_setzero_ps(); 
 	return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
 	return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
 				 _mm_blend_ps(mCol[1].mValue, zero, 8),
 				 _mm_blend_ps(mCol[1].mValue, zero, 8),
@@ -1038,7 +1038,10 @@ Mat44 Mat44::GetRotationSafe() const
 				 vsetq_lane_f32(0, mCol[2].mValue, 3),
 				 vsetq_lane_f32(0, mCol[2].mValue, 3),
 				 Vec4(0, 0, 0, 1)); 
 				 Vec4(0, 0, 0, 1)); 
 #else
 #else
-	#error Unsupported CPU architecture
+	return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
+				 Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
+				 Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
+				 Vec4(0, 0, 0, 1));
 #endif
 #endif
 }
 }
 
 

+ 1 - 1
Jolt/Math/Quat.inl

@@ -5,7 +5,7 @@ namespace JPH {
 
 
 Quat Quat::operator * (QuatArg inRHS) const
 Quat Quat::operator * (QuatArg inRHS) const
 { 
 { 
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	// Taken from: http://momchil-velikov.blogspot.nl/2013/10/fast-sse-quternion-multiplication.html
 	// Taken from: http://momchil-velikov.blogspot.nl/2013/10/fast-sse-quternion-multiplication.html
 	__m128 abcd = mValue.mValue;
 	__m128 abcd = mValue.mValue;
 	__m128 xyzw = inRHS.mValue.mValue;
 	__m128 xyzw = inRHS.mValue.mValue;

+ 63 - 18
Jolt/Math/UVec4.inl

@@ -105,23 +105,33 @@ UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
 
 
 UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
 UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_min_epu32(inV1.mValue, inV2.mValue);
 	return _mm_min_epu32(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	return vminq_u32(inV1.mValue, inV2.mValue);
 	return vminq_u32(inV1.mValue, inV2.mValue);
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
 UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
 UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_max_epu32(inV1.mValue, inV2.mValue);
 	return _mm_max_epu32(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	return vmaxq_u32(inV1.mValue, inV2.mValue);
 	return vmaxq_u32(inV1.mValue, inV2.mValue);
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
@@ -138,12 +148,17 @@ UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
 
 
 UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
 UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
 	return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
 	return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = inControl.mU32[i] ? inV2.mU32[i] : inV1.mU32[i];
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
@@ -208,12 +223,17 @@ UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
 
 
 UVec4 UVec4::operator * (UVec4Arg inV2) const
 UVec4 UVec4::operator * (UVec4Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_mullo_epi32(mValue, inV2.mValue);
 	return _mm_mullo_epi32(mValue, inV2.mValue);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	return vmulq_u32(mValue, inV2.mValue);
 	return vmulq_u32(mValue, inV2.mValue);
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = mU32[i] * inV2.mU32[i];
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
@@ -441,61 +461,86 @@ UVec4 UVec4::Expand4Uint16Hi() const
 
 
 UVec4 UVec4::Expand4Byte0() const
 UVec4 UVec4::Expand4Byte0() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	int8x16_t idx = { 0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f };
 	int8x16_t idx = { 0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f };
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
 UVec4 UVec4::Expand4Byte4() const
 UVec4 UVec4::Expand4Byte4() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	int8x16_t idx = { 0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f };
 	int8x16_t idx = { 0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f };
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
 UVec4 UVec4::Expand4Byte8() const
 UVec4 UVec4::Expand4Byte8() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	int8x16_t idx = { 0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f };
 	int8x16_t idx = { 0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f };
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
 UVec4 UVec4::Expand4Byte12() const
 UVec4 UVec4::Expand4Byte12() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	int8x16_t idx = { 0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f };
 	int8x16_t idx = { 0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f };
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
 UVec4 UVec4::ShiftComponents4Minus(int inCount) const
 UVec4 UVec4::ShiftComponents4Minus(int inCount) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, sFourMinusXShuffle[inCount].mValue);
 	return _mm_shuffle_epi8(mValue, sFourMinusXShuffle[inCount].mValue);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	uint8x16_t idx = vreinterpretq_u8_u32(sFourMinusXShuffle[inCount].mValue);
 	uint8x16_t idx = vreinterpretq_u8_u32(sFourMinusXShuffle[inCount].mValue);
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 #else
-	#error Unsupported CPU architecture
+	UVec4 result(0, 0, 0, 0);
+	for (int i = 0; i < inCount; i++)
+	{
+		result.mU32[i] = mU32[i + 4 - inCount];
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 

+ 65 - 17
Jolt/Math/Vec3.inl

@@ -223,14 +223,21 @@ Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
 
 
 Vec3 Vec3::sSelect(Vec3Arg inV1, Vec3Arg inV2, UVec4Arg inControl)
 Vec3 Vec3::sSelect(Vec3Arg inV1, Vec3Arg inV2, UVec4Arg inControl)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	Type v = _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
 	Type v = _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
+	return sFixW(v);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	Type v = vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
 	Type v = vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
+	return sFixW(v);
 #else
 #else
-	#error Unsupported CPU architecture
+	Vec3 result;
+	for (int i = 0; i < 3; i++)
+	{
+		result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
+	}
+	result.mF32[3] = result.mF32[2];
+	return result;
 #endif
 #endif
-	return sFixW(v);
 }
 }
 
 
 Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
 Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
@@ -527,59 +534,79 @@ Vec3 Vec3::Cross(Vec3Arg inV2) const
 
 
 Vec3 Vec3::DotV(Vec3Arg inV2) const
 Vec3 Vec3::DotV(Vec3Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
 	mul = vsetq_lane_f32(0, mul, 3);
 	mul = vsetq_lane_f32(0, mul, 3);
     return vdupq_n_f32(vaddvq_f32(mul));
     return vdupq_n_f32(vaddvq_f32(mul));
 #else
 #else
-	#error Unsupported CPU architecture
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		dot += mF32[i] * inV2.mF32[i];
+	}
+	return Vec3(dot, dot, dot);
 #endif
 #endif
 }
 }
 
 
 Vec4 Vec3::DotV4(Vec3Arg inV2) const
 Vec4 Vec3::DotV4(Vec3Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
 	mul = vsetq_lane_f32(0, mul, 3);
 	mul = vsetq_lane_f32(0, mul, 3);
     return vdupq_n_f32(vaddvq_f32(mul));
     return vdupq_n_f32(vaddvq_f32(mul));
 #else
 #else
-	#error Unsupported CPU architecture
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		dot += mF32[i] * inV2.mF32[i];
+	}
+	return Vec4(dot, dot, dot, dot);
 #endif
 #endif
 }
 }
 
 
 float Vec3::Dot(Vec3Arg inV2) const
 float Vec3::Dot(Vec3Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
 	mul = vsetq_lane_f32(0, mul, 3);
 	mul = vsetq_lane_f32(0, mul, 3);
     return vaddvq_f32(mul);
     return vaddvq_f32(mul);
 #else
 #else
-	#error Unsupported CPU architecture
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		dot += mF32[i] * inV2.mF32[i];
+	}
+	return dot;
 #endif
 #endif
 }
 }
 
 
 float Vec3::LengthSq() const
 float Vec3::LengthSq() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
 	mul = vsetq_lane_f32(0, mul, 3);
 	mul = vsetq_lane_f32(0, mul, 3);
     return vaddvq_f32(mul);
     return vaddvq_f32(mul);
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	return len_sq;
 #endif
 #endif
 }
 }
 
 
 float Vec3::Length() const
 float Vec3::Length() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
@@ -587,7 +614,12 @@ float Vec3::Length() const
     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
     return vget_lane_f32(vsqrt_f32(sum), 0);
     return vget_lane_f32(vsqrt_f32(sum), 0);
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	return sqrt(len_sq);
 #endif
 #endif
 }
 }
 
 
@@ -604,7 +636,7 @@ Vec3 Vec3::Sqrt() const
 
 
 Vec3 Vec3::Normalized() const
 Vec3 Vec3::Normalized() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
@@ -612,13 +644,19 @@ Vec3 Vec3::Normalized() const
     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
     return vdivq_f32(mValue, vsqrtq_f32(sum));
     return vdivq_f32(mValue, vsqrtq_f32(sum));
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	float len = sqrt(len_sq);
+	return Vec3(mF32[0] / len, mF32[1] / len, mF32[2] / len);
 #endif
 #endif
 }
 }
 
 
 Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
 Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
 	Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
 	Type is_zero = _mm_cmpeq_ps(len_sq, _mm_setzero_ps());
 	Type is_zero = _mm_cmpeq_ps(len_sq, _mm_setzero_ps());
 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
@@ -637,7 +675,17 @@ Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
 	float32x4_t is_zero = vceqq_f32(len, vdupq_n_f32(0));
 	float32x4_t is_zero = vceqq_f32(len, vdupq_n_f32(0));
     return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, len));
     return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, len));
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 3; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	float len = sqrt(len_sq);
+	if (len == 0.0f) {
+		return inZeroValue;
+	} else {
+		return Vec3(mF32[0] / len, mF32[1] / len, mF32[2] / len);
+	}
 #endif
 #endif
 }
 }
 
 

+ 49 - 14
Jolt/Math/Vec4.inl

@@ -14,12 +14,16 @@ Vec4::Vec4(Vec3Arg inRHS) :
 
 
 Vec4::Vec4(Vec3Arg inRHS, float inW)
 Vec4::Vec4(Vec3Arg inRHS, float inW)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
 	mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
 	mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
 #else
 #else
-	#error Undefined CPU architecture
+	for (int i = 0; i < 3; i++)
+	{
+		mF32[i] = inRHS.mF32[i];
+	}
+	mF32[3] = inW;
 #endif
 #endif
 }
 }
 
 
@@ -222,12 +226,17 @@ Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
 
 
 Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
 Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
 	return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
 	return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
 	return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
 #else
 #else
-	#error Unsupported CPU architecture
+	Vec4 result;
+	for (int i = 0; i < 4; i++)
+	{
+		result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
+	}
+	return result;
 #endif
 #endif
 }
 }
 
 
@@ -550,50 +559,70 @@ Vec4 Vec4::Reciprocal() const
 
 
 Vec4 Vec4::DotV(Vec4Arg inV2) const
 Vec4 Vec4::DotV(Vec4Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_dp_ps(mValue, inV2.mValue, 0xff);
 	return _mm_dp_ps(mValue, inV2.mValue, 0xff);
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     return vdupq_n_f32(vaddvq_f32(mul));
     return vdupq_n_f32(vaddvq_f32(mul));
 #else
 #else
-	#error Unsupported CPU architecture
+	float dot = 0.0f;
+	for (int i = 0; i < 4; i++)
+	{
+		dot += mF32[i] * inV2.mF32[i];
+	}
+	return Vec4(dot, dot, dot, dot);
 #endif
 #endif
 }
 }
 
 
 float Vec4::Dot(Vec4Arg inV2) const
 float Vec4::Dot(Vec4Arg inV2) const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
     return vaddvq_f32(mul);
     return vaddvq_f32(mul);
 #else
 #else
-	#error Unsupported CPU architecture
+	float dot = 0.0f;
+	for (int i = 0; i < 4; i++)
+	{
+		dot += mF32[i] * inV2.mF32[i];
+	}
+	return dot;
 #endif
 #endif
 }
 }
 
 
 float Vec4::LengthSq() const
 float Vec4::LengthSq() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
     return vaddvq_f32(mul);
     return vaddvq_f32(mul);
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 4; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	return len_sq;
 #endif
 #endif
 }
 }
 
 
 float Vec4::Length() const
 float Vec4::Length() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
     return vget_lane_f32(vsqrt_f32(sum), 0);
     return vget_lane_f32(vsqrt_f32(sum), 0);
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 4; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	return sqrt(len_sq);
 #endif
 #endif
 }
 }
 
 
@@ -626,14 +655,20 @@ Vec4 Vec4::GetSign() const
 
 
 Vec4 Vec4::Normalized() const
 Vec4 Vec4::Normalized() const
 {
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_SSE4_1)
 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
 #elif defined(JPH_USE_NEON)
 #elif defined(JPH_USE_NEON)
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t mul = vmulq_f32(mValue, mValue);
     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
     return vdivq_f32(mValue, vsqrtq_f32(sum));
     return vdivq_f32(mValue, vsqrtq_f32(sum));
 #else
 #else
-	#error Unsupported CPU architecture
+	float len_sq = 0.0f;
+	for (int i = 0; i < 4; i++)
+	{
+		len_sq += mF32[i] * mF32[i];
+	}
+	float len = sqrt(len_sq);
+	return Vec4(mF32[0] / len, mF32[1] / len, mF32[2] / len, mF32[3] / len);
 #endif
 #endif
 }
 }
 
 

+ 1 - 1
README.md

@@ -84,7 +84,7 @@ For more information see the [Architecture and API documentation](https://jrouwe
 
 
 ## Required CPU features
 ## Required CPU features
 
 
-* On x86 the minimal requirements are SSE4.1 but the library can be compiled using SSE4.2, AVX or AVX2.
+* On x86 the minimal requirements are SSE2 but the library can be compiled using SSE4.1, SSE4.2, AVX or AVX2.
 * On ARM64 the library requires NEON with FP16 support.
 * On ARM64 the library requires NEON with FP16 support.
 
 
 ## Compiling
 ## Compiling