3 years ago · 28f363856a
--- a/Build/CMakeLists.txt
+++ b/Build/CMakeLists.txt
@@ -7,7 +7,8 @@ option(TARGET_UNIT_TESTS "Build Unit Tests" ON)
 
															 option(TARGET_HELLO_WORLD "Build Hello World" ON)
														
 
															 option(TARGET_PERFORMANCE_TEST "Build Performance Test" ON)
														
 
															-# Select X86 processor features to use (if everything is off it will be SSE4.1 compatible)
														
 
															+# Select X86 processor features to use (if everything is off it will be SSE2 compatible)
														
 
															+option(USE_SSE4_1 "Enable SSE4.1" ON)
														
 
															 option(USE_SSE4_2 "Enable SSE4.2" ON)
														
 
															 option(USE_AVX "Enable AVX" ON)
														
 
															 option(USE_AVX2 "Enable AVX2" ON)
														
@@ -52,6 +53,9 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
 
															 		elseif (USE_AVX)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
														
 
															 		endif()	
														
 
															+		if (USE_SSE4_1)
														
 
															+			add_compile_definitions(JPH_USE_SSE4_1)
														
 
															+		endif()
														
 
															 		if (USE_SSE4_2)
														
 
															 			add_compile_definitions(JPH_USE_SSE4_2)
														
 
															 		endif()
														
@@ -79,8 +83,10 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
														
 
															 		elseif (USE_SSE4_2)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
														
 
															-		else()
														
 
															+		elseif (USE_SSE4_1)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
														
 
															+		else ()
														
 
															+		  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
														
 
															 		endif()
														
 
															 		if (USE_LZCNT)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
														
@@ -117,8 +123,10 @@ elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQU
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
														
 
															 		elseif (USE_SSE4_2)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpopcnt")
														
 
															-		else()
														
 
															+		elseif (USE_SSE_4_1)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
														
 
															+		else()
														
 
															+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
														
 
															 		endif()
														
 
															 		if (USE_LZCNT)
														
 
															 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlzcnt")
														
--- a/Build/README.md
+++ b/Build/README.md
@@ -25,6 +25,7 @@ There are a number of user configurable defines that turn on/off certain feature
 
															 - JPH_DEBUG_RENDERER - Adds support to draw lines and triangles, used to be able to debug draw the state of the world.
														
 
															 - JPH_DISABLE_TEMP_ALLOCATOR - Disables the temporary memory allocator, used mainly to allow ASAN to do its job.
														
 
															 - JPH_FLOATING_POINT_EXCEPTIONS_ENABLED - Turns on division by zero and invalid floating point exception support in order to detect bugs (Windows only).
														
 
															+- JPH_USE_SSE4_1 - Enable SSE4.1 CPU instructions (x64 only)
														
 
															 - JPH_USE_SSE4_2 - Enable SSE4.2 CPU instructions (x64 only)
														
 
															 - JPH_USE_F16C - Enable half float CPU instructions (x64 only)
														
 
															 - JPH_USE_LZCNT - Enable the lzcnt CPU instruction (x64 only)
														
--- a/Jolt/Core/Core.h
+++ b/Jolt/Core/Core.h
@@ -93,6 +93,9 @@
 
															 	#if (defined(__BMI__) || defined(__AVX2__)) && !defined(JPH_USE_TZCNT)
														
 
															 		#define JPH_USE_TZCNT
														
 
															 	#endif
														
 
															+	#if (defined(__SSE4_1__) || defined(__AVX__)) && !defined(JPH_USE_SSE4_1)
														
 
															+		#define JPH_USE_SSE4_1
														
 
															+	#endif
														
 
															 	#if (defined(__SSE4_2__) || defined(__AVX__)) && !defined(JPH_USE_SSE4_2)
														
 
															 		#define JPH_USE_SSE4_2
														
 
															 	#endif
														
--- a/Jolt/Math/Mat44.inl
+++ b/Jolt/Math/Mat44.inl
@@ -76,7 +76,7 @@ Mat44 Mat44::sRotation(QuatArg inQuat)
 
															 	JPH_ASSERT(inQuat.IsNormalized());
														
 
															 	// See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
														
 
															-#ifdef JPH_USE_SSE
														
 
															+#ifdef JPH_USE_SSE4_1
														
 
															 	__m128 xyzw = inQuat.mValue.mValue;
														
 
															 	__m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
														
 
															 	__m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
														
@@ -167,7 +167,7 @@ Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
 
															 Mat44 Mat44::sCrossProduct(Vec3Arg inV)
														
 
															 {
														
 
															-#ifdef JPH_USE_SSE
														
 
															+#ifdef JPH_USE_SSE4_1
														
 
															 	// Zero out the W component
														
 
															 	__m128 zero = _mm_setzero_ps();
														
 
															 	__m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
														
@@ -295,7 +295,7 @@ Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
 
															 Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	__m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
														
 
															 	__m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
														
 
															 	__m128 xy = _mm_blend_ps(x, y, 0b0010);
														
@@ -1026,7 +1026,7 @@ Mat44 Mat44::GetRotation() const
 
															 Mat44 Mat44::GetRotationSafe() const
														
 
															 { 
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	__m128 zero = _mm_setzero_ps(); 
														
 
															 	return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
														
 
															 				 _mm_blend_ps(mCol[1].mValue, zero, 8),
														
@@ -1038,7 +1038,10 @@ Mat44 Mat44::GetRotationSafe() const
 
															 				 vsetq_lane_f32(0, mCol[2].mValue, 3),
														
 
															 				 Vec4(0, 0, 0, 1)); 
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
														
 
															+				 Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
														
 
															+				 Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
														
 
															+				 Vec4(0, 0, 0, 1));
														
 
															 #endif
														
 
															 }
														
--- a/Jolt/Math/Quat.inl
+++ b/Jolt/Math/Quat.inl
@@ -5,7 +5,7 @@ namespace JPH {
 
															 Quat Quat::operator * (QuatArg inRHS) const
														
 
															 { 
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	// Taken from: http://momchil-velikov.blogspot.nl/2013/10/fast-sse-quternion-multiplication.html
														
 
															 	__m128 abcd = mValue.mValue;
														
 
															 	__m128 xyzw = inRHS.mValue.mValue;
														
--- a/Jolt/Math/UVec4.inl
+++ b/Jolt/Math/UVec4.inl
@@ -105,23 +105,33 @@ UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
 
															 UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_min_epu32(inV1.mValue, inV2.mValue);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	return vminq_u32(inV1.mValue, inV2.mValue);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
 
															 UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_max_epu32(inV1.mValue, inV2.mValue);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	return vmaxq_u32(inV1.mValue, inV2.mValue);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
@@ -138,12 +148,17 @@ UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
 
															 UVec4 UVec4::sSelect(UVec4Arg inV1, UVec4Arg inV2, UVec4Arg inControl)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inV1.mValue), _mm_castsi128_ps(inV2.mValue), _mm_castsi128_ps(inControl.mValue)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	return vbslq_u32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = inControl.mU32[i] ? inV2.mU32[i] : inV1.mU32[i];
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
@@ -208,12 +223,17 @@ UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
 
															 UVec4 UVec4::operator * (UVec4Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_mullo_epi32(mValue, inV2.mValue);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	return vmulq_u32(mValue, inV2.mValue);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = mU32[i] * inV2.mU32[i];
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
@@ -441,61 +461,86 @@ UVec4 UVec4::Expand4Uint16Hi() const
 
															 UVec4 UVec4::Expand4Byte0() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	int8x16_t idx = { 0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f };
														
 
															 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
 
															 UVec4 UVec4::Expand4Byte4() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	int8x16_t idx = { 0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f };
														
 
															 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
 
															 UVec4 UVec4::Expand4Byte8() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	int8x16_t idx = { 0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f };
														
 
															 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
 
															 UVec4 UVec4::Expand4Byte12() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	int8x16_t idx = { 0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f };
														
 
															 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
 
															 UVec4 UVec4::ShiftComponents4Minus(int inCount) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_shuffle_epi8(mValue, sFourMinusXShuffle[inCount].mValue);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	uint8x16_t idx = vreinterpretq_u8_u32(sFourMinusXShuffle[inCount].mValue);
														
 
															 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	UVec4 result(0, 0, 0, 0);
														
 
															+	for (int i = 0; i < inCount; i++)
														
 
															+	{
														
 
															+		result.mU32[i] = mU32[i + 4 - inCount];
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
--- a/Jolt/Math/Vec3.inl
+++ b/Jolt/Math/Vec3.inl
@@ -223,14 +223,21 @@ Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
 
															 Vec3 Vec3::sSelect(Vec3Arg inV1, Vec3Arg inV2, UVec4Arg inControl)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	Type v = _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
														
 
															+	return sFixW(v);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	Type v = vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
														
 
															+	return sFixW(v);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	Vec3 result;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
														
 
															+	}
														
 
															+	result.mF32[3] = result.mF32[2];
														
 
															+	return result;
														
 
															 #endif
														
 
															-	return sFixW(v);
														
 
															 }
														
 
															 Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
														
@@ -527,59 +534,79 @@ Vec3 Vec3::Cross(Vec3Arg inV2) const
 
															 Vec3 Vec3::DotV(Vec3Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
														
 
															 	mul = vsetq_lane_f32(0, mul, 3);
														
 
															     return vdupq_n_f32(vaddvq_f32(mul));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float dot = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		dot += mF32[i] * inV2.mF32[i];
														
 
															+	}
														
 
															+	return Vec3(dot, dot, dot);
														
 
															 #endif
														
 
															 }
														
 
															 Vec4 Vec3::DotV4(Vec3Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
														
 
															 	mul = vsetq_lane_f32(0, mul, 3);
														
 
															     return vdupq_n_f32(vaddvq_f32(mul));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float dot = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		dot += mF32[i] * inV2.mF32[i];
														
 
															+	}
														
 
															+	return Vec4(dot, dot, dot, dot);
														
 
															 #endif
														
 
															 }
														
 
															 float Vec3::Dot(Vec3Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
														
 
															 	mul = vsetq_lane_f32(0, mul, 3);
														
 
															     return vaddvq_f32(mul);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float dot = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		dot += mF32[i] * inV2.mF32[i];
														
 
															+	}
														
 
															+	return dot;
														
 
															 #endif
														
 
															 }
														
 
															 float Vec3::LengthSq() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
 
															 	mul = vsetq_lane_f32(0, mul, 3);
														
 
															     return vaddvq_f32(mul);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	return len_sq;
														
 
															 #endif
														
 
															 }
														
 
															 float Vec3::Length() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
@@ -587,7 +614,12 @@ float Vec3::Length() const
 
															     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
														
 
															     return vget_lane_f32(vsqrt_f32(sum), 0);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	return sqrt(len_sq);
														
 
															 #endif
														
 
															 }
														
@@ -604,7 +636,7 @@ Vec3 Vec3::Sqrt() const
 
															 Vec3 Vec3::Normalized() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
@@ -612,13 +644,19 @@ Vec3 Vec3::Normalized() const
 
															     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
														
 
															     return vdivq_f32(mValue, vsqrtq_f32(sum));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	float len = sqrt(len_sq);
														
 
															+	return Vec3(mF32[0] / len, mF32[1] / len, mF32[2] / len);
														
 
															 #endif
														
 
															 }
														
 
															 Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
														
 
															 	Type is_zero = _mm_cmpeq_ps(len_sq, _mm_setzero_ps());
														
 
															 #ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
														
@@ -637,7 +675,17 @@ Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
 
															 	float32x4_t is_zero = vceqq_f32(len, vdupq_n_f32(0));
														
 
															     return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, len));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	float len = sqrt(len_sq);
														
 
															+	if (len == 0.0f) {
														
 
															+		return inZeroValue;
														
 
															+	} else {
														
 
															+		return Vec3(mF32[0] / len, mF32[1] / len, mF32[2] / len);
														
 
															+	}
														
 
															 #endif
														
 
															 }
														
--- a/Jolt/Math/Vec4.inl
+++ b/Jolt/Math/Vec4.inl
@@ -14,12 +14,16 @@ Vec4::Vec4(Vec3Arg inRHS) :
 
															 Vec4::Vec4(Vec3Arg inRHS, float inW)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
														
 
															 #else
														
 
															-	#error Undefined CPU architecture
														
 
															+	for (int i = 0; i < 3; i++)
														
 
															+	{
														
 
															+		mF32[i] = inRHS.mF32[i];
														
 
															+	}
														
 
															+	mF32[3] = inW;
														
 
															 #endif
														
 
															 }
														
@@ -222,12 +226,17 @@ Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
 
															 Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															 	return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	Vec4 result;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
														
 
															+	}
														
 
															+	return result;
														
 
															 #endif
														
 
															 }
														
@@ -550,50 +559,70 @@ Vec4 Vec4::Reciprocal() const
 
															 Vec4 Vec4::DotV(Vec4Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_dp_ps(mValue, inV2.mValue, 0xff);
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
														
 
															     return vdupq_n_f32(vaddvq_f32(mul));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float dot = 0.0f;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		dot += mF32[i] * inV2.mF32[i];
														
 
															+	}
														
 
															+	return Vec4(dot, dot, dot, dot);
														
 
															 #endif
														
 
															 }
														
 
															 float Vec4::Dot(Vec4Arg inV2) const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
														
 
															     return vaddvq_f32(mul);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float dot = 0.0f;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		dot += mF32[i] * inV2.mF32[i];
														
 
															+	}
														
 
															+	return dot;
														
 
															 #endif
														
 
															 }
														
 
															 float Vec4::LengthSq() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
 
															     return vaddvq_f32(mul);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	return len_sq;
														
 
															 #endif
														
 
															 }
														
 
															 float Vec4::Length() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
 
															     float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
														
 
															     return vget_lane_f32(vsqrt_f32(sum), 0);
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	return sqrt(len_sq);
														
 
															 #endif
														
 
															 }
														
@@ -626,14 +655,20 @@ Vec4 Vec4::GetSign() const
 
															 Vec4 Vec4::Normalized() const
														
 
															 {
														
 
															-#if defined(JPH_USE_SSE)
														
 
															+#if defined(JPH_USE_SSE4_1)
														
 
															 	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
														
 
															 #elif defined(JPH_USE_NEON)
														
 
															     float32x4_t mul = vmulq_f32(mValue, mValue);
														
 
															     float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
														
 
															     return vdivq_f32(mValue, vsqrtq_f32(sum));
														
 
															 #else
														
 
															-	#error Unsupported CPU architecture
														
 
															+	float len_sq = 0.0f;
														
 
															+	for (int i = 0; i < 4; i++)
														
 
															+	{
														
 
															+		len_sq += mF32[i] * mF32[i];
														
 
															+	}
														
 
															+	float len = sqrt(len_sq);
														
 
															+	return Vec4(mF32[0] / len, mF32[1] / len, mF32[2] / len, mF32[3] / len);
														
 
															 #endif
														
 
															 }
														
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ For more information see the [Architecture and API documentation](https://jrouwe
 
															 ## Required CPU features
														
 
															-* On x86 the minimal requirements are SSE4.1 but the library can be compiled using SSE4.2, AVX or AVX2.
														
 
															+* On x86 the minimal requirements are SSE2 but the library can be compiled using SSE4.1, SSE4.2, AVX or AVX2.
														
 
															 * On ARM64 the library requires NEON with FP16 support.
														
 
															 ## Compiling