Ver código fonte

Created DVec3 fallback path for when AVX is not available (#336)

* Added unit tests
Jorrit Rouwe 2 anos atrás
pai
commit
3d2b4b8b4f

+ 24 - 18
Jolt/Math/DVec3.h

@@ -5,8 +5,6 @@
 
 #include <Jolt/Math/Swizzle.h>
 
-#ifdef JPH_USE_AVX2 // DVec3 currently uses AVX2 intrinsics but the class is currently unused so we can leave it out (it will be used in the future to support objects at a large distance from the origin)
-
 JPH_NAMESPACE_BEGIN
 
 /// 3 component vector of doubles (stored as 4 vectors). 
@@ -16,29 +14,25 @@ class [[nodiscard]] DVec3
 public:
 	JPH_OVERRIDE_NEW_DELETE
 
-#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
-	/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
-	JPH_INLINE void				CheckW() const									{ JPH_ASSERT(reinterpret_cast<const uint64 *>(mD32)[2] == reinterpret_cast<const uint64 *>(mD32)[3]); } // Avoid asserts when both components are NaN
-	
-	/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
-	static JPH_INLINE __m256d	sFixW(__m256d inValue)							{ return _mm256_shuffle_pd(inValue, inValue, 2); }
+	// Underlying vector type
+#if defined(JPH_USE_AVX)
+	using Type = __m256d;
 #else
-	/// Stub function
-	JPH_INLINE void				CheckW() const									{ }
-	
-	/// Stub function
-	static JPH_INLINE __m256d	sFixW(__m256d inValue)							{ return inValue; }
-#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	using Type = struct { double mData[4]; };
+#endif
 
 	/// Constructor
 								DVec3() = default; ///< Intentionally not initialized for performance reasons
 								DVec3(const DVec3 &inRHS) = default;
 	JPH_INLINE explicit			DVec3(Vec3Arg inRHS);
-	JPH_INLINE					DVec3(__m256d inRHS) : mValue(inRHS)			{ CheckW(); }
+	JPH_INLINE					DVec3(Type inRHS) : mValue(inRHS)				{ CheckW(); }
 
 	/// Create a vector from 3 components
 	JPH_INLINE					DVec3(double inX, double inY, double inZ);
 
+	/// Load 3 doubles from memory
+	explicit JPH_INLINE			DVec3(const double *inV);
+
 	/// Vector with all zeros
 	static JPH_INLINE DVec3		sZero();
 
@@ -105,7 +99,11 @@ public:
 	JPH_INLINE bool				TestAllTrue() const;
 
 	/// Get individual components
+#ifdef JPH_USE_AVX
 	JPH_INLINE double			GetX() const									{ return _mm_cvtsd_f64(_mm256_castpd256_pd128(mValue)); }
+#else
+	JPH_INLINE double			GetX() const									{ return mD32[0]; }
+#endif // JPH_USE_AVX
 	JPH_INLINE double			GetY() const									{ return mD32[1]; }
 	JPH_INLINE double			GetZ() const									{ return mD32[2]; }
 	
@@ -206,10 +204,20 @@ public:
 		return inStream;
 	}
 
+	/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
+	JPH_INLINE void				CheckW() const;
+	
+	/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
+	static JPH_INLINE Type		sFixW(Type inValue);
+
+	/// Representations of true and false for boolean operations
+	inline static const double	cTrue = BitCast<double>(~uint64(0));
+	inline static const double	cFalse = 0.0f;
+
 private:
 	union
 	{
-		__m256d					mValue;
+		Type					mValue;
 		double					mD32[4];
 	};
 };
@@ -219,5 +227,3 @@ static_assert(is_trivial<DVec3>(), "Is supposed to be a trivial type!");
 JPH_NAMESPACE_END
 
 #include "DVec3.inl"
-
-#endif // JPH_USE_AVX2

+ 269 - 19
Jolt/Math/DVec3.inl

@@ -3,8 +3,6 @@
 
 #pragma once
 
-#ifdef JPH_USE_AVX2
-
 #include <Jolt/Core/HashCombine.h>
 
 // Create a std::hash for DVec3
@@ -12,45 +10,136 @@ JPH_MAKE_HASHABLE(JPH::DVec3, t.GetX(), t.GetY(), t.GetZ())
 
 JPH_NAMESPACE_BEGIN
 
-DVec3::DVec3(Vec3Arg inRHS) : 
-	mValue(_mm256_cvtps_pd(inRHS.mValue))
+DVec3::DVec3(Vec3Arg inRHS)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_cvtps_pd(inRHS.mValue);
+#else
+	mD32[0] = (double)inRHS.GetX();
+	mD32[1] = (double)inRHS.GetY();
+	mD32[2] = (double)inRHS.GetZ();
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
+}
+
+DVec3::DVec3(double inX, double inY, double inZ)
 {
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_set_pd(inZ, inZ, inY, inX); // Assure Z and W are the same
+#else
+	mD32[0] = inX;
+	mD32[1] = inY;
+	mD32[2] = inZ;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 }
 
-DVec3::DVec3(double inX, double inY, double inZ) : 
-	mValue(_mm256_set_pd(inZ, inZ, inY, inX)) // Assure Z and W are the same
+DVec3::DVec3(const double *inV)
 {
+#if defined(JPH_USE_AVX)
+	Type x = _mm256_castpd128_pd256(_mm_load_sd(inV));
+	Type y = _mm256_castpd128_pd256(_mm_load_sd(inV + 1));
+	Type z = _mm256_broadcast_sd(inV + 2);
+	Type xy = _mm256_unpacklo_pd(x, y);
+	mValue = _mm256_blend_pd(xy, z, 0b1100); // Assure Z and W are the same
+#else
+	mD32[0] = inV[0];
+	mD32[1] = inV[1];
+	mD32[2] = inV[2];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = inV[2];
+	#endif
+#endif
+}
+
+void DVec3::CheckW() const
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	// Avoid asserts when both components are NaN
+	JPH_ASSERT(reinterpret_cast<const uint64 *>(mD32)[2] == reinterpret_cast<const uint64 *>(mD32)[3]); 
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+} 
+	
+/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
+DVec3::Type DVec3::sFixW(Type inValue)
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	#if defined(JPH_USE_AVX)
+		return _mm256_shuffle_pd(inValue, inValue, 2);
+	#else
+		Type value;
+		value.mData[0] = inValue.mData[0];
+		value.mData[1] = inValue.mData[1];
+		value.mData[2] = inValue.mData[2];
+		value.mData[3] = inValue.mData[2];
+		return value;
+	#endif
+#else
+	return inValue;
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
 }
 
 DVec3 DVec3::sZero()
 {
-	return _mm256_setzero_pd();
+	#if defined(JPH_USE_AVX)
+		return _mm256_setzero_pd();
+	#else
+		return DVec3(0, 0, 0);
+	#endif
 }
 
 DVec3 DVec3::sReplicate(double inV)
 {
-	return _mm256_set1_pd(inV);
+	#if defined(JPH_USE_AVX)
+		return _mm256_set1_pd(inV);
+	#else
+		return DVec3(inV, inV, inV);
+	#endif
 }
 
 DVec3 DVec3::sLoadDouble3Unsafe(const double *inV)
 {
-	__m256d v = _mm256_loadu_pd(inV);
+	#if defined(JPH_USE_AVX)
+		Type v = _mm256_loadu_pd(inV);
+	#else
+		Type v = { inV[0], inV[1], inV[2] };
+	#endif
 	return sFixW(v);
 }
 
 Vec3 DVec3::ToVec3() const
 {
-	return _mm256_cvtpd_ps(mValue);
+	#if defined(JPH_USE_AVX)
+		return _mm256_cvtpd_ps(mValue);
+	#else
+		return Vec3((float)GetX(), (float)GetY(), (float)GetZ());
+	#endif
 }
 
 DVec3 DVec3::sMin(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_min_pd(inV1.mValue, inV2.mValue);
+#else
+	return DVec3(min(inV1.mD32[0], inV2.mD32[0]), 
+				 min(inV1.mD32[1], inV2.mD32[1]), 
+				 min(inV1.mD32[2], inV2.mD32[2]));
+#endif
 }
 
 DVec3 DVec3::sMax(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_max_pd(inV1.mValue, inV2.mValue);
+#else
+	return DVec3(max(inV1.mD32[0], inV2.mD32[0]), 
+				 max(inV1.mD32[1], inV2.mD32[1]), 
+				 max(inV1.mD32[2], inV2.mD32[2]));
+#endif
 }
 
 DVec3 DVec3::sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax)
@@ -60,71 +149,137 @@ DVec3 DVec3::sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax)
 
 DVec3 DVec3::sEquals(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_EQ_OQ);
+#else
+	return DVec3(inV1.mD32[0] == inV2.mD32[0]? cTrue : cFalse, 
+				 inV1.mD32[1] == inV2.mD32[1]? cTrue : cFalse, 
+				 inV1.mD32[2] == inV2.mD32[2]? cTrue : cFalse);
+#endif
 }
 
 DVec3 DVec3::sLess(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LT_OQ);
+#else
+	return DVec3(inV1.mD32[0] < inV2.mD32[0]? cTrue : cFalse, 
+				 inV1.mD32[1] < inV2.mD32[1]? cTrue : cFalse, 
+				 inV1.mD32[2] < inV2.mD32[2]? cTrue : cFalse);
+#endif
 }
 
 DVec3 DVec3::sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LE_OQ);
+#else
+	return DVec3(inV1.mD32[0] <= inV2.mD32[0]? cTrue : cFalse, 
+				 inV1.mD32[1] <= inV2.mD32[1]? cTrue : cFalse, 
+				 inV1.mD32[2] <= inV2.mD32[2]? cTrue : cFalse);
+#endif
 }
 
 DVec3 DVec3::sGreater(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GT_OQ);
+#else
+	return DVec3(inV1.mD32[0] > inV2.mD32[0]? cTrue : cFalse, 
+				 inV1.mD32[1] > inV2.mD32[1]? cTrue : cFalse, 
+				 inV1.mD32[2] > inV2.mD32[2]? cTrue : cFalse);
+#endif
 }
 
 DVec3 DVec3::sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GE_OQ);
+#else
+	return DVec3(inV1.mD32[0] >= inV2.mD32[0]? cTrue : cFalse, 
+				 inV1.mD32[1] >= inV2.mD32[1]? cTrue : cFalse, 
+				 inV1.mD32[2] >= inV2.mD32[2]? cTrue : cFalse);
+#endif
 }
 
 DVec3 DVec3::sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd)
 {
-#ifdef JPH_USE_FMADD
-	return _mm256_fmadd_pd(inMul1.mValue, inMul2.mValue, inAdd.mValue);
+#if defined(JPH_USE_AVX)
+	#ifdef JPH_USE_FMADD
+		return _mm256_fmadd_pd(inMul1.mValue, inMul2.mValue, inAdd.mValue);
+	#else
+		return _mm256_add_pd(_mm256_mul_pd(inMul1.mValue, inMul2.mValue), inAdd.mValue);
+	#endif
 #else
-	return _mm256_add_pd(_mm256_mul_pd(inMul1.mValue, inMul2.mValue), inAdd.mValue);
+	return inMul1 * inMul2 + inAdd;
 #endif
 }
 
 DVec3 DVec3::sSelect(DVec3Arg inV1, DVec3Arg inV2, DVec3Arg inControl)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_blendv_pd(inV1.mValue, inV2.mValue, inControl.mValue);
+#else
+	DVec3 result;
+	for (int i = 0; i < 3; i++)
+		result.mD32[i] = BitCast<uint64>(inControl.mD32[i])? inV2.mD32[i] : inV1.mD32[i];
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	result.mD32[3] = result.mD32[2];
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	return result;
+#endif
 }
 
 DVec3 DVec3::sOr(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_or_pd(inV1.mValue, inV2.mValue);
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mD32[0]) | BitCast<uint64>(inV2.mD32[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[1]) | BitCast<uint64>(inV2.mD32[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[2]) | BitCast<uint64>(inV2.mD32[2])));
+#endif
 }
 
 DVec3 DVec3::sXor(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_xor_pd(inV1.mValue, inV2.mValue);
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mD32[0]) ^ BitCast<uint64>(inV2.mD32[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[1]) ^ BitCast<uint64>(inV2.mD32[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[2]) ^ BitCast<uint64>(inV2.mD32[2])));
+#endif
 }
 
 DVec3 DVec3::sAnd(DVec3Arg inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_and_pd(inV1.mValue, inV2.mValue);
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mD32[0]) & BitCast<uint64>(inV2.mD32[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[1]) & BitCast<uint64>(inV2.mD32[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mD32[2]) & BitCast<uint64>(inV2.mD32[2])));
+#endif
 }
 
 int DVec3::GetTrues() const
 {
-	return _mm256_movemask_pd(mValue);
+#if defined(JPH_USE_AVX)
+	return _mm256_movemask_pd(mValue) & 0x7;
+#else
+	return int((BitCast<uint64>(mD32[0]) >> 63) | ((BitCast<uint64>(mD32[1]) >> 63) << 1) | ((BitCast<uint64>(mD32[2]) >> 63) << 2));
+#endif
 }
 
 bool DVec3::TestAnyTrue() const
 {
-	return (_mm256_movemask_pd(mValue) & 0x7) != 0;
+	return GetTrues() != 0;
 }
 
 bool DVec3::TestAllTrue() const
 {
-	return (_mm256_movemask_pd(mValue) & 0x7) == 0x7;
+	return GetTrues() == 0x7;
 }
 
 bool DVec3::operator == (DVec3Arg inV2) const 
@@ -144,81 +299,155 @@ bool DVec3::IsNearZero(double inMaxDistSq) const
 
 DVec3 DVec3::operator * (DVec3Arg inV2) const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_mul_pd(mValue, inV2.mValue);
+#else
+	return DVec3(mD32[0] * inV2.mD32[0], mD32[1] * inV2.mD32[1], mD32[2] * inV2.mD32[2]);
+#endif
 }
 
 DVec3 DVec3::operator * (double inV2) const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
+#else
+	return DVec3(mD32[0] * inV2, mD32[1] * inV2, mD32[2] * inV2);
+#endif
 }
 
 DVec3 operator * (double inV1, DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_mul_pd(_mm256_set1_pd(inV1), inV2.mValue);
+#else
+	return DVec3(inV1 * inV2.mD32[0], inV1 * inV2.mD32[1], inV1 * inV2.mD32[2]);
+#endif
 }
 
 DVec3 DVec3::operator / (double inV2) const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
+#else
+	return DVec3(mD32[0] / inV2, mD32[1] / inV2, mD32[2] / inV2);
+#endif
 }
 
 DVec3 &DVec3::operator *= (double inV2)
 {
+#if defined(JPH_USE_AVX)
 	mValue = _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
+#else
+	for (int i = 0; i < 3; ++i)
+		mD32[i] *= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 	return *this;
 }
 
 DVec3 &DVec3::operator *= (DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	mValue = _mm256_mul_pd(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mD32[i] *= inV2.mD32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 	return *this;
 }
 
 DVec3 &DVec3::operator /= (double inV2)
 {
+#if defined(JPH_USE_AVX)
 	mValue = _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
+#else
+	for (int i = 0; i < 3; ++i)
+		mD32[i] /= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 	return *this;
 }
 
 DVec3 DVec3::operator + (DVec3Arg inV2) const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_add_pd(mValue, inV2.mValue);
+#else
+	return DVec3(mD32[0] + inV2.mD32[0], mD32[1] + inV2.mD32[1], mD32[2] + inV2.mD32[2]);
+#endif
 }
 
 DVec3 &DVec3::operator += (DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	mValue = _mm256_add_pd(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mD32[i] += inV2.mD32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 	return *this;
 }
 
 DVec3 DVec3::operator - () const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_sub_pd(_mm256_setzero_pd(), mValue);
+#else
+	return DVec3(-mD32[0], -mD32[1], -mD32[2]);
+#endif
 }
 
 DVec3 DVec3::operator - (DVec3Arg inV2) const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_sub_pd(mValue, inV2.mValue);
+#else
+	return DVec3(mD32[0] - inV2.mD32[0], mD32[1] - inV2.mD32[1], mD32[2] - inV2.mD32[2]);
+#endif
 }
 
 DVec3 &DVec3::operator -= (DVec3Arg inV2)
 {
+#if defined(JPH_USE_AVX)
 	mValue = _mm256_sub_pd(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mD32[i] -= inV2.mD32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mD32[3] = mD32[2];
+	#endif
+#endif
 	return *this;
 }
 
 DVec3 DVec3::operator / (DVec3Arg inV2) const
 {
 	inV2.CheckW();
+#if defined(JPH_USE_AVX)
 	return _mm256_div_pd(mValue, inV2.mValue);
+#else
+	return DVec3(mD32[0] / inV2.mD32[0], mD32[1] / inV2.mD32[1], mD32[2] / inV2.mD32[2]);
+#endif
 }
 
 DVec3 DVec3::Abs() const
 {
 #if defined(JPH_USE_AVX512)
 	return _mm256_range_pd(mValue, mValue, 0b1000);
-#else
+#elif defined(JPH_USE_AVX)
 	return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), mValue), mValue);
+#else
+	return DVec3(abs(mD32[0]), abs(mD32[1]), abs(mD32[2]));
 #endif
 }
 
@@ -229,16 +458,23 @@ DVec3 DVec3::Reciprocal() const
 
 DVec3 DVec3::Cross(DVec3Arg inV2) const
 {
+#if defined(JPH_USE_AVX2)
 	__m256d t1 = _mm256_permute4x64_pd(inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
     t1 = _mm256_mul_pd(t1, mValue);
     __m256d t2 = _mm256_permute4x64_pd(mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
     t2 = _mm256_mul_pd(t2, inV2.mValue);
     __m256d t3 = _mm256_sub_pd(t1, t2);
     return _mm256_permute4x64_pd(t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+#else
+	return DVec3(mD32[1] * inV2.mD32[2] - mD32[2] * inV2.mD32[1],
+				 mD32[2] * inV2.mD32[0] - mD32[0] * inV2.mD32[2],
+				 mD32[0] * inV2.mD32[1] - mD32[1] * inV2.mD32[0]);
+#endif
 }
 
 double DVec3::Dot(DVec3Arg inV2) const
 {
+#if defined(JPH_USE_AVX)
 	__m256d mul = _mm256_mul_pd(mValue, inV2.mValue);
     __m128d xy = _mm256_castpd256_pd128(mul);
 	__m128d yx = _mm_shuffle_pd(xy, xy, 1);
@@ -246,6 +482,12 @@ double DVec3::Dot(DVec3Arg inV2) const
     __m128d zw = _mm256_extractf128_pd(mul, 1);
 	sum = _mm_add_pd(sum, zw);
 	return _mm_cvtsd_f64(sum);
+#else
+	double dot = 0.0;
+	for (int i = 0; i < 3; i++)
+		dot += mD32[i] * inV2.mD32[i];
+	return dot;
+#endif
 }
 
 double DVec3::LengthSq() const
@@ -255,7 +497,11 @@ double DVec3::LengthSq() const
 
 DVec3 DVec3::Sqrt() const
 {
+#if defined(JPH_USE_AVX)
 	return _mm256_sqrt_pd(mValue);
+#else
+	return DVec3(sqrt(mD32[0]), sqrt(mD32[1]), sqrt(mD32[2]));
+#endif
 }
 
 double DVec3::Length() const
@@ -275,11 +521,15 @@ bool DVec3::IsNormalized(double inTolerance) const
 
 DVec3 DVec3::GetSign() const
 {
+#if defined(JPH_USE_AVX)
 	__m256d minus_one = _mm256_set1_pd(-1.0);
 	__m256d one = _mm256_set1_pd(1.0);
 	return _mm256_or_pd(_mm256_and_pd(mValue, minus_one), one);
+#else
+	return DVec3(std::signbit(mD32[0])? -1.0 : 1.0, 
+				 std::signbit(mD32[1])? -1.0 : 1.0, 
+				 std::signbit(mD32[2])? -1.0 : 1.0);
+#endif
 }
 
 JPH_NAMESPACE_END
-
-#endif // JPH_USE_AVX2

+ 1 - 9
Jolt/Math/HalfFloat.h

@@ -52,15 +52,7 @@ template <int RoundingMode>
 inline HalfFloat FromFloatFallback(float inV)
 {
 	// Reinterpret the float as an uint32
-	static_assert(sizeof(float) == sizeof(uint32));
-	union FloatToInt
-	{
-		float	f;
-		uint32	i;
-	};
-	FloatToInt f_to_i;
-	f_to_i.f = inV;
-	uint32 value = f_to_i.i;
+	uint32 value = BitCast<uint32>(inV);
 
 	// Extract exponent
 	uint32 exponent = (value >> FLOAT_EXPONENT_POS) & FLOAT_EXPONENT_MASK;

+ 18 - 0
Jolt/Math/Math.h

@@ -177,4 +177,22 @@ inline uint32 GetNextPowerOf2(uint32 inValue)
 	return inValue <= 1? uint32(1) : uint32(1) << (32 - CountLeadingZeros(inValue - 1));
 }
 
+// Simple implementation of C++20 std::bit_cast (unfortunately not constexpr)
+template <class To, class From>
+JPH_INLINE To BitCast(const From &inValue)
+{
+	static_assert(std::is_trivially_constructible_v<To>);
+	static_assert(sizeof(From) == sizeof(To));
+
+	union FromTo
+	{
+		To			mTo;
+		From		mFrom;
+	};
+
+	FromTo convert;
+	convert.mFrom = inValue;
+	return convert.mTo;
+}
+
 JPH_NAMESPACE_END

+ 181 - 4
UnitTests/Math/DVec3Tests.cpp

@@ -4,10 +4,178 @@
 #include "UnitTestFramework.h"
 #include <Jolt/Math/DVec3.h>
 
-#ifdef JPH_USE_AVX2
-
 TEST_SUITE("DVec3Tests")
 {
+	TEST_CASE("TestDVec3Zero")
+	{
+		DVec3 v = DVec3::sZero();
+
+		CHECK(v.GetX() == 0);
+		CHECK(v.GetY() == 0);
+		CHECK(v.GetZ() == 0);
+	}
+
+	TEST_CASE("TestDVec3ConstructComponents")
+	{
+		DVec3 v(1, 2, 3);
+
+		// Test component access
+		CHECK(v.GetX() == 1);
+		CHECK(v.GetY() == 2);
+		CHECK(v.GetZ() == 3);
+
+		// Test component access by [] operators
+		CHECK(v[0] == 1);
+		CHECK(v[1] == 2);
+		CHECK(v[2] == 3);
+
+		// Test == and != operators
+		CHECK(v == DVec3(1, 2, 3));
+		CHECK(v != DVec3(1, 2, 4));
+
+		// Set the components
+		v.SetComponent(0, 4);
+		v.SetComponent(1, 5);
+		v.SetComponent(2, 6);
+		CHECK(v == DVec3(4, 5, 6));
+	}
+
+	TEST_CASE("TestDVec3Replicate")
+	{
+		CHECK(DVec3::sReplicate(2) == DVec3(2, 2, 2));
+	}
+
+	TEST_CASE("TestDVec3ToVec3")
+	{
+		CHECK(DVec3(1, 3, 5).ToVec3() == Vec3(1, 3, 5));
+	}
+
+	TEST_CASE("TestVec3MinMax")
+	{
+		DVec3 v1(1, 5, 3);
+		DVec3 v2(4, 2, 6);
+
+		CHECK(DVec3::sMin(v1, v2) == DVec3(1, 2, 3));
+		CHECK(DVec3::sMax(v1, v2) == DVec3(4, 5, 6));
+	}
+
+	TEST_CASE("TestDVec3Clamp")
+	{
+		DVec3 v1(1, 2, 3);
+		DVec3 v2(4, 5, 6);
+		DVec3 v(-1, 3, 7);
+
+		CHECK(DVec3::sClamp(v, v1, v2) == DVec3(1, 3, 6));
+	}
+
+	TEST_CASE("TestDVec3Trues")
+	{
+		CHECK(DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cFalse).GetTrues() == 0b0000);
+		CHECK(DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cFalse).GetTrues() == 0b0001);
+		CHECK(DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cFalse).GetTrues() == 0b0010);
+		CHECK(DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cFalse).GetTrues() == 0b0011);
+		CHECK(DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cTrue).GetTrues() == 0b0100);
+		CHECK(DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cTrue).GetTrues() == 0b0101);
+		CHECK(DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cTrue).GetTrues() == 0b0110);
+		CHECK(DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cTrue).GetTrues() == 0b0111);
+
+		CHECK(!DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cFalse).TestAnyTrue());
+		CHECK(DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cFalse).TestAnyTrue());
+		CHECK(DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cFalse).TestAnyTrue());
+		CHECK(DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cFalse).TestAnyTrue());
+		CHECK(DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cTrue).TestAnyTrue());
+		CHECK(DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cTrue).TestAnyTrue());
+		CHECK(DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cTrue).TestAnyTrue());
+		CHECK(DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cTrue).TestAnyTrue());
+
+		CHECK(!DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cFalse).TestAllTrue());
+		CHECK(!DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cFalse).TestAllTrue());
+		CHECK(!DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cFalse).TestAllTrue());
+		CHECK(!DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cFalse).TestAllTrue());
+		CHECK(!DVec3(DVec3::cFalse, DVec3::cFalse, DVec3::cTrue).TestAllTrue());
+		CHECK(!DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cTrue).TestAllTrue());
+		CHECK(!DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cTrue).TestAllTrue());
+		CHECK(DVec3(DVec3::cTrue, DVec3::cTrue, DVec3::cTrue).TestAllTrue());
+	}
+
+	TEST_CASE("TestDVec3Comparisons")
+	{
+		CHECK(DVec3::sEquals(DVec3(1, 2, 3), DVec3(1, 4, 3)).GetTrues() == 0b101); // Can't directly check if equal to (true, false, true) because true = -NaN and -NaN != -NaN
+		CHECK(DVec3::sLess(DVec3(1, 2, 4), DVec3(1, 4, 3)).GetTrues() == 0b010);
+		CHECK(DVec3::sLessOrEqual(DVec3(1, 2, 4), DVec3(1, 4, 3)).GetTrues() == 0b011);
+		CHECK(DVec3::sGreater(DVec3(1, 2, 4), DVec3(1, 4, 3)).GetTrues() == 0b100);
+		CHECK(DVec3::sGreaterOrEqual(DVec3(1, 2, 4), DVec3(1, 4, 3)).GetTrues() == 0b101);
+	}
+
+	TEST_CASE("TestDVec3FMA")
+	{
+		CHECK(DVec3::sFusedMultiplyAdd(DVec3(1, 2, 3), DVec3(4, 5, 6), DVec3(7, 8, 9)) == DVec3(1 * 4 + 7, 2 * 5 + 8, 3 * 6 + 9));
+	}
+
+	TEST_CASE("TestDVec3Select")
+	{
+		CHECK(DVec3::sSelect(DVec3(1, 2, 3), DVec3(4, 5, 6), DVec3(DVec3::cTrue, DVec3::cFalse, DVec3::cTrue)) == DVec3(4, 2, 6));
+		CHECK(DVec3::sSelect(DVec3(1, 2, 3), DVec3(4, 5, 6), DVec3(DVec3::cFalse, DVec3::cTrue, DVec3::cFalse)) == DVec3(1, 5, 3));
+	}
+
+	TEST_CASE("TestDVec3BitOps")
+	{
+		// Test all bit permutations
+		DVec3 v1(BitCast<double, uint64>(0b0011), BitCast<double, uint64>(0b00110), BitCast<double, uint64>(0b001100));
+		DVec3 v2(BitCast<double, uint64>(0b0101), BitCast<double, uint64>(0b01010), BitCast<double, uint64>(0b010100));
+
+		CHECK(DVec3::sOr(v1, v2) == DVec3(BitCast<double, uint64>(0b0111), BitCast<double, uint64>(0b01110), BitCast<double, uint64>(0b011100)));
+		CHECK(DVec3::sXor(v1, v2) == DVec3(BitCast<double, uint64>(0b0110), BitCast<double, uint64>(0b01100), BitCast<double, uint64>(0b011000)));
+		CHECK(DVec3::sAnd(v1, v2) == DVec3(BitCast<double, uint64>(0b0001), BitCast<double, uint64>(0b00010), BitCast<double, uint64>(0b000100)));
+	}
+
+	TEST_CASE("TestDVec3Close")
+	{
+		CHECK(DVec3(1, 2, 3).IsClose(DVec3(1.001, 2.001, 3.001), 1.0e-4));
+		CHECK(!DVec3(1, 2, 3).IsClose(DVec3(1.001, 2.001, 3.001), 1.0e-6));
+
+		CHECK(DVec3(1.001, 0, 0).IsNormalized(1.0e-2));
+		CHECK(!DVec3(0, 1.001, 0).IsNormalized(1.0e-4));
+
+		CHECK(DVec3(-1.0e-7, 1.0e-7, 1.0e-8).IsNearZero(1.0e-12));
+		CHECK(!DVec3(-1.0e-7, 1.0e-7, -1.0e-5).IsNearZero(1.0e-12));
+	}
+
+	TEST_CASE("TestDVec3Operators")
+	{
+		CHECK(-DVec3(1, 2, 3) == DVec3(-1, -2, -3));
+
+		CHECK(DVec3(1, 2, 3) + DVec3(4, 5, 6) == DVec3(5, 7, 9));
+		CHECK(DVec3(1, 2, 3) - DVec3(6, 5, 4) == DVec3(-5, -3, -1));
+
+		CHECK(DVec3(1, 2, 3) * DVec3(4, 5, 6) == DVec3(4, 10, 18));
+		CHECK(DVec3(1, 2, 3) * 2 == DVec3(2, 4, 6));
+		CHECK(4 * DVec3(1, 2, 3) == DVec3(4, 8, 12));
+
+		CHECK(DVec3(1, 2, 3) / 2 == DVec3(0.5, 1.0, 1.5));
+		CHECK(DVec3(1, 2, 3) / DVec3(2, 8, 24) == DVec3(0.5, 0.25, 0.125));
+
+		DVec3 v = DVec3(1, 2, 3);
+		v *= DVec3(4, 5, 6);
+		CHECK(v == DVec3(4, 10, 18));
+		v *= 2;
+		CHECK(v == DVec3(8, 20, 36));
+		v /= 2;
+		CHECK(v == DVec3(4, 10, 18));
+		v += DVec3(1, 2, 3);
+		CHECK(v == DVec3(5, 12, 21));
+		v -= DVec3(1, 2, 3);
+		CHECK(v == DVec3(4, 10, 18));
+
+		CHECK(DVec3(2, 4, 8).Reciprocal() == DVec3(0.5, 0.25, 0.125));
+	}
+
+	TEST_CASE("TestDVec3Abs")
+	{
+		CHECK(DVec3(1, -2, 3).Abs() == DVec3(1, 2, 3));
+		CHECK(DVec3(-1, 2, -3).Abs() == DVec3(1, 2, 3));
+	}
+
 	TEST_CASE("TestDVec3Dot")
 	{
 		CHECK(DVec3(2, 3, 4).Dot(DVec3(5, 6, 7)) == double(2 * 5 + 3 * 6 + 4 * 7));
@@ -48,6 +216,15 @@ TEST_SUITE("DVec3Tests")
 		CHECK(DVec3(0, 0, 1).Cross(DVec3(1, 0, 0)) == DVec3(0, 1, 0));
 		CHECK(DVec3(1, 0, 0).Cross(DVec3(0, 0, 1)) == DVec3(0, -1, 0));
 	}
-}
 
-#endif // JPH_USE_AVX2
+	TEST_CASE("TestDVec3Normalize")
+	{
+		CHECK(DVec3(3, 2, 1).Normalized() == DVec3(3, 2, 1) / sqrt(9.0 + 4.0 + 1.0));
+	}
+
+	TEST_CASE("TestDVec3Sign")
+	{
+		CHECK(DVec3(1.2345, -6.7891, 0).GetSign() == DVec3(1, -1, 1));
+		CHECK(DVec3(0, 2.3456, -7.8912).GetSign() == DVec3(1, 1, -1));
+	}
+}

+ 5 - 19
UnitTests/Math/HalfFloatTests.cpp

@@ -7,20 +7,6 @@
 
 TEST_SUITE("HalfFloatTests")
 {
-	// Helper function to construct a float with a specific bit pattern
-	static inline float ReinterpretAsFloat(uint32 inValue)
-	{
-		static_assert(sizeof(float) == sizeof(uint32));
-		union IntToFloat
-		{
-			uint32	i;
-			float	f;
-		};
-		IntToFloat i_to_f;
-		i_to_f.i = inValue;
-		return i_to_f.f;
-	}
-
 #if defined(JPH_USE_F16C) || defined(JPH_USE_NEON)
 	TEST_CASE("TestHalfFloatToFloat")
 	{
@@ -44,7 +30,7 @@ TEST_SUITE("HalfFloatTests")
 	// Helper function to compare the intrinsics version with the fallback version
 	static inline void CheckFloatToHalfFloat(uint32 inValue, uint32 inSign)
 	{
-		const float fvalue = ReinterpretAsFloat(inValue + inSign * 0x80000000U);
+		const float fvalue = BitCast<float>(inValue + inSign * 0x80000000U);
 
 		HalfFloat hf1 = HalfFloatConversion::FromFloat<HalfFloatConversion::ROUND_TO_NEAREST>(fvalue);
 		HalfFloat hf2 = HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(fvalue);
@@ -89,8 +75,8 @@ TEST_SUITE("HalfFloatTests")
 	TEST_CASE("TestHalfFloatINF")
 	{
 		// Float -> half float
-		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(ReinterpretAsFloat(0x7f800000U)) == HALF_FLT_INF);
-		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(ReinterpretAsFloat(0xff800000U)) == HALF_FLT_INF_NEGATIVE);
+		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(BitCast<float>(0x7f800000U)) == HALF_FLT_INF);
+		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(BitCast<float>(0xff800000U)) == HALF_FLT_INF_NEGATIVE);
 
 		// Half float -> float
 		UVec4 half_float(uint32(HALF_FLT_INF) | (uint32(HALF_FLT_INF_NEGATIVE) << 16), 0, 0, 0);
@@ -101,8 +87,8 @@ TEST_SUITE("HalfFloatTests")
 	TEST_CASE("TestHalfFloatNaN")
 	{
 		// Float -> half float
-		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(ReinterpretAsFloat(0x7fc00000U)) == HALF_FLT_NANQ);
-		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(ReinterpretAsFloat(0xffc00000U)) == HALF_FLT_NANQ_NEGATIVE);
+		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(BitCast<float>(0x7fc00000U)) == HALF_FLT_NANQ);
+		CHECK(HalfFloatConversion::FromFloatFallback<HalfFloatConversion::ROUND_TO_NEAREST>(BitCast<float>(0xffc00000U)) == HALF_FLT_NANQ_NEGATIVE);
 
 		// Half float -> float
 		UVec4 half_float(uint32(HALF_FLT_NANQ) | (uint32(HALF_FLT_NANQ_NEGATIVE) << 16), 0, 0, 0);

+ 2 - 0
UnitTests/Math/Vec3Tests.cpp

@@ -173,6 +173,8 @@ TEST_SUITE("Vec3Tests")
 		CHECK(v == Vec3(4, 10, 18));
 		v += Vec3(1, 2, 3);
 		CHECK(v == Vec3(5, 12, 21));
+		v -= Vec3(1, 2, 3);
+		CHECK(v == Vec3(4, 10, 18));
 
 		CHECK(Vec3(2, 4, 8).Reciprocal() == Vec3(0.5f, 0.25f, 0.125f));
 	}

+ 0 - 4
UnitTests/UnitTestFramework.h

@@ -42,15 +42,11 @@ inline void CHECK_APPROX_EQUAL(QuatArg inLHS, QuatArg inRHS, float inTolerance =
 	CHECK(close);
 }
 
-#ifdef JPH_USE_AVX2
-
 inline void CHECK_APPROX_EQUAL(DVec3Arg inLHS, DVec3Arg inRHS, double inTolerance = 1.0e-6)
 {
 	CHECK(inLHS.IsClose(inRHS, inTolerance * inTolerance));
 }
 
-#endif // JPH_USE_AVX2
-
 inline void CHECK_APPROX_EQUAL(const Float2 &inLHS, const Float2 &inRHS, float inTolerance = 1.0e-6f)
 {
 	Float2 diff(inLHS.x - inRHS.x, inLHS.y - inRHS.y);