Jelajahi Sumber

Implemented WASM target using emscripten (#222)

* Implemented fallback math functions without using SIMD
Jorrit Rouwe 2 tahun lalu
induk
melakukan
99f086f8b5

+ 1 - 1
Build/CMakeLists.txt

@@ -127,7 +127,7 @@ if (("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUA
 		set(CMAKE_EXE_LINKER_FLAGS_RELEASEUBSAN "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LIBPATH:${CLANG_LIB_PATH}")
 		set(CMAKE_EXE_LINKER_FLAGS_RELEASECOVERAGE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LIBPATH:${CLANG_LIB_PATH}")
 	endif()
-elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin" OR "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS" OR MINGW)
+elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin" OR "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS" OR MINGW OR EMSCRIPTEN)
 	# Set general compiler flags
 	set(CMAKE_CXX_FLAGS "-g -std=c++17 -I. -Wall -Werror")
 

+ 9 - 0
Jolt/Core/Core.h

@@ -23,6 +23,8 @@
     #else
         #define JPH_PLATFORM_IOS
     #endif
+#elif defined(__EMSCRIPTEN__)
+	#define JPH_PLATFORM_WASM
 #endif
 
 // Platform helper macros
@@ -98,6 +100,11 @@
 	#define JPH_CPU_ARM64
 	#define JPH_USE_NEON
 	#define JPH_CPU_ADDRESS_BITS 64
+#elif defined(JPH_PLATFORM_WASM)
+	// WebAssembly CPU architecture
+	#define JPH_CPU_WASM
+	#define JPH_CPU_ADDRESS_BITS 32
+	#define JPH_DISABLE_CUSTOM_ALLOCATOR
 #else
 	#error Unsupported CPU architecture
 #endif
@@ -197,6 +204,8 @@
 	#elif defined(JPH_CPU_ARM64)
 		#define JPH_BREAKPOINT		__builtin_trap()
 	#endif
+#elif defined(JPH_PLATFORM_WASM)
+	#define JPH_BREAKPOINT		do { } while (false) // Not supported
 #else
 	#error Unknown platform
 #endif

+ 4 - 0
Jolt/Core/FPControlWord.h

@@ -61,6 +61,10 @@ private:
 	uint64		mPrevState;
 };
 
+#elif defined(JPH_CPU_WASM)
+
+// Not supported
+
 #else
 
 #error Unsupported CPU architecture

+ 7 - 0
Jolt/Core/FPException.h

@@ -37,6 +37,13 @@ class FPExceptionDisableInvalid : public FPControlWord<0, FP_IOE> { };
 /// Disable division by zero floating point exceptions
 class FPExceptionDisableDivByZero : public FPControlWord<0, FP_DZE> { };
 
+#elif defined(JPH_CPU_WASM)
+
+// Not supported
+class FPExceptionsEnable { };
+class FPExceptionDisableInvalid { };
+class FPExceptionDisableDivByZero { };
+
 #else
 
 #error Unsupported CPU architecture

+ 5 - 0
Jolt/Core/FPFlushDenormals.h

@@ -22,6 +22,11 @@ static constexpr uint64 FP_FZ = 1 << 24;
 /// This can make floating point operations much faster when working with very small numbers
 class FPFlushDenormals : public FPControlWord<FP_FZ, FP_FZ> { };
 
+#elif defined(JPH_CPU_WASM)
+
+// Not supported
+class FPFlushDenormals { };
+
 #else
 
 #error Unsupported CPU architecture

+ 1 - 1
Jolt/Core/Profiler.inl

@@ -78,7 +78,7 @@ ProfileMeasurement::~ProfileMeasurement()
 		val = vld1q_s32(src + 4);
 		vst1q_s32(dst + 4, val);
 	#else
-		#error Unsupported CPU architecture
+		memcpy(mSample, &mTemp, sizeof(ProfileSample));
 	#endif
 		mSample = nullptr;
 	}

+ 2 - 0
Jolt/Core/TickCounter.cpp

@@ -104,6 +104,8 @@ static const uint64 sProcessorTicksPerSecond = []() {
     size_t len = sizeof(freq);
     sysctl(mib, 2, &freq, &len, nullptr, 0);
 	return freq;
+#elif defined(JPH_PLATFORM_WASM)
+	return 1; // Not supported
 #else
 	#error Undefined
 #endif

+ 2 - 0
Jolt/Core/TickCounter.h

@@ -30,6 +30,8 @@ JPH_INLINE uint64 GetProcessorTickCount()
 	uint64 val;
     asm volatile("mrs %0, cntvct_el0" : "=r" (val));
 	return val;
+#elif defined(JPH_CPU_WASM)
+	return 0; // Not supported
 #else
 	#error Undefined
 #endif

+ 1 - 0
Jolt/Math/EigenValueSymmetric.h

@@ -29,6 +29,7 @@ bool EigenValueSymmetric(const Matrix &inMatrix, Matrix &outEigVec, Vector &outE
 {
 	// This algorithm works with very small numbers and can trigger invalid float exceptions when not flushing denormals
 	FPFlushDenormals flush_denormals;
+	(void)flush_denormals;
 
 	// Maximum number of sweeps to make
 	const int cMaxSweeps = 50;

+ 95 - 10
Jolt/Math/Mat44.inl

@@ -9,6 +9,8 @@
 
 JPH_NAMESPACE_BEGIN
 
+#define JPH_EL(r, c) mCol[c].mF32[r]
+
 Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) : 
 	mCol { inC1, inC2, inC3, inC4 } 
 { 
@@ -245,7 +247,8 @@ Mat44 Mat44::operator * (Mat44Arg inM) const
 		result.mCol[i].mValue = t;
 	}
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
 #endif
 	return result;
 }
@@ -265,7 +268,10 @@ Vec3 Mat44::operator * (Vec3Arg inV) const
 	t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
 	return Vec3::sFixW(t);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0], 
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1], 
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
 #endif
 }
 
@@ -284,7 +290,11 @@ Vec4 Mat44::operator * (Vec4Arg inV) const
 	t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
 	return t;
 #else
-	#error Unsupported CPU architecture
+	return Vec4(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3], 
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3], 
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3], 
+		mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
 #endif
 }
 
@@ -301,7 +311,10 @@ Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
 	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
 	return Vec3::sFixW(t);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2], 
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2], 
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
 #endif
 }
 
@@ -345,7 +358,8 @@ Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
 		result.mCol[i].mValue = t;
 	}
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
 #endif
 	result.mCol[3] = Vec4(0, 0, 0, 1);
 	return result;
@@ -462,7 +476,11 @@ Mat44 Mat44::Transposed() const
 	result.mCol[3].mValue = tmp4.val[1];
 	return result;
 #else
-	#error Unsupported CPU architecture
+	Mat44 result;
+	for (int c = 0; c < 4; ++c)
+		for (int r = 0; r < 4; ++r)
+			result.mCol[r].mF32[c] = mCol[c].mF32[r];
+	return result;
 #endif
 }
 
@@ -490,7 +508,13 @@ Mat44 Mat44::Transposed3x3() const
 	result.mCol[1].mValue = tmp3.val[1];
 	result.mCol[2].mValue = tmp4.val[0];
 #else
-	#error Unsupported CPU architecture
+	Mat44 result;
+	for (int c = 0; c < 3; ++c)
+	{
+		for (int r = 0; r < 3; ++r)
+			result.mCol[c].mF32[r] = mCol[r].mF32[c];
+		result.mCol[c].mF32[3] = 0;
+	}
 #endif
 	result.mCol[3] = Vec4(0, 0, 0, 1);
 	return result;
@@ -651,7 +675,38 @@ Mat44 Mat44::Inversed() const
 	result.mCol[3].mValue = vmulq_f32(det, minor3);
 	return result;
 #else
-	#error Undefined CPU architecture
+	float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
+	float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
+	float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
+	float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
+	
+	float m10211120 = m10 * m21 - m11 * m20;
+	float m10221220 = m10 * m22 - m12 * m20;
+	float m10231320 = m10 * m23 - m13 * m20;
+	float m10311130 = m10 * m31 - m11 * m30;
+	float m10321230 = m10 * m32 - m12 * m30;
+	float m10331330 = m10 * m33 - m13 * m30;
+	float m11221221 = m11 * m22 - m12 * m21;
+	float m11231321 = m11 * m23 - m13 * m21;
+	float m11321231 = m11 * m32 - m12 * m31;
+	float m11331331 = m11 * m33 - m13 * m31;
+	float m12231322 = m12 * m23 - m13 * m22;
+	float m12331332 = m12 * m33 - m13 * m32;
+	float m20312130 = m20 * m31 - m21 * m30;
+	float m20322230 = m20 * m32 - m22 * m30;
+	float m20332330 = m20 * m33 - m23 * m30;
+	float m21322231 = m21 * m32 - m22 * m31;
+	float m21332331 = m21 * m33 - m23 * m31;
+	float m22332332 = m22 * m33 - m23 * m32;
+
+	Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231,		-m10 * m22332332 + m12 * m20332330 - m13 * m20322230,		m10 * m21332331 - m11 * m20332330 + m13 * m20312130,		-m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
+	Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231,		m00 * m22332332 - m02 * m20332330 + m03 * m20322230,		-m00 * m21332331 + m01 * m20332330 - m03 * m20312130,		m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
+	Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231,		-m00 * m12331332 + m02 * m10331330 - m03 * m10321230,		m00 * m11331331 - m01 * m10331330 + m03 * m10311130,		-m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
+	Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221,		m00 * m12231322 - m02 * m10231320 + m03 * m10221220,		-m00 * m11231321 + m01 * m10231320 - m03 * m10211120,		m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
+
+	float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
+
+	return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
 #endif
 }
 
@@ -800,7 +855,20 @@ Mat44 Mat44::Adjointed3x3() const
 	result.mCol[3].mValue = v0001;
 	return result;
 #else
-	#error Undefined CPU architecture
+	return Mat44(
+		Vec4(JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1),
+			JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2),
+			JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0),
+			0),
+		Vec4(JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2),
+			JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0),
+			JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1),
+			0),
+		Vec4(JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1),
+			JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2),
+			JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0),
+			0),
+		Vec4(0, 0, 0, 1));
 #endif
 }
 
@@ -947,7 +1015,22 @@ Mat44 Mat44::Inversed3x3() const
 	result.mCol[3].mValue = v0001;
 	return result;
 #else
-	#error Undefined CPU architecture
+	float det = GetDeterminant3x3();
+
+	return Mat44(
+		Vec4((JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1)) / det,
+			(JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2)) / det,
+			(JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0)) / det,
+			0),
+		Vec4((JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2)) / det,
+			(JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0)) / det,
+			(JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1)) / det,
+			0),
+		Vec4((JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1)) / det,
+			(JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2)) / det,
+			(JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0)) / det,
+			0),
+		Vec4(0, 0, 0, 1));
 #endif
 }
 
@@ -1115,4 +1198,6 @@ Mat44 Mat44::Decompose(Vec3 &outScale) const
 	return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
 }
 
+#undef JPH_EL
+
 JPH_NAMESPACE_END

+ 2 - 2
Jolt/Math/Math.h

@@ -93,7 +93,7 @@ inline bool IsAligned(T inV, uint64 inAlignment)
 /// Compute number of trailing zero bits (how many low bits are zero)
 inline uint CountTrailingZeros(uint32 inValue)
 {
-#if defined(JPH_CPU_X86)
+#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
 	#if defined(JPH_USE_TZCNT)
 		return _tzcnt_u32(inValue);
 	#elif defined(JPH_COMPILER_MSVC)
@@ -117,7 +117,7 @@ inline uint CountTrailingZeros(uint32 inValue)
 /// Compute the number of leading zero bits (how many high bits are zero)
 inline uint CountLeadingZeros(uint32 inValue)
 {
-#if defined(JPH_CPU_X86)
+#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
 	#if defined(JPH_USE_LZCNT)
 		return _lzcnt_u32(inValue);
 	#elif defined(JPH_COMPILER_MSVC)

+ 5 - 2
Jolt/Math/UVec4.h

@@ -18,7 +18,7 @@ public:
 #elif defined(JPH_USE_NEON)
 	using Type = uint32x4_t;
 #else
-	#error Undefined
+	using Type = struct { uint32 mData[4]; };
 #endif
 
 	/// Constructor
@@ -97,7 +97,10 @@ public:
 	JPH_INLINE uint32			GetZ() const										{ return vgetq_lane_u32(mValue, 2); }
 	JPH_INLINE uint32			GetW() const										{ return vgetq_lane_u32(mValue, 3); }
 #else
-	#error Undefined
+	JPH_INLINE uint32			GetX() const										{ return mU32[0]; }
+	JPH_INLINE uint32			GetY() const										{ return mU32[1]; }
+	JPH_INLINE uint32			GetZ() const										{ return mU32[2]; }
+	JPH_INLINE uint32			GetW() const										{ return mU32[3]; }
 #endif
 
 	/// Set individual components

+ 59 - 29
Jolt/Math/UVec4.inl

@@ -12,7 +12,10 @@ UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
 	uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
 	mValue = vcombine_u32(xy, zw);
 #else
-	#error Undefined CPU architecture
+	mU32[0] = inX;
+	mU32[1] = inY;
+	mU32[2] = inZ;
+	mU32[3] = inW;
 #endif
 }
 
@@ -34,7 +37,7 @@ UVec4 UVec4::Swizzle() const
 #elif defined(JPH_USE_NEON)
 	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
 #endif
 }
 
@@ -45,7 +48,7 @@ UVec4 UVec4::sZero()
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_u32(0);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(0, 0, 0, 0);
 #endif
 }
 
@@ -56,7 +59,7 @@ UVec4 UVec4::sReplicate(uint32 inV)
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_u32(inV);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV, inV, inV, inV);
 #endif
 }
 
@@ -67,7 +70,7 @@ UVec4 UVec4::sLoadInt(const uint32 *inV)
 #elif defined(JPH_USE_NEON)
 	return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(*inV, 0, 0, 0);
 #endif
 }
 
@@ -78,7 +81,7 @@ UVec4 UVec4::sLoadInt4(const uint32 *inV)
 #elif defined(JPH_USE_NEON)
 	return vld1q_u32(inV);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV[0], inV[1], inV[2], inV[3]);
 #endif
 }
 
@@ -89,7 +92,7 @@ UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
 #elif defined(JPH_USE_NEON)
 	return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV[0], inV[1], inV[2], inV[3]);
 #endif
 }
 
@@ -138,7 +141,10 @@ UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vceqq_u32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0, 
+				 inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0, 
+				 inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0, 
+				 inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -163,7 +169,10 @@ UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vorrq_u32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mU32[0] | inV2.mU32[0], 
+				 inV1.mU32[1] | inV2.mU32[1], 
+				 inV1.mU32[2] | inV2.mU32[2], 
+				 inV1.mU32[3] | inV2.mU32[3]);
 #endif
 }
 
@@ -174,7 +183,10 @@ UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return veorq_u32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mU32[0] ^ inV2.mU32[0], 
+				 inV1.mU32[1] ^ inV2.mU32[1], 
+				 inV1.mU32[2] ^ inV2.mU32[2], 
+				 inV1.mU32[3] ^ inV2.mU32[3]);
 #endif
 }
 
@@ -185,7 +197,10 @@ UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vandq_u32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mU32[0] & inV2.mU32[0], 
+				 inV1.mU32[1] & inV2.mU32[1], 
+				 inV1.mU32[2] & inV2.mU32[2], 
+				 inV1.mU32[3] & inV2.mU32[3]);
 #endif
 }
 
@@ -197,7 +212,7 @@ UVec4 UVec4::sNot(UVec4Arg inV1)
 #elif defined(JPH_USE_NEON)
 	return vmvnq_u32(inV1.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
 #endif
 }
 
@@ -236,7 +251,10 @@ UVec4 UVec4::operator + (UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vaddq_u32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[0] + inV2.mU32[0], 
+				 mU32[1] + inV2.mU32[1], 
+				 mU32[2] + inV2.mU32[2], 
+				 mU32[3] + inV2.mU32[3]);
 #endif
 }
 
@@ -247,7 +265,8 @@ UVec4 &UVec4::operator += (UVec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vaddq_u32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mU32[i] += inV2.mU32[i];
 #endif
 	return *this;
 }
@@ -259,7 +278,7 @@ UVec4 UVec4::SplatX() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_u32(mValue, 0);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
 #endif
 }
 
@@ -270,7 +289,7 @@ UVec4 UVec4::SplatY() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_u32(mValue, 1);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
 #endif
 }
 
@@ -281,7 +300,7 @@ UVec4 UVec4::SplatZ() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_u32(mValue, 2);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
 #endif
 }
 
@@ -292,7 +311,7 @@ UVec4 UVec4::SplatW() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_u32(mValue, 3);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
 #endif
 }
 
@@ -303,7 +322,7 @@ Vec4 UVec4::ToFloat() const
 #elif defined(JPH_USE_NEON)
 	return vcvtq_f32_s32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
 #endif
 }
 
@@ -314,7 +333,7 @@ Vec4 UVec4::ReinterpretAsFloat() const
 #elif defined(JPH_USE_NEON)
 	return vreinterpretq_f32_s32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return *reinterpret_cast<const Vec4 *>(this);
 #endif
 }
 
@@ -325,7 +344,8 @@ void UVec4::StoreInt4(uint32 *outV) const
 #elif defined(JPH_USE_NEON)
 	vst1q_u32(outV, mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		outV[i] = mU32[i];
 #endif
 }
 
@@ -336,7 +356,8 @@ void UVec4::StoreInt4Aligned(uint32 *outV) const
 #elif defined(JPH_USE_NEON)
 	vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		outV[i] = mU32[i];
 #endif
 }
 
@@ -347,7 +368,7 @@ int UVec4::CountTrues() const
 #elif defined(JPH_USE_NEON)
     return vaddvq_u32(vshrq_n_u32(mValue, 31));
 #else
-	#error Unsupported CPU architecture
+	return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
 #endif
 }
 
@@ -359,7 +380,7 @@ int UVec4::GetTrues() const
     int32x4_t shift = { 0, 1, 2, 3 };
     return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
 #else
-	#error Unsupported CPU architecture
+	return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
 #endif
 }
 
@@ -393,7 +414,7 @@ UVec4 UVec4::LogicalShiftLeft() const
 #elif defined(JPH_USE_NEON)
 	return vshlq_n_u32(mValue, Count);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
 #endif
 }
 
@@ -407,7 +428,7 @@ UVec4 UVec4::LogicalShiftRight() const
 #elif defined(JPH_USE_NEON)
 	return vshrq_n_u32(mValue, Count);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
 #endif
 }
 
@@ -421,7 +442,10 @@ UVec4 UVec4::ArithmeticShiftRight() const
 #elif defined(JPH_USE_NEON)
 	return vshrq_n_s32(mValue, Count);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(uint32(int32_t(mU32[0]) >> Count), 
+				 uint32(int32_t(mU32[1]) >> Count), 
+				 uint32(int32_t(mU32[2]) >> Count), 
+				 uint32(int32_t(mU32[3]) >> Count));
 #endif
 }
 
@@ -434,7 +458,10 @@ UVec4 UVec4::Expand4Uint16Lo() const
 	int16x4_t zero = vdup_n_s16(0);
 	return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[0] & 0xffff, 
+				 (mU32[0] >> 16) & 0xffff, 
+				 mU32[1] & 0xffff, 
+				 (mU32[1] >> 16) & 0xffff);
 #endif
 }
 
@@ -447,7 +474,10 @@ UVec4 UVec4::Expand4Uint16Hi() const
 	int16x4_t zero = vdup_n_s16(0);
 	return vcombine_s16(vzip1_s16(value, zero), vzip2_s16(value, zero));
 #else
-	#error Unsupported CPU architecture
+	return UVec4(mU32[2] & 0xffff, 
+				 (mU32[2] >> 16) & 0xffff, 
+				 mU32[3] & 0xffff, 
+				 (mU32[3] >> 16) & 0xffff);
 #endif
 }
 

+ 4 - 2
Jolt/Math/Vec3.h

@@ -22,7 +22,7 @@ public:
 #elif defined(JPH_USE_NEON)
 	using Type = float32x4_t;
 #else
-	#error Undefined
+	using Type = Vec4::Type;
 #endif
 
 	/// Constructor
@@ -115,7 +115,9 @@ public:
 	JPH_INLINE float			GetY() const									{ return vgetq_lane_f32(mValue, 1); }
 	JPH_INLINE float			GetZ() const									{ return vgetq_lane_f32(mValue, 2); }
 #else
-	#error Undefined
+	JPH_INLINE float			GetX() const									{ return mF32[0]; }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
 #endif
 	
 	/// Set individual components

+ 108 - 42
Jolt/Math/Vec3.inl

@@ -30,7 +30,12 @@ JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
 	#elif defined(JPH_USE_NEON)
 		return __builtin_shufflevector(inValue, inValue, 0, 1, 2, 2);
 	#else
-		#error Unsupported CPU architecture
+		Type value;
+		value.mData[0] = inValue.mData[0];
+		value.mData[1] = inValue.mData[1];
+		value.mData[2] = inValue.mData[2];
+		value.mData[3] = inValue.mData[2];
+		return value;
 	#endif
 #else
 	return inValue;
@@ -55,7 +60,12 @@ Vec3::Vec3(const Float3 &inV)
     float32x2_t zz = vdup_n_f32(inV.z); // Assure Z and W are the same
     mValue = vcombine_f32(xy, zz);
 #else
-	#error Undefined CPU architecture
+	mF32[0] = inV[0];
+	mF32[1] = inV[1];
+	mF32[2] = inV[2];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = inV[2];
+	#endif
 #endif
 }
 
@@ -68,7 +78,12 @@ Vec3::Vec3(float inX, float inY, float inZ)
 	uint32x2_t zz = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inZ)) << 32));
 	mValue = vcombine_f32(xy, zz);
 #else
-	#error Undefined CPU architecture
+	mF32[0] = inX;
+	mF32[1] = inY;
+	mF32[2] = inZ;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = inZ;
+	#endif
 #endif
 }
 
@@ -84,7 +99,7 @@ Vec3 Vec3::Swizzle() const
 #elif defined(JPH_USE_NEON)
 	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ]);
 #endif
 }
 
@@ -95,7 +110,7 @@ Vec3 Vec3::sZero()
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_f32(0);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(0, 0, 0);
 #endif
 }
 
@@ -106,7 +121,7 @@ Vec3 Vec3::sReplicate(float inV)
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_f32(inV);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(inV, inV, inV);
 #endif
 }
 
@@ -122,7 +137,7 @@ Vec3 Vec3::sLoadFloat3Unsafe(const Float3 &inV)
 #elif defined(JPH_USE_NEON)
 	Type v = vld1q_f32(&inV.x);
 #else
-	#error Unsupported CPU architecture
+	Type v = { inV.x, inV.y, inV.z };
 #endif
 	return sFixW(v);
 }
@@ -134,7 +149,9 @@ Vec3 Vec3::sMin(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vminq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(min(inV1.mF32[0], inV2.mF32[0]), 
+				min(inV1.mF32[1], inV2.mF32[1]), 
+				min(inV1.mF32[2], inV2.mF32[2]));
 #endif
 }
 
@@ -145,7 +162,9 @@ Vec3 Vec3::sMax(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vmaxq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(max(inV1.mF32[0], inV2.mF32[0]), 
+				max(inV1.mF32[1], inV2.mF32[1]), 
+				max(inV1.mF32[2], inV2.mF32[2]));
 #endif
 }
 
@@ -161,7 +180,11 @@ UVec4 Vec3::sEquals(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vceqq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	uint32 z = inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0, 
+				 z, 
+				 z);
 #endif
 }
 
@@ -172,7 +195,11 @@ UVec4 Vec3::sLess(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcltq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	uint32 z = inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0, 
+				 z, 
+				 z);
 #endif
 }
 
@@ -183,7 +210,11 @@ UVec4 Vec3::sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcleq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	uint32 z = inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0, 
+				 z, 
+				 z);
 #endif
 }
 
@@ -194,7 +225,11 @@ UVec4 Vec3::sGreater(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcgtq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	uint32 z = inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0, 
+				 z, 
+				 z);
 #endif
 }
 
@@ -205,7 +240,11 @@ UVec4 Vec3::sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcgeq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	uint32 z = inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0, 
+				 z, 
+				 z);
 #endif
 }
 
@@ -220,7 +259,9 @@ Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
 #elif defined(JPH_USE_NEON)
 	return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
+				inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
+				inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2]);
 #endif
 }
 
@@ -250,7 +291,7 @@ Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vorrq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
 #endif
 }
 
@@ -261,7 +302,7 @@ Vec3 Vec3::sXor(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return veorq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
 #endif
 }
 
@@ -272,7 +313,7 @@ Vec3 Vec3::sAnd(Vec3Arg inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vandq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
 #endif
 }
 
@@ -314,7 +355,7 @@ Vec3 Vec3::operator * (Vec3Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vmulq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] * inV2.mF32[0], mF32[1] * inV2.mF32[1], mF32[2] * inV2.mF32[2]);
 #endif
 }
 
@@ -325,7 +366,7 @@ Vec3 Vec3::operator * (float inV2) const
 #elif defined(JPH_USE_NEON)
 	return vmulq_n_f32(mValue, inV2);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2);
 #endif
 }
 
@@ -336,7 +377,7 @@ Vec3 operator * (float inV1, Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vmulq_n_f32(inV2.mValue, inV1);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(inV1 * inV2.mF32[0], inV1 * inV2.mF32[1], inV1 * inV2.mF32[2]);
 #endif
 }
 
@@ -347,7 +388,7 @@ Vec3 Vec3::operator / (float inV2) const
 #elif defined(JPH_USE_NEON)
 	return vdivq_f32(mValue, vdupq_n_f32(inV2));
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2);
 #endif
 }
 
@@ -358,7 +399,11 @@ Vec3 &Vec3::operator *= (float inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vmulq_n_f32(mValue, inV2);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		mF32[i] *= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
 #endif
 	return *this;
 }
@@ -370,7 +415,11 @@ Vec3 &Vec3::operator *= (Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vmulq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		mF32[i] *= inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
 #endif
 	return *this;
 }
@@ -382,7 +431,11 @@ Vec3 &Vec3::operator /= (float inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		mF32[i] /= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
 #endif
 	return *this;
 }
@@ -394,7 +447,7 @@ Vec3 Vec3::operator + (Vec3Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vaddq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] + inV2.mF32[0], mF32[1] + inV2.mF32[1], mF32[2] + inV2.mF32[2]);
 #endif
 }
 
@@ -405,7 +458,11 @@ Vec3 &Vec3::operator += (Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vaddq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		mF32[i] += inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
 #endif
 	return *this;
 }
@@ -417,7 +474,7 @@ Vec3 Vec3::operator - () const
 #elif defined(JPH_USE_NEON)
 	return vnegq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(-mF32[0], -mF32[1], -mF32[2]);
 #endif
 }
 
@@ -428,7 +485,7 @@ Vec3 Vec3::operator - (Vec3Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vsubq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] - inV2.mF32[0], mF32[1] - inV2.mF32[1], mF32[2] - inV2.mF32[2]);
 #endif
 }
 
@@ -439,7 +496,11 @@ Vec3 &Vec3::operator -= (Vec3Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vsubq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		mF32[i] -= inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
 #endif
 	return *this;
 }
@@ -452,7 +513,7 @@ Vec3 Vec3::operator / (Vec3Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vdivq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[0] / inV2.mF32[0], mF32[1] / inV2.mF32[1], mF32[2] / inV2.mF32[2]);
 #endif
 }
 
@@ -463,7 +524,7 @@ Vec4 Vec3::SplatX() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 0);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
 #endif
 }
 
@@ -474,7 +535,7 @@ Vec4 Vec3::SplatY() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 1);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
 #endif
 }
 
@@ -485,7 +546,7 @@ Vec4 Vec3::SplatZ() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 2);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
 #endif
 }
 
@@ -508,7 +569,7 @@ Vec3 Vec3::Abs() const
 #elif defined(JPH_USE_NEON)
 	return vabsq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]));
 #endif
 }
 
@@ -534,7 +595,9 @@ Vec3 Vec3::Cross(Vec3Arg inV2) const
     Type t3 = vsubq_f32(t1, t2);
     return __builtin_shufflevector(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
 #else
-	#error Unsupported CPU architecture
+	return Vec3(mF32[1] * inV2.mF32[2] - mF32[2] * inV2.mF32[1],
+				mF32[2] * inV2.mF32[0] - mF32[0] * inV2.mF32[2],
+				mF32[0] * inV2.mF32[1] - mF32[1] * inV2.mF32[0]);
 #endif
 }
 
@@ -623,7 +686,7 @@ Vec3 Vec3::Sqrt() const
 #elif defined(JPH_USE_NEON)
 	return vsqrtq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]));
 #endif
 }
 
@@ -684,7 +747,7 @@ bool Vec3::IsNaN() const
 	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
 	return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
 #else
-	#error Unsupported CPU architecture
+	return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]);
 #endif
 }
 
@@ -701,7 +764,8 @@ void Vec3::StoreFloat3(Float3 *outV) const
     vst1_f32(&outV->x, xy);
     vst1q_lane_f32(&outV->z, mValue, 2);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 3; ++i)
+		(&outV->x)[i] = mF32[i];
 #endif
 }
 
@@ -712,7 +776,7 @@ UVec4 Vec3::ToInt() const
 #elif defined(JPH_USE_NEON)
 	return vcvtq_u32_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
 #endif
 }
 
@@ -723,7 +787,7 @@ UVec4 Vec3::ReinterpretAsInt() const
 #elif defined(JPH_USE_NEON)
 	return vreinterpretq_u32_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return *reinterpret_cast<const UVec4 *>(this);
 #endif
 }
 
@@ -768,7 +832,9 @@ Vec3 Vec3::GetSign() const
 	Type one = vdupq_n_f32(1.0f);
 	return vorrq_s32(vandq_s32(mValue, minus_one), one);
 #else
-	#error Unsupported CPU architecture
+	return Vec3(signbit(mF32[0])? -1.0f : 1.0f, 
+				signbit(mF32[1])? -1.0f : 1.0f, 
+				signbit(mF32[2])? -1.0f : 1.0f);
 #endif
 }
 

+ 5 - 2
Jolt/Math/Vec4.h

@@ -20,7 +20,7 @@ public:
 #elif defined(JPH_USE_NEON)
 	using Type = float32x4_t;
 #else
-	#error Undefined
+	using Type = struct { float mData[4]; };
 #endif
 
 	/// Constructor
@@ -108,7 +108,10 @@ public:
 	JPH_INLINE float			GetZ() const									{ return vgetq_lane_f32(mValue, 2); }
 	JPH_INLINE float			GetW() const									{ return vgetq_lane_f32(mValue, 3); }
 #else
-	#error Undefined
+	JPH_INLINE float			GetX() const									{ return mF32[0]; }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
+	JPH_INLINE float			GetW() const									{ return mF32[3]; }
 #endif
 
 	/// Set individual components

+ 93 - 42
Jolt/Math/Vec4.inl

@@ -35,7 +35,10 @@ Vec4::Vec4(float inX, float inY, float inZ, float inW)
 	uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
 	mValue = vcombine_f32(xy, zw);
 #else
-	#error Undefined CPU architecture
+	mF32[0] = inX;
+	mF32[1] = inY;
+	mF32[2] = inZ;
+	mF32[3] = inW;
 #endif
 }
 
@@ -52,7 +55,7 @@ Vec4 Vec4::Swizzle() const
 #elif defined(JPH_USE_NEON)
 	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
 #endif
 }
 
@@ -63,7 +66,7 @@ Vec4 Vec4::sZero()
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_f32(0);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(0, 0, 0, 0);
 #endif
 }
 
@@ -74,7 +77,7 @@ Vec4 Vec4::sReplicate(float inV)
 #elif defined(JPH_USE_NEON)
 	return vdupq_n_f32(inV);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(inV, inV, inV, inV);
 #endif
 }
 
@@ -90,7 +93,7 @@ Vec4 Vec4::sLoadFloat4(const Float4 *inV)
 #elif defined(JPH_USE_NEON)
 	return vld1q_f32(&inV->x);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(inV->x, inV->y, inV->z, inV->w);
 #endif
 }
 
@@ -101,7 +104,7 @@ Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
 #elif defined(JPH_USE_NEON)
 	return vld1q_f32(&inV->x);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(inV->x, inV->y, inV->z, inV->w);
 #endif
 }
 
@@ -138,7 +141,10 @@ Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vminq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(min(inV1.mF32[0], inV2.mF32[0]), 
+				min(inV1.mF32[1], inV2.mF32[1]), 
+				min(inV1.mF32[2], inV2.mF32[2]), 
+				min(inV1.mF32[3], inV2.mF32[3]));
 #endif
 }
 
@@ -149,7 +155,10 @@ Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vmaxq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(max(inV1.mF32[0], inV2.mF32[0]), 
+				max(inV1.mF32[1], inV2.mF32[1]), 
+				max(inV1.mF32[2], inV2.mF32[2]), 
+				max(inV1.mF32[3], inV2.mF32[3]));
 #endif
 }
 
@@ -160,7 +169,10 @@ UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vceqq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0, 
+				 inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0, 
+				 inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -171,7 +183,10 @@ UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcltq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0, 
+				 inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0, 
+				 inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -182,7 +197,10 @@ UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcleq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0, 
+				 inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0, 
+				 inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -193,7 +211,10 @@ UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcgtq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0, 
+				 inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0, 
+				 inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -204,7 +225,10 @@ UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vcgeq_f32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0, 
+				 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0, 
+				 inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0, 
+				 inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
 #endif
 }
 
@@ -219,7 +243,10 @@ Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
 #elif defined(JPH_USE_NEON)
 	return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
+				inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
+				inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
+				inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
 #endif
 }
 
@@ -244,7 +271,7 @@ Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vorrq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
 #endif
 }
 
@@ -255,7 +282,7 @@ Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return veorq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
 #endif
 }
 
@@ -266,7 +293,7 @@ Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vandq_s32(inV1.mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
 #endif
 }
 
@@ -341,7 +368,7 @@ bool Vec4::IsNaN() const
 	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
 	return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
 #else
-	#error Unsupported CPU architecture
+	return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
 #endif
 }
 
@@ -352,7 +379,10 @@ Vec4 Vec4::operator * (Vec4Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vmulq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] * inV2.mF32[0], 
+				mF32[1] * inV2.mF32[1], 
+				mF32[2] * inV2.mF32[2], 
+				mF32[3] * inV2.mF32[3]);
 #endif
 }
 
@@ -363,7 +393,7 @@ Vec4 Vec4::operator * (float inV2) const
 #elif defined(JPH_USE_NEON)
 	return vmulq_n_f32(mValue, inV2);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
 #endif
 }
 
@@ -375,7 +405,10 @@ Vec4 operator * (float inV1, Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	return vmulq_n_f32(inV2.mValue, inV1);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(inV1 * inV2.mF32[0], 
+				inV1 * inV2.mF32[1], 
+				inV1 * inV2.mF32[2], 
+				inV1 * inV2.mF32[3]);
 #endif
 }
 
@@ -386,7 +419,7 @@ Vec4 Vec4::operator / (float inV2) const
 #elif defined(JPH_USE_NEON)
 	return vdivq_f32(mValue, vdupq_n_f32(inV2));
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
 #endif
 }
 
@@ -397,7 +430,8 @@ Vec4 &Vec4::operator *= (float inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vmulq_n_f32(mValue, inV2);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mF32[i] *= inV2;
 #endif
 	return *this;
 }
@@ -409,7 +443,8 @@ Vec4 &Vec4::operator *= (Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vmulq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mF32[i] *= inV2.mF32[i];
 #endif
 	return *this;
 }
@@ -421,7 +456,8 @@ Vec4 &Vec4::operator /= (float inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mF32[i] /= inV2;
 #endif
 	return *this;
 }
@@ -433,7 +469,10 @@ Vec4 Vec4::operator + (Vec4Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vaddq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] + inV2.mF32[0], 
+				mF32[1] + inV2.mF32[1], 
+				mF32[2] + inV2.mF32[2], 
+				mF32[3] + inV2.mF32[3]);
 #endif
 }
 
@@ -444,7 +483,8 @@ Vec4 &Vec4::operator += (Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vaddq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mF32[i] += inV2.mF32[i];
 #endif
 	return *this;
 }
@@ -456,7 +496,7 @@ Vec4 Vec4::operator - () const
 #elif defined(JPH_USE_NEON)
 	return vnegq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
 #endif
 }
 
@@ -467,7 +507,10 @@ Vec4 Vec4::operator - (Vec4Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vsubq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] - inV2.mF32[0], 
+				mF32[1] - inV2.mF32[1], 
+				mF32[2] - inV2.mF32[2], 
+				mF32[3] - inV2.mF32[3]);
 #endif
 }
 
@@ -478,7 +521,8 @@ Vec4 &Vec4::operator -= (Vec4Arg inV2)
 #elif defined(JPH_USE_NEON)
 	mValue = vsubq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		mF32[i] -= inV2.mF32[i];
 #endif
 	return *this;
 }
@@ -490,7 +534,10 @@ Vec4 Vec4::operator / (Vec4Arg inV2) const
 #elif defined(JPH_USE_NEON)
 	return vdivq_f32(mValue, inV2.mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0] / inV2.mF32[0], 
+				mF32[1] / inV2.mF32[1], 
+				mF32[2] / inV2.mF32[2], 
+				mF32[3] / inV2.mF32[3]);
 #endif
 }
 
@@ -501,7 +548,7 @@ Vec4 Vec4::SplatX() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 0);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
 #endif
 }
 
@@ -512,7 +559,7 @@ Vec4 Vec4::SplatY() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 1);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
 #endif
 }
 
@@ -523,7 +570,7 @@ Vec4 Vec4::SplatZ() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 2);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
 #endif
 }
 
@@ -534,7 +581,7 @@ Vec4 Vec4::SplatW() const
 #elif defined(JPH_USE_NEON)
 	return vdupq_laneq_f32(mValue, 3);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
 #endif
 }
 
@@ -547,7 +594,7 @@ Vec4 Vec4::Abs() const
 #elif defined(JPH_USE_NEON)
 	return vabsq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
 #endif
 }
 
@@ -621,7 +668,7 @@ Vec4 Vec4::Sqrt() const
 #elif defined(JPH_USE_NEON)
 	return vsqrtq_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
 #endif
 }
 
@@ -639,7 +686,10 @@ Vec4 Vec4::GetSign() const
 	Type one = vdupq_n_f32(1.0f);
 	return vorrq_s32(vandq_s32(mValue, minus_one), one);
 #else
-	#error Unsupported CPU architecture
+	return Vec4(signbit(mF32[0])? -1.0f : 1.0f, 
+				signbit(mF32[1])? -1.0f : 1.0f, 
+				signbit(mF32[2])? -1.0f : 1.0f, 
+				signbit(mF32[3])? -1.0f : 1.0f);
 #endif
 }
 
@@ -663,7 +713,8 @@ void Vec4::StoreFloat4(Float4 *outV) const
 #elif defined(JPH_USE_NEON)
     vst1q_f32(&outV->x, mValue);
 #else
-	#error Unsupported CPU architecture
+	for (int i = 0; i < 4; ++i)
+		(&outV->x)[i] = mF32[i];
 #endif
 }
 
@@ -674,7 +725,7 @@ UVec4 Vec4::ToInt() const
 #elif defined(JPH_USE_NEON)
 	return vcvtq_u32_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
 #endif
 }
 
@@ -685,7 +736,7 @@ UVec4 Vec4::ReinterpretAsInt() const
 #elif defined(JPH_USE_NEON)
 	return vreinterpretq_u32_f32(mValue);
 #else
-	#error Unsupported CPU architecture
+	return *reinterpret_cast<const UVec4 *>(this);
 #endif
 }
 
@@ -697,7 +748,7 @@ int Vec4::GetSignBits() const
     int32x4_t shift = { 0, 1, 2, 3 };
     return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
 #else
-	#error Unsupported CPU architecture
+	return (signbit(mF32[0])? 1 : 0) | (signbit(mF32[1])? 2 : 0) | (signbit(mF32[2])? 4 : 0) | (signbit(mF32[3])? 8 : 0);
 #endif
 }