Преглед на файлове

Windows on ARM64 support (#317)

Jorrit Rouwe преди 2 години
родител
ревизия
75b5180deb

+ 18 - 0
.github/workflows/build.yml

@@ -123,6 +123,24 @@ jobs:
       working-directory: ${{github.workspace}}/Build/VS2022_CL_32_BIT/${{matrix.build_type}}
       run: ./UnitTests.exe
 
+  msvc_cl_arm:
+    runs-on: windows-latest
+    name: Visual Studio CL ARM
+    strategy:
+        fail-fast: false
+        matrix:
+            build_type: [Debug, Release]
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Add msbuild to PATH
+      uses: microsoft/[email protected]
+    - name: Configure CMake
+      run: cmake -B ${{github.workspace}}/Build/VS2022_CL_ARM -G "Visual Studio 17 2022" -A ARM64 Build
+    - name: Build
+      run: msbuild Build\VS2022_CL_ARM\JoltPhysics.sln /property:Configuration=${{matrix.build_type}}
+
   macos:
     runs-on: macos-latest
     name: MacOS

+ 36 - 30
Build/CMakeLists.txt

@@ -57,12 +57,18 @@ if (("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUA
 
 	# Set compiler flags for various configurations
 	set(CMAKE_CXX_FLAGS_DEBUG "/GS /Od /Ob0 /RTC1")
-	set(CMAKE_CXX_FLAGS_RELEASE "/GS- /GL /Gy /O2 /Oi /Ot")
-	set(CMAKE_CXX_FLAGS_DISTRIBUTION "/GS- /GL /Gy /O2 /Oi /Ot")
+	set(CMAKE_CXX_FLAGS_RELEASE "/GS- /Gy /O2 /Oi /Ot")
+	set(CMAKE_CXX_FLAGS_DISTRIBUTION "/GS- /Gy /O2 /Oi /Ot")
 	set(CMAKE_CXX_FLAGS_RELEASEASAN "-fsanitize=address /Od")
 	set(CMAKE_CXX_FLAGS_RELEASEUBSAN "-fsanitize=undefined,implicit-conversion,float-divide-by-zero,local-bounds -fno-sanitize-recover=all")
 	set(CMAKE_CXX_FLAGS_RELEASECOVERAGE "-fprofile-instr-generate -fcoverage-mapping")
 
+	if (NOT ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM64"))
+		# On ARM64, whole program optimization triggers an internal compiler error during code gen, so we don't turn it on
+		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
+		set(CMAKE_CXX_FLAGS_DISTRIBUTION "${CMAKE_CXX_FLAGS_DISTRIBUTION} /GL")
+	endif()
+
 	# Set linker flags
 	set(CMAKE_EXE_LINKER_FLAGS "/SUBSYSTEM:WINDOWS /ignore:4221 /DEBUG:FASTLINK")
 	if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
@@ -71,30 +77,32 @@ if (("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUA
 		else()
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") # Clang doesn't use fast math because it cannot be turned off inside a single compilation unit
 		endif()
-		if (USE_AVX512)
-			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
-		elseif (USE_AVX2)
-			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-		elseif (USE_AVX)
-			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-		endif()	
-		if (USE_SSE4_1)
-			add_compile_definitions(JPH_USE_SSE4_1)
-		endif()
-		if (USE_SSE4_2)
-			add_compile_definitions(JPH_USE_SSE4_2)
-		endif()
-		if (USE_LZCNT)
-			add_compile_definitions(JPH_USE_LZCNT)
-		endif()
-		if (USE_TZCNT)
-			add_compile_definitions(JPH_USE_TZCNT)
-		endif()
-		if (USE_F16C)
-			add_compile_definitions(JPH_USE_F16C)
-		endif()
-		if (USE_FMADD AND NOT CROSS_PLATFORM_DETERMINISTIC)
-			add_compile_definitions(JPH_USE_FMADD)
+		if ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "x86" OR "${CMAKE_VS_PLATFORM_NAME}" STREQUAL "x64")
+			if (USE_AVX512)
+				set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
+			elseif (USE_AVX2)
+				set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+			elseif (USE_AVX)
+				set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+			endif()	
+			if (USE_SSE4_1)
+				add_compile_definitions(JPH_USE_SSE4_1)
+			endif()
+			if (USE_SSE4_2)
+				add_compile_definitions(JPH_USE_SSE4_2)
+			endif()
+			if (USE_LZCNT)
+				add_compile_definitions(JPH_USE_LZCNT)
+			endif()
+			if (USE_TZCNT)
+				add_compile_definitions(JPH_USE_TZCNT)
+			endif()
+			if (USE_F16C)
+				add_compile_definitions(JPH_USE_F16C)
+			endif()
+			if (USE_FMADD AND NOT CROSS_PLATFORM_DETERMINISTIC)
+				add_compile_definitions(JPH_USE_FMADD)
+			endif()
 		endif()
 		set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /DJPH_FLOATING_POINT_EXCEPTIONS_ENABLED") # Clang turns Float2 into a vector sometimes causing floating point exceptions
 		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /DJPH_FLOATING_POINT_EXCEPTIONS_ENABLED")
@@ -148,7 +156,8 @@ elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQU
 	# Platform specific compiler flags
 	if (CROSS_COMPILE_ARM)
 		set(CMAKE_CXX_FLAGS "--target=aarch64-linux-gnu ${CMAKE_CXX_FLAGS}")
-	elseif (CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+	elseif (CMAKE_OSX_ARCHITECTURES MATCHES "arm64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
+		# ARM64
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 	elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64")
 		# X64
@@ -177,9 +186,6 @@ elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQU
 		if (USE_FMADD AND NOT CROSS_PLATFORM_DETERMINISTIC)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
 		endif()
-	elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-		# ARM64
-		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 	endif()
 
 	# Set compiler flags for various configurations

+ 3 - 0
Build/cmake_vs2022_cl_arm.bat

@@ -0,0 +1,3 @@
+@echo off
+cmake -S . -B VS2022_CL_ARM -G "Visual Studio 17 2022" -A ARM64 %*
+echo Open VS2022_CL_ARM\JoltPhysics.sln to build the project.

+ 87 - 0
Jolt/Core/ARMNeon.h

@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef JPH_USE_NEON
+
+#ifdef JPH_COMPILER_MSVC
+	JPH_NAMESPACE_BEGIN
+
+	// Constructing NEON values
+	#define JPH_NEON_INT32x4(v1, v2, v3, v4) { int64_t(v1) + (int64_t(v2) << 32), int64_t(v3) + (int64_t(v4) << 32) }
+	#define JPH_NEON_UINT32x4(v1, v2, v3, v4) { uint64_t(v1) + (uint64_t(v2) << 32), uint64_t(v3) + (uint64_t(v4) << 32) }
+	#define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { int64_t(v1) + (int64_t(v2) << 8) + (int64_t(v3) << 16) + (int64_t(v4) << 24) + (int64_t(v5) << 32) + (int64_t(v6) << 40) + (int64_t(v7) << 48) + (int64_t(v8) << 56), int64_t(v9) + (int64_t(v10) << 8) + (int64_t(v11) << 16) + (int64_t(v12) << 24) + (int64_t(v13) << 32) + (int64_t(v14) << 40) + (int64_t(v15) << 48) + (int64_t(v16) << 56) }
+
+	// Generic shuffle vector template
+	template <unsigned I1, unsigned I2, unsigned I3, unsigned I4>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4(float32x4_t inV1, float32x4_t inV2)
+	{
+		float32x4_t ret;
+		ret = vmovq_n_f32(vgetq_lane_f32(I1 >= 4? inV2 : inV1, I1 & 0b11));
+		ret = vsetq_lane_f32(vgetq_lane_f32(I2 >= 4? inV2 : inV1, I2 & 0b11), ret, 1);
+		ret = vsetq_lane_f32(vgetq_lane_f32(I3 >= 4? inV2 : inV1, I3 & 0b11), ret, 2);
+		ret = vsetq_lane_f32(vgetq_lane_f32(I4 >= 4? inV2 : inV1, I4 & 0b11), ret, 3);
+		return ret;
+	}
+
+	// Specializations
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 2>(float32x4_t inV1, float32x4_t inV2)
+	{
+	    return vcombine_f32(vget_low_f32(inV1), vdup_lane_s32(vget_high_f32(inV1), 0));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 3, 3>(float32x4_t inV1, float32x4_t inV2)
+	{
+	    return vcombine_f32(vget_low_f32(inV1), vdup_lane_s32(vget_high_f32(inV1), 1));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<0, 1, 2, 3>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return inV1;
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 0, 3, 2>(float32x4_t inV1, float32x4_t inV2)
+	{
+	    return vcombine_f32(vrev64_f32(vget_low_f32(inV1)), vrev64_f32(vget_high_f32(inV1)));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 2, 1, 0>(float32x4_t inV1, float32x4_t inV2)
+	{
+	    return vcombine_f32(vdup_lane_s32(vget_high_f32(inV1), 0), vrev64_f32(vget_low_f32(inV1)));
+	}
+
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<2, 3, 0, 1>(float32x4_t inV1, float32x4_t inV2)
+	{
+		return vcombine_f32(vget_high_f32(inV1), vget_low_f32(inV1));
+	}
+
+	// Used extensively by cross product
+	template <>
+	JPH_INLINE float32x4_t NeonShuffleFloat32x4<1, 2, 0, 0>(float32x4_t inV1, float32x4_t inV2)
+	{
+		static int8x16_t table = JPH_NEON_INT8x16(0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03);
+	    return vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(inV1), table));
+	}
+
+	// Shuffle a vector
+	#define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) NeonShuffleFloat32x4<index1, index2, index3, index4>(vec1, vec2)
+
+	JPH_NAMESPACE_END
+#else
+	// Constructing NEON values
+	#define JPH_NEON_INT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
+	#define JPH_NEON_UINT32x4(v1, v2, v3, v4) { v1, v2, v3, v4 }
+	#define JPH_NEON_INT8x16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) { v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16 }
+
+	// Shuffle a vector
+	#define JPH_NEON_SHUFFLE_F32x4(vec1, vec2, index1, index2, index3, index4) __builtin_shufflevector(vec1, vec2, index1, index2, index3, index4)
+#endif
+
+#endif // JPH_USE_NEON

+ 6 - 1
Jolt/Core/Core.h

@@ -246,7 +246,12 @@ JPH_SUPPRESS_WARNINGS_STD_END
 #if defined(JPH_USE_SSE)
 	#include <immintrin.h>
 #elif defined(JPH_USE_NEON)
-	#include <arm_neon.h>
+	#ifdef JPH_COMPILER_MSVC
+		#include <intrin.h>
+		#include <arm64_neon.h>
+	#else
+		#include <arm_neon.h>
+	#endif
 #endif
 
 JPH_NAMESPACE_BEGIN

+ 29 - 0
Jolt/Core/FPControlWord.h

@@ -30,6 +30,35 @@ private:
 	uint		mPrevState;	
 };
 
+#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+
+/// Helper class that needs to be put on the stack to update the state of the floating point control word.
+/// This state is kept per thread.
+template <unsigned int Value, unsigned int Mask>
+class FPControlWord : public NonCopyable
+{
+public:
+				FPControlWord()
+	{
+		// Read state before change
+		_controlfp_s(&mPrevState, 0, 0);
+
+		// Update the state
+		unsigned int dummy;
+		_controlfp_s(&dummy, Value, Mask);
+	}
+
+				~FPControlWord()
+	{
+		// Restore state
+		unsigned int dummy;
+		_controlfp_s(&dummy, mPrevState, Mask);
+	}
+
+private:
+	unsigned int mPrevState;
+};
+
 #elif defined(JPH_USE_NEON)
 
 /// Helper class that needs to be put on the stack to update the state of the floating point control word.

+ 11 - 0
Jolt/Core/FPException.h

@@ -20,6 +20,17 @@ class FPExceptionDisableInvalid : public FPControlWord<_MM_MASK_INVALID, _MM_MAS
 /// Disable division by zero floating point exceptions
 class FPExceptionDisableDivByZero : public FPControlWord<_MM_MASK_DIV_ZERO, _MM_MASK_DIV_ZERO> { };
 
+#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+
+/// Enable floating point divide by zero exception and exceptions on invalid numbers
+class FPExceptionsEnable : public FPControlWord<0, _EM_INVALID | _EM_ZERODIVIDE> { };
+
+/// Disable invalid floating point value exceptions
+class FPExceptionDisableInvalid : public FPControlWord<_EM_INVALID, _EM_INVALID> { };
+
+/// Disable division by zero floating point exceptions
+class FPExceptionDisableDivByZero : public FPControlWord<_EM_ZERODIVIDE, _EM_ZERODIVIDE> { };
+
 #elif defined(JPH_USE_NEON)
 
 /// Invalid operation exception bit

+ 4 - 0
Jolt/Core/FPFlushDenormals.h

@@ -13,6 +13,10 @@ JPH_NAMESPACE_BEGIN
 /// This can make floating point operations much faster when working with very small numbers
 class FPFlushDenormals : public FPControlWord<_MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_MASK> { };
 
+#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+
+class FPFlushDenormals : public FPControlWord<_DN_FLUSH, _MCW_DN> { };
+
 #elif defined(JPH_USE_NEON)
 
 /// Flush denormals to zero bit

+ 3 - 3
Jolt/Core/TickCounter.cpp

@@ -24,7 +24,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-#ifdef JPH_PLATFORM_WINDOWS_UWP
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
 
 uint64 GetProcessorTickCount()
 {
@@ -33,10 +33,10 @@ uint64 GetProcessorTickCount()
 	return uint64(count.QuadPart);
 }
 
-#endif // JPH_PLATFORM_WINDOWS_UWP
+#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM64)
 
 static const uint64 sProcessorTicksPerSecond = []() {
-#if defined(JPH_PLATFORM_WINDOWS_UWP)
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
 	LARGE_INTEGER frequency { };
 	QueryPerformanceFrequency(&frequency);
 	return uint64(frequency.QuadPart);

+ 3 - 3
Jolt/Core/TickCounter.h

@@ -12,7 +12,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-#ifdef JPH_PLATFORM_WINDOWS_UWP
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
 
 /// Functionality to get the processors cycle counter
 uint64 GetProcessorTickCount(); // Not inline to avoid having to include Windows.h
@@ -28,7 +28,7 @@ JPH_INLINE uint64 GetProcessorTickCount()
 	return __rdtsc();
 #elif defined(JPH_CPU_ARM64)
 	uint64 val;
-    asm volatile("mrs %0, cntvct_el0" : "=r" (val));
+	asm volatile("mrs %0, cntvct_el0" : "=r" (val));
 	return val;
 #elif defined(JPH_CPU_WASM)
 	return 0; // Not supported
@@ -37,7 +37,7 @@ JPH_INLINE uint64 GetProcessorTickCount()
 #endif
 }
 
-#endif // JPH_PLATFORM_WINDOWS_UWP
+#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM64)
 
 /// Get the amount of ticks per second, note that this number will never be fully accurate as the amound of ticks per second may vary with CPU load, so this number is only to be used to give an indication of time for profiling purposes
 uint64 GetProcessorTicksPerSecond();

+ 1 - 0
Jolt/Jolt.cmake

@@ -8,6 +8,7 @@ set(JOLT_PHYSICS_SRC_FILES
 	${JOLT_PHYSICS_ROOT}/AABBTree/AABBTreeToBuffer.h
 	${JOLT_PHYSICS_ROOT}/AABBTree/NodeCodec/NodeCodecQuadTreeHalfFloat.h
 	${JOLT_PHYSICS_ROOT}/AABBTree/TriangleCodec/TriangleCodecIndexed8BitPackSOA4Flags.h
+	${JOLT_PHYSICS_ROOT}/Core/ARMNeon.h
 	${JOLT_PHYSICS_ROOT}/Core/Atomics.h
 	${JOLT_PHYSICS_ROOT}/Core/ByteBuffer.h
 	${JOLT_PHYSICS_ROOT}/Core/Color.cpp

+ 1 - 0
Jolt/Jolt.h

@@ -5,6 +5,7 @@
 
 // Project includes
 #include <Jolt/Core/Core.h>
+#include <Jolt/Core/ARMNeon.h>
 #include <Jolt/Core/Memory.h>
 #include <Jolt/Core/STLAllocator.h>
 #include <Jolt/Core/IssueReporting.h>

+ 76 - 73
Jolt/Math/Mat44.inl

@@ -603,64 +603,65 @@ Mat44 Mat44::Inversed() const
 	return result;
 #elif defined(JPH_USE_NEON)
 	// Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
-	Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
-	Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
-	row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
-	Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
-	row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
+	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
+	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
+	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
+	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
+	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
+	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
+	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
 
 	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	Type minor0 = vmulq_f32(row1, tmp1);
 	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
 	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
+	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
 	Type minor3 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
 	minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
-	minor3 = __builtin_shufflevector(minor3, minor3, 2, 3, 0, 1);
+	minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
 
-	tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
+	tmp1 = vmulq_f32(tmp1, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
 	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
 	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
 	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
+	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
 	minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
 	minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
 
 	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
 	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
 	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
 
 	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
 	minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
 	minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
 
@@ -794,58 +795,59 @@ Mat44 Mat44::Adjointed3x3() const
 	return result;
 #elif defined(JPH_USE_NEON)
 	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
-	row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
-	row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
+	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
+	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
+	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
+	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
+	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
+	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
+	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
 
 	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	Type minor0 = vmulq_f32(row1, tmp1);
 	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
 	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
+	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
 
-	tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
+	tmp1 = vmulq_f32(tmp1, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
 	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
 	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
 	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
+	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
 
 	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
 	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
 	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
 
 	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
 	
 	Mat44 result;
@@ -950,58 +952,59 @@ Mat44 Mat44::Inversed3x3() const
 	return result;
 #elif defined(JPH_USE_NEON)
 	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
-	row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
-	row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
+	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
+	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
+	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
+	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
+	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
+	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
+	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
 
 	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	Type minor0 = vmulq_f32(row1, tmp1);
 	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
 	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
+	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
 
-	tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
+	tmp1 = vmulq_f32(tmp1, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
 	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
 	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
 	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
+	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
 
 	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
 
 	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
 	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
 	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
 
 	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
 	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
 	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
 
 	Type det = vmulq_f32(row0, minor0);

+ 16 - 2
Jolt/Math/Math.h

@@ -108,7 +108,15 @@ inline uint CountTrailingZeros(uint32 inValue)
 		return __builtin_ctz(inValue);
 	#endif
 #elif defined(JPH_CPU_ARM64)
-	return __builtin_clz(__builtin_bitreverse32(inValue));
+	#if defined(JPH_COMPILER_MSVC)
+		if (inValue == 0)
+			return 32;
+		unsigned long result;
+		_BitScanForward(&result, inValue);
+		return result;
+	#else
+		return __builtin_clz(__builtin_bitreverse32(inValue));
+	#endif
 #else
 	#error Undefined
 #endif
@@ -132,7 +140,11 @@ inline uint CountLeadingZeros(uint32 inValue)
 		return __builtin_clz(inValue);
 	#endif
 #elif defined(JPH_CPU_ARM64)
-	return __builtin_clz(inValue);
+	#if defined(JPH_COMPILER_MSVC)
+		return _CountLeadingZeros(inValue);
+	#else
+		return __builtin_clz(inValue);
+	#endif
 #else
 	#error Undefined
 #endif
@@ -146,6 +158,8 @@ inline uint CountBits(uint32 inValue)
 #elif defined(JPH_COMPILER_MSVC)
 	#if defined(JPH_USE_SSE4_2)
 		return _mm_popcnt_u32(inValue);
+	#elif defined(JPH_USE_NEON)
+		return _CountOneBits(inValue);
 	#else
 		inValue = inValue - ((inValue >> 1) & 0x55555555);
 		inValue = (inValue & 0x33333333) + ((inValue >> 2) & 0x33333333);

+ 6 - 6
Jolt/Math/UVec4.inl

@@ -35,7 +35,7 @@ UVec4 UVec4::Swizzle() const
 #if defined(JPH_USE_SSE)
 	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
 #elif defined(JPH_USE_NEON)
-	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
+	return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
 #else
 	return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
 #endif
@@ -379,7 +379,7 @@ int UVec4::GetTrues() const
 #if defined(JPH_USE_SSE)
 	return _mm_movemask_ps(_mm_castsi128_ps(mValue));
 #elif defined(JPH_USE_NEON)
-    int32x4_t shift = { 0, 1, 2, 3 };
+    int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
     return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
 #else
 	return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
@@ -488,7 +488,7 @@ UVec4 UVec4::Expand4Byte0() const
 #if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
 #elif defined(JPH_USE_NEON)
-	int8x16_t idx = { 0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f };
+	int8x16_t idx = JPH_NEON_INT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 	UVec4 result;
@@ -503,7 +503,7 @@ UVec4 UVec4::Expand4Byte4() const
 #if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
 #elif defined(JPH_USE_NEON)
-	int8x16_t idx = { 0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f };
+	int8x16_t idx = JPH_NEON_INT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 	UVec4 result;
@@ -518,7 +518,7 @@ UVec4 UVec4::Expand4Byte8() const
 #if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
 #elif defined(JPH_USE_NEON)
-	int8x16_t idx = { 0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f };
+	int8x16_t idx = JPH_NEON_INT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 	UVec4 result;
@@ -533,7 +533,7 @@ UVec4 UVec4::Expand4Byte12() const
 #if defined(JPH_USE_SSE4_1)
 	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
 #elif defined(JPH_USE_NEON)
-	int8x16_t idx = { 0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f };
+	int8x16_t idx = JPH_NEON_INT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
 	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
 #else
 	UVec4 result;

+ 6 - 6
Jolt/Math/Vec3.inl

@@ -28,7 +28,7 @@ JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
 	#if defined(JPH_USE_SSE)
 		return _mm_shuffle_ps(inValue, inValue, _MM_SHUFFLE(2, 2, 1, 0)); 
 	#elif defined(JPH_USE_NEON)
-		return __builtin_shufflevector(inValue, inValue, 0, 1, 2, 2);
+		return JPH_NEON_SHUFFLE_F32x4(inValue, inValue, 0, 1, 2, 2);
 	#else
 		Type value;
 		value.mData[0] = inValue.mData[0];
@@ -97,7 +97,7 @@ Vec3 Vec3::Swizzle() const
 #if defined(JPH_USE_SSE)
 	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleZ, SwizzleZ, SwizzleY, SwizzleX)); // Assure Z and W are the same
 #elif defined(JPH_USE_NEON)
-	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
+	return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
 #else
 	return Vec3(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ]);
 #endif
@@ -588,12 +588,12 @@ Vec3 Vec3::Cross(Vec3Arg inV2) const
     Type t3 = _mm_sub_ps(t1, t2);
     return _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
 #elif defined(JPH_USE_NEON)
-	Type t1 = __builtin_shufflevector(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
+	Type t1 = JPH_NEON_SHUFFLE_F32x4(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
     t1 = vmulq_f32(t1, mValue);
-    Type t2 = __builtin_shufflevector(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
+    Type t2 = JPH_NEON_SHUFFLE_F32x4(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
     t2 = vmulq_f32(t2, inV2.mValue);
     Type t3 = vsubq_f32(t1, t2);
-    return __builtin_shufflevector(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
+    return JPH_NEON_SHUFFLE_F32x4(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
 #else
 	return Vec3(mF32[1] * inV2.mF32[2] - mF32[2] * inV2.mF32[1],
 				mF32[2] * inV2.mF32[0] - mF32[0] * inV2.mF32[2],
@@ -745,7 +745,7 @@ bool Vec3::IsNaN() const
 #elif defined(JPH_USE_SSE)
 	return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
 #elif defined(JPH_USE_NEON)
-	uint32x4_t mask = { 1, 1, 1, 0 };
+	uint32x4_t mask = JPH_NEON_UINT32x4(1, 1, 1, 0);
 	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
 	return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
 #else

+ 2 - 2
Jolt/Math/Vec4.inl

@@ -53,7 +53,7 @@ Vec4 Vec4::Swizzle() const
 #if defined(JPH_USE_SSE)
 	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
 #elif defined(JPH_USE_NEON)
-	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
+	return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
 #else
 	return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
 #endif
@@ -747,7 +747,7 @@ int Vec4::GetSignBits() const
 #if defined(JPH_USE_SSE)
 	return _mm_movemask_ps(mValue);
 #elif defined(JPH_USE_NEON)
-    int32x4_t shift = { 0, 1, 2, 3 };
+    int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
     return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
 #else
 	return (signbit(mF32[0])? 1 : 0) | (signbit(mF32[1])? 2 : 0) | (signbit(mF32[2])? 4 : 0) | (signbit(mF32[3])? 8 : 0);

+ 1 - 1
README.md

@@ -81,7 +81,7 @@ For more information see the [Architecture and API documentation](https://jrouwe
 
 ## Supported Platforms
 
-* Windows (VS2019, VS2022) x64/x86 (Desktop/UWP)
+* Windows (VS2019, VS2022) x64/x86/ARM64 (Desktop/UWP)
 * Linux (tested on Ubuntu 20.04) x64/ARM64
 * Android (tested on Android 10) x64/ARM64
 * Platform Blue (a popular game console) x64

+ 4 - 3
UnitTests/Core/FPFlushDenormalsTest.cpp

@@ -3,10 +3,11 @@
 
 #include "UnitTestFramework.h"
 #include <Jolt/Core/FPFlushDenormals.h>
+#include <atomic>
 
-// Implemented as a global so the compiler can't optimize it to a constant
-extern volatile float TestFltMin;
-volatile float TestFltMin = FLT_MIN;
+// Implemented as a global atomic so the compiler can't optimize it to a constant
+extern atomic<float> TestFltMin;
+atomic<float> TestFltMin = FLT_MIN;
 
 TEST_SUITE("FlushDenormalsTests")
 {

+ 1 - 1
UnitTests/Physics/PhysicsTests.cpp

@@ -290,7 +290,7 @@ TEST_SUITE("PhysicsTests")
 		CHECK_APPROX_EQUAL(body->GetWorldTransform(), body_transform);
 		CHECK_APPROX_EQUAL(body->GetCenterOfMassPosition(), com_transform.GetTranslation());
 		CHECK_APPROX_EQUAL(body->GetCenterOfMassTransform(), com_transform);
-		CHECK_APPROX_EQUAL(body->GetInverseCenterOfMassTransform(), com_transform.InversedRotationTranslation());
+		CHECK_APPROX_EQUAL(body->GetInverseCenterOfMassTransform(), com_transform.InversedRotationTranslation(), 1.0e-5f);
 	}
 
 	TEST_CASE("TestPhysicsOverrideMassAndInertia")

+ 4 - 4
UnitTests/Physics/RayShapeTests.cpp

@@ -290,7 +290,7 @@ TEST_SUITE("RayShapeTests")
 			if (inExpectedFraction1 != FLT_MAX)
 			{
 				CHECK(system.GetNarrowPhaseQuery().CastRay(ray, hit));
-				CHECK_APPROX_EQUAL(hit.mFraction, inExpectedFraction1, 1.0e-5f);
+				CHECK_APPROX_EQUAL(hit.mFraction, inExpectedFraction1, 2.0e-5f);
 			}
 			else
 			{
@@ -324,7 +324,7 @@ TEST_SUITE("RayShapeTests")
 			if (inExpectedFraction1 != FLT_MAX)
 			{
 				CHECK(collector.mHits.size() == 1);
-				CHECK_APPROX_EQUAL(collector.mHits[0].mFraction, inExpectedFraction1, 1.0e-5f);
+				CHECK_APPROX_EQUAL(collector.mHits[0].mFraction, inExpectedFraction1, 2.0e-5f);
 			}
 			else
 			{
@@ -359,7 +359,7 @@ TEST_SUITE("RayShapeTests")
 			if (inExpectedFraction1 != FLT_MAX)
 			{
 				CHECK(collector.mHits.size() >= 1);
-				CHECK_APPROX_EQUAL(collector.mHits[0].mFraction, inExpectedFraction1, 1.0e-5f);
+				CHECK_APPROX_EQUAL(collector.mHits[0].mFraction, inExpectedFraction1, 2.0e-5f);
 			}
 			else
 			{
@@ -370,7 +370,7 @@ TEST_SUITE("RayShapeTests")
 			if (inExpectedFraction2 != FLT_MAX)
 			{
 				CHECK(collector.mHits.size() >= 2);
-				CHECK_APPROX_EQUAL(collector.mHits[1].mFraction, inExpectedFraction2, 1.0e-5f);
+				CHECK_APPROX_EQUAL(collector.mHits[1].mFraction, inExpectedFraction2, 2.0e-5f);
 			}
 			else
 			{

+ 1 - 1
UnitTests/Physics/WheeledVehicleTests.cpp

@@ -194,7 +194,7 @@ TEST_SUITE("WheeledVehicleTests")
 		CheckOnGround(constraint, settings, floor_id);
 		CHECK(!body->IsActive()); // Car should have gone sleeping
 		Vec3 pos3 = body->GetPosition();
-		CHECK_APPROX_EQUAL(pos3.GetX(), 0, 1.0e-3f); // Not moving left/right
+		CHECK_APPROX_EQUAL(pos3.GetX(), 0, 2.0e-3f); // Not moving left/right
 		CHECK(pos3.GetZ() > pos2.GetZ() + 1.0f); // Moving in Z direction while braking
 		vel = body->GetLinearVelocity();
 		CHECK_APPROX_EQUAL(vel, Vec3::sZero(), 1.0e-3f); // Not moving