Explorar o código

Add initial support for AVX512 (#185)

* Add support for AVX512 target

Off by default

* - Add AVX512 implementation for `Vec{3,4}::GetSign`
- Add AVX512 implementation for `Vec{3,4}::Abs`
Wunk %!s(int64=3) %!d(string=hai) anos
pai
achega
a830f47833

+ 2 - 2
.github/workflows/build.yml

@@ -90,7 +90,7 @@ jobs:
     - name: Add msbuild to PATH
       uses: microsoft/[email protected]
     - name: Configure CMake
-      run: cmake -B ${{github.workspace}}/Build/VS2022_CL_32_BIT -G "Visual Studio 17 2022" -A Win32 -DUSE_SSE4_1=OFF -DUSE_SSE4_2=OFF -DUSE_AVX=OFF -DUSE_AVX2=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_F16C=OFF -DUSE_FMADD=OFF Build
+      run: cmake -B ${{github.workspace}}/Build/VS2022_CL_32_BIT -G "Visual Studio 17 2022" -A Win32 -DUSE_SSE4_1=OFF -DUSE_SSE4_2=OFF -DUSE_AVX=OFF -DUSE_AVX2=OFF -DUSE_AVX512=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_F16C=OFF -DUSE_FMADD=OFF Build
     - name: Build
       run: msbuild Build\VS2022_CL_32_BIT\JoltPhysics.sln /property:Configuration=${{matrix.build_type}}
     - name: Test
@@ -111,7 +111,7 @@ jobs:
       uses: actions/checkout@v2
     - name: Configure CMake
       # github macos-latest runs on a 2013 Ivy Bridge CPU so doesn't have AVX2, LZCNT, TZCNT or FMADD
-      run: cmake -B ${{github.workspace}}/Build/MacOS_${{matrix.build_type}}_${{matrix.clang_version}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DCMAKE_CXX_COMPILER=${{matrix.clang_version}} -DUSE_AVX2=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_FMADD=OFF Build
+      run: cmake -B ${{github.workspace}}/Build/MacOS_${{matrix.build_type}}_${{matrix.clang_version}} -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DCMAKE_CXX_COMPILER=${{matrix.clang_version}} -DUSE_AVX2=OFF -DUSE_AVX512=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_FMADD=OFF Build
     - name: Build
       run: cmake --build ${{github.workspace}}/Build/MacOS_${{matrix.build_type}}_${{matrix.clang_version}} --config ${{matrix.build_type}}
     - name: Test

+ 10 - 3
Build/CMakeLists.txt

@@ -17,6 +17,7 @@ option(USE_SSE4_1 "Enable SSE4.1" ON)
 option(USE_SSE4_2 "Enable SSE4.2" ON)
 option(USE_AVX "Enable AVX" ON)
 option(USE_AVX2 "Enable AVX2" ON)
+option(USE_AVX512 "Enable AVX512" OFF)
 option(USE_LZCNT "Enable LZCNT" ON)
 option(USE_TZCNT "Enable TZCNT" ON)
 option(USE_F16C "Enable F16C" ON)
@@ -63,7 +64,9 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUAL
 		else()
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") # Clang doesn't use fast math because it cannot be turned off inside a single compilation unit
 		endif()
-		if (USE_AVX2)
+		if (USE_AVX512)
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
+		elseif (USE_AVX2)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
 		elseif (USE_AVX)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
@@ -92,7 +95,9 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUAL
 		set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "/LTCG")
 	elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /showFilenames")
-		if (USE_AVX2)
+		if (USE_AVX512)
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vl -mavx512dq -mavx2 -mbmi -mpopcnt -mlzcnt -mf16c")
+		elseif (USE_AVX2)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mpopcnt -mlzcnt -mf16c")
 		elseif (USE_AVX)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")
@@ -135,7 +140,9 @@ elseif ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR "${CMAKE_SYSTEM_NAME}" STREQU
 	# Platform specific compiler flags
 	if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64")
 		# X64
-		if (USE_AVX2)
+		if (USE_AVX512)
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vl -mavx512dq -mavx2 -mbmi -mpopcnt -mlzcnt -mf16c")
+		elseif (USE_AVX2)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mpopcnt -mlzcnt -mf16c")
 		elseif (USE_AVX)
 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mpopcnt")

+ 1 - 0
Build/README.md

@@ -34,6 +34,7 @@ There are a number of user configurable defines that turn on/off certain feature
 - JPH_USE_TZCNT - Enable the tzcnt CPU instruction (x86/x64 only)
 - JPH_USE_AVX - Enable AVX CPU instructions (x86/x64 only)
 - JPH_USE_AVX2 - Enable AVX2 CPU instructions (x86/x64 only)
+- JPH_USE_AVX512 - Enable AVX512F+AVX512VL CPU instructions (x86/x64 only)
 - JPH_USE_FMADD - Enable fused multiply add CPU instructions (x86/x64 only)
 
 ## Logging & Asserting

+ 1 - 1
Build/cmake_vs2022_cl_32bit.bat

@@ -1,3 +1,3 @@
 @echo off
-cmake -S . -B VS2022_CL_32BIT -G "Visual Studio 17 2022" -A Win32 -DUSE_SSE4_1=OFF -DUSE_SSE4_2=OFF -DUSE_AVX=OFF -DUSE_AVX2=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_F16C=OFF -DUSE_FMADD=OFF
+cmake -S . -B VS2022_CL_32BIT -G "Visual Studio 17 2022" -A Win32 -DUSE_SSE4_1=OFF -DUSE_SSE4_2=OFF -DUSE_AVX=OFF -DUSE_AVX2=OFF -DUSE_AVX512=OFF -DUSE_LZCNT=OFF -DUSE_TZCNT=OFF -DUSE_F16C=OFF -DUSE_FMADD=OFF
 echo Open VS2022_CL_32BIT\JoltPhysics.sln to build the project.

+ 3 - 0
Jolt/Core/Core.h

@@ -74,6 +74,9 @@
 	#if defined(__AVX2__) && !defined(JPH_USE_AVX2)
 		#define JPH_USE_AVX2
 	#endif
+	#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && !defined(JPH_USE_AVX512)
+		#define JPH_USE_AVX512
+	#endif
 	#ifndef JPH_CROSS_PLATFORM_DETERMINISTIC // FMA is not compatible with cross platform determinism
 		#if defined(JPH_COMPILER_CLANG) || defined(JPH_COMPILER_GCC)
 			#if defined(__FMA__) && !defined(JPH_USE_FMADD)

+ 6 - 2
Jolt/Math/Vec3.inl

@@ -501,7 +501,9 @@ int Vec3::GetHighestComponentIndex() const
 
 Vec3 Vec3::Abs() const
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_range_ps(mValue, mValue, 0b1000);
+#elif defined(JPH_USE_SSE)
 	return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
 #elif defined(JPH_USE_NEON)
 	return vabsq_f32(mValue);
@@ -755,7 +757,9 @@ Vec3 Vec3::GetNormalizedPerpendicular() const
 
 Vec3 Vec3::GetSign() const
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
+#elif defined(JPH_USE_SSE)
 	Type minus_one = _mm_set1_ps(-1.0f);
 	Type one = _mm_set1_ps(1.0f);
 	return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);

+ 6 - 2
Jolt/Math/Vec4.inl

@@ -540,7 +540,9 @@ Vec4 Vec4::SplatW() const
 
 Vec4 Vec4::Abs() const
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_range_ps(mValue, mValue, 0b1000);
+#elif defined(JPH_USE_SSE)
 	return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
 #elif defined(JPH_USE_NEON)
 	return vabsq_f32(mValue);
@@ -626,7 +628,9 @@ Vec4 Vec4::Sqrt() const
 
 Vec4 Vec4::GetSign() const
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
+#elif defined(JPH_USE_SSE)
 	Type minus_one = _mm_set1_ps(-1.0f);
 	Type one = _mm_set1_ps(1.0f);
 	return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);

+ 1 - 1
README.md

@@ -88,7 +88,7 @@ For more information see the [Architecture and API documentation](https://jrouwe
 
 ## Required CPU features
 
-* On x86 the minimal requirements are SSE2 but the library can be compiled using SSE4.1, SSE4.2, AVX or AVX2.
+* On x86 the minimal requirements are SSE2 but the library can be compiled using SSE4.1, SSE4.2, AVX, AVX2, or AVX512.
 * On ARM64 the library requires NEON with FP16 support.
 
 ## Compiling