Explorar el Código

AVX512 Tweaks (#294)

* Add `UVec4::sNot` unit test

* Add AVX512 acceleration for UVec4::sNot

Single-instruction `not` operation using the `vpternlog{d,q}`
instruction.

* Add AVX512 acceleration for Vec{3,4}::IsNaN

Utilize `vfpclassps` which can classify each floating point element
directly into a mask register. In this case, most compilers will run
'kortestb' and `setne al` immediately after this instruction. The SSE
implementation needs an additional temporary-register.

* Add AVX512 acceleration for Mat44::GetRotationSafe

Single-instruction `vmovaps` with no dependency on a temporary
zero-register.

* Add AVX512 acceleration for Vec8::Abs

* Add AVX512 acceleration for DVec3::Abs
Wunk hace 2 años
padre
commit
af863a8835

+ 4 - 0
Jolt/Math/DVec3.inl

@@ -215,7 +215,11 @@ DVec3 DVec3::operator / (DVec3Arg inV2) const
 
 DVec3 DVec3::Abs() const
 {
+#if defined(JPH_USE_AVX512)
+	return _mm256_range_pd(mValue, mValue, 0b1000);
+#else
 	return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), mValue), mValue);
+#endif
 }
 
 DVec3 DVec3::Reciprocal() const

+ 6 - 1
Jolt/Math/Mat44.inl

@@ -1120,7 +1120,12 @@ Mat44 Mat44::GetRotation() const
 
 Mat44 Mat44::GetRotationSafe() const
 { 
-#if defined(JPH_USE_SSE4_1)
+#if defined(JPH_USE_AVX512)
+	return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
+				 _mm_maskz_mov_ps(0b0111, mCol[1].mValue),
+				 _mm_maskz_mov_ps(0b0111, mCol[2].mValue),
+				 Vec4(0, 0, 0, 1)); 
+#elif defined(JPH_USE_SSE4_1)
 	__m128 zero = _mm_setzero_ps(); 
 	return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
 				 _mm_blend_ps(mCol[1].mValue, zero, 8),

+ 3 - 1
Jolt/Math/UVec4.inl

@@ -207,7 +207,9 @@ UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
 
 UVec4 UVec4::sNot(UVec4Arg inV1)
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
+#elif defined(JPH_USE_SSE)
 	return sXor(inV1, sReplicate(0xffffffff));
 #elif defined(JPH_USE_NEON)
 	return vmvnq_u32(inV1.mValue);

+ 4 - 2
Jolt/Math/Vec3.inl

@@ -739,8 +739,10 @@ bool Vec3::IsNormalized(float inTolerance) const
 }
 
 bool Vec3::IsNaN() const
-{	
-#if defined(JPH_USE_SSE)
+{
+#if defined(JPH_USE_AVX512)
+	return (_mm_fpclass_ps_mask(mValue, 0b10000001) & 0x7) != 0;
+#elif defined(JPH_USE_SSE)
 	return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
 #elif defined(JPH_USE_NEON)
 	uint32x4_t mask = { 1, 1, 1, 0 };

+ 3 - 1
Jolt/Math/Vec4.inl

@@ -362,7 +362,9 @@ bool Vec4::IsNormalized(float inTolerance) const
 
 bool Vec4::IsNaN() const
 {
-#if defined(JPH_USE_SSE)
+#if defined(JPH_USE_AVX512)
+	return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
+#elif defined(JPH_USE_SSE)
 	return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
 #elif defined(JPH_USE_NEON)
 	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN

+ 4 - 0
Jolt/Math/Vec8.inl

@@ -122,7 +122,11 @@ Vec8 Vec8::Swizzle() const
 
 Vec8 Vec8::Abs() const
 {
+#if defined(JPH_USE_AVX512)
+	return _mm256_range_ps(mValue, mValue, 0b1000);
+#else
 	return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), mValue), mValue);
+#endif
 }
 	
 Vec4 Vec8::LowerVec4() const

+ 3 - 0
UnitTests/Math/UVec4Tests.cpp

@@ -128,6 +128,9 @@ TEST_SUITE("UVec4Tests")
 		CHECK(UVec4::sXor(v1, v2) == UVec4(0b0110, 0b01100, 0b011000, 0b0110000));
 		CHECK(UVec4::sAnd(v1, v2) == UVec4(0b0001, 0b00010, 0b000100, 0b0001000));
 
+		CHECK(UVec4::sNot(v1) == UVec4(0xfffffffcU, 0xfffffff9U, 0xfffffff3U, 0xffffffe7U));
+		CHECK(UVec4::sNot(v2) == UVec4(0xfffffffaU, 0xfffffff5U, 0xffffffebU, 0xffffffd7U));
+
 		CHECK(UVec4(0x80000000U, 0x40000000U, 0x20000000U, 0x10000000U).LogicalShiftRight<1>() == UVec4(0x40000000U, 0x20000000U, 0x10000000U, 0x08000000U));
 		CHECK(UVec4(0x80000000U, 0x40000000U, 0x20000000U, 0x10000000U).ArithmeticShiftRight<1>() == UVec4(0xC0000000U, 0x20000000U, 0x10000000U, 0x08000000U));
 		CHECK(UVec4(0x40000000U, 0x20000000U, 0x10000000U, 0x08000001U).LogicalShiftLeft<1>() == UVec4(0x80000000U, 0x40000000U, 0x20000000U, 0x10000002U));