2
0
Эх сурвалжийг харах

Optimized Adjointed3x3 and Inversed3x3 (#618)

The SIMD code based on the 4x4 inverse was slower than a more naive implementation
Jorrit Rouwe 2 жил өмнө
parent
commit
b23f1c2b63

+ 1 - 1
.github/workflows/determinism_check.yml

@@ -2,7 +2,7 @@ name: Determinism Check
 
 env:
     CONVEX_VS_MESH_HASH: '0x412693f5fd7ee9f6'
-    RAGDOLL_HASH: '0x62f3cf349a172dd4'
+    RAGDOLL_HASH: '0xf6bc510ce5a03e4b'
 
 on:
   push:

+ 1 - 1
Docs/Architecture.md

@@ -380,7 +380,7 @@ If you want cross platform determinism then please turn on the CROSS_PLATFORM_DE
 * Compiler used to compile the library (tested MSVC2022 vs clang)
 * Configuration (Debug, Release or Distribution)
 * OS (tested Windows vs Linux)
-* Architecture (x86 or ARM)
+* Architecture (x86 or ARM). Note that 32-bit architectures are currently not compatible with 64-bit architectures.
 
 Note that the same source code must be used to compile the library on all platforms. Also note that it is quite difficult to verify cross platform determinism, so this feature is less tested than other features.
 

+ 12 - 300
Jolt/Math/Mat44.inl

@@ -731,316 +731,28 @@ float Mat44::GetDeterminant3x3() const
 
 Mat44 Mat44::Adjointed3x3() const
 {
-	// Adapted from Inversed() to remove 4th column and the division by the determinant
-	// Note: This can be optimized.
-
-	JPH_ASSERT(mCol[0][3] == 0.0f);
-	JPH_ASSERT(mCol[1][3] == 0.0f);
-	JPH_ASSERT(mCol[2][3] == 0.0f);
-
-#if defined(JPH_USE_SSE)
-	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
-	row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-	tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
-	row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-	tmp1 = _mm_mul_ps(row2, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128 minor0 = _mm_mul_ps(row1, tmp1);
-	__m128 minor1 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
-	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
-	minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row1, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
-
-	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
-	__m128 minor2 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
-	minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row0, row1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
-
-	tmp1 = _mm_mul_ps(row0, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
-	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
-
-	tmp1 = _mm_mul_ps(row0, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
-		
-	Mat44 result;
-	result.mCol[0].mValue = minor0;
-	result.mCol[1].mValue = minor1;
-	result.mCol[2].mValue = minor2;
-	result.mCol[3] = Vec4(0, 0, 0, 1);
-	return result;
-#elif defined(JPH_USE_NEON)
-	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
-	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
-	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
-
-	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	Type minor0 = vmulq_f32(row1, tmp1);
-	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
-	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
-
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
-	tmp1 = vmulq_f32(tmp1, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
-	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
-	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
-	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
-
-	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
-	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
-	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
-
-	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
-	
-	Mat44 result;
-	result.mCol[0].mValue = minor0;
-	result.mCol[1].mValue = minor1;
-	result.mCol[2].mValue = minor2;
-	result.mCol[3].mValue = v0001;
-	return result;
-#else
 	return Mat44(
-		Vec4(JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1),
-			JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2),
-			JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0),
-			0),
-		Vec4(JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2),
-			JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0),
-			JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1),
-			0),
-		Vec4(JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1),
-			JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2),
-			JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0),
-			0),
+		Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0),
+		Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0),
+		Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0),
 		Vec4(0, 0, 0, 1));
-#endif
 }
 
 Mat44 Mat44::Inversed3x3() const
 {
-	// Adapted from Inversed() to remove 4th column
-	// Note: This can be optimized.
-
-	JPH_ASSERT(mCol[0][3] == 0.0f);
-	JPH_ASSERT(mCol[1][3] == 0.0f);
-	JPH_ASSERT(mCol[2][3] == 0.0f);
-
-#if defined(JPH_USE_SSE)
-	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
-	row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-	tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
-	row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-	tmp1 = _mm_mul_ps(row2, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128 minor0 = _mm_mul_ps(row1, tmp1);
-	__m128 minor1 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
-	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
-	minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row1, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
-
-	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
-	__m128 minor2 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
-	minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row0, row1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
-
-	tmp1 = _mm_mul_ps(row0, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
-	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
-
-	tmp1 = _mm_mul_ps(row0, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
-
-	__m128 det = _mm_mul_ps(row0, minor0);
-	det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
-	det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
-	det = _mm_div_ss(_mm_set_ss(1.0f), det);
-	det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
-	
-	Mat44 result;
-	result.mCol[0].mValue = _mm_mul_ps(det, minor0);
-	result.mCol[1].mValue = _mm_mul_ps(det, minor1);
-	result.mCol[2].mValue = _mm_mul_ps(det, minor2);
-	result.mCol[3] = Vec4(0, 0, 0, 1);
-	return result;
-#elif defined(JPH_USE_NEON)
-	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
-	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
-	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
-
-	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	Type minor0 = vmulq_f32(row1, tmp1);
-	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
-	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
-
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
-	tmp1 = vmulq_f32(tmp1, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
-	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
-	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
-	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
-
-	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
-	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
-	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
-
-	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
-
-	Type det = vmulq_f32(row0, minor0);
-	det = vdupq_n_f32(vaddvq_f32(det));
-	det = vdivq_f32(vdupq_n_f32(1.0f), det);
-	
-	Mat44 result;
-	result.mCol[0].mValue = vmulq_f32(det, minor0);
-	result.mCol[1].mValue = vmulq_f32(det, minor1);
-	result.mCol[2].mValue = vmulq_f32(det, minor2);
-	result.mCol[3].mValue = v0001;
-	return result;
-#else
 	float det = GetDeterminant3x3();
 
 	return Mat44(
-		Vec4((JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1)) / det,
-			(JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2)) / det,
-			(JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0)) / det,
-			0),
-		Vec4((JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2)) / det,
-			(JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0)) / det,
-			(JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1)) / det,
-			0),
-		Vec4((JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1)) / det,
-			(JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2)) / det,
-			(JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0)) / det,
-			0),
+		(Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)) / det,
+		(Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)) / det,
+		(Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0)) / det,
 		Vec4(0, 0, 0, 1));
-#endif
 }
 
 Quat Mat44::GetQuaternion() const