Browse Source

Optimized Adjointed3x3 and Inversed3x3 (#618)

The SIMD code based on the 4x4 inverse was slower than a more naive implementation
Jorrit Rouwe 2 years ago
parent
commit
b23f1c2b63
3 changed files with 14 additions and 302 deletions
  1. 1 1
      .github/workflows/determinism_check.yml
  2. 1 1
      Docs/Architecture.md
  3. 12 300
      Jolt/Math/Mat44.inl

+ 1 - 1
.github/workflows/determinism_check.yml

@@ -2,7 +2,7 @@ name: Determinism Check
 
 
 env:
 env:
     CONVEX_VS_MESH_HASH: '0x412693f5fd7ee9f6'
     CONVEX_VS_MESH_HASH: '0x412693f5fd7ee9f6'
-    RAGDOLL_HASH: '0x62f3cf349a172dd4'
+    RAGDOLL_HASH: '0xf6bc510ce5a03e4b'
 
 
 on:
 on:
   push:
   push:

+ 1 - 1
Docs/Architecture.md

@@ -380,7 +380,7 @@ If you want cross platform determinism then please turn on the CROSS_PLATFORM_DE
 * Compiler used to compile the library (tested MSVC2022 vs clang)
 * Compiler used to compile the library (tested MSVC2022 vs clang)
 * Configuration (Debug, Release or Distribution)
 * Configuration (Debug, Release or Distribution)
 * OS (tested Windows vs Linux)
 * OS (tested Windows vs Linux)
-* Architecture (x86 or ARM)
+* Architecture (x86 or ARM). Note that 32-bit architectures are currently not compatible with 64-bit architectures.
 
 
 Note that the same source code must be used to compile the library on all platforms. Also note that it is quite difficult to verify cross platform determinism, so this feature is less tested than other features.
 Note that the same source code must be used to compile the library on all platforms. Also note that it is quite difficult to verify cross platform determinism, so this feature is less tested than other features.
 
 

+ 12 - 300
Jolt/Math/Mat44.inl

@@ -731,316 +731,28 @@ float Mat44::GetDeterminant3x3() const
 
 
 Mat44 Mat44::Adjointed3x3() const
 Mat44 Mat44::Adjointed3x3() const
 {
 {
-	// Adapted from Inversed() to remove 4th column and the division by the determinant
-	// Note: This can be optimized.
-
-	JPH_ASSERT(mCol[0][3] == 0.0f);
-	JPH_ASSERT(mCol[1][3] == 0.0f);
-	JPH_ASSERT(mCol[2][3] == 0.0f);
-
-#if defined(JPH_USE_SSE)
-	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
-	row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-	tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
-	row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-	tmp1 = _mm_mul_ps(row2, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128 minor0 = _mm_mul_ps(row1, tmp1);
-	__m128 minor1 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
-	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
-	minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row1, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
-
-	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
-	__m128 minor2 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
-	minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row0, row1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
-
-	tmp1 = _mm_mul_ps(row0, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
-	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
-
-	tmp1 = _mm_mul_ps(row0, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
-		
-	Mat44 result;
-	result.mCol[0].mValue = minor0;
-	result.mCol[1].mValue = minor1;
-	result.mCol[2].mValue = minor2;
-	result.mCol[3] = Vec4(0, 0, 0, 1);
-	return result;
-#elif defined(JPH_USE_NEON)
-	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
-	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
-	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
-
-	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	Type minor0 = vmulq_f32(row1, tmp1);
-	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
-	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
-
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
-	tmp1 = vmulq_f32(tmp1, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
-	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
-	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
-	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
-
-	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
-	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
-	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
-
-	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
-	
-	Mat44 result;
-	result.mCol[0].mValue = minor0;
-	result.mCol[1].mValue = minor1;
-	result.mCol[2].mValue = minor2;
-	result.mCol[3].mValue = v0001;
-	return result;
-#else
 	return Mat44(
 	return Mat44(
-		Vec4(JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1),
-			JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2),
-			JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0),
-			0),
-		Vec4(JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2),
-			JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0),
-			JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1),
-			0),
-		Vec4(JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1),
-			JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2),
-			JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0),
-			0),
+		Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0),
+		Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0),
+		Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0),
 		Vec4(0, 0, 0, 1));
 		Vec4(0, 0, 0, 1));
-#endif
 }
 }
 
 
 Mat44 Mat44::Inversed3x3() const
 Mat44 Mat44::Inversed3x3() const
 {
 {
-	// Adapted from Inversed() to remove 4th column
-	// Note: This can be optimized.
-
-	JPH_ASSERT(mCol[0][3] == 0.0f);
-	JPH_ASSERT(mCol[1][3] == 0.0f);
-	JPH_ASSERT(mCol[2][3] == 0.0f);
-
-#if defined(JPH_USE_SSE)
-	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
-	__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
-	row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-	tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
-	__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
-	row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-	tmp1 = _mm_mul_ps(row2, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128 minor0 = _mm_mul_ps(row1, tmp1);
-	__m128 minor1 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
-	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
-	minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row1, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
-
-	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
-	__m128 minor2 = _mm_mul_ps(row0, tmp1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
-	minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
-
-	tmp1 = _mm_mul_ps(row0, row1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
-
-	tmp1 = _mm_mul_ps(row0, row3);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
-	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
-	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
-
-	tmp1 = _mm_mul_ps(row0, row2);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
-	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
-	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
-	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
-
-	__m128 det = _mm_mul_ps(row0, minor0);
-	det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
-	det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
-	det = _mm_div_ss(_mm_set_ss(1.0f), det);
-	det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
-	
-	Mat44 result;
-	result.mCol[0].mValue = _mm_mul_ps(det, minor0);
-	result.mCol[1].mValue = _mm_mul_ps(det, minor1);
-	result.mCol[2].mValue = _mm_mul_ps(det, minor2);
-	result.mCol[3] = Vec4(0, 0, 0, 1);
-	return result;
-#elif defined(JPH_USE_NEON)
-	Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
-	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
-	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
-	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
-	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
-	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
-	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
-	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
-
-	tmp1 = vmulq_f32(row2, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	Type minor0 = vmulq_f32(row1, tmp1);
-	Type minor1 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
-	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
-	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row1, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
-
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
-	tmp1 = vmulq_f32(tmp1, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
-	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
-	Type minor2 = vmulq_f32(row0, tmp1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
-	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
-	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
-
-	tmp1 = vmulq_f32(row0, row1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
-
-	tmp1 = vmulq_f32(row0, row3);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
-	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
-	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
-
-	tmp1 = vmulq_f32(row0, row2);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
-	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
-	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
-	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
-
-	Type det = vmulq_f32(row0, minor0);
-	det = vdupq_n_f32(vaddvq_f32(det));
-	det = vdivq_f32(vdupq_n_f32(1.0f), det);
-	
-	Mat44 result;
-	result.mCol[0].mValue = vmulq_f32(det, minor0);
-	result.mCol[1].mValue = vmulq_f32(det, minor1);
-	result.mCol[2].mValue = vmulq_f32(det, minor2);
-	result.mCol[3].mValue = v0001;
-	return result;
-#else
 	float det = GetDeterminant3x3();
 	float det = GetDeterminant3x3();
 
 
 	return Mat44(
 	return Mat44(
-		Vec4((JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1)) / det,
-			(JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2)) / det,
-			(JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0)) / det,
-			0),
-		Vec4((JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2)) / det,
-			(JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0)) / det,
-			(JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1)) / det,
-			0),
-		Vec4((JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1)) / det,
-			(JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2)) / det,
-			(JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0)) / det,
-			0),
+		(Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)) / det,
+		(Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)) / det,
+		(Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0)) / det,
 		Vec4(0, 0, 0, 1));
 		Vec4(0, 0, 0, 1));
-#endif
 }
 }
 
 
 Quat Mat44::GetQuaternion() const
 Quat Mat44::GetQuaternion() const