2
0
Эх сурвалжийг харах

Implemented NEON versions of DVec3 and DMat44 (#394)

Gives approximately 1% performance boost in double precision mode on ARM
Jorrit Rouwe 2 жил өмнө
parent
commit
348f9e4263

+ 54 - 1
Jolt/Math/DMat44.inl

@@ -79,6 +79,13 @@ DVec3 DMat44::operator * (Vec3Arg inV) const
 	__m128d low = _mm_add_pd(mCol3.mValue.mLow, _mm_cvtps_pd(t));
 	__m128d high = _mm_add_pd(mCol3.mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(t, t, _MM_SHUFFLE(2, 2, 2, 2))));
 	return DVec3({ low, high });
+#elif defined(JPH_USE_NEON)
+	float32x4_t t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
+	t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
+	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
+	float64x2_t low = vaddq_f64(mCol3.mValue.val[0], vcvt_f64_f32(vget_low_f32(t)));
+	float64x2_t high = vaddq_f64(mCol3.mValue.val[1], vcvt_high_f64_f32(t));
+	return DVec3::sFixW({ low, high });
 #else
 	return DVec3(
 		mCol3.mF64[0] + double(mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2]), 
@@ -108,6 +115,20 @@ DVec3 DMat44::operator * (DVec3Arg inV) const
 	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
 	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
 	return DVec3({ t_low, t_high });
+#elif defined(JPH_USE_NEON)
+	float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
+	float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
+	float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
+	float32x4_t col0 = mCol[0].mValue;
+	float32x4_t col1 = mCol[1].mValue;
+	float32x4_t col2 = mCol[2].mValue;
+	float64x2_t t_low = vaddq_f64(mCol3.mValue.val[0], vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
+	float64x2_t t_high = vaddq_f64(mCol3.mValue.val[1], vmulq_f64(vcvt_high_f64_f32(col0), xxxx));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
+	return DVec3::sFixW({ t_low, t_high });
 #else
 	return DVec3(
 		mCol3.mF64[0] + double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2], 
@@ -137,6 +158,20 @@ DVec3 DMat44::Multiply3x3(DVec3Arg inV) const
 	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
 	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
 	return DVec3({ t_low, t_high });
+#elif defined(JPH_USE_NEON)
+	float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
+	float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
+	float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
+	float32x4_t col0 = mCol[0].mValue;
+	float32x4_t col1 = mCol[1].mValue;
+	float32x4_t col2 = mCol[2].mValue;
+	float64x2_t t_low = vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx);
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
+	float64x2_t t_high = vmulq_f64(vcvt_high_f64_f32(col0), xxxx);
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
+	return DVec3::sFixW({ t_low, t_high });
 #else
 	return DVec3(
 		double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2], 
@@ -159,6 +194,15 @@ DMat44 DMat44::operator * (Mat44Arg inM) const
 		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
 		result.mCol[i].mValue = t;
 	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 3; ++i)
+	{
+		Type c = inM.GetColumn4(i).mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		result.mCol[i].mValue = t;
+	}
 #else
 	for (int i = 0; i < 3; ++i)
 	{
@@ -187,13 +231,22 @@ DMat44 DMat44::operator * (DMat44Arg inM) const
 		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
 		result.mCol[i].mValue = t;
 	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 3; ++i)
+	{
+		Type c = inM.GetColumn4(i).mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		result.mCol[i].mValue = t;
+	}
 #else
 	for (int i = 0; i < 3; ++i)
 	{
 		Vec4 coli = inM.mCol[i];
 		result.mCol[i] = mCol[0] * coli.mF32[0] + mCol[1] * coli.mF32[1] + mCol[2] * coli.mF32[2];
 	}
-#endif // JPH_USE_SSE
+#endif
 
 	// Translation part
 	result.mCol3 = *this * inM.GetTranslation();

+ 12 - 1
Jolt/Math/DVec3.h

@@ -21,6 +21,9 @@ public:
 #elif defined(JPH_USE_SSE)
 	using Type = struct { __m128d mLow, mHigh; };
 	using TypeArg = const Type &;
+#elif defined(JPH_USE_NEON)
+	using Type = float64x2x2_t;
+	using TypeArg = const Type &;
 #else
 	using Type = struct { double mData[4]; };
 	using TypeArg = const Type &;
@@ -128,13 +131,21 @@ public:
 	/// Get individual components
 #if defined(JPH_USE_AVX)
 	JPH_INLINE double			GetX() const									{ return _mm_cvtsd_f64(_mm256_castpd256_pd128(mValue)); }
+	JPH_INLINE double			GetY() const									{ return mF64[1]; }
+	JPH_INLINE double			GetZ() const									{ return mF64[2]; }
 #elif defined(JPH_USE_SSE)
 	JPH_INLINE double			GetX() const									{ return _mm_cvtsd_f64(mValue.mLow); }
+	JPH_INLINE double			GetY() const									{ return mF64[1]; }
+	JPH_INLINE double			GetZ() const									{ return _mm_cvtsd_f64(mValue.mHigh); }
+#elif defined(JPH_USE_NEON)
+	JPH_INLINE double			GetX() const									{ return vgetq_lane_f64(mValue.val[0], 0); }
+	JPH_INLINE double			GetY() const									{ return vgetq_lane_f64(mValue.val[0], 1); }
+	JPH_INLINE double			GetZ() const									{ return vgetq_lane_f64(mValue.val[1], 0); }
 #else
 	JPH_INLINE double			GetX() const									{ return mF64[0]; }
-#endif
 	JPH_INLINE double			GetY() const									{ return mF64[1]; }
 	JPH_INLINE double			GetZ() const									{ return mF64[2]; }
+#endif
 	
 	/// Set individual components
 	JPH_INLINE void				SetX(double inX)								{ mF64[0] = inX; }

+ 119 - 0
Jolt/Math/DVec3.inl

@@ -17,6 +17,9 @@ DVec3::DVec3(Vec3Arg inRHS)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_cvtps_pd(inRHS.mValue);
 	mValue.mHigh = _mm_cvtps_pd(_mm_shuffle_ps(inRHS.mValue, inRHS.mValue, _MM_SHUFFLE(2, 2, 2, 2)));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vcvt_f64_f32(vget_low_f32(inRHS.mValue));
+	mValue.val[1] = vcvt_high_f64_f32(inRHS.mValue);
 #else
 	mF64[0] = (double)inRHS.GetX();
 	mF64[1] = (double)inRHS.GetY();
@@ -39,6 +42,9 @@ DVec3::DVec3(double inX, double inY, double inZ)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_set_pd(inY, inX);
 	mValue.mHigh = _mm_set_pd1(inZ);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vcombine_f64(vcreate_f64(*reinterpret_cast<uint64 *>(&inX)), vcreate_f64(*reinterpret_cast<uint64 *>(&inY)));
+	mValue.val[1] = vdupq_n_f64(inZ);
 #else
 	mF64[0] = inX;
 	mF64[1] = inY;
@@ -60,6 +66,9 @@ DVec3::DVec3(const Double3 &inV)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_load_pd(&inV.x);
 	mValue.mHigh = _mm_set_pd1(inV.z);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vld1q_f64(&inV.x);
+	mValue.val[1] = vdupq_n_f64(inV.z);
 #else
 	mF64[0] = inV.x;
 	mF64[1] = inV.y;
@@ -89,6 +98,11 @@ DVec3::Type DVec3::sFixW(TypeArg inValue)
 		value.mLow = inValue.mLow;
 		value.mHigh = _mm_shuffle_pd(inValue.mHigh, inValue.mHigh, 0);
 		return value;
+	#elif defined(JPH_USE_NEON)
+		Type value;
+		value.val[0] = inValue.val[0];
+		value.val[1] = vdupq_laneq_f64(inValue.val[1], 0);
+		return value;
 	#else
 		Type value;
 		value.mData[0] = inValue.mData[0];
@@ -109,6 +123,9 @@ DVec3 DVec3::sZero()
 #elif defined(JPH_USE_SSE)
 	__m128d zero = _mm_setzero_pd();
 	return DVec3({ zero, zero });
+#elif defined(JPH_USE_NEON)
+	float64x2_t zero = vdupq_n_f64(0.0);
+	return DVec3({ zero, zero });
 #else
 	return DVec3(0, 0, 0);
 #endif
@@ -121,6 +138,9 @@ DVec3 DVec3::sReplicate(double inV)
 #elif defined(JPH_USE_SSE)
 	__m128d value = _mm_set1_pd(inV);
 	return DVec3({ value, value });
+#elif defined(JPH_USE_NEON)
+	float64x2_t value = vdupq_n_f64(inV);
+	return DVec3({ value, value });
 #else
 	return DVec3(inV, inV, inV);
 #endif
@@ -139,6 +159,8 @@ DVec3 DVec3::sLoadDouble3Unsafe(const Double3 &inV)
 	Type v;
 	v.mLow = _mm_loadu_pd(&inV.x);
 	v.mHigh = _mm_set1_pd(inV.z);
+#elif defined(JPH_USE_NEON)
+	Type v = vld1q_f64_x2(&inV.x);
 #else
 	Type v = { inV.x, inV.y, inV.z };
 #endif
@@ -160,6 +182,8 @@ DVec3::operator Vec3() const
 	__m128 low = _mm_cvtpd_ps(mValue.mLow);
 	__m128 high = _mm_cvtpd_ps(mValue.mHigh);
 	return _mm_shuffle_ps(low, high, _MM_SHUFFLE(1, 0, 1, 0));
+#elif defined(JPH_USE_NEON)
+	return vcvt_high_f32_f64(vcvtx_f32_f64(mValue.val[0]), mValue.val[1]);
 #else
 	return Vec3((float)GetX(), (float)GetY(), (float)GetZ());
 #endif
@@ -171,6 +195,8 @@ DVec3 DVec3::sMin(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_min_pd(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_min_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_min_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vminq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vminq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(min(inV1.mF64[0], inV2.mF64[0]), 
 				 min(inV1.mF64[1], inV2.mF64[1]), 
@@ -184,6 +210,8 @@ DVec3 DVec3::sMax(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_max_pd(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_max_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_max_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmaxq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vmaxq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(max(inV1.mF64[0], inV2.mF64[0]), 
 				 max(inV1.mF64[1], inV2.mF64[1]), 
@@ -202,6 +230,8 @@ DVec3 DVec3::sEquals(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_EQ_OQ);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_cmpeq_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpeq_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_u64_f64(vceqq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_u64_f64(vceqq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
 #else
 	return DVec3(inV1.mF64[0] == inV2.mF64[0]? cTrue : cFalse, 
 				 inV1.mF64[1] == inV2.mF64[1]? cTrue : cFalse, 
@@ -215,6 +245,8 @@ DVec3 DVec3::sLess(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LT_OQ);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_cmplt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmplt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_u64_f64(vcltq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_u64_f64(vcltq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
 #else
 	return DVec3(inV1.mF64[0] < inV2.mF64[0]? cTrue : cFalse, 
 				 inV1.mF64[1] < inV2.mF64[1]? cTrue : cFalse, 
@@ -228,6 +260,8 @@ DVec3 DVec3::sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LE_OQ);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_cmple_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmple_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_u64_f64(vcleq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_u64_f64(vcleq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
 #else
 	return DVec3(inV1.mF64[0] <= inV2.mF64[0]? cTrue : cFalse, 
 				 inV1.mF64[1] <= inV2.mF64[1]? cTrue : cFalse, 
@@ -241,6 +275,8 @@ DVec3 DVec3::sGreater(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GT_OQ);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_cmpgt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpgt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_u64_f64(vcgtq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_u64_f64(vcgtq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
 #else
 	return DVec3(inV1.mF64[0] > inV2.mF64[0]? cTrue : cFalse, 
 				 inV1.mF64[1] > inV2.mF64[1]? cTrue : cFalse, 
@@ -254,6 +290,8 @@ DVec3 DVec3::sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GE_OQ);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_cmpge_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpge_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_u64_f64(vcgeq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_u64_f64(vcgeq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
 #else
 	return DVec3(inV1.mF64[0] >= inV2.mF64[0]? cTrue : cFalse, 
 				 inV1.mF64[1] >= inV2.mF64[1]? cTrue : cFalse, 
@@ -269,6 +307,8 @@ DVec3 DVec3::sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd)
 	#else
 		return _mm256_add_pd(_mm256_mul_pd(inMul1.mValue, inMul2.mValue), inAdd.mValue);
 	#endif
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmlaq_f64(inAdd.mValue.val[0], inMul1.mValue.val[0], inMul2.mValue.val[0]), vmlaq_f64(inAdd.mValue.val[1], inMul1.mValue.val[1], inMul2.mValue.val[1]) });
 #else
 	return inMul1 * inMul2 + inAdd;
 #endif
@@ -281,6 +321,9 @@ DVec3 DVec3::sSelect(DVec3Arg inV1, DVec3Arg inV2, DVec3Arg inControl)
 #elif defined(JPH_USE_SSE4_1)
 	Type v = { _mm_blendv_pd(inV1.mValue.mLow, inV2.mValue.mLow, inControl.mValue.mLow), _mm_blendv_pd(inV1.mValue.mHigh, inV2.mValue.mHigh, inControl.mValue.mHigh) };
 	return sFixW(v);
+#elif defined(JPH_USE_NEON)
+	Type v = { vbslq_f64(vshrq_n_s64(inControl.mValue.val[0], 63), inV2.mValue.val[0], inV1.mValue.val[0]), vbslq_f64(vshrq_n_s64(inControl.mValue.val[1], 63), inV2.mValue.val[1], inV1.mValue.val[1]) };
+	return sFixW(v);
 #else
 	DVec3 result;
 	for (int i = 0; i < 3; i++)
@@ -298,6 +341,8 @@ DVec3 DVec3::sOr(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_or_pd(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_or_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_or_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vorrq_s64(inV1.mValue.val[0], inV2.mValue.val[0]), vorrq_s64(inV1.mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) | BitCast<uint64>(inV2.mF64[0])),
 				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) | BitCast<uint64>(inV2.mF64[1])),
@@ -311,6 +356,8 @@ DVec3 DVec3::sXor(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_xor_pd(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_xor_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_xor_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ veorq_s64(inV1.mValue.val[0], inV2.mValue.val[0]), veorq_s64(inV1.mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) ^ BitCast<uint64>(inV2.mF64[0])),
 				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) ^ BitCast<uint64>(inV2.mF64[1])),
@@ -324,6 +371,8 @@ DVec3 DVec3::sAnd(DVec3Arg inV1, DVec3Arg inV2)
 	return _mm256_and_pd(inV1.mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_and_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_and_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vandq_s64(inV1.mValue.val[0], inV2.mValue.val[0]), vandq_s64(inV1.mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) & BitCast<uint64>(inV2.mF64[0])),
 				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) & BitCast<uint64>(inV2.mF64[1])),
@@ -373,6 +422,8 @@ DVec3 DVec3::operator * (DVec3Arg inV2) const
 	return _mm256_mul_pd(mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_mul_pd(mValue.mLow, inV2.mValue.mLow), _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_f64(mValue.val[0], inV2.mValue.val[0]), vmulq_f64(mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(mF64[0] * inV2.mF64[0], mF64[1] * inV2.mF64[1], mF64[2] * inV2.mF64[2]);
 #endif
@@ -385,6 +436,8 @@ DVec3 DVec3::operator * (double inV2) const
 #elif defined(JPH_USE_SSE)
 	__m128d v = _mm_set1_pd(inV2);
 	return DVec3({ _mm_mul_pd(mValue.mLow, v), _mm_mul_pd(mValue.mHigh, v) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_n_f64(mValue.val[0], inV2), vmulq_n_f64(mValue.val[1], inV2) });
 #else
 	return DVec3(mF64[0] * inV2, mF64[1] * inV2, mF64[2] * inV2);
 #endif
@@ -397,6 +450,8 @@ DVec3 operator * (double inV1, DVec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	__m128d v = _mm_set1_pd(inV1);
 	return DVec3({ _mm_mul_pd(v, inV2.mValue.mLow), _mm_mul_pd(v, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_n_f64(inV2.mValue.val[0], inV1), vmulq_n_f64(inV2.mValue.val[1], inV1) });
 #else
 	return DVec3(inV1 * inV2.mF64[0], inV1 * inV2.mF64[1], inV1 * inV2.mF64[2]);
 #endif
@@ -409,6 +464,9 @@ DVec3 DVec3::operator / (double inV2) const
 #elif defined(JPH_USE_SSE)
 	__m128d v = _mm_set1_pd(inV2);
 	return DVec3({ _mm_div_pd(mValue.mLow, v), _mm_div_pd(mValue.mHigh, v) });
+#elif defined(JPH_USE_NEON)
+	float64x2_t v = vdupq_n_f64(inV2);
+	return DVec3({ vdivq_f64(mValue.val[0], v), vdivq_f64(mValue.val[1], v) });
 #else
 	return DVec3(mF64[0] / inV2, mF64[1] / inV2, mF64[2] / inV2);
 #endif
@@ -422,6 +480,9 @@ DVec3 &DVec3::operator *= (double inV2)
 	__m128d v = _mm_set1_pd(inV2);
 	mValue.mLow = _mm_mul_pd(mValue.mLow, v);
 	mValue.mHigh = _mm_mul_pd(mValue.mHigh, v);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vmulq_n_f64(mValue.val[0], inV2);
+	mValue.val[1] = vmulq_n_f64(mValue.val[1], inV2);
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] *= inV2;
@@ -439,6 +500,9 @@ DVec3 &DVec3::operator *= (DVec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_mul_pd(mValue.mLow, inV2.mValue.mLow);
 	mValue.mHigh = _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] *= inV2.mF64[i];
@@ -457,6 +521,10 @@ DVec3 &DVec3::operator /= (double inV2)
 	__m128d v = _mm_set1_pd(inV2);
 	mValue.mLow = _mm_div_pd(mValue.mLow, v);
 	mValue.mHigh = _mm_div_pd(mValue.mHigh, v);
+#elif defined(JPH_USE_NEON)
+	float64x2_t v = vdupq_n_f64(inV2);
+	mValue.val[0] = vdivq_f64(mValue.val[0], v);
+	mValue.val[1] = vdivq_f64(mValue.val[1], v);
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] /= inV2;
@@ -473,6 +541,8 @@ DVec3 DVec3::operator + (Vec3Arg inV2) const
 	return _mm256_add_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
 #else
 	return DVec3(mF64[0] + inV2.mF32[0], mF64[1] + inV2.mF32[1], mF64[2] + inV2.mF32[2]);
 #endif
@@ -484,6 +554,8 @@ DVec3 DVec3::operator + (DVec3Arg inV2) const
 	return _mm256_add_pd(mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_add_pd(mValue.mLow, inV2.mValue.mLow), _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vaddq_f64(mValue.val[0], inV2.mValue.val[0]), vaddq_f64(mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(mF64[0] + inV2.mF64[0], mF64[1] + inV2.mF64[1], mF64[2] + inV2.mF64[2]);
 #endif
@@ -496,6 +568,9 @@ DVec3 &DVec3::operator += (Vec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
 	mValue.mHigh = _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
+	mValue.val[1] = vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] += inV2.mF32[i];
@@ -513,6 +588,9 @@ DVec3 &DVec3::operator += (DVec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_add_pd(mValue.mLow, inV2.mValue.mLow);
 	mValue.mHigh = _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vaddq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vaddq_f64(mValue.val[1], inV2.mValue.val[1]);
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] += inV2.mF64[i];
@@ -530,6 +608,8 @@ DVec3 DVec3::operator - () const
 #elif defined(JPH_USE_SSE)
 	__m128d zero = _mm_setzero_pd();
 	return DVec3({ _mm_sub_pd(zero, mValue.mLow), _mm_sub_pd(zero, mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vnegq_f64(mValue.val[0]), vnegq_f64(mValue.val[1]) });
 #else
 	return DVec3(-mF64[0], -mF64[1], -mF64[2]);
 #endif
@@ -541,6 +621,8 @@ DVec3 DVec3::operator - (Vec3Arg inV2) const
 	return _mm256_sub_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
 #else
 	return DVec3(mF64[0] - inV2.mF32[0], mF64[1] - inV2.mF32[1], mF64[2] - inV2.mF32[2]);
 #endif
@@ -552,6 +634,8 @@ DVec3 DVec3::operator - (DVec3Arg inV2) const
 	return _mm256_sub_pd(mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_sub_pd(mValue.mLow, inV2.mValue.mLow), _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsubq_f64(mValue.val[0], inV2.mValue.val[0]), vsubq_f64(mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(mF64[0] - inV2.mF64[0], mF64[1] - inV2.mF64[1], mF64[2] - inV2.mF64[2]);
 #endif
@@ -564,6 +648,9 @@ DVec3 &DVec3::operator -= (Vec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
 	mValue.mHigh = _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
+	mValue.val[1] = vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] -= inV2.mF32[i];
@@ -581,6 +668,9 @@ DVec3 &DVec3::operator -= (DVec3Arg inV2)
 #elif defined(JPH_USE_SSE)
 	mValue.mLow = _mm_sub_pd(mValue.mLow, inV2.mValue.mLow);
 	mValue.mHigh = _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vsubq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vsubq_f64(mValue.val[1], inV2.mValue.val[1]);
 #else
 	for (int i = 0; i < 3; ++i)
 		mF64[i] -= inV2.mF64[i];
@@ -598,6 +688,8 @@ DVec3 DVec3::operator / (DVec3Arg inV2) const
 	return _mm256_div_pd(mValue, inV2.mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_div_pd(mValue.mLow, inV2.mValue.mLow), _mm_div_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vdivq_f64(mValue.val[0], inV2.mValue.val[0]), vdivq_f64(mValue.val[1], inV2.mValue.val[1]) });
 #else
 	return DVec3(mF64[0] / inV2.mF64[0], mF64[1] / inV2.mF64[1], mF64[2] / inV2.mF64[2]);
 #endif
@@ -612,6 +704,8 @@ DVec3 DVec3::Abs() const
 #elif defined(JPH_USE_SSE)
 	__m128d zero = _mm_setzero_pd();
 	return DVec3({ _mm_max_pd(_mm_sub_pd(zero, mValue.mLow), mValue.mLow), _mm_max_pd(_mm_sub_pd(zero, mValue.mHigh), mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vabsq_f64(mValue.val[0]), vabsq_f64(mValue.val[1]) });
 #else
 	return DVec3(abs(mF64[0]), abs(mF64[1]), abs(mF64[2]));
 #endif
@@ -655,6 +749,10 @@ double DVec3::Dot(DVec3Arg inV2) const
 	__m128d z = _mm_mul_sd(mValue.mHigh, inV2.mValue.mHigh);
 	sum = _mm_add_pd(sum, z);
 	return _mm_cvtsd_f64(sum);
+#elif defined(JPH_USE_NEON)
+    float64x2_t mul_low = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
+    float64x2_t mul_high = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
+    return vaddvq_f64(mul_low) + vgetq_lane_f64(mul_high, 0);
 #else
 	double dot = 0.0;
 	for (int i = 0; i < 3; i++)
@@ -674,6 +772,8 @@ DVec3 DVec3::Sqrt() const
 	return _mm256_sqrt_pd(mValue);
 #elif defined(JPH_USE_SSE)
 	return DVec3({ _mm_sqrt_pd(mValue.mLow), _mm_sqrt_pd(mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsqrtq_f64(mValue.val[0]), vsqrtq_f64(mValue.val[1]) });
 #else
 	return DVec3(sqrt(mF64[0]), sqrt(mF64[1]), sqrt(mF64[2]));
 #endif
@@ -715,6 +815,10 @@ DVec3 DVec3::GetSign() const
 	__m128d minus_one = _mm_set1_pd(-1.0);
 	__m128d one = _mm_set1_pd(1.0);
 	return DVec3({ _mm_or_pd(_mm_and_pd(mValue.mLow, minus_one), one), _mm_or_pd(_mm_and_pd(mValue.mHigh, minus_one), one) });
+#elif defined(JPH_USE_NEON)
+	float64x2_t minus_one = vdupq_n_f64(-1.0f);
+	float64x2_t one = vdupq_n_f64(1.0f);
+	return DVec3({ vorrq_s64(vandq_s64(mValue.val[0], minus_one), one), vorrq_s64(vandq_s64(mValue.val[1], minus_one), one) });
 #else
 	return DVec3(std::signbit(mF64[0])? -1.0 : 1.0, 
 				 std::signbit(mF64[1])? -1.0 : 1.0, 
@@ -732,6 +836,9 @@ DVec3 DVec3::PrepareRoundToZero() const
 #elif defined(JPH_USE_SSE)
 	__m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(int64_t(~cDoubleToFloatMantissaLoss)));
 	return DVec3({ _mm_and_pd(mValue.mLow, mask), _mm_and_pd(mValue.mHigh, mask) });
+#elif defined(JPH_USE_NEON)
+	float64x2_t mask = vreinterpretq_f64_u64(vdupq_n_u64(~cDoubleToFloatMantissaLoss));
+	return DVec3({ vandq_s64(mValue.val[0], mask), vandq_s64(mValue.val[1], mask) });
 #else
 	double x = BitCast<double>(BitCast<uint64>(mF64[0]) & ~cDoubleToFloatMantissaLoss);
 	double y = BitCast<double>(BitCast<uint64>(mF64[1]) & ~cDoubleToFloatMantissaLoss);
@@ -762,6 +869,18 @@ DVec3 DVec3::PrepareRoundToInf() const
 	__m128d is_zero_high = _mm_cmpeq_pd(value_and_mantissa_loss_high, zero);
 	__m128d value_or_mantissa_loss_high = _mm_or_pd(mValue.mHigh, _mm_castsi128_pd(mantissa_loss));
 	return DVec3({ _mm_blendv_pd(value_or_mantissa_loss_low, mValue.mLow, is_zero_low), _mm_blendv_pd(value_or_mantissa_loss_high, mValue.mHigh, is_zero_high) });
+#elif defined(JPH_USE_NEON)
+	float64x2_t mantissa_loss = vreinterpretq_f64_u64(vdupq_n_u64(cDoubleToFloatMantissaLoss));
+	float64x2_t zero = vdupq_n_f64(0.0);
+	float64x2_t value_and_mantissa_loss_low = vandq_s64(mValue.val[0], mantissa_loss);
+	float64x2_t is_zero_low = vceqq_f64(value_and_mantissa_loss_low, zero);
+	float64x2_t value_or_mantissa_loss_low = vorrq_s64(mValue.val[0], mantissa_loss);
+	float64x2_t value_and_mantissa_loss_high = vandq_s64(mValue.val[1], mantissa_loss);
+	float64x2_t value_low = vbslq_f64(is_zero_low, mValue.val[0], value_or_mantissa_loss_low);
+	float64x2_t is_zero_high = vceqq_f64(value_and_mantissa_loss_high, zero);
+	float64x2_t value_or_mantissa_loss_high = vorrq_s64(mValue.val[1], mantissa_loss);
+	float64x2_t value_high = vbslq_f64(is_zero_high, mValue.val[1], value_or_mantissa_loss_high);
+	return DVec3({ value_low, value_high });
 #else
 	uint64 ux = BitCast<uint64>(mF64[0]);
 	uint64 uy = BitCast<uint64>(mF64[1]);

+ 1 - 1
UnitTests/Physics/ActiveEdgesTests.cpp

@@ -226,7 +226,7 @@ TEST_SUITE("ActiveEdgesTest")
 		{
 			// Box should have slided frictionless over the plane without encountering any collisions
 			CHECK_APPROX_EQUAL(box.GetPosition(), expected_position, 1.0e-3f);
-			CHECK_APPROX_EQUAL(box.GetLinearVelocity(), initial_velocity, 1.0e-3f);
+			CHECK_APPROX_EQUAL(box.GetLinearVelocity(), initial_velocity, 2.0e-3f);
 		}
 		else
 		{