cpp
/
JoltPhysics
Mirror von https://github.com/jrouwe/JoltPhysics.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
							// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT

#include <Jolt/Math/Trigonometry.h>
#include <Jolt/Math/Vec3.h>
#include <Jolt/Math/UVec4.h>

JPH_NAMESPACE_BEGIN

// Constructor
Vec4::Vec4(Vec3Arg inRHS) : 
	mValue(inRHS.mValue) 
{ 
}

Vec4::Vec4(Vec3Arg inRHS, float inW)
{
#if defined(JPH_USE_SSE4_1)
	mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
#elif defined(JPH_USE_NEON)
	mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
#else
	for (int i = 0; i < 3; i++)
		mF32[i] = inRHS.mF32[i];
	mF32[3] = inW;
#endif
}

Vec4::Vec4(float inX, float inY, float inZ, float inW)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_set_ps(inW, inZ, inY, inX);
#elif defined(JPH_USE_NEON)
	uint32x2_t xy = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32 *>(&inX)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inY)) << 32));
	uint32x2_t zw = vcreate_f32(static_cast<uint64>(*reinterpret_cast<uint32* >(&inZ)) | (static_cast<uint64>(*reinterpret_cast<uint32 *>(&inW)) << 32));
	mValue = vcombine_f32(xy, zw);
#else
	#error Undefined CPU architecture
#endif
}

template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
Vec4 Vec4::Swizzle() const
{
	static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
	static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
	static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
	static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");

#if defined(JPH_USE_SSE)
	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
#elif defined(JPH_USE_NEON)
	return __builtin_shufflevector(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sZero()
{
#if defined(JPH_USE_SSE)
	return _mm_setzero_ps();
#elif defined(JPH_USE_NEON)
	return vdupq_n_f32(0);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sReplicate(float inV)
{
#if defined(JPH_USE_SSE)
	return _mm_set1_ps(inV);
#elif defined(JPH_USE_NEON)
	return vdupq_n_f32(inV);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sNaN()
{
	return sReplicate(numeric_limits<float>::quiet_NaN());
}

Vec4 Vec4::sLoadFloat4(const Float4 *inV)
{
#if defined(JPH_USE_SSE)
	return _mm_loadu_ps(&inV->x);
#elif defined(JPH_USE_NEON)
	return vld1q_f32(&inV->x);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
{
#if defined(JPH_USE_SSE)
	return _mm_load_ps(&inV->x);
#elif defined(JPH_USE_NEON)
	return vld1q_f32(&inV->x);
#else
	#error Unsupported CPU architecture
#endif
}

template <const int Scale>
Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
{
#if defined(JPH_USE_SSE)
	#ifdef JPH_USE_AVX2
		return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
	#else
		const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
		Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
		Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
		Type xy = _mm_unpacklo_ps(x, y);
		Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
		Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
		Type zw = _mm_unpacklo_ps(z, w);
		return _mm_movelh_ps(xy, zw);
	#endif
#else
	const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
	float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
	float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
	float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
	float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
	return Vec4(x, y, z, w);
#endif
}

Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_min_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vminq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_max_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vmaxq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
	return vceqq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
	return vcltq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
	return vcleq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
	return vcgtq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
	return vcgeq_f32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
{
#if defined(JPH_USE_SSE)
	#ifdef JPH_USE_FMADD
		return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
	#else
		return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
	#endif
#elif defined(JPH_USE_NEON)
	return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sSelect(Vec4Arg inV1, Vec4Arg inV2, UVec4Arg inControl)
{
#if defined(JPH_USE_SSE4_1)
	return _mm_blendv_ps(inV1.mValue, inV2.mValue, _mm_castsi128_ps(inControl.mValue));
#elif defined(JPH_USE_NEON)
	return vbslq_f32(vshrq_n_s32(inControl.mValue, 31), inV2.mValue, inV1.mValue);
#else
	Vec4 result;
	for (int i = 0; i < 4; i++)
		result.mF32[i] = inControl.mU32[i] ? inV2.mF32[i] : inV1.mF32[i];
	return result;
#endif
}

Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_or_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vorrq_s32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_xor_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return veorq_s32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_and_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vandq_s32(inV1.mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
{
	// Pass 1, test 1st vs 3rd, 2nd vs 4th
	Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
	UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
	UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v1, c1);
	ioIndex = UVec4::sSelect(ioIndex, i1, c1);

	// Pass 2, test 1st vs 2nd, 3rd vs 4th
	Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
	UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
	UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v2, c2);
	ioIndex = UVec4::sSelect(ioIndex, i2, c2);

	// Pass 3, test 2nd vs 3rd component
	Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
	UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
	UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v3, c3);
	ioIndex = UVec4::sSelect(ioIndex, i3, c3);
}

void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
{
	// Pass 1, test 1st vs 3rd, 2nd vs 4th
	Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
	UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
	UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v1, c1);
	ioIndex = UVec4::sSelect(ioIndex, i1, c1);

	// Pass 2, test 1st vs 2nd, 3rd vs 4th
	Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
	UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
	UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v2, c2);
	ioIndex = UVec4::sSelect(ioIndex, i2, c2);

	// Pass 3, test 2nd vs 3rd component
	Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
	UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
	UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
	ioValue = sSelect(ioValue, v3, c3);
	ioIndex = UVec4::sSelect(ioIndex, i3, c3);
}

bool Vec4::operator == (Vec4Arg inV2) const 
{ 
	return sEquals(*this, inV2).TestAllTrue();
}

bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
{
	return (inV2 - *this).LengthSq() <= inMaxDistSq;
}

bool Vec4::IsNormalized(float inTolerance) const 
{ 
	return abs(LengthSq() - 1.0f) <= inTolerance; 
}

bool Vec4::IsNaN() const
{
#if defined(JPH_USE_SSE)
	return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
#elif defined(JPH_USE_NEON)
	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
	return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::operator * (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vmulq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::operator * (float inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
	return vmulq_n_f32(mValue, inV2);
#else
	#error Unsupported CPU architecture
#endif
}

/// Multiply vector with float
Vec4 operator * (float inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vmulq_n_f32(inV2.mValue, inV1);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::operator / (float inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
	return vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 &Vec4::operator *= (float inV2)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
	mValue = vmulq_n_f32(mValue, inV2);
#else
	#error Unsupported CPU architecture
#endif
	return *this;
}

Vec4 &Vec4::operator *= (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	mValue = vmulq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
	return *this;
}

Vec4 &Vec4::operator /= (float inV2)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
	mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
	#error Unsupported CPU architecture
#endif
	return *this;
}

Vec4 Vec4::operator + (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vaddq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 &Vec4::operator += (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	mValue = vaddq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
	return *this;
}

Vec4 Vec4::operator - () const
{
#if defined(JPH_USE_SSE)
	return _mm_sub_ps(_mm_setzero_ps(), mValue);
#elif defined(JPH_USE_NEON)
	return vnegq_f32(mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::operator - (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vsubq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 &Vec4::operator -= (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
	mValue = _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	mValue = vsubq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
	return *this;
}

Vec4 Vec4::operator / (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
	return _mm_div_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
	return vdivq_f32(mValue, inV2.mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::SplatX() const
{
#if defined(JPH_USE_SSE)
	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
#elif defined(JPH_USE_NEON)
	return vdupq_laneq_f32(mValue, 0);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::SplatY() const
{
#if defined(JPH_USE_SSE)
	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
#elif defined(JPH_USE_NEON)
	return vdupq_laneq_f32(mValue, 1);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::SplatZ() const
{
#if defined(JPH_USE_SSE)
	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
#elif defined(JPH_USE_NEON)
	return vdupq_laneq_f32(mValue, 2);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::SplatW() const
{
#if defined(JPH_USE_SSE)
	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
#elif defined(JPH_USE_NEON)
	return vdupq_laneq_f32(mValue, 3);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::Abs() const
{
#if defined(JPH_USE_AVX512)
	return _mm_range_ps(mValue, mValue, 0b1000);
#elif defined(JPH_USE_SSE)
	return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
#elif defined(JPH_USE_NEON)
	return vabsq_f32(mValue);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::Reciprocal() const
{
	return sReplicate(1.0f) / mValue;
}

Vec4 Vec4::DotV(Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
	return _mm_dp_ps(mValue, inV2.mValue, 0xff);
#elif defined(JPH_USE_NEON)
    float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
    return vdupq_n_f32(vaddvq_f32(mul));
#else
	float dot = 0.0f;
	for (int i = 0; i < 4; i++)
		dot += mF32[i] * inV2.mF32[i];
	return Vec4::sReplicate(dot);
#endif
}

float Vec4::Dot(Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
#elif defined(JPH_USE_NEON)
    float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
    return vaddvq_f32(mul);
#else
	float dot = 0.0f;
	for (int i = 0; i < 4; i++)
		dot += mF32[i] * inV2.mF32[i];
	return dot;
#endif
}

float Vec4::LengthSq() const
{
#if defined(JPH_USE_SSE4_1)
	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
#elif defined(JPH_USE_NEON)
    float32x4_t mul = vmulq_f32(mValue, mValue);
    return vaddvq_f32(mul);
#else
	float len_sq = 0.0f;
	for (int i = 0; i < 4; i++)
		len_sq += mF32[i] * mF32[i];
	return len_sq;
#endif
}

float Vec4::Length() const
{
#if defined(JPH_USE_SSE4_1)
	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
#elif defined(JPH_USE_NEON)
    float32x4_t mul = vmulq_f32(mValue, mValue);
    float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
    return vget_lane_f32(vsqrt_f32(sum), 0);
#else
	return sqrt(LengthSq());
#endif
}

Vec4 Vec4::Sqrt() const
{
#if defined(JPH_USE_SSE)
	return _mm_sqrt_ps(mValue);
#elif defined(JPH_USE_NEON)
	return vsqrtq_f32(mValue);
#else
	#error Unsupported CPU architecture
#endif
}


Vec4 Vec4::GetSign() const
{
#if defined(JPH_USE_AVX512)
	return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
#elif defined(JPH_USE_SSE)
	Type minus_one = _mm_set1_ps(-1.0f);
	Type one = _mm_set1_ps(1.0f);
	return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
#elif defined(JPH_USE_NEON)
	Type minus_one = vdupq_n_f32(-1.0f);
	Type one = vdupq_n_f32(1.0f);
	return vorrq_s32(vandq_s32(mValue, minus_one), one);
#else
	#error Unsupported CPU architecture
#endif
}

Vec4 Vec4::Normalized() const
{
#if defined(JPH_USE_SSE4_1)
	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
#elif defined(JPH_USE_NEON)
    float32x4_t mul = vmulq_f32(mValue, mValue);
    float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
    return vdivq_f32(mValue, vsqrtq_f32(sum));
#else
	return *this / Length();
#endif
}

void Vec4::StoreFloat4(Float4 *outV) const
{
#if defined(JPH_USE_SSE)
	_mm_storeu_ps(&outV->x, mValue);
#elif defined(JPH_USE_NEON)
    vst1q_f32(&outV->x, mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::ToInt() const
{
#if defined(JPH_USE_SSE)
	return _mm_cvttps_epi32(mValue);
#elif defined(JPH_USE_NEON)
	return vcvtq_u32_f32(mValue);
#else
	#error Unsupported CPU architecture
#endif
}

UVec4 Vec4::ReinterpretAsInt() const
{
#if defined(JPH_USE_SSE)
	return UVec4(_mm_castps_si128(mValue));
#elif defined(JPH_USE_NEON)
	return vreinterpretq_u32_f32(mValue);
#else
	#error Unsupported CPU architecture
#endif
}

int Vec4::GetSignBits() const
{
#if defined(JPH_USE_SSE)
	return _mm_movemask_ps(mValue);
#elif defined(JPH_USE_NEON)
    int32x4_t shift = { 0, 1, 2, 3 };
    return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
#else
	#error Unsupported CPU architecture
#endif
}

float Vec4::ReduceMin() const
{
	Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
	v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
	return v.GetX();
}

float Vec4::ReduceMax() const
{
	Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
	v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
	return v.GetX();
}

void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
{
	// Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function and vectorizes it
	// Original implementation by Stephen L. Moshier (See: http://www.netlib.org/cephes/)

	// Make argument positive and remember sign (highest bit set is negative)
	UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
	Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());

	// Integer part of x / (PI / 4)
	UVec4 int_val = (1.27323954473516f * x).ToInt();
	Vec4 y = int_val.ToFloat();

	// Integer and fractional part modulo one octant, map zeros to origin
	// if (int_val & 1) int_val++, y += 1;
	UVec4 and_1 = int_val.LogicalShiftLeft<31>().ArithmeticShiftRight<31>();
	int_val += UVec4::sAnd(and_1, UVec4::sReplicate(1));
	y += Vec4::sAnd(and_1.ReinterpretAsFloat(), Vec4::sReplicate(1.0f));

	// Extended precision modular arithmetic
	x = ((x - y * 0.78515625f) - y * 2.4187564849853515625e-4f) - y * 3.77489497744594108e-8f;

	// Calculate both results
	Vec4 z = x * x;
	Vec4 y1 = ((2.443315711809948e-5f * z - Vec4::sReplicate(1.388731625493765e-3f)) * z + Vec4::sReplicate(4.166664568298827e-2f)) * z * z - 0.5f * z + Vec4::sReplicate(1.0f);
	Vec4 y2 = ((-1.9515295891e-4f * z + Vec4::sReplicate(8.3321608736e-3f)) * z - Vec4::sReplicate(1.6666654611e-1f)) * z * x + x;

	// From here we deviate form the original cephes code, we would have to write:
	//
	// j &= 7;
	// 
	// if (j > 3)
	// {
	//		j -= 4;
	//		sin_sign = -sin_sign;
	//		cos_sign = -cos_sign;
	// }
	// 
	// if (j > 1)
	//		cos_sign = -cos_sign;
	//
	// ...
	//
	// if (j == 1 || j == 2) // condition
	//		...
	// 
	// j		sin_sign	cos_sign	condition
	// 000b     1			1			0
	// 001b     1			1			1
	// 010b     1			-1			1
	// 011b     1			-1			0
	// 100b     -1			-1			0
	// 101b     -1			-1			1
	// 110b     -1			1			1
	// 111b		-1			1			0
	//
	// So: sin_sign = bit3, cos_sign = bit2 ^ bit3, condition = bit1 ^ bit2
	UVec4 bit1 = int_val.LogicalShiftLeft<31>();
	UVec4 bit2 = UVec4::sAnd(int_val.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
	UVec4 bit3 = UVec4::sAnd(int_val.LogicalShiftLeft<29>(), UVec4::sReplicate(0x80000000U));

	// Select which one of the results is sin and which one is cos
	UVec4 xor_1_2 = UVec4::sXor(bit1, bit2);
	Vec4 s = Vec4::sSelect(y2, y1, xor_1_2);
	Vec4 c = Vec4::sSelect(y1, y2, xor_1_2);

	// Update the signs
	sin_sign = UVec4::sXor(sin_sign, bit3);
	UVec4 cos_sign = UVec4::sXor(bit2, bit3);

	// Correct the signs
	outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
	outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
}

JPH_NAMESPACE_END