// zlib open source license
//
// Copyright (c) 2017 to 2022 David Forsgren Piuva
// 
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
// 
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
// 
//    1. The origin of this software must not be misrepresented; you must not
//    claim that you wrote the original software. If you use this software
//    in a product, an acknowledgment in the product documentation would be
//    appreciated but is not required.
// 
//    2. Altered source versions must be plainly marked as such, and must not be
//    misrepresented as being the original software.
// 
//    3. This notice may not be removed or altered from any source
//    distribution.

// Hardware abstraction layer for portable SIMD math.
//    Covers a small intersection of SSE2 and NEON in order to reduce the number
//    of bugs from multiple implementations when nothing advanced is required.

#ifndef DFPSR_SIMD
#define DFPSR_SIMD
	#include <stdint.h>
	#include <cassert>
	#include "SafePointer.h"
	#include "../math/FVector.h"
	#include "../math/IVector.h"
	#include "../math/UVector.h"
	#define ALIGN16  __attribute__((aligned(16)))

	// To allow turning off SIMD intrinsics for testing
	#ifdef __SSE2__
		// Comment out this line to test without SSE2
		#define USE_SSE2
	#elif __ARM_NEON
		// Comment out this line to test without NEON
		#define USE_NEON
	#endif

	// Everything declared in here handles things specific for SSE.
	// Direct use of the macros will not provide portability to all hardware.
	#ifdef USE_SSE2
		#define USE_BASIC_SIMD
		#define USE_DIRECT_SIMD_MEMORY_ACCESS
		#include <emmintrin.h> // SSE2

		#ifdef __SSSE3__
			#include <tmmintrin.h> // SSSE3
			// Comment out this line to test without SSSE3
			#define USE_SSSE3
		#endif
		#ifdef __AVX2__
			#include <immintrin.h> // AVX2
			// Comment out this line to test without AVX2
			#define USE_AVX2
		#endif

		// Vector types
		#define SIMD_F32x4 __m128
		#define SIMD_U8x16 __m128i
		#define SIMD_U16x8 __m128i
		#define SIMD_U32x4 __m128i
		#define SIMD_I32x4 __m128i

		// Vector uploads in address order
		#define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
		#define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
		#define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
		#define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
		#define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
		#define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
		#define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
		#define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
		#define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
		#define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)

		// Conversions
		#define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
		#define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
		#define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
		#define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)

		// Unpacking conversions
		#define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
		#define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
		#define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
		#define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))

		// Saturated packing
		//   Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
		inline SIMD_U8x16 PACK_SAT_U16_TO_U8(const SIMD_U16x8& a, const SIMD_U16x8& b) {
			SIMD_U16x8 mask, a2, b2;
			mask = _mm_set1_epi16(0x7fff);
			a2 = _mm_and_si128(a, mask);
			a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
			b2 = _mm_and_si128(b, mask);
			b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
			return _mm_packus_epi16(a2, b2);
		}

		// Reinterpret casting
		#define REINTERPRET_U32_TO_U8_SIMD(A) (A)
		#define REINTERPRET_U32_TO_U16_SIMD(A) (A)
		#define REINTERPRET_U8_TO_U32_SIMD(A) (A)
		#define REINTERPRET_U16_TO_U32_SIMD(A) (A)
		#define REINTERPRET_U32_TO_I32_SIMD(A) (A)
		#define REINTERPRET_I32_TO_U32_SIMD(A) (A)

		// Vector float operations returning SIMD_F32x4
		#define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
		#define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
		#define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)

		// Vector integer operations returning SIMD_I32x4
		#define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
		#define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
		// 32-bit integer multiplications are not available on SSE2.

		// Vector integer operations returning SIMD_U32x4
		#define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
		#define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
		// 32-bit integer multiplications are not available on SSE2.

		// Vector integer operations returning SIMD_U16x8
		#define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
		#define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
		#define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)

		// Vector integer operations returning SIMD_U8x16
		#define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
		#define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
		#define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
		// No 8-bit multiplications

		// Statistics
		#define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
		#define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)

		// Bitwise
		#define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
		#define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
	#endif

	// Everything declared in here handles things specific for NEON.
	// Direct use of the macros will not provide portability to all hardware.
	#ifdef USE_NEON
		#define USE_BASIC_SIMD
		#include <arm_neon.h> // NEON

		// Vector types
		#define SIMD_F32x4 float32x4_t
		#define SIMD_U8x16 uint8x16_t
		#define SIMD_U16x8 uint16x8_t
		#define SIMD_U32x4 uint32x4_t
		#define SIMD_I32x4 int32x4_t

		// Vector uploads in address order
		inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
			float data[4] ALIGN16 = {a, b, c, d};
			return vld1q_f32(data);
		}
		inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
			return vdupq_n_f32(a);
		}
		inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
		                                      uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
			uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
			return vld1q_u8(data);
		}
		inline SIMD_U16x8 LOAD_SCALAR_U8_SIMD(uint16_t a) {
			return vdupq_n_u8(a);
		}
		inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
			uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
			return vld1q_u16(data);
		}
		inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
			return vdupq_n_u16(a);
		}
		inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
			uint32_t data[4] ALIGN16 = {a, b, c, d};
			return vld1q_u32(data);
		}
		inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
			return vdupq_n_u32(a);
		}
		inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
			int32_t data[4] ALIGN16 = {a, b, c, d};
			return vld1q_s32(data);
		}
		inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
			return vdupq_n_s32(a);
		}

		// Conversions
		#define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
		#define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
		#define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
		#define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)

		// Unpacking conversions
		#define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
		#define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
		#define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
		#define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))

		// Saturated packing
		#define PACK_SAT_U16_TO_U8(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))

		// Reinterpret casting
		#define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
		#define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
		#define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
		#define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
		#define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
		#define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)

		// Vector float operations returning SIMD_F32x4
		#define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
		#define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
		#define MUL_F32_SIMD(A, B) vmulq_f32(A, B)

		// Vector integer operations returning SIMD_I32x4
		#define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
		#define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
		#define MUL_I32_NEON(A, B) vmulq_s32(A, B)

		// Vector integer operations returning SIMD_U32x4
		#define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
		#define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
		#define MUL_U32_NEON(A, B) vmulq_u32(A, B)

		// Vector integer operations returning SIMD_U16x8
		#define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
		#define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
		#define MUL_U16_SIMD(A, B) vmulq_u16(A, B)

		// Vector integer operations returning SIMD_U8x16
		#define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
		#define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
		#define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
		// No 8-bit multiplications

		// Statistics
		#define MIN_F32_SIMD(A, B) vminq_f32(A, B)
		#define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)

		// Bitwise
		#define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
		#define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
	#endif

	/*
	The vector types (F32x4, I32x4, U32x4, U16x8) below are supposed to be portable across different CPU architectures.
	When this abstraction layer is mixed with handwritten SIMD intrinsics:
		Use "USE_SSE2" instead of "__SSE2__"
		Use "USE_AVX2" instead of "__AVX2__"
		Use "USE_NEON" instead of "__ARM_NEON"
	Portability exceptions:
		* The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
			Only use when USE_BASIC_SIMD is defined.
			Will not work on scalar emulation.
		* The "shared_memory" array is only defined for targets with direct access to SIMD registers. (SSE)
			Only use when USE_DIRECT_SIMD_MEMORY_ACCESS is defined.
			Will not work on NEON or scalar emulation.
		* The "emulated" array is ony defined when SIMD is turned off.
			Cannot be used when USE_BASIC_SIMD is defined.
			Will not work when either SSE or NEON is enabled.
	*/

	union F32x4 {
		#ifdef USE_BASIC_SIMD
			public:
			#ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
				// Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
				// Direct access cannot be done on NEON!
				float shared_memory[4];
			#endif
			// The SIMD vector of undefined type
			// Not accessible while emulating!
			SIMD_F32x4 v;
			// Construct a portable vector from a native SIMD vector
			explicit F32x4(const SIMD_F32x4& v) : v(v) {}
			// Construct a portable vector from a set of scalars
			F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
			// Construct a portable vector from a single duplicated scalar
			explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
		#else
			public:
			// Emulate a SIMD vector as an array of scalars without hardware support
			// Only accessible while emulating!
			float emulated[4];
			// Construct a portable vector from a set of scalars
			F32x4(float a1, float a2, float a3, float a4) {
				this->emulated[0] = a1;
				this->emulated[1] = a2;
				this->emulated[2] = a3;
				this->emulated[3] = a4;
			}
			// Construct a portable vector from a single duplicated scalar
			explicit F32x4(float scalar) {
				this->emulated[0] = scalar;
				this->emulated[1] = scalar;
				this->emulated[2] = scalar;
				this->emulated[3] = scalar;
			}
		#endif
		// Construct a portable SIMD vector from a pointer to aligned data
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		static inline F32x4 readAlignedUnsafe(const float* data) {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					return F32x4(_mm_load_ps(data));
				#elif USE_NEON
					return F32x4(vld1q_f32(data));
				#endif
			#else
				return F32x4(data[0], data[1], data[2], data[3]);
			#endif
		}
		// Write to aligned memory from the existing vector
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		inline void writeAlignedUnsafe(float* data) const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					_mm_store_ps(data, this->v);
				#elif USE_NEON
					vst1q_f32(data, this->v);
				#endif
			#else
				data[0] = this->emulated[0];
				data[1] = this->emulated[1];
				data[2] = this->emulated[2];
				data[3] = this->emulated[3];
			#endif
		}
		#ifdef DFPSR_GEOMETRY_FVECTOR
			dsr::FVector4D get() const {
				float data[4] ALIGN16;
				this->writeAlignedUnsafe(data);
				return dsr::FVector4D(data[0], data[1], data[2], data[3]);
			}
		#endif
		// Bound and alignment checked reading
		static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
			const float* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			return F32x4::readAlignedUnsafe(pointer);
		}
		// Bound and alignment checked writing
		inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
			float* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			this->writeAlignedUnsafe(pointer);
		}
		// 1 / x
		//   Useful for multiple divisions with the same denominator
		//   Useless if the denominator is a constant
		F32x4 reciprocal() const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					// Approximate
					SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
					// Refine
					return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
				#elif USE_NEON
					// Approximate
					SIMD_F32x4 result = vrecpeq_f32(this->v);
					// Refine
					result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
					return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
				#else
					assert(false);
					return F32x4(0);
				#endif
			#else
				return F32x4(1.0f / this->emulated[0], 1.0f / this->emulated[1], 1.0f / this->emulated[2], 1.0f / this->emulated[3]);
			#endif
		}
		// 1 / sqrt(x)
		//   Useful for normalizing vectors
		F32x4 reciprocalSquareRoot() const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					//__m128 reciRoot = _mm_rsqrt_ps(this->v);
					SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
					SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
					reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
					return F32x4(reciRoot);
				#elif USE_NEON
					// TODO: Test on ARM
					// Approximate
					SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
					// Refine
					reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
					return reciRoot;
				#else
					assert(false);
					return F32x4(0);
				#endif
			#else
				return F32x4(1.0f / sqrt(this->emulated[0]), 1.0f / sqrt(this->emulated[1]), 1.0f / sqrt(this->emulated[2]), 1.0f / sqrt(this->emulated[3]));
			#endif
		}
		// sqrt(x)
		//   Useful for getting lengths of vectors
		F32x4 squareRoot() const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
					// Approximate
					SIMD_F32x4 root = _mm_sqrt_ps(this->v);
					// Refine
					root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
					return F32x4(root);
				#elif USE_NEON
					// TODO: Test on ARM
					return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
				#else
					assert(false);
					return F32x4(0);
				#endif
			#else
				return F32x4(sqrt(this->emulated[0]), sqrt(this->emulated[1]), sqrt(this->emulated[2]), sqrt(this->emulated[3]));
			#endif
		}
		F32x4 clamp(float min, float max) const {
			#ifdef USE_BASIC_SIMD
				return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)), LOAD_SCALAR_F32_SIMD(max)));
			#else
				float val0 = this->emulated[0];
				float val1 = this->emulated[1];
				float val2 = this->emulated[2];
				float val3 = this->emulated[3];
				if (min > val0) { val0 = min; }
				if (max < val0) { val0 = max; }
				if (min > val1) { val1 = min; }
				if (max < val1) { val1 = max; }
				if (min > val2) { val2 = min; }
				if (max < val2) { val2 = max; }
				if (min > val3) { val3 = min; }
				if (max < val3) { val3 = max; }
				return F32x4(val0, val1, val2, val3);
			#endif
		}
		F32x4 clampLower(float min) const {
			#ifdef USE_BASIC_SIMD
				return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)));
			#else
				float val0 = this->emulated[0];
				float val1 = this->emulated[1];
				float val2 = this->emulated[2];
				float val3 = this->emulated[3];
				if (min > val0) { val0 = min; }
				if (min > val1) { val1 = min; }
				if (min > val2) { val2 = min; }
				if (min > val3) { val3 = min; }
				return F32x4(val0, val1, val2, val3);
			#endif
		}
		F32x4 clampUpper(float max) const {
			#ifdef USE_BASIC_SIMD
				return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(max)));
			#else
				float val0 = this->emulated[0];
				float val1 = this->emulated[1];
				float val2 = this->emulated[2];
				float val3 = this->emulated[3];
				if (max < val0) { val0 = max; }
				if (max < val1) { val1 = max; }
				if (max < val2) { val2 = max; }
				if (max < val3) { val3 = max; }
				return F32x4(val0, val1, val2, val3);
			#endif
		}
	};
	inline dsr::String& string_toStreamIndented(dsr::String& target, const F32x4& source, const dsr::ReadableString& indentation) {
		string_append(target, indentation, source.get());
		return target;
	}
	inline bool operator==(const F32x4& left, const F32x4& right) {
		float a[4] ALIGN16;
		float b[4] ALIGN16;
		left.writeAlignedUnsafe(a);
		right.writeAlignedUnsafe(b);
		return fabs(a[0] - b[0]) < 0.0001f && fabs(a[1] - b[1]) < 0.0001f && fabs(a[2] - b[2]) < 0.0001f && fabs(a[3] - b[3]) < 0.0001f;
	}
	inline bool operator!=(const F32x4& left, const F32x4& right) {
		return !(left == right);
	}
	inline F32x4 operator+(const F32x4& left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(ADD_F32_SIMD(left.v, right.v));
		#else
			return F32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
		#endif
	}
	inline F32x4 operator+(float left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(ADD_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
		#else
			return F32x4(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3]);
		#endif
	}
	inline F32x4 operator+(const F32x4& left, float right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(ADD_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
		#else
			return F32x4(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right);
		#endif
	}
	inline F32x4 operator-(const F32x4& left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(SUB_F32_SIMD(left.v, right.v));
		#else
			return F32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
		#endif
	}
	inline F32x4 operator-(float left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
		#else
			return F32x4(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3]);
		#endif
	}
	inline F32x4 operator-(const F32x4& left, float right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(SUB_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
		#else
			return F32x4(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right);
		#endif
	}
	inline F32x4 operator*(const F32x4& left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(MUL_F32_SIMD(left.v, right.v));
		#else
			return F32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
		#endif
	}
	inline F32x4 operator*(float left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
		#else
			return F32x4(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3]);
		#endif
	}
	inline F32x4 operator*(const F32x4& left, float right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(MUL_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
		#else
			return F32x4(left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right);
		#endif
	}
	inline F32x4 min(const F32x4& left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(MIN_F32_SIMD(left.v, right.v));
		#else
			float v0 = left.emulated[0];
			float v1 = left.emulated[1];
			float v2 = left.emulated[2];
			float v3 = left.emulated[3];
			float r0 = right.emulated[0];
			float r1 = right.emulated[1];
			float r2 = right.emulated[2];
			float r3 = right.emulated[3];
			if (r0 < v0) { v0 = r0; }
			if (r1 < v1) { v1 = r1; }
			if (r2 < v2) { v2 = r2; }
			if (r3 < v3) { v3 = r3; }
			return F32x4(v0, v1, v2, v3);
		#endif
	}
	inline F32x4 max(const F32x4& left, const F32x4& right) {
		#ifdef USE_BASIC_SIMD
			return F32x4(MAX_F32_SIMD(left.v, right.v));
		#else
			float v0 = left.emulated[0];
			float v1 = left.emulated[1];
			float v2 = left.emulated[2];
			float v3 = left.emulated[3];
			float r0 = right.emulated[0];
			float r1 = right.emulated[1];
			float r2 = right.emulated[2];
			float r3 = right.emulated[3];
			if (r0 > v0) { v0 = r0; }
			if (r1 > v1) { v1 = r1; }
			if (r2 > v2) { v2 = r2; }
			if (r3 > v3) { v3 = r3; }
			return F32x4(v0, v1, v2, v3);
		#endif
	}
	union I32x4 {
		#ifdef USE_BASIC_SIMD
			public:
			#ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
				// Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
				// Direct access cannot be done on NEON!
				int32_t shared_memory[4];
			#endif
			// The SIMD vector of undefined type
			// Not accessible while emulating!
			SIMD_I32x4 v;
			// Construct a portable vector from a native SIMD vector
			explicit I32x4(const SIMD_I32x4& v) : v(v) {}
			// Construct a portable vector from a set of scalars
			I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
			// Construct a portable vector from a single duplicated scalar
			explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
		#else
			public:
			// Emulate a SIMD vector as an array of scalars without hardware support
			// Only accessible while emulating!
			int32_t emulated[4];
			// Construct a portable vector from a set of scalars
			I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
				this->emulated[0] = a1;
				this->emulated[1] = a2;
				this->emulated[2] = a3;
				this->emulated[3] = a4;
			}
			// Construct a portable vector from a single duplicated scalar
			explicit I32x4(int32_t scalar) {
				this->emulated[0] = scalar;
				this->emulated[1] = scalar;
				this->emulated[2] = scalar;
				this->emulated[3] = scalar;
			}
		#endif
		// Construct a portable SIMD vector from a pointer to aligned data
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		static inline I32x4 readAlignedUnsafe(const int32_t* data) {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					return I32x4(_mm_load_si128((const __m128i*)data));
				#elif USE_NEON
					return I32x4(vld1q_s32(data));
				#endif
			#else
				return I32x4(data[0], data[1], data[2], data[3]);
			#endif
		}
		// Write to aligned memory from the existing vector
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		inline void writeAlignedUnsafe(int32_t* data) const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					_mm_store_si128((__m128i*)data, this->v);
				#elif USE_NEON
					vst1q_s32(data, this->v);
				#endif
			#else
				data[0] = this->emulated[0];
				data[1] = this->emulated[1];
				data[2] = this->emulated[2];
				data[3] = this->emulated[3];
			#endif
		}
		#ifdef DFPSR_GEOMETRY_IVECTOR
			dsr::IVector4D get() const {
				int32_t data[4] ALIGN16;
				this->writeAlignedUnsafe(data);
				return dsr::IVector4D(data[0], data[1], data[2], data[3]);
			}
		#endif
		// Bound and alignment checked reading
		static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
			const int32_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			return I32x4::readAlignedUnsafe(pointer);
		}
		// Bound and alignment checked writing
		inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
			int32_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			this->writeAlignedUnsafe(pointer);
		}
	};
	inline dsr::String& string_toStreamIndented(dsr::String& target, const I32x4& source, const dsr::ReadableString& indentation) {
		string_append(target, indentation, source.get());
		return target;
	}
	inline bool operator==(const I32x4& left, const I32x4& right) {
		int32_t a[4] ALIGN16;
		int32_t b[4] ALIGN16;
		left.writeAlignedUnsafe(a);
		right.writeAlignedUnsafe(b);
		return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
	}
	inline bool operator!=(const I32x4& left, const I32x4& right) {
		return !(left == right);
	}
	inline I32x4 operator+(const I32x4& left, const I32x4& right) {
		#ifdef USE_BASIC_SIMD
			return I32x4(ADD_I32_SIMD(left.v, right.v));
		#else
			return I32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
		#endif
	}
	inline I32x4 operator+(int32_t left, const I32x4& right) {
		return I32x4(left) + right;
	}
	inline I32x4 operator+(const I32x4& left, int32_t right) {
		return left + I32x4(right);
	}
	inline I32x4 operator-(const I32x4& left, const I32x4& right) {
		#ifdef USE_BASIC_SIMD
			return I32x4(SUB_I32_SIMD(left.v, right.v));
		#else
			return I32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
		#endif
	}
	inline I32x4 operator-(int32_t left, const I32x4& right) {
		return I32x4(left) - right;
	}
	inline I32x4 operator-(const I32x4& left, int32_t right) {
		return left - I32x4(right);
	}
	inline I32x4 operator*(const I32x4& left, const I32x4& right) {
		#ifdef USE_BASIC_SIMD
			#ifdef USE_SSE2
				// Emulate a NEON instruction
				return I32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
			#elif USE_NEON
				return I32x4(MUL_I32_NEON(left.v, right.v));
			#endif
		#else
			return I32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
		#endif
	}
	inline I32x4 operator*(int32_t left, const I32x4& right) {
		return I32x4(left) * right;
	}
	inline I32x4 operator*(const I32x4& left, int32_t right) {
		return left * I32x4(right);
	}

	union U32x4 {
		#ifdef USE_BASIC_SIMD
			public:
			#ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
				// Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
				// Direct access cannot be done on NEON!
				uint32_t shared_memory[4];
			#endif
			// The SIMD vector of undefined type
			// Not accessible while emulating!
			SIMD_U32x4 v;
			// Construct a portable vector from a native SIMD vector
			explicit U32x4(const SIMD_U32x4& v) : v(v) {}
			// Construct a portable vector from a set of scalars
			U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
			// Construct a portable vector from a single duplicated scalar
			explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
		#else
			public:
			// Emulate a SIMD vector as an array of scalars without hardware support
			// Only accessible while emulating!
			uint32_t emulated[4];
			// Construct a portable vector from a set of scalars
			U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
				this->emulated[0] = a1;
				this->emulated[1] = a2;
				this->emulated[2] = a3;
				this->emulated[3] = a4;
			}
			// Construct a portable vector from a single duplicated scalar
			explicit U32x4(uint32_t scalar) {
				this->emulated[0] = scalar;
				this->emulated[1] = scalar;
				this->emulated[2] = scalar;
				this->emulated[3] = scalar;
			}
		#endif
		// Construct a portable SIMD vector from a pointer to aligned data
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					return U32x4(_mm_load_si128((const __m128i*)data));
				#elif USE_NEON
					return U32x4(vld1q_u32(data));
				#endif
			#else
				return U32x4(data[0], data[1], data[2], data[3]);
			#endif
		}
		// Write to aligned memory from the existing vector
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		inline void writeAlignedUnsafe(uint32_t* data) const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					_mm_store_si128((__m128i*)data, this->v);
				#elif USE_NEON
					vst1q_u32(data, this->v);
				#endif
			#else
				data[0] = this->emulated[0];
				data[1] = this->emulated[1];
				data[2] = this->emulated[2];
				data[3] = this->emulated[3];
			#endif
		}
		#ifdef DFPSR_GEOMETRY_UVECTOR
			dsr::UVector4D get() const {
				uint32_t data[4] ALIGN16;
				this->writeAlignedUnsafe(data);
				return dsr::UVector4D(data[0], data[1], data[2], data[3]);
			}
		#endif
		// Bound and alignment checked reading
		static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
			const uint32_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			return U32x4::readAlignedUnsafe(pointer);
		}
		// Bound and alignment checked writing
		inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
			uint32_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			this->writeAlignedUnsafe(pointer);
		}
	};
	inline dsr::String& string_toStreamIndented(dsr::String& target, const U32x4& source, const dsr::ReadableString& indentation) {
		string_append(target, indentation, source.get());
		return target;
	}
	inline bool operator==(const U32x4& left, const U32x4& right) {
		uint32_t a[4] ALIGN16;
		uint32_t b[4] ALIGN16;
		left.writeAlignedUnsafe(a);
		right.writeAlignedUnsafe(b);
		return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
	}
	inline bool operator!=(const U32x4& left, const U32x4& right) {
		return !(left == right);
	}
	inline U32x4 operator+(const U32x4& left, const U32x4& right) {
		#ifdef USE_BASIC_SIMD
			return U32x4(ADD_U32_SIMD(left.v, right.v));
		#else
			return U32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
		#endif
	}
	inline U32x4 operator+(uint32_t left, const U32x4& right) {
		return U32x4(left) + right;
	}
	inline U32x4 operator+(const U32x4& left, uint32_t right) {
		return left + U32x4(right);
	}
	inline U32x4 operator-(const U32x4& left, const U32x4& right) {
		#ifdef USE_BASIC_SIMD
			return U32x4(SUB_U32_SIMD(left.v, right.v));
		#else
			return U32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
		#endif
	}
	inline U32x4 operator-(uint32_t left, const U32x4& right) {
		return U32x4(left) - right;
	}
	inline U32x4 operator-(const U32x4& left, uint32_t right) {
		return left - U32x4(right);
	}
	inline U32x4 operator*(const U32x4& left, const U32x4& right) {
		#ifdef USE_BASIC_SIMD
			#ifdef USE_SSE2
				// Emulate a NEON instruction on SSE2 registers
				return U32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
			#else // NEON
				return U32x4(MUL_U32_NEON(left.v, right.v));
			#endif
		#else
			return U32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
		#endif
	}
	inline U32x4 operator*(uint32_t left, const U32x4& right) {
		return U32x4(left) * right;
	}
	inline U32x4 operator*(const U32x4& left, uint32_t right) {
		return left * U32x4(right);
	}
	inline U32x4 operator&(const U32x4& left, const U32x4& right) {
		#ifdef USE_BASIC_SIMD
			return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
		#else
			return U32x4(left.emulated[0] & right.emulated[0], left.emulated[1] & right.emulated[1], left.emulated[2] & right.emulated[2], left.emulated[3] & right.emulated[3]);
		#endif
	}
	inline U32x4 operator&(const U32x4& left, uint32_t mask) {
		#ifdef USE_BASIC_SIMD
			return U32x4(BITWISE_AND_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
		#else
			return U32x4(left.emulated[0] & mask, left.emulated[1] & mask, left.emulated[2] & mask, left.emulated[3] & mask);
		#endif
	}
	inline U32x4 operator|(const U32x4& left, const U32x4& right) {
		#ifdef USE_BASIC_SIMD
			return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
		#else
			return U32x4(left.emulated[0] | right.emulated[0], left.emulated[1] | right.emulated[1], left.emulated[2] | right.emulated[2], left.emulated[3] | right.emulated[3]);
		#endif
	}
	inline U32x4 operator|(const U32x4& left, uint32_t mask) {
		#ifdef USE_BASIC_SIMD
			return U32x4(BITWISE_OR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
		#else
			return U32x4(left.emulated[0] | mask, left.emulated[1] | mask, left.emulated[2] | mask, left.emulated[3] | mask);
		#endif
	}
	inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
		#ifdef USE_SSE2
			return U32x4(_mm_slli_epi32(left.v, bitOffset));
		#else
			#ifdef USE_NEON
				return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
			#else
				return U32x4(left.emulated[0] << bitOffset, left.emulated[1] << bitOffset, left.emulated[2] << bitOffset, left.emulated[3] << bitOffset);
			#endif
		#endif
	}
	inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
		#ifdef USE_SSE2
			return U32x4(_mm_srli_epi32(left.v, bitOffset));
		#else
			#ifdef USE_NEON
				return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
			#else
				return U32x4(left.emulated[0] >> bitOffset, left.emulated[1] >> bitOffset, left.emulated[2] >> bitOffset, left.emulated[3] >> bitOffset);
			#endif
		#endif
	}

	union U16x8 {
		#ifdef USE_BASIC_SIMD
			public:
			#ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
				// Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
				// Direct access cannot be done on NEON!
				uint16_t shared_memory[8];
			#endif
			// The SIMD vector of undefined type
			// Not accessible while emulating!
			SIMD_U16x8 v;
			// Construct a portable vector from a native SIMD vector
			explicit U16x8(const SIMD_U16x8& v) : v(v) {}
			// Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
			//   Reinterpret casting is used
			explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
			// Construct a portable vector from a set of scalars
			U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
			// Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
			//   Reinterpret casting is used
			// TODO: Remove all reintreprets from constructors to improve readability
			explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
			// Construct a portable vector from a single duplicated scalar
			explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
			// Reinterpret cast to a vector of 4 x 32-bit unsigned integers
			U32x4 get_U32() const {
				return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
			}
		#else
			public:
			// Emulate a SIMD vector as an array of scalars without hardware support
			// Only accessible while emulating!
			uint16_t emulated[8];
			// Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
			//   Reinterpret casting is used
			explicit U16x8(const U32x4& vector) {
				uint64_t *target = (uint64_t*)this->emulated;
				uint64_t *source = (uint64_t*)vector.emulated;
				target[0] = source[0];
				target[1] = source[1];
			}
			// Construct a portable vector from a set of scalars
			U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
				this->emulated[0] = a1;
				this->emulated[1] = a2;
				this->emulated[2] = a3;
				this->emulated[3] = a4;
				this->emulated[4] = a5;
				this->emulated[5] = a6;
				this->emulated[6] = a7;
				this->emulated[7] = a8;
			}
			// Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
			//   Reinterpret casting is used
			explicit U16x8(uint32_t scalar) {
				uint32_t *target = (uint32_t*)this->emulated;
				target[0] = scalar;
				target[1] = scalar;
				target[2] = scalar;
				target[3] = scalar;
			}
			// Construct a portable vector from a single duplicated scalar
			explicit U16x8(uint16_t scalar) {
				this->emulated[0] = scalar;
				this->emulated[1] = scalar;
				this->emulated[2] = scalar;
				this->emulated[3] = scalar;
				this->emulated[4] = scalar;
				this->emulated[5] = scalar;
				this->emulated[6] = scalar;
				this->emulated[7] = scalar;
			}
			// Reinterpret cast to a vector of 4 x 32-bit unsigned integers
			U32x4 get_U32() const {
				U32x4 result(0);
				uint64_t *target = (uint64_t*)result.emulated;
				uint64_t *source = (uint64_t*)this->emulated;
				target[0] = source[0];
				target[1] = source[1];
				return result;
			}
		#endif
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		//static inline U16x8 readSlow(uint16_t* data) {
		//	return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
		//}
		static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					return U16x8(_mm_load_si128((const __m128i*)data));
				#elif USE_NEON
					return U16x8(vld1q_u16(data));
				#endif
			#else
				return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
			#endif
		}
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		inline void writeAlignedUnsafe(uint16_t* data) const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					_mm_store_si128((__m128i*)data, this->v);
				#elif USE_NEON
					vst1q_u16(data, this->v);
				#endif
			#else
				data[0] = this->emulated[0];
				data[1] = this->emulated[1];
				data[2] = this->emulated[2];
				data[3] = this->emulated[3];
				data[4] = this->emulated[4];
				data[5] = this->emulated[5];
				data[6] = this->emulated[6];
				data[7] = this->emulated[7];
			#endif
		}
		// Bound and alignment checked reading
		static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
			const uint16_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			return U16x8::readAlignedUnsafe(pointer);
		}
		// Bound and alignment checked writing
		inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
			uint16_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			this->writeAlignedUnsafe(pointer);
		}
	};
	inline dsr::String& string_toStreamIndented(dsr::String& target, const U16x8& source, const dsr::ReadableString& indentation) {
		ALIGN16 uint16_t data[8];
		source.writeAlignedUnsafe(data);
		string_append(target, indentation, "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7], ")");
		return target;
	}
	inline bool operator==(const U16x8& left, const U16x8& right) {
		ALIGN16 uint16_t a[8];
		ALIGN16 uint16_t b[8];
		left.writeAlignedUnsafe(a);
		right.writeAlignedUnsafe(b);
		return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7];
	}
	inline bool operator!=(const U16x8& left, const U16x8& right) {
		return !(left == right);
	}
	inline U16x8 operator+(const U16x8& left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(ADD_U16_SIMD(left.v, right.v));
		#else
			return U16x8(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3],
			           left.emulated[4] + right.emulated[4], left.emulated[5] + right.emulated[5], left.emulated[6] + right.emulated[6], left.emulated[7] + right.emulated[7]);
		#endif
	}
	inline U16x8 operator+(uint16_t left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(ADD_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
		#else
			return U16x8(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3],
			           left + right.emulated[4], left + right.emulated[5], left + right.emulated[6], left + right.emulated[7]);
		#endif
	}
	inline U16x8 operator+(const U16x8& left, uint16_t right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(ADD_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
		#else
			return U16x8(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right,
			           left.emulated[4] + right, left.emulated[5] + right, left.emulated[6] + right, left.emulated[7] + right);
		#endif
	}
	inline U16x8 operator-(const U16x8& left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(SUB_U16_SIMD(left.v, right.v));
		#else
			return U16x8(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3],
			           left.emulated[4] - right.emulated[4], left.emulated[5] - right.emulated[5], left.emulated[6] - right.emulated[6], left.emulated[7] - right.emulated[7]);
		#endif
	}
	inline U16x8 operator-(uint16_t left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(SUB_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
		#else
			return U16x8(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3],
			           left - right.emulated[4], left - right.emulated[5], left - right.emulated[6], left - right.emulated[7]);
		#endif
	}
	inline U16x8 operator-(const U16x8& left, uint16_t right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(SUB_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
		#else
			return U16x8(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right,
			           left.emulated[4] - right, left.emulated[5] - right, left.emulated[6] - right, left.emulated[7] - right);
		#endif
	}
	inline U16x8 operator*(const U16x8& left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(MUL_U16_SIMD(left.v, right.v));
		#else
			return U16x8(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3],
			           left.emulated[4] * right.emulated[4], left.emulated[5] * right.emulated[5], left.emulated[6] * right.emulated[6], left.emulated[7] * right.emulated[7]);
		#endif
	}
	inline U16x8 operator*(uint16_t left, const U16x8& right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(MUL_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
		#else
			return U16x8(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3],
			           left * right.emulated[4], left * right.emulated[5], left * right.emulated[6], left * right.emulated[7]);
		#endif
	}
	inline U16x8 operator*(const U16x8& left, uint16_t right) {
		#ifdef USE_BASIC_SIMD
			return U16x8(MUL_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
		#else
			return U16x8(
			  left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right,
			  left.emulated[4] * right, left.emulated[5] * right, left.emulated[6] * right, left.emulated[7] * right
			);
		#endif
	}

	union U8x16 {
		#ifdef USE_BASIC_SIMD
			public:
			#ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
				// Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
				// Direct access cannot be done on NEON!
				uint8_t shared_memory[16];
			#endif
			// The SIMD vector of undefined type
			// Not accessible while emulating!
			SIMD_U8x16 v;
			// Construct a portable vector from a native SIMD vector
			explicit U8x16(const SIMD_U8x16& v) : v(v) {}
			// Construct a portable vector from a set of scalars
			U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
			      uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
			: v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
			// Construct a portable vector from a single duplicated scalar
			explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
		#else
			public:
			// Emulate a SIMD vector as an array of scalars without hardware support
			// Only accessible while emulating!
			uint8_t emulated[16];
			// Construct a portable vector from a set of scalars
			U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
			      uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
				this->emulated[0] = a1;
				this->emulated[1] = a2;
				this->emulated[2] = a3;
				this->emulated[3] = a4;
				this->emulated[4] = a5;
				this->emulated[5] = a6;
				this->emulated[6] = a7;
				this->emulated[7] = a8;
				this->emulated[8] = a9;
				this->emulated[9] = a10;
				this->emulated[10] = a11;
				this->emulated[11] = a12;
				this->emulated[12] = a13;
				this->emulated[13] = a14;
				this->emulated[14] = a15;
				this->emulated[15] = a16;
			}
			// Construct a portable vector from a single duplicated scalar
			explicit U8x16(uint8_t scalar) {
				this->emulated[0] = scalar;
				this->emulated[1] = scalar;
				this->emulated[2] = scalar;
				this->emulated[3] = scalar;
				this->emulated[4] = scalar;
				this->emulated[5] = scalar;
				this->emulated[6] = scalar;
				this->emulated[7] = scalar;
				this->emulated[8] = scalar;
				this->emulated[9] = scalar;
				this->emulated[10] = scalar;
				this->emulated[11] = scalar;
				this->emulated[12] = scalar;
				this->emulated[13] = scalar;
				this->emulated[14] = scalar;
				this->emulated[15] = scalar;
			}
		#endif
		static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					return U8x16(_mm_load_si128((const __m128i*)data));
				#elif USE_NEON
					return U8x16(vld1q_u8(data));
				#endif
			#else
				return U8x16(
				  data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
				  data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
				);
			#endif
		}
		// data must be aligned with at least 8 bytes, but preferrably 16 bytes
		inline void writeAlignedUnsafe(uint8_t* data) const {
			#ifdef USE_BASIC_SIMD
				#ifdef USE_SSE2
					_mm_store_si128((__m128i*)data, this->v);
				#elif USE_NEON
					vst1q_u8(data, this->v);
				#endif
			#else
				data[0] = this->emulated[0];
				data[1] = this->emulated[1];
				data[2] = this->emulated[2];
				data[3] = this->emulated[3];
				data[4] = this->emulated[4];
				data[5] = this->emulated[5];
				data[6] = this->emulated[6];
				data[7] = this->emulated[7];
				data[8] = this->emulated[8];
				data[9] = this->emulated[9];
				data[10] = this->emulated[10];
				data[11] = this->emulated[11];
				data[12] = this->emulated[12];
				data[13] = this->emulated[13];
				data[14] = this->emulated[14];
				data[15] = this->emulated[15];
			#endif
		}
		// Bound and alignment checked reading
		static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
			const uint8_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			return U8x16::readAlignedUnsafe(pointer);
		}
		// Bound and alignment checked writing
		inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
			uint8_t* pointer = data.getUnsafe();
			assert(((uintptr_t)pointer & 15) == 0);
			#ifdef SAFE_POINTER_CHECKS
				data.assertInside(methodName, pointer, 16);
			#endif
			this->writeAlignedUnsafe(pointer);
		}
	};
	inline dsr::String& string_toStreamIndented(dsr::String& target, const U8x16& source, const dsr::ReadableString& indentation) {
		ALIGN16 uint8_t data[16];
		source.writeAlignedUnsafe(data);
		string_append(target, indentation,
		  "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7],
		  ", ", data[8], ", ", data[9], ", ", data[10], ", ", data[11], ", ", data[12], ", ", data[13], ", ", data[14], ", ", data[15], ")"
		);
		return target;
	}
	inline bool operator==(const U8x16& left, const U8x16& right) {
		ALIGN16 uint8_t a[16];
		ALIGN16 uint8_t b[16];
		left.writeAlignedUnsafe(a);
		right.writeAlignedUnsafe(b);
		return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7]
		    && a[8] == b[8] && a[9] == b[9] && a[10] == b[10] && a[11] == b[11] && a[12] == b[12] && a[13] == b[13] && a[14] == b[14] && a[15] == b[15];
	}
	inline bool operator!=(const U8x16& left, const U8x16& right) {
		return !(left == right);
	}
	inline U8x16 operator+(const U8x16& left, const U8x16& right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(ADD_U8_SIMD(left.v, right.v));
		#else
			return U8x16(
			  left.emulated[0] + right.emulated[0],
			  left.emulated[1] + right.emulated[1],
			  left.emulated[2] + right.emulated[2],
			  left.emulated[3] + right.emulated[3],
			  left.emulated[4] + right.emulated[4],
			  left.emulated[5] + right.emulated[5],
			  left.emulated[6] + right.emulated[6],
			  left.emulated[7] + right.emulated[7],
			  left.emulated[8] + right.emulated[8],
			  left.emulated[9] + right.emulated[9],
			  left.emulated[10] + right.emulated[10],
			  left.emulated[11] + right.emulated[11],
			  left.emulated[12] + right.emulated[12],
			  left.emulated[13] + right.emulated[13],
			  left.emulated[14] + right.emulated[14],
			  left.emulated[15] + right.emulated[15]
			);
		#endif
	}
	inline U8x16 operator+(uint8_t left, const U8x16& right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(ADD_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
		#else
			return U8x16(
			  left + right.emulated[0],
			  left + right.emulated[1],
			  left + right.emulated[2],
			  left + right.emulated[3],
			  left + right.emulated[4],
			  left + right.emulated[5],
			  left + right.emulated[6],
			  left + right.emulated[7],
			  left + right.emulated[8],
			  left + right.emulated[9],
			  left + right.emulated[10],
			  left + right.emulated[11],
			  left + right.emulated[12],
			  left + right.emulated[13],
			  left + right.emulated[14],
			  left + right.emulated[15]
			);
		#endif
	}
	inline U8x16 operator+(const U8x16& left, uint8_t right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(ADD_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
		#else
			return U8x16(
			  left.emulated[0] + right,
			  left.emulated[1] + right,
			  left.emulated[2] + right,
			  left.emulated[3] + right,
			  left.emulated[4] + right,
			  left.emulated[5] + right,
			  left.emulated[6] + right,
			  left.emulated[7] + right,
			  left.emulated[8] + right,
			  left.emulated[9] + right,
			  left.emulated[10] + right,
			  left.emulated[11] + right,
			  left.emulated[12] + right,
			  left.emulated[13] + right,
			  left.emulated[14] + right,
			  left.emulated[15] + right
			);
		#endif
	}
	inline U8x16 operator-(const U8x16& left, const U8x16& right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(SUB_U8_SIMD(left.v, right.v));
		#else
			return U8x16(
			  left.emulated[0] - right.emulated[0],
			  left.emulated[1] - right.emulated[1],
			  left.emulated[2] - right.emulated[2],
			  left.emulated[3] - right.emulated[3],
			  left.emulated[4] - right.emulated[4],
			  left.emulated[5] - right.emulated[5],
			  left.emulated[6] - right.emulated[6],
			  left.emulated[7] - right.emulated[7],
			  left.emulated[8] - right.emulated[8],
			  left.emulated[9] - right.emulated[9],
			  left.emulated[10] - right.emulated[10],
			  left.emulated[11] - right.emulated[11],
			  left.emulated[12] - right.emulated[12],
			  left.emulated[13] - right.emulated[13],
			  left.emulated[14] - right.emulated[14],
			  left.emulated[15] - right.emulated[15]
			);
		#endif
	}
	inline U8x16 operator-(uint8_t left, const U8x16& right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(SUB_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
		#else
			return U8x16(
			  left - right.emulated[0],
			  left - right.emulated[1],
			  left - right.emulated[2],
			  left - right.emulated[3],
			  left - right.emulated[4],
			  left - right.emulated[5],
			  left - right.emulated[6],
			  left - right.emulated[7],
			  left - right.emulated[8],
			  left - right.emulated[9],
			  left - right.emulated[10],
			  left - right.emulated[11],
			  left - right.emulated[12],
			  left - right.emulated[13],
			  left - right.emulated[14],
			  left - right.emulated[15]
			);
		#endif
	}
	inline U8x16 operator-(const U8x16& left, uint8_t right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(SUB_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
		#else
			return U8x16(
			  left.emulated[0] - right,
			  left.emulated[1] - right,
			  left.emulated[2] - right,
			  left.emulated[3] - right,
			  left.emulated[4] - right,
			  left.emulated[5] - right,
			  left.emulated[6] - right,
			  left.emulated[7] - right,
			  left.emulated[8] - right,
			  left.emulated[9] - right,
			  left.emulated[10] - right,
			  left.emulated[11] - right,
			  left.emulated[12] - right,
			  left.emulated[13] - right,
			  left.emulated[14] - right,
			  left.emulated[15] - right
			);
		#endif
	}
	inline uint8_t saturateToU8(uint32_t x) {
		// No need to check lower bound for unsigned input
		return x > 255 ? 255 : x;
	}
	inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
		#ifdef USE_BASIC_SIMD
			return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
		#else
			return U8x16(
			  saturateToU8((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
			  saturateToU8((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
			  saturateToU8((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
			  saturateToU8((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
			  saturateToU8((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
			  saturateToU8((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
			  saturateToU8((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
			  saturateToU8((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
			  saturateToU8((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
			  saturateToU8((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
			  saturateToU8((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
			  saturateToU8((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
			  saturateToU8((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
			  saturateToU8((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
			  saturateToU8((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
			  saturateToU8((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
			);
		#endif
	}

	inline I32x4 truncateToI32(const F32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return I32x4(F32_TO_I32_SIMD(vector.v));
		#else
			return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
		#endif
	}
	inline U32x4 truncateToU32(const F32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return U32x4(F32_TO_U32_SIMD(vector.v));
		#else
			return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
		#endif
	}
	inline F32x4 floatFromI32(const I32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return F32x4(I32_TO_F32_SIMD(vector.v));
		#else
			return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
		#endif
	}
	inline F32x4 floatFromU32(const U32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return F32x4(U32_TO_F32_SIMD(vector.v));
		#else
			return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
		#endif
	}
	inline I32x4 I32FromU32(const U32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
		#else
			return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
		#endif
	}
	inline U32x4 U32FromI32(const I32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
		#else
			return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
		#endif
	}
	// Warning! Behaviour depends on endianness.
	inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
		#ifdef USE_BASIC_SIMD
			return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
		#else
			const uint8_t *source = (const uint8_t*)vector.emulated;
			return U8x16(
			  source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
			  source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
			);
		#endif
	}
	// Warning! Behaviour depends on endianness.
	inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
		#ifdef USE_BASIC_SIMD
			return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
		#else
			const uint32_t *source = (const uint32_t*)vector.emulated;
			return U32x4(source[0], source[1], source[2], source[3]);
		#endif
	}

	// Unpacking to larger integers
	inline U32x4 lowerToU32(const U16x8& vector) {
		#ifdef USE_BASIC_SIMD
			return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
		#else
			return U32x4(vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3]);
		#endif
	}
	inline U32x4 higherToU32(const U16x8& vector) {
		#ifdef USE_BASIC_SIMD
			return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
		#else
			return U32x4(vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]);
		#endif
	}
	inline U16x8 lowerToU16(const U8x16& vector) {
		#ifdef USE_BASIC_SIMD
			return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
		#else
			return U16x8(
			  vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3],
			  vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]
			);
		#endif
	}
	inline U16x8 higherToU16(const U8x16& vector) {
		#ifdef USE_BASIC_SIMD
			return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
		#else
			return U16x8(
			  vector.emulated[8], vector.emulated[9], vector.emulated[10], vector.emulated[11],
			  vector.emulated[12], vector.emulated[13], vector.emulated[14], vector.emulated[15]
			);
		#endif
	}

	// Saturated packing
	inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
		#ifdef USE_BASIC_SIMD
			return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
		#else
			return U8x16(
			  saturateToU8(lower.emulated[0]),
			  saturateToU8(lower.emulated[1]),
			  saturateToU8(lower.emulated[2]),
			  saturateToU8(lower.emulated[3]),
			  saturateToU8(lower.emulated[4]),
			  saturateToU8(lower.emulated[5]),
			  saturateToU8(lower.emulated[6]),
			  saturateToU8(lower.emulated[7]),
			  saturateToU8(upper.emulated[0]),
			  saturateToU8(upper.emulated[1]),
			  saturateToU8(upper.emulated[2]),
			  saturateToU8(upper.emulated[3]),
			  saturateToU8(upper.emulated[4]),
			  saturateToU8(upper.emulated[5]),
			  saturateToU8(upper.emulated[6]),
			  saturateToU8(upper.emulated[7])
			);
		#endif
	}

	// Helper macros for generating the vector extract functions.
	//   Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
	#ifdef USE_BASIC_SIMD
		#ifdef USE_SSE2
			#ifdef USE_SSSE3
				#define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
			#else
				// If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
				static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
					ALIGN16 uint8_t vectorBuffer[32];
					_mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
					_mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
					return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
				}
			#endif
			#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
			#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
			#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
			#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
			#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
		#elif USE_NEON
			#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
			#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
			#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
			#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
			#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
		#endif
	#else
		#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
		#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
		#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
		#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
		#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
	#endif

	// Vector extraction concatunates two input vectors and reads a vector between them using an offset.
	//   The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
	//   To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
	//   To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
	U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
	U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0])) }
	U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1])) }
	U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2])) }
	U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
	U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
	U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
	U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
	U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7])) }
	U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8])) }
	U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9])) }
	U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10])) }
	U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11])) }
	U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12])) }
	U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13])) }
	U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13], b.emulated[14])) }
	U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }

	U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
	U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0])) }
	U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1])) }
	U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2])) }
	U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
	U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
	U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
	U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
	U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }

	U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
	U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
	U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
	U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
	U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }

	I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
	I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
	I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
	I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
	I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }

	F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
	F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
	F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
	F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
	F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }

	// Gather instructions load memory from a pointer at multiple index offsets at the same time.
	//   The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
	#ifdef USE_AVX2
		#define GATHER_I32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
		#define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
		#define GATHER_F32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
	#endif
	static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
		#ifdef USE_AVX2
			return U32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
		#else
			dsr::UVector4D elementOffsetS = elementOffset.get();
			return U32x4(
			  *(data + elementOffsetS.x),
			  *(data + elementOffsetS.y),
			  *(data + elementOffsetS.z),
			  *(data + elementOffsetS.w)
			);
		#endif
	}
	static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
		#ifdef USE_AVX2
			return I32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
		#else
			dsr::UVector4D elementOffsetS = elementOffset.get();
			return I32x4(
			  *(data + elementOffsetS.x),
			  *(data + elementOffsetS.y),
			  *(data + elementOffsetS.z),
			  *(data + elementOffsetS.w)
			);
		#endif
	}
	static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
		#ifdef USE_AVX2
			return F32x4(GATHER_F32_AVX2(data.getUnsafe(), elementOffset.v, 4));
		#else
			dsr::UVector4D elementOffsetS = elementOffset.get();
			return F32x4(
			  *(data + elementOffsetS.x),
			  *(data + elementOffsetS.y),
			  *(data + elementOffsetS.z),
			  *(data + elementOffsetS.w)
			);
		#endif
	}

#endif