2 years ago · d54ee7027e
--- a/Source/DFPSR/base/simd3D.h
+++ b/Source/DFPSR/base/simd3D.h
@@ -1,6 +1,6 @@
 
				 // zlib open source license

			
 
				 //

			
 
				-// Copyright (c) 2017 to 2022 David Forsgren Piuva

			
 
				+// Copyright (c) 2017 to 2023 David Forsgren Piuva

			
 
				 // 

			
 
				 // This software is provided 'as-is', without any express or implied

			
 
				 // warranty. In no event will the authors be held liable for any damages

			
@@ -31,157 +31,247 @@
 
				 #ifndef DFPSR_SIMD_3D

			
 
				 #define DFPSR_SIMD_3D

			
 
				 

			
 
				-// 3D vector in xxxxyyyyzzzz format

			
 
				-struct F32x4x3 {

			
 
				-	F32x4 v1, v2, v3;

			
 
				-	// Direct constructor given 3 rows of length 4

			
 
				-	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)

			
 
				-	: v1(v1), v2(v2), v3(v3) {}

			
 
				-	// Transposed constructor given 4 columns of length 3

			
 
				-	F32x4x3(const dsr::FVector3D& vx, const dsr::FVector3D& vy, const dsr::FVector3D& vz, const dsr::FVector3D& vw)

			
 
				-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),

			
 
				-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)),

			
 
				-	  v3(F32x4(vx.z, vy.z, vz.z, vw.z)) {}

			
 
				-	// Transposed constructor given a single repeated column

			
 
				-	F32x4x3(const dsr::FVector3D& v)

			
 
				-	: v1(F32x4(v.x, v.x, v.x, v.x)),

			
 
				-	  v2(F32x4(v.y, v.y, v.y, v.y)),

			
 
				-	  v3(F32x4(v.z, v.z, v.z, v.z)) {}

			
 
				-	// In-place math operations

			
 
				-	inline F32x4x3& operator+=(const F32x4x3& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; }

			
 
				-	inline F32x4x3& operator-=(const F32x4x3& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; }

			
 
				-	inline F32x4x3& operator*=(const F32x4x3& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; }

			
 
				-	inline F32x4x3& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }

			
 
				-	inline F32x4x3& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }

			
 
				-	inline F32x4x3& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

			
 
				-	inline F32x4x3& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }

			
 
				-	inline F32x4x3& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }

			
 
				-	inline F32x4x3& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

			
 
				-};

			
 
				-

			
 
				-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4x3 &right) {

			
 
				-	return F32x4x3(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3);

			
 
				-}

			
 
				-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4 &right) {

			
 
				-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);

			
 
				-}

			
 
				-inline F32x4x3 operator+(const F32x4x3 &left, const float &right) {

			
 
				-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);

			
 
				+// These are the infix operations for 2D SIMD vectors F32x4x2, F32x8x2...

			
 
				+#define SIMD_VECTOR_INFIX_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \

			
 
				+	return VECTOR_TYPE(-value.v1, -value.v2); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \

			
 
				+} \

			
 
				+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \

			
 
				+	return (a.v1 * b.v1) + (a.v2 * b.v2); \

			
 
				+} \

			
 
				+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \

			
 
				+	return dotProduct(v, v); \

			
 
				+} \

			
 
				+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \

			
 
				+	return squareLength(v).squareRoot(); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \

			
 
				+	return v * squareLength(v).reciprocalSquareRoot(); \

			
 
				 }

			
 
				 

			
 
				-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4x3 &right) {

			
 
				-	return F32x4x3(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3);

			
 
				-}

			
 
				-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4 &right) {

			
 
				-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);

			
 
				-}

			
 
				-inline F32x4x3 operator-(const F32x4x3 &left, const float &right) {

			
 
				-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);

			
 
				-}

			
 
				-inline F32x4x3 operator-(const F32x4x3& value) {

			
 
				-	return F32x4x3(-value.v1, -value.v2, -value.v3);

			
 
				+// These are the infix operations for 3D SIMD vectors F32x4x3, F32x8x3...

			
 
				+#define SIMD_VECTOR_INFIX_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \

			
 
				+	return VECTOR_TYPE(-value.v1, -value.v2, -value.v3); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

			
 
				+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \

			
 
				+} \

			
 
				+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \

			
 
				+	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3); \

			
 
				+} \

			
 
				+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \

			
 
				+	return dotProduct(v, v); \

			
 
				+} \

			
 
				+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \

			
 
				+	return squareLength(v).squareRoot(); \

			
 
				+} \

			
 
				+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \

			
 
				+	return v * squareLength(v).reciprocalSquareRoot(); \

			
 
				 }

			
 
				 

			
 
				-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4x3 &right) {

			
 
				-	return F32x4x3(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3);

			
 
				-}

			
 
				-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4 &right) {

			
 
				-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);

			
 
				-}

			
 
				-inline F32x4x3 operator*(const F32x4x3 &left, const float &right) {

			
 
				-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);

			
 
				-}

			
 
				-

			
 
				-inline F32x4 dotProduct(const F32x4x3 &a, const F32x4x3 &b) {

			
 
				-	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3);

			
 
				-}

			
 
				-

			
 
				-inline F32x4 squareLength(const F32x4x3 &v) {

			
 
				-	return dotProduct(v, v);

			
 
				-}

			
 
				-

			
 
				-inline F32x4 length(const F32x4x3 &v) {

			
 
				-	return squareLength(v).squareRoot();

			
 
				-}

			
 
				+// These are the available in-plaxe operations for 2D SIMD vectors F32x4x2, F32x8x2...

			
 
				+#define SIMD_VECTOR_MEMBER_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

			
 
				+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

			
 
				 

			
 
				-inline F32x4x3 normalize(const F32x4x3 &v) {

			
 
				-	return v * squareLength(v).reciprocalSquareRoot();

			
 
				-}

			
 
				+// These are the available in-plaxe operations for 3D SIMD vectors F32x4x3, F32x8x3...

			
 
				+#define SIMD_VECTOR_MEMBER_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

			
 
				+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \

			
 
				+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

			
 
				 

			
 
				-// 2D vector in xxxxyyyy format

			
 
				+// 128x2-bit SIMD vectorized 2D math vector stored in xxxxyyyy format (one planar SIMD vector per dimension).

			
 
				 struct F32x4x2 {

			
 
				 	F32x4 v1, v2;

			
 
				 	// Direct constructor given 3 rows of length 4

			
 
				 	F32x4x2(const F32x4& v1, const F32x4& v2)

			
 
				 	: v1(v1), v2(v2) {}

			
 
				-	// Transposed constructor given 4 columns of length 3

			
 
				-	F32x4x2(const dsr::FVector2D& vx, const dsr::FVector2D& vy, const dsr::FVector2D& vz, const dsr::FVector2D& vw)

			
 
				-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),

			
 
				-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)) {}

			
 
				+	// Gradient constructor from an initial vector and the increment for each element.

			
 
				+	static F32x4x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

			
 
				+		return F32x4x2(

			
 
				+		  F32x4::createGradient(start.x, increment.x),

			
 
				+		  F32x4::createGradient(start.y, increment.y)

			
 
				+		);

			
 
				+	}

			
 
				+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)

			
 
				+	F32x4x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d)

			
 
				+	: v1(a.x, b.x, c.x, d.x),

			
 
				+	  v2(a.y, b.y, c.y, d.y) {}

			
 
				 	// Transposed constructor given a single repeated column

			
 
				 	F32x4x2(const dsr::FVector2D& v)

			
 
				-	: v1(F32x4(v.x, v.x, v.x, v.x)),

			
 
				-	  v2(F32x4(v.y, v.y, v.y, v.y)) {}

			
 
				+	: v1(F32x4(v.x)),

			
 
				+	  v2(F32x4(v.y)) {}

			
 
				 	// In-place math operations

			
 
				-	inline F32x4x2& operator+=(const F32x4x2& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; }

			
 
				-	inline F32x4x2& operator-=(const F32x4x2& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; }

			
 
				-	inline F32x4x2& operator*=(const F32x4x2& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; }

			
 
				-	inline F32x4x2& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }

			
 
				-	inline F32x4x2& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }

			
 
				-	inline F32x4x2& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

			
 
				-	inline F32x4x2& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }

			
 
				-	inline F32x4x2& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }

			
 
				-	inline F32x4x2& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

			
 
				+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x4x2, F32x4, float)

			
 
				 };

			
 
				+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x4x2, F32x4, float)

			
 
				 

			
 
				-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4x2 &right) {

			
 
				-	return F32x4x2(left.v1 + right.v1, left.v2 + right.v2);

			
 
				-}

			
 
				-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4 &right) {

			
 
				-	return F32x4x2(left.v1 + right, left.v2 + right);

			
 
				-}

			
 
				-inline F32x4x2 operator+(const F32x4x2 &left, const float &right) {

			
 
				-	return F32x4x2(left.v1 + right, left.v2 + right);

			
 
				-}

			
 
				-

			
 
				-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4x2 &right) {

			
 
				-	return F32x4x2(left.v1 - right.v1, left.v2 - right.v2);

			
 
				-}

			
 
				-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4 &right) {

			
 
				-	return F32x4x2(left.v1 - right, left.v2 - right);

			
 
				-}

			
 
				-inline F32x4x2 operator-(const F32x4x2 &left, const float &right) {

			
 
				-	return F32x4x2(left.v1 - right, left.v2 - right);

			
 
				-}

			
 
				-inline F32x4x2 operator-(const F32x4x2& value) {

			
 
				-	return F32x4x2(-value.v1, -value.v2);

			
 
				-}

			
 
				-

			
 
				-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4x2 &right) {

			
 
				-	return F32x4x2(left.v1 * right.v1, left.v2 * right.v2);

			
 
				-}

			
 
				-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4 &right) {

			
 
				-	return F32x4x2(left.v1 * right, left.v2 * right);

			
 
				-}

			
 
				-inline F32x4x2 operator*(const F32x4x2 &left, const float &right) {

			
 
				-	return F32x4x2(left.v1 * right, left.v2 * right);

			
 
				-}

			
 
				+// 256x2-bit SIMD vectorized 2D math vector stored in xxxxxxxxyyyyyyyy format (one planar SIMD vector per dimension).

			
 
				+struct F32x8x2 {

			
 
				+	F32x8 v1, v2;

			
 
				+	// Direct constructor given 3 rows of length 4

			
 
				+	F32x8x2(const F32x8& v1, const F32x8& v2)

			
 
				+	: v1(v1), v2(v2) {}

			
 
				+	// Gradient constructor from an initial vector and the increment for each element.

			
 
				+	static F32x8x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

			
 
				+		return F32x8x2(

			
 
				+		  F32x8::createGradient(start.x, increment.x),

			
 
				+		  F32x8::createGradient(start.y, increment.y)

			
 
				+		);

			
 
				+	}

			
 
				+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)

			
 
				+	F32x8x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d, const dsr::FVector2D& e, const dsr::FVector2D& f, const dsr::FVector2D& g, const dsr::FVector2D& h)

			
 
				+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),

			
 
				+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y) {}

			
 
				+	// Transposed constructor given a single repeated column

			
 
				+	F32x8x2(const dsr::FVector2D& v)

			
 
				+	: v1(F32x8(v.x)),

			
 
				+	  v2(F32x8(v.y)) {}

			
 
				+	// In-place math operations

			
 
				+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x8x2, F32x8, float)

			
 
				+};

			
 
				+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x8x2, F32x8, float)

			
 
				 

			
 
				-inline F32x4 dotProduct(const F32x4x2 &a, const F32x4x2 &b) {

			
 
				-	return (a.v1 * b.v1) + (a.v2 * b.v2);

			
 
				-}

			
 
				+// 128x3-bit SIMD vectorized 3D math vector stored in xxxxyyyyzzzz format (one planar SIMD vector per dimension).

			
 
				+struct F32x4x3 {

			
 
				+	F32x4 v1, v2, v3;

			
 
				+	// Direct constructor given 3 rows of length 4

			
 
				+	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)

			
 
				+	: v1(v1), v2(v2), v3(v3) {}

			
 
				+	// Gradient constructor from an initial vector and the increment for each element.

			
 
				+	static F32x4x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

			
 
				+		return F32x4x3(

			
 
				+		  F32x4::createGradient(start.x, increment.x),

			
 
				+		  F32x4::createGradient(start.y, increment.y),

			
 
				+		  F32x4::createGradient(start.z, increment.z)

			
 
				+		);

			
 
				+	}

			
 
				+	// Transposed constructor given 4 columns of length 3

			
 
				+	F32x4x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d)

			
 
				+	: v1(a.x, b.x, c.x, d.x),

			
 
				+	  v2(a.y, b.y, c.y, d.y),

			
 
				+	  v3(a.z, b.z, c.z, d.z) {}

			
 
				+	// Transposed constructor given a single repeated column

			
 
				+	F32x4x3(const dsr::FVector3D& v)

			
 
				+	: v1(F32x4(v.x)),

			
 
				+	  v2(F32x4(v.y)),

			
 
				+	  v3(F32x4(v.z)) {}

			
 
				+	// In-place math operations

			
 
				+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x4x3, F32x4, float)

			
 
				+};

			
 
				+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x4x3, F32x4, float)

			
 
				 

			
 
				-inline F32x4 squareLength(const F32x4x2 &v) {

			
 
				-	return dotProduct(v, v);

			
 
				-}

			
 
				+// 256x3-bit SIMD vectorized 3D math vector stored in xxxxxxxxyyyyyyyyzzzzzzzz format (one planar SIMD vector per dimension).

			
 
				+struct F32x8x3 {

			
 
				+	F32x8 v1, v2, v3;

			
 
				+	// Direct constructor given 3 rows of length 4

			
 
				+	F32x8x3(const F32x8& v1, const F32x8& v2, const F32x8& v3)

			
 
				+	: v1(v1), v2(v2), v3(v3) {}

			
 
				+	// Gradient constructor from an initial vector and the increment for each element.

			
 
				+	static F32x8x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

			
 
				+		return F32x8x3(

			
 
				+		  F32x8::createGradient(start.x, increment.x),

			
 
				+		  F32x8::createGradient(start.y, increment.y),

			
 
				+		  F32x8::createGradient(start.z, increment.z)

			
 
				+		);

			
 
				+	}

			
 
				+	// Transposed constructor given 4 columns of length 3

			
 
				+	F32x8x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d, const dsr::FVector3D& e, const dsr::FVector3D& f, const dsr::FVector3D& g, const dsr::FVector3D& h)

			
 
				+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),

			
 
				+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y),

			
 
				+	  v3(a.z, b.z, c.z, d.z, e.z, f.z, g.z, h.z) {}

			
 
				+	// Transposed constructor given a single repeated column

			
 
				+	F32x8x3(const dsr::FVector3D& v)

			
 
				+	: v1(F32x8(v.x)),

			
 
				+	  v2(F32x8(v.y)),

			
 
				+	  v3(F32x8(v.z)) {}

			
 
				+	// In-place math operations

			
 
				+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x8x3, F32x8, float)

			
 
				+};

			
 
				+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x8x3, F32x8, float)

			
 
				 

			
 
				-inline F32x4 length(const F32x4x2 &v) {

			
 
				-	return squareLength(v).squareRoot();

			
 
				-}

			
 
				+// X vector aliases

			
 
				+#if DSR_DEFAULT_VECTOR_SIZE == 16

			
 
				+	using F32xXx3 = F32x4x3;

			
 
				+	using F32xXx2 = F32x4x2;

			
 
				+#elif DSR_DEFAULT_VECTOR_SIZE == 32

			
 
				+	using F32xXx3 = F32x8x3;

			
 
				+	using F32xXx2 = F32x8x2;

			
 
				+#endif

			
 
				 

			
 
				-inline F32x4x2 normalize(const F32x4x2 &v) {

			
 
				-	return v * squareLength(v).reciprocalSquareRoot();

			
 
				-}

			
 
				+// F vector aliases

			
 
				+#if DSR_FLOAT_VECTOR_SIZE == 16

			
 
				+	using F32xFx3 = F32x4x3;

			
 
				+	using F32xFx2 = F32x4x2;

			
 
				+#elif DSR_FLOAT_VECTOR_SIZE == 32

			
 
				+	using F32xFx3 = F32x8x3;

			
 
				+	using F32xFx2 = F32x8x2;

			
 
				+#endif

			
 
				 

			
 
				 #endif

			
 
				 

			
--- a/Source/DFPSR/image/PackOrder.h
+++ b/Source/DFPSR/image/PackOrder.h
@@ -1,6 +1,6 @@
 
				 // zlib open source license

			
 
				 //

			
 
				-// Copyright (c) 2017 to 2019 David Forsgren Piuva

			
 
				+// Copyright (c) 2017 to 2023 David Forsgren Piuva

			
 
				 // 

			
 
				 // This software is provided 'as-is', without any express or implied

			
 
				 // warranty. In no event will the authors be held liable for any damages

			
@@ -93,22 +93,26 @@ inline bool operator==(const PackOrder &left, const PackOrder &right) {
 
				 }

			
 
				 

			
 
				 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.

			
 
				-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+T packBytes(const T &s0, const T &s1, const T &s2) {

			
 
				 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16);

			
 
				 }

			
 
				 // Using a specified packing order

			
 
				-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+T packBytes(const T &s0, const T &s1, const T &s2, const PackOrder &order) {

			
 
				 	return ENDIAN_POS_ADDR(s0, order.redOffset)

			
 
				 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)

			
 
				 	     | ENDIAN_POS_ADDR(s2, order.blueOffset);

			
 
				 }

			
 
				 

			
 
				 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.

			
 
				-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3) {

			
 
				 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16) | ENDIAN_POS_ADDR(s3, 24);

			
 
				 }

			
 
				 // Using a specified packing order

			
 
				-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3, const PackOrder &order) {

			
 
				 	return ENDIAN_POS_ADDR(s0, order.redOffset)

			
 
				 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)

			
 
				 	     | ENDIAN_POS_ADDR(s2, order.blueOffset)

			
@@ -116,7 +120,15 @@ inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const
 
				 }

			
 
				 

			
 
				 // Pack separate floats into saturated bytes

			
 
				-inline static U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {

			
 
				+inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {

			
 
				+	return packBytes(

			
 
				+	  truncateToU32(s0.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s1.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s2.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s3.clamp(0.1f, 255.1f))

			
 
				+	);

			
 
				+}

			
 
				+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3) {

			
 
				 	return packBytes(

			
 
				 	  truncateToU32(s0.clamp(0.1f, 255.1f)),

			
 
				 	  truncateToU32(s1.clamp(0.1f, 255.1f)),

			
@@ -134,54 +146,46 @@ inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4
 
				 	  order

			
 
				 	);

			
 
				 }

			
 
				-

			
 
				-inline uint32_t getRed(uint32_t color) {

			
 
				-	return color & ENDIAN32_BYTE_0;

			
 
				-}

			
 
				-inline uint32_t getRed(uint32_t color, const PackOrder &order) {

			
 
				-	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);

			
 
				-}

			
 
				-inline uint32_t getGreen(uint32_t color) {

			
 
				-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);

			
 
				-}

			
 
				-inline uint32_t getGreen(uint32_t color, const PackOrder &order) {

			
 
				-	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);

			
 
				-}

			
 
				-inline uint32_t getBlue(uint32_t color) {

			
 
				-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);

			
 
				-}

			
 
				-inline uint32_t getBlue(uint32_t color, const PackOrder &order) {

			
 
				-	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);

			
 
				-}

			
 
				-inline uint32_t getAlpha(uint32_t color) {

			
 
				-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);

			
 
				-}

			
 
				-inline uint32_t getAlpha(uint32_t color, const PackOrder &order) {

			
 
				-	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);

			
 
				+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3, const PackOrder &order) {

			
 
				+	return packBytes(

			
 
				+	  truncateToU32(s0.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s1.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s2.clamp(0.1f, 255.1f)),

			
 
				+	  truncateToU32(s3.clamp(0.1f, 255.1f)),

			
 
				+	  order

			
 
				+	);

			
 
				 }

			
 
				 

			
 
				-inline U32x4 getRed(const U32x4 &color) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getRed(T color) {

			
 
				 	return color & ENDIAN32_BYTE_0;

			
 
				 }

			
 
				-inline U32x4 getRed(const U32x4 &color, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getRed(T color, const PackOrder &order) {

			
 
				 	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);

			
 
				 }

			
 
				-inline U32x4 getGreen(const U32x4 &color) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getGreen(T color) {

			
 
				 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);

			
 
				 }

			
 
				-inline U32x4 getGreen(const U32x4 &color, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getGreen(T color, const PackOrder &order) {

			
 
				 	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);

			
 
				 }

			
 
				-inline U32x4 getBlue(const U32x4 &color) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getBlue(T color) {

			
 
				 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);

			
 
				 }

			
 
				-inline U32x4 getBlue(const U32x4 &color, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getBlue(T color, const PackOrder &order) {

			
 
				 	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);

			
 
				 }

			
 
				-inline U32x4 getAlpha(const U32x4 &color) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getAlpha(T color) {

			
 
				 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);

			
 
				 }

			
 
				-inline U32x4 getAlpha(const U32x4 &color, const PackOrder &order) {

			
 
				+template<typename T> // Accepting uint32_t, U32x4, U32x8...

			
 
				+inline T getAlpha(T color, const PackOrder &order) {

			
 
				 	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);

			
 
				 }

			
 
				 

			
--- a/Source/SDK/SpriteEngine/lightAPI.cpp
+++ b/Source/SDK/SpriteEngine/lightAPI.cpp
@@ -6,24 +6,24 @@
 
				 namespace dsr {

			
 
				 

			
 
				 // Precondition: The packed color must be in the standard RGBA order, meaning no native packing

			
 
				-inline F32x4x3 unpackRgb_U32x4_to_F32x4x3(const U32x4& color) {

			
 
				-	return F32x4x3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));

			
 
				+inline F32xXx3 unpackRgb_U32xX_to_F32xXx3(const U32xX& color) {

			
 
				+	return F32xXx3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));

			
 
				 }

			
 
				 

			
 
				-static inline void setLight(SafePointer<uint8_t> lightPixel, U8x16 newlight) {

			
 
				+static inline void setLight(SafePointer<uint8_t> lightPixel, U8xX newlight) {

			
 
				 	newlight.writeAligned(lightPixel, "setLight: writing light");

			
 
				 }

			
 
				 

			
 
				-static inline void addLight(SafePointer<uint8_t> lightPixel, U8x16 addedlight) {

			
 
				-	U8x16 oldLight = U8x16::readAligned(lightPixel, "addLight: reading light");

			
 
				-	U8x16 newlight = saturatedAddition(oldLight, addedlight);

			
 
				+static inline void addLight(SafePointer<uint8_t> lightPixel, U8xX addedlight) {

			
 
				+	U8xX oldLight = U8xX::readAligned(lightPixel, "addLight: reading light");

			
 
				+	U8xX newlight = saturatedAddition(oldLight, addedlight);

			
 
				 	newlight.writeAligned(lightPixel, "addLight: writing light");

			
 
				 }

			
 
				 

			
 
				 template <bool ADD_LIGHT>

			
 
				 void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lightBuffer, const OrderedImageRgbaU8& normalBuffer, const FVector3D& lightDirection, float lightIntensity, const ColorRgbI32& lightColor) {

			
 
				 	// Normals in range 0..255 - 128 have lengths of 127 and 128, so if we double the reverse light direction we'll end up near 0..255 again for colors

			
 
				-	F32x4x3 reverseLightDirection = F32x4x3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);

			
 
				+	F32xXx3 reverseLightDirection = F32xXx3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);

			
 
				 	IRect rectangleBound = image_getBound(lightBuffer);

			
 
				 	float colorR = std::max(0.0f, (float)lightColor.red / 255.0f);

			
 
				 	float colorG = std::max(0.0f, (float)lightColor.green / 255.0f);

			
@@ -37,27 +37,29 @@ void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lig
 
				 		for (int y = bound.top(); y < bound.bottom(); y++) {

			
 
				 			SafePointer<uint8_t> lightPixel = lightRow;

			
 
				 			SafePointer<uint32_t> normalPixel = normalRow;

			
 
				-			for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {

			
 
				+			for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {

			
 
				 				// Read surface normals

			
 
				-				U32x4 normalColor = U32x4::readAligned(normalPixel, "directedLight: reading normal");

			
 
				-				F32x4x3 negativeSurfaceNormal = unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f;

			
 
				+				U32xX normalColor = U32xX::readAligned(normalPixel, "directedLight: reading normal");

			
 
				+				// TODO: Port SIMD3D to handle arbitrary vector lengths.

			
 
				+				F32xXx3 negativeSurfaceNormal = unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f;

			
 
				 				// Calculate light intensity

			
 
				 				//   Normalization and negation is already pre-multiplied into reverseLightDirection

			
 
				-				F32x4 intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);

			
 
				-				F32x4 red = intensity * colorR;

			
 
				-				F32x4 green = intensity * colorG;

			
 
				-				F32x4 blue = intensity * colorB;

			
 
				+				F32xX intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);

			
 
				+				F32xX red = intensity * colorR;

			
 
				+				F32xX green = intensity * colorG;

			
 
				+				F32xX blue = intensity * colorB;

			
 
				 				red = red.clampUpper(255.1f);

			
 
				 				green = green.clampUpper(255.1f);

			
 
				 				blue = blue.clampUpper(255.1f);

			
 
				-				U8x16 light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

			
 
				+				// TODO: Let color packing handle arbitrary vector lengths.

			
 
				+				U8xX light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

			
 
				 				if (ADD_LIGHT) {

			
 
				 					addLight(lightPixel, light);

			
 
				 				} else {

			
 
				 					setLight(lightPixel, light);

			
 
				 				}

			
 
				-				lightPixel += 16;

			
 
				-				normalPixel += 4;

			
 
				+				lightPixel += laneCountX_8Bit;

			
 
				+				normalPixel += laneCountX_32Bit;

			
 
				 			}

			
 
				 			lightRow.increaseBytes(lightStride);

			
 
				 			normalRow.increaseBytes(normalStride);

			
@@ -136,16 +138,33 @@ static float getShadowTransparency(SafePointer<float> pixelData, int32_t width,
 
				 	return reciDepth * 1.02f > shadowReciDepth ? 1.0f : 0.0f;

			
 
				 }

			
 
				 

			
 
				-static inline F32x4 getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32x4x3& lightOffset) {

			
 
				-	FVector4D offsetX = lightOffset.v1.get();

			
 
				-	FVector4D offsetY = lightOffset.v2.get();

			
 
				-	FVector4D offsetZ = lightOffset.v3.get();

			
 
				-	return F32x4(

			
 
				-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.x, offsetY.x, offsetZ.x)),

			
 
				-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.y, offsetY.y, offsetZ.y)),

			
 
				-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.z, offsetY.z, offsetZ.z)),

			
 
				-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.w, offsetY.w, offsetZ.w))

			
 
				-	);

			
 
				+static inline F32xX getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32xXx3& lightOffset) {

			
 
				+	// TODO: Create a way to quickly iterate over elements in a SIMD vector for interfacing with scalar operations.

			
 
				+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetX[DSR_DEFAULT_VECTOR_SIZE];

			
 
				+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetY[DSR_DEFAULT_VECTOR_SIZE];

			
 
				+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetZ[DSR_DEFAULT_VECTOR_SIZE];

			
 
				+	lightOffset.v1.writeAlignedUnsafe(offsetX);

			
 
				+	lightOffset.v2.writeAlignedUnsafe(offsetY);

			
 
				+	lightOffset.v3.writeAlignedUnsafe(offsetZ);

			
 
				+	#if DSR_DEFAULT_VECTOR_SIZE == 16

			
 
				+		return F32x4(

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3]))

			
 
				+		);

			
 
				+	#elif DSR_DEFAULT_VECTOR_SIZE == 32

			
 
				+		return F32x8(

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[4], offsetY[4], offsetZ[4])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[5], offsetY[5], offsetZ[5])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[6], offsetY[6], offsetZ[6])),

			
 
				+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[7], offsetY[7], offsetZ[7]))

			
 
				+		);

			
 
				+	#endif

			
 
				 }

			
 
				 

			
 
				 template <bool SHADOW_CASTING>

			
@@ -154,11 +173,11 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
				 	//   Normal-space defines the rotation for light-space

			
 
				 	FVector3D lightSpaceSourcePosition = camera.normalToWorldSpace.transformTransposed(lightPosition);

			
 
				 	// Align the rectangle with 8 pixels, because that's the widest read to align in the 16-bit height buffer

			
 
				-	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, 4);

			
 
				+	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, laneCountX_32Bit);

			
 
				 	if (rectangleBound.hasArea()) {

			
 
				 		// Uniform values

			
 
				 		// How much closer to your face in light-space does the pixel go per depth unit

			
 
				-		F32x4x3 inYourFaceAxis = F32x4x3(camera.screenDepthToLightSpace.zAxis);

			
 
				+		F32xXx3 inYourFaceAxis = F32xXx3(camera.screenDepthToLightSpace.zAxis);

			
 
				 		// Light color

			
 
				 		float colorR = std::max(0.0f, (float)lightColor.red * lightIntensity);

			
 
				 		float colorG = std::max(0.0f, (float)lightColor.green * lightIntensity);

			
@@ -173,14 +192,14 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
				 			FVector3D dx = camera.screenDepthToLightSpace.xAxis;

			
 
				 			FVector3D dy = camera.screenDepthToLightSpace.yAxis;

			
 
				 			// Pack the offset for each of the 4 first pixels into a transposing constructor

			
 
				-			F32x4x3 lightBaseRowX4 = F32x4x3(lightBaseRow, lightBaseRow + dx, lightBaseRow + dx * 2.0f, lightBaseRow + dx * 3.0f);

			
 
				+			F32xXx3 lightBaseRowX = F32xXx3::createGradient(lightBaseRow, dx);

			
 
				 			// Derivatives for moving four pixels to the right in parallel

			
 
				 			//    (n+0, y0), (n+1, y0), (n+2, y0), (n+3, y0) -> (n+4, y0), (n+5, y0), (n+6, y0), (n+7, y0)

			
 
				-			F32x4x3 dx4 = F32x4x3(dx * 4.0f);

			
 
				+			F32xXx3 dxX = F32xXx3(dx * (float)laneCountX_32Bit);

			
 
				 			// Derivatives for moving one pixel down in parallel

			
 
				 			//    (x0, n+0), (x1, n+0), (x2, n+0), (x3, n+0)

			
 
				 			// -> (x0, n+1), (x1, n+1), (x2, n+1), (x3, n+1)

			
 
				-			F32x4x3 dy1 = F32x4x3(dy);

			
 
				+			F32xXx3 dy1 = F32xXx3(dy);

			
 
				 			// Get strides

			
 
				 			int lightStride = image_getStride(lightBuffer);

			
 
				 			int normalStride = image_getStride(normalBuffer);

			
@@ -194,56 +213,56 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
				 			SafePointer<float> shadowCubeData;

			
 
				 			float shadowCubeCenter;

			
 
				 			if (SHADOW_CASTING) {

			
 
				-				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % 4 == 0);

			
 
				+				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % laneCountX_32Bit == 0);

			
 
				 				shadowCubeData = image_getSafePointer(shadowCubeMap);

			
 
				 				shadowCubeCenter = (float)shadowCubeWidth * 0.5f;

			
 
				 			}

			
 
				 			// Loop over the pixels to add light

			
 
				 			for (int y = bound.top(); y < bound.bottom(); y++) {

			
 
				 				// Initiate the leftmost pixels before iterating to the right

			
 
				-				F32x4x3 lightBasePixelx4 = lightBaseRowX4;

			
 
				+				F32xXx3 lightBasePixelxX = lightBaseRowX;

			
 
				 				SafePointer<uint8_t> lightPixel = lightRow;

			
 
				 				SafePointer<uint32_t> normalPixel = normalRow;

			
 
				 				SafePointer<float> heightPixel = heightRow;

			
 
				 				// Iterate over 16-bit pixels 8 at a time

			
 
				-				for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {

			
 
				+				for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {

			
 
				 					// Read pixel height

			
 
				-					F32x4 depthOffset = F32x4::readAligned(heightPixel, "addPointLight: reading height");

			
 
				+					F32xX depthOffset = F32xX::readAligned(heightPixel, "addPointLight: reading height");

			
 
				 					// Extrude the pixel using positive values towards the camera to represent another height

			
 
				 					//   This will solve X and Z positions based on the height Y

			
 
				-					F32x4x3 lightOffset = lightBasePixelx4 + (inYourFaceAxis * depthOffset);

			
 
				+					F32xXx3 lightOffset = lightBasePixelxX + (inYourFaceAxis * depthOffset);

			
 
				 					// Get the linear distance, divide by sphere radius and limit to length 1 at intensity 0

			
 
				-					F32x4 lightRatio = min(F32x4(1.0f), length(lightOffset) * reciprocalRadius);

			
 
				+					F32xX lightRatio = min(F32xX(1.0f), length(lightOffset) * reciprocalRadius);

			
 
				 					// Read surface normal

			
 
				-					U32x4 normalColor = U32x4::readAligned(normalPixel, "addPointLight: reading normal");

			
 
				+					U32xX normalColor = U32xX::readAligned(normalPixel, "addPointLight: reading normal");

			
 
				 					// normalScale is used to negate the normals in advance so that opposing directions get positive values

			
 
				-					F32x4x3 negativeSurfaceNormal = (unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f) * (-1.0f / 128.0f);

			
 
				+					F32xXx3 negativeSurfaceNormal = (unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f) * (-1.0f / 128.0f);

			
 
				 					// Fade from 0 to 1 using 1 - 2x + x²

			
 
				-					F32x4 distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;

			
 
				-					F32x4 angleIntensity = max(F32x4(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));

			
 
				-					F32x4 intensity = angleIntensity * distanceIntensity;

			
 
				+					F32xX distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;

			
 
				+					F32xX angleIntensity = max(F32xX(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));

			
 
				+					F32xX intensity = angleIntensity * distanceIntensity;

			
 
				 					if (SHADOW_CASTING) {

			
 
				 						intensity = intensity * getShadowTransparency(shadowCubeData, shadowCubeWidth, shadowCubeCenter, lightOffset);

			
 
				 					}

			
 
				 					// TODO: Make an optimized version for white light replacing red, green and blue with a single LUMA

			
 
				-					F32x4 red = intensity * colorR;

			
 
				-					F32x4 green = intensity * colorG;

			
 
				-					F32x4 blue = intensity * colorB;

			
 
				+					F32xX red = intensity * colorR;

			
 
				+					F32xX green = intensity * colorG;

			
 
				+					F32xX blue = intensity * colorB;

			
 
				 					red = red.clampUpper(255.1f);

			
 
				 					green = green.clampUpper(255.1f);

			
 
				 					blue = blue.clampUpper(255.1f);

			
 
				 					// Add light to the image

			
 
				-					U8x16 morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

			
 
				+					U8xX morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

			
 
				 					addLight(lightPixel, morelight);

			
 
				 					// Go to the next four pixels in light-space

			
 
				-					lightBasePixelx4 += dx4;

			
 
				+					lightBasePixelxX += dxX;

			
 
				 					// Go to the next 4 pixels of image data

			
 
				-					lightPixel += 16;

			
 
				-					normalPixel += 4;

			
 
				-					heightPixel += 4;

			
 
				+					lightPixel += laneCountX_8Bit;

			
 
				+					normalPixel += laneCountX_32Bit;

			
 
				+					heightPixel += laneCountX_32Bit;

			
 
				 				}

			
 
				 				// Go to the next row in light-space

			
 
				-				lightBaseRowX4 += dy1;

			
 
				+				lightBaseRowX += dy1;

			
 
				 				// Go to the next row of image data

			
 
				 				lightRow.increaseBytes(lightStride);

			
 
				 				normalRow.increaseBytes(normalStride);

			
@@ -276,25 +295,25 @@ void blendLight(AlignedImageRgbaU8& colorBuffer, const OrderedImageRgbaU8& diffu
 
				 		int targetStride = image_getStride(colorBuffer);

			
 
				 		int diffuseStride = image_getStride(diffuseBuffer);

			
 
				 		int lightStride = image_getStride(lightBuffer);

			
 
				-		F32x4 scale = F32x4(1.0 / 128.0f);

			
 
				+		F32xX scale = F32xX(1.0 / 128.0f);

			
 
				 		for (int y = startIndex; y < stopIndex; y++) {

			
 
				 			SafePointer<uint32_t> targetPixel = targetRow;

			
 
				 			SafePointer<uint32_t> diffusePixel = diffuseRow;

			
 
				 			SafePointer<uint32_t> lightPixel = lightRow;

			
 
				-			for (int x4 = 0; x4 < width; x4 += 4) {

			
 
				-				U32x4 diffuse = U32x4::readAligned(diffusePixel, "blendLight: reading diffuse");

			
 
				-				U32x4 light = U32x4::readAligned(lightPixel, "blendLight: reading light");

			
 
				-				F32x4 red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;

			
 
				-				F32x4 green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;

			
 
				-				F32x4 blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;

			
 
				+			for (int x = 0; x < width; x += laneCountX_32Bit) {

			
 
				+				U32xX diffuse = U32xX::readAligned(diffusePixel, "blendLight: reading diffuse");

			
 
				+				U32xX light = U32xX::readAligned(lightPixel, "blendLight: reading light");

			
 
				+				F32xX red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;

			
 
				+				F32xX green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;

			
 
				+				F32xX blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;

			
 
				 				red = red.clampUpper(255.1f);

			
 
				 				green = green.clampUpper(255.1f);

			
 
				 				blue = blue.clampUpper(255.1f);

			
 
				-				U32x4 color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);

			
 
				+				U32xX color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);

			
 
				 				color.writeAligned(targetPixel, "blendLight: writing color");

			
 
				-				targetPixel += 4;

			
 
				-				diffusePixel += 4;

			
 
				-				lightPixel += 4;

			
 
				+				targetPixel += laneCountX_32Bit;

			
 
				+				diffusePixel += laneCountX_32Bit;

			
 
				+				lightPixel += laneCountX_32Bit;

			
 
				 			}

			
 
				 			targetRow.increaseBytes(targetStride);

			
 
				 			diffuseRow.increaseBytes(diffuseStride);