2 years ago · d54ee7027e
--- a/Source/DFPSR/base/simd3D.h
+++ b/Source/DFPSR/base/simd3D.h
@@ -1,6 +1,6 @@
 
															 // zlib open source license

														
 
															 //

														
 
															-// Copyright (c) 2017 to 2022 David Forsgren Piuva

														
 
															+// Copyright (c) 2017 to 2023 David Forsgren Piuva

														
 
															 // 

														
 
															 // This software is provided 'as-is', without any express or implied

														
 
															 // warranty. In no event will the authors be held liable for any damages

														
@@ -31,157 +31,247 @@
 
															 #ifndef DFPSR_SIMD_3D

														
 
															 #define DFPSR_SIMD_3D

														
 
															-// 3D vector in xxxxyyyyzzzz format

														
 
															-struct F32x4x3 {

														
 
															-	F32x4 v1, v2, v3;

														
 
															-	// Direct constructor given 3 rows of length 4

														
 
															-	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)

														
 
															-	: v1(v1), v2(v2), v3(v3) {}

														
 
															-	// Transposed constructor given 4 columns of length 3

														
 
															-	F32x4x3(const dsr::FVector3D& vx, const dsr::FVector3D& vy, const dsr::FVector3D& vz, const dsr::FVector3D& vw)

														
 
															-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),

														
 
															-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)),

														
 
															-	  v3(F32x4(vx.z, vy.z, vz.z, vw.z)) {}

														
 
															-	// Transposed constructor given a single repeated column

														
 
															-	F32x4x3(const dsr::FVector3D& v)

														
 
															-	: v1(F32x4(v.x, v.x, v.x, v.x)),

														
 
															-	  v2(F32x4(v.y, v.y, v.y, v.y)),

														
 
															-	  v3(F32x4(v.z, v.z, v.z, v.z)) {}

														
 
															-	// In-place math operations

														
 
															-	inline F32x4x3& operator+=(const F32x4x3& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; }

														
 
															-	inline F32x4x3& operator-=(const F32x4x3& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; }

														
 
															-	inline F32x4x3& operator*=(const F32x4x3& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; }

														
 
															-	inline F32x4x3& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }

														
 
															-	inline F32x4x3& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }

														
 
															-	inline F32x4x3& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

														
 
															-	inline F32x4x3& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }

														
 
															-	inline F32x4x3& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }

														
 
															-	inline F32x4x3& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

														
 
															-};

														
 
															-

														
 
															-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4x3 &right) {

														
 
															-	return F32x4x3(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3);

														
 
															-}

														
 
															-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4 &right) {

														
 
															-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);

														
 
															-}

														
 
															-inline F32x4x3 operator+(const F32x4x3 &left, const float &right) {

														
 
															-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);

														
 
															+// These are the infix operations for 2D SIMD vectors F32x4x2, F32x8x2...

														
 
															+#define SIMD_VECTOR_INFIX_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \

														
 
															+	return VECTOR_TYPE(-value.v1, -value.v2); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \

														
 
															+} \

														
 
															+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \

														
 
															+	return (a.v1 * b.v1) + (a.v2 * b.v2); \

														
 
															+} \

														
 
															+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \

														
 
															+	return dotProduct(v, v); \

														
 
															+} \

														
 
															+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \

														
 
															+	return squareLength(v).squareRoot(); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \

														
 
															+	return v * squareLength(v).reciprocalSquareRoot(); \

														
 
															 }

														
 
															-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4x3 &right) {

														
 
															-	return F32x4x3(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3);

														
 
															-}

														
 
															-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4 &right) {

														
 
															-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);

														
 
															-}

														
 
															-inline F32x4x3 operator-(const F32x4x3 &left, const float &right) {

														
 
															-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);

														
 
															-}

														
 
															-inline F32x4x3 operator-(const F32x4x3& value) {

														
 
															-	return F32x4x3(-value.v1, -value.v2, -value.v3);

														
 
															+// These are the infix operations for 3D SIMD vectors F32x4x3, F32x8x3...

														
 
															+#define SIMD_VECTOR_INFIX_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \

														
 
															+	return VECTOR_TYPE(-value.v1, -value.v2, -value.v3); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \

														
 
															+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \

														
 
															+} \

														
 
															+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \

														
 
															+	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3); \

														
 
															+} \

														
 
															+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \

														
 
															+	return dotProduct(v, v); \

														
 
															+} \

														
 
															+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \

														
 
															+	return squareLength(v).squareRoot(); \

														
 
															+} \

														
 
															+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \

														
 
															+	return v * squareLength(v).reciprocalSquareRoot(); \

														
 
															 }

														
 
															-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4x3 &right) {

														
 
															-	return F32x4x3(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3);

														
 
															-}

														
 
															-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4 &right) {

														
 
															-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);

														
 
															-}

														
 
															-inline F32x4x3 operator*(const F32x4x3 &left, const float &right) {

														
 
															-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);

														
 
															-}

														
 
															-

														
 
															-inline F32x4 dotProduct(const F32x4x3 &a, const F32x4x3 &b) {

														
 
															-	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3);

														
 
															-}

														
 
															-

														
 
															-inline F32x4 squareLength(const F32x4x3 &v) {

														
 
															-	return dotProduct(v, v);

														
 
															-}

														
 
															-

														
 
															-inline F32x4 length(const F32x4x3 &v) {

														
 
															-	return squareLength(v).squareRoot();

														
 
															-}

														
 
															+// These are the available in-plaxe operations for 2D SIMD vectors F32x4x2, F32x8x2...

														
 
															+#define SIMD_VECTOR_MEMBER_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

														
 
															+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

														
 
															-inline F32x4x3 normalize(const F32x4x3 &v) {

														
 
															-	return v * squareLength(v).reciprocalSquareRoot();

														
 
															-}

														
 
															+// These are the available in-plaxe operations for 3D SIMD vectors F32x4x3, F32x8x3...

														
 
															+#define SIMD_VECTOR_MEMBER_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \

														
 
															+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \

														
 
															+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }

														
 
															-// 2D vector in xxxxyyyy format

														
 
															+// 128x2-bit SIMD vectorized 2D math vector stored in xxxxyyyy format (one planar SIMD vector per dimension).

														
 
															 struct F32x4x2 {

														
 
															 	F32x4 v1, v2;

														
 
															 	// Direct constructor given 3 rows of length 4

														
 
															 	F32x4x2(const F32x4& v1, const F32x4& v2)

														
 
															 	: v1(v1), v2(v2) {}

														
 
															-	// Transposed constructor given 4 columns of length 3

														
 
															-	F32x4x2(const dsr::FVector2D& vx, const dsr::FVector2D& vy, const dsr::FVector2D& vz, const dsr::FVector2D& vw)

														
 
															-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),

														
 
															-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)) {}

														
 
															+	// Gradient constructor from an initial vector and the increment for each element.

														
 
															+	static F32x4x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

														
 
															+		return F32x4x2(

														
 
															+		  F32x4::createGradient(start.x, increment.x),

														
 
															+		  F32x4::createGradient(start.y, increment.y)

														
 
															+		);

														
 
															+	}

														
 
															+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)

														
 
															+	F32x4x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d)

														
 
															+	: v1(a.x, b.x, c.x, d.x),

														
 
															+	  v2(a.y, b.y, c.y, d.y) {}

														
 
															 	// Transposed constructor given a single repeated column

														
 
															 	F32x4x2(const dsr::FVector2D& v)

														
 
															-	: v1(F32x4(v.x, v.x, v.x, v.x)),

														
 
															-	  v2(F32x4(v.y, v.y, v.y, v.y)) {}

														
 
															+	: v1(F32x4(v.x)),

														
 
															+	  v2(F32x4(v.y)) {}

														
 
															 	// In-place math operations

														
 
															-	inline F32x4x2& operator+=(const F32x4x2& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; }

														
 
															-	inline F32x4x2& operator-=(const F32x4x2& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; }

														
 
															-	inline F32x4x2& operator*=(const F32x4x2& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; }

														
 
															-	inline F32x4x2& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }

														
 
															-	inline F32x4x2& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }

														
 
															-	inline F32x4x2& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

														
 
															-	inline F32x4x2& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }

														
 
															-	inline F32x4x2& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }

														
 
															-	inline F32x4x2& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }

														
 
															+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x4x2, F32x4, float)

														
 
															 };

														
 
															+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x4x2, F32x4, float)

														
 
															-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4x2 &right) {

														
 
															-	return F32x4x2(left.v1 + right.v1, left.v2 + right.v2);

														
 
															-}

														
 
															-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4 &right) {

														
 
															-	return F32x4x2(left.v1 + right, left.v2 + right);

														
 
															-}

														
 
															-inline F32x4x2 operator+(const F32x4x2 &left, const float &right) {

														
 
															-	return F32x4x2(left.v1 + right, left.v2 + right);

														
 
															-}

														
 
															-

														
 
															-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4x2 &right) {

														
 
															-	return F32x4x2(left.v1 - right.v1, left.v2 - right.v2);

														
 
															-}

														
 
															-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4 &right) {

														
 
															-	return F32x4x2(left.v1 - right, left.v2 - right);

														
 
															-}

														
 
															-inline F32x4x2 operator-(const F32x4x2 &left, const float &right) {

														
 
															-	return F32x4x2(left.v1 - right, left.v2 - right);

														
 
															-}

														
 
															-inline F32x4x2 operator-(const F32x4x2& value) {

														
 
															-	return F32x4x2(-value.v1, -value.v2);

														
 
															-}

														
 
															-

														
 
															-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4x2 &right) {

														
 
															-	return F32x4x2(left.v1 * right.v1, left.v2 * right.v2);

														
 
															-}

														
 
															-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4 &right) {

														
 
															-	return F32x4x2(left.v1 * right, left.v2 * right);

														
 
															-}

														
 
															-inline F32x4x2 operator*(const F32x4x2 &left, const float &right) {

														
 
															-	return F32x4x2(left.v1 * right, left.v2 * right);

														
 
															-}

														
 
															+// 256x2-bit SIMD vectorized 2D math vector stored in xxxxxxxxyyyyyyyy format (one planar SIMD vector per dimension).

														
 
															+struct F32x8x2 {

														
 
															+	F32x8 v1, v2;

														
 
															+	// Direct constructor given 3 rows of length 4

														
 
															+	F32x8x2(const F32x8& v1, const F32x8& v2)

														
 
															+	: v1(v1), v2(v2) {}

														
 
															+	// Gradient constructor from an initial vector and the increment for each element.

														
 
															+	static F32x8x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

														
 
															+		return F32x8x2(

														
 
															+		  F32x8::createGradient(start.x, increment.x),

														
 
															+		  F32x8::createGradient(start.y, increment.y)

														
 
															+		);

														
 
															+	}

														
 
															+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)

														
 
															+	F32x8x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d, const dsr::FVector2D& e, const dsr::FVector2D& f, const dsr::FVector2D& g, const dsr::FVector2D& h)

														
 
															+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),

														
 
															+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y) {}

														
 
															+	// Transposed constructor given a single repeated column

														
 
															+	F32x8x2(const dsr::FVector2D& v)

														
 
															+	: v1(F32x8(v.x)),

														
 
															+	  v2(F32x8(v.y)) {}

														
 
															+	// In-place math operations

														
 
															+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x8x2, F32x8, float)

														
 
															+};

														
 
															+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x8x2, F32x8, float)

														
 
															-inline F32x4 dotProduct(const F32x4x2 &a, const F32x4x2 &b) {

														
 
															-	return (a.v1 * b.v1) + (a.v2 * b.v2);

														
 
															-}

														
 
															+// 128x3-bit SIMD vectorized 3D math vector stored in xxxxyyyyzzzz format (one planar SIMD vector per dimension).

														
 
															+struct F32x4x3 {

														
 
															+	F32x4 v1, v2, v3;

														
 
															+	// Direct constructor given 3 rows of length 4

														
 
															+	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)

														
 
															+	: v1(v1), v2(v2), v3(v3) {}

														
 
															+	// Gradient constructor from an initial vector and the increment for each element.

														
 
															+	static F32x4x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

														
 
															+		return F32x4x3(

														
 
															+		  F32x4::createGradient(start.x, increment.x),

														
 
															+		  F32x4::createGradient(start.y, increment.y),

														
 
															+		  F32x4::createGradient(start.z, increment.z)

														
 
															+		);

														
 
															+	}

														
 
															+	// Transposed constructor given 4 columns of length 3

														
 
															+	F32x4x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d)

														
 
															+	: v1(a.x, b.x, c.x, d.x),

														
 
															+	  v2(a.y, b.y, c.y, d.y),

														
 
															+	  v3(a.z, b.z, c.z, d.z) {}

														
 
															+	// Transposed constructor given a single repeated column

														
 
															+	F32x4x3(const dsr::FVector3D& v)

														
 
															+	: v1(F32x4(v.x)),

														
 
															+	  v2(F32x4(v.y)),

														
 
															+	  v3(F32x4(v.z)) {}

														
 
															+	// In-place math operations

														
 
															+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x4x3, F32x4, float)

														
 
															+};

														
 
															+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x4x3, F32x4, float)

														
 
															-inline F32x4 squareLength(const F32x4x2 &v) {

														
 
															-	return dotProduct(v, v);

														
 
															-}

														
 
															+// 256x3-bit SIMD vectorized 3D math vector stored in xxxxxxxxyyyyyyyyzzzzzzzz format (one planar SIMD vector per dimension).

														
 
															+struct F32x8x3 {

														
 
															+	F32x8 v1, v2, v3;

														
 
															+	// Direct constructor given 3 rows of length 4

														
 
															+	F32x8x3(const F32x8& v1, const F32x8& v2, const F32x8& v3)

														
 
															+	: v1(v1), v2(v2), v3(v3) {}

														
 
															+	// Gradient constructor from an initial vector and the increment for each element.

														
 
															+	static F32x8x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {

														
 
															+		return F32x8x3(

														
 
															+		  F32x8::createGradient(start.x, increment.x),

														
 
															+		  F32x8::createGradient(start.y, increment.y),

														
 
															+		  F32x8::createGradient(start.z, increment.z)

														
 
															+		);

														
 
															+	}

														
 
															+	// Transposed constructor given 4 columns of length 3

														
 
															+	F32x8x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d, const dsr::FVector3D& e, const dsr::FVector3D& f, const dsr::FVector3D& g, const dsr::FVector3D& h)

														
 
															+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),

														
 
															+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y),

														
 
															+	  v3(a.z, b.z, c.z, d.z, e.z, f.z, g.z, h.z) {}

														
 
															+	// Transposed constructor given a single repeated column

														
 
															+	F32x8x3(const dsr::FVector3D& v)

														
 
															+	: v1(F32x8(v.x)),

														
 
															+	  v2(F32x8(v.y)),

														
 
															+	  v3(F32x8(v.z)) {}

														
 
															+	// In-place math operations

														
 
															+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x8x3, F32x8, float)

														
 
															+};

														
 
															+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x8x3, F32x8, float)

														
 
															-inline F32x4 length(const F32x4x2 &v) {

														
 
															-	return squareLength(v).squareRoot();

														
 
															-}

														
 
															+// X vector aliases

														
 
															+#if DSR_DEFAULT_VECTOR_SIZE == 16

														
 
															+	using F32xXx3 = F32x4x3;

														
 
															+	using F32xXx2 = F32x4x2;

														
 
															+#elif DSR_DEFAULT_VECTOR_SIZE == 32

														
 
															+	using F32xXx3 = F32x8x3;

														
 
															+	using F32xXx2 = F32x8x2;

														
 
															+#endif

														
 
															-inline F32x4x2 normalize(const F32x4x2 &v) {

														
 
															-	return v * squareLength(v).reciprocalSquareRoot();

														
 
															-}

														
 
															+// F vector aliases

														
 
															+#if DSR_FLOAT_VECTOR_SIZE == 16

														
 
															+	using F32xFx3 = F32x4x3;

														
 
															+	using F32xFx2 = F32x4x2;

														
 
															+#elif DSR_FLOAT_VECTOR_SIZE == 32

														
 
															+	using F32xFx3 = F32x8x3;

														
 
															+	using F32xFx2 = F32x8x2;

														
 
															+#endif

														
 
															 #endif

														
--- a/Source/DFPSR/image/PackOrder.h
+++ b/Source/DFPSR/image/PackOrder.h
@@ -1,6 +1,6 @@
 
															 // zlib open source license

														
 
															 //

														
 
															-// Copyright (c) 2017 to 2019 David Forsgren Piuva

														
 
															+// Copyright (c) 2017 to 2023 David Forsgren Piuva

														
 
															 // 

														
 
															 // This software is provided 'as-is', without any express or implied

														
 
															 // warranty. In no event will the authors be held liable for any damages

														
@@ -93,22 +93,26 @@ inline bool operator==(const PackOrder &left, const PackOrder &right) {
 
															 }

														
 
															 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.

														
 
															-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+T packBytes(const T &s0, const T &s1, const T &s2) {

														
 
															 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16);

														
 
															 }

														
 
															 // Using a specified packing order

														
 
															-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+T packBytes(const T &s0, const T &s1, const T &s2, const PackOrder &order) {

														
 
															 	return ENDIAN_POS_ADDR(s0, order.redOffset)

														
 
															 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)

														
 
															 	     | ENDIAN_POS_ADDR(s2, order.blueOffset);

														
 
															 }

														
 
															 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.

														
 
															-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3) {

														
 
															 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16) | ENDIAN_POS_ADDR(s3, 24);

														
 
															 }

														
 
															 // Using a specified packing order

														
 
															-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3, const PackOrder &order) {

														
 
															 	return ENDIAN_POS_ADDR(s0, order.redOffset)

														
 
															 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)

														
 
															 	     | ENDIAN_POS_ADDR(s2, order.blueOffset)

														
@@ -116,7 +120,15 @@ inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const
 
															 }

														
 
															 // Pack separate floats into saturated bytes

														
 
															-inline static U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {

														
 
															+inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {

														
 
															+	return packBytes(

														
 
															+	  truncateToU32(s0.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s1.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s2.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s3.clamp(0.1f, 255.1f))

														
 
															+	);

														
 
															+}

														
 
															+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3) {

														
 
															 	return packBytes(

														
 
															 	  truncateToU32(s0.clamp(0.1f, 255.1f)),

														
 
															 	  truncateToU32(s1.clamp(0.1f, 255.1f)),

														
@@ -134,54 +146,46 @@ inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4
 
															 	  order

														
 
															 	);

														
 
															 }

														
 
															-

														
 
															-inline uint32_t getRed(uint32_t color) {

														
 
															-	return color & ENDIAN32_BYTE_0;

														
 
															-}

														
 
															-inline uint32_t getRed(uint32_t color, const PackOrder &order) {

														
 
															-	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);

														
 
															-}

														
 
															-inline uint32_t getGreen(uint32_t color) {

														
 
															-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);

														
 
															-}

														
 
															-inline uint32_t getGreen(uint32_t color, const PackOrder &order) {

														
 
															-	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);

														
 
															-}

														
 
															-inline uint32_t getBlue(uint32_t color) {

														
 
															-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);

														
 
															-}

														
 
															-inline uint32_t getBlue(uint32_t color, const PackOrder &order) {

														
 
															-	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);

														
 
															-}

														
 
															-inline uint32_t getAlpha(uint32_t color) {

														
 
															-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);

														
 
															-}

														
 
															-inline uint32_t getAlpha(uint32_t color, const PackOrder &order) {

														
 
															-	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);

														
 
															+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3, const PackOrder &order) {

														
 
															+	return packBytes(

														
 
															+	  truncateToU32(s0.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s1.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s2.clamp(0.1f, 255.1f)),

														
 
															+	  truncateToU32(s3.clamp(0.1f, 255.1f)),

														
 
															+	  order

														
 
															+	);

														
 
															 }

														
 
															-inline U32x4 getRed(const U32x4 &color) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getRed(T color) {

														
 
															 	return color & ENDIAN32_BYTE_0;

														
 
															 }

														
 
															-inline U32x4 getRed(const U32x4 &color, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getRed(T color, const PackOrder &order) {

														
 
															 	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);

														
 
															 }

														
 
															-inline U32x4 getGreen(const U32x4 &color) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getGreen(T color) {

														
 
															 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);

														
 
															 }

														
 
															-inline U32x4 getGreen(const U32x4 &color, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getGreen(T color, const PackOrder &order) {

														
 
															 	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);

														
 
															 }

														
 
															-inline U32x4 getBlue(const U32x4 &color) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getBlue(T color) {

														
 
															 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);

														
 
															 }

														
 
															-inline U32x4 getBlue(const U32x4 &color, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getBlue(T color, const PackOrder &order) {

														
 
															 	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);

														
 
															 }

														
 
															-inline U32x4 getAlpha(const U32x4 &color) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getAlpha(T color) {

														
 
															 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);

														
 
															 }

														
 
															-inline U32x4 getAlpha(const U32x4 &color, const PackOrder &order) {

														
 
															+template<typename T> // Accepting uint32_t, U32x4, U32x8...

														
 
															+inline T getAlpha(T color, const PackOrder &order) {

														
 
															 	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);

														
 
															 }

														
--- a/Source/SDK/SpriteEngine/lightAPI.cpp
+++ b/Source/SDK/SpriteEngine/lightAPI.cpp
@@ -6,24 +6,24 @@
 
															 namespace dsr {

														
 
															 // Precondition: The packed color must be in the standard RGBA order, meaning no native packing

														
 
															-inline F32x4x3 unpackRgb_U32x4_to_F32x4x3(const U32x4& color) {

														
 
															-	return F32x4x3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));

														
 
															+inline F32xXx3 unpackRgb_U32xX_to_F32xXx3(const U32xX& color) {

														
 
															+	return F32xXx3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));

														
 
															 }

														
 
															-static inline void setLight(SafePointer<uint8_t> lightPixel, U8x16 newlight) {

														
 
															+static inline void setLight(SafePointer<uint8_t> lightPixel, U8xX newlight) {

														
 
															 	newlight.writeAligned(lightPixel, "setLight: writing light");

														
 
															 }

														
 
															-static inline void addLight(SafePointer<uint8_t> lightPixel, U8x16 addedlight) {

														
 
															-	U8x16 oldLight = U8x16::readAligned(lightPixel, "addLight: reading light");

														
 
															-	U8x16 newlight = saturatedAddition(oldLight, addedlight);

														
 
															+static inline void addLight(SafePointer<uint8_t> lightPixel, U8xX addedlight) {

														
 
															+	U8xX oldLight = U8xX::readAligned(lightPixel, "addLight: reading light");

														
 
															+	U8xX newlight = saturatedAddition(oldLight, addedlight);

														
 
															 	newlight.writeAligned(lightPixel, "addLight: writing light");

														
 
															 }

														
 
															 template <bool ADD_LIGHT>

														
 
															 void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lightBuffer, const OrderedImageRgbaU8& normalBuffer, const FVector3D& lightDirection, float lightIntensity, const ColorRgbI32& lightColor) {

														
 
															 	// Normals in range 0..255 - 128 have lengths of 127 and 128, so if we double the reverse light direction we'll end up near 0..255 again for colors

														
 
															-	F32x4x3 reverseLightDirection = F32x4x3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);

														
 
															+	F32xXx3 reverseLightDirection = F32xXx3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);

														
 
															 	IRect rectangleBound = image_getBound(lightBuffer);

														
 
															 	float colorR = std::max(0.0f, (float)lightColor.red / 255.0f);

														
 
															 	float colorG = std::max(0.0f, (float)lightColor.green / 255.0f);

														
@@ -37,27 +37,29 @@ void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lig
 
															 		for (int y = bound.top(); y < bound.bottom(); y++) {

														
 
															 			SafePointer<uint8_t> lightPixel = lightRow;

														
 
															 			SafePointer<uint32_t> normalPixel = normalRow;

														
 
															-			for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {

														
 
															+			for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {

														
 
															 				// Read surface normals

														
 
															-				U32x4 normalColor = U32x4::readAligned(normalPixel, "directedLight: reading normal");

														
 
															-				F32x4x3 negativeSurfaceNormal = unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f;

														
 
															+				U32xX normalColor = U32xX::readAligned(normalPixel, "directedLight: reading normal");

														
 
															+				// TODO: Port SIMD3D to handle arbitrary vector lengths.

														
 
															+				F32xXx3 negativeSurfaceNormal = unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f;

														
 
															 				// Calculate light intensity

														
 
															 				//   Normalization and negation is already pre-multiplied into reverseLightDirection

														
 
															-				F32x4 intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);

														
 
															-				F32x4 red = intensity * colorR;

														
 
															-				F32x4 green = intensity * colorG;

														
 
															-				F32x4 blue = intensity * colorB;

														
 
															+				F32xX intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);

														
 
															+				F32xX red = intensity * colorR;

														
 
															+				F32xX green = intensity * colorG;

														
 
															+				F32xX blue = intensity * colorB;

														
 
															 				red = red.clampUpper(255.1f);

														
 
															 				green = green.clampUpper(255.1f);

														
 
															 				blue = blue.clampUpper(255.1f);

														
 
															-				U8x16 light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

														
 
															+				// TODO: Let color packing handle arbitrary vector lengths.

														
 
															+				U8xX light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

														
 
															 				if (ADD_LIGHT) {

														
 
															 					addLight(lightPixel, light);

														
 
															 				} else {

														
 
															 					setLight(lightPixel, light);

														
 
															 				}

														
 
															-				lightPixel += 16;

														
 
															-				normalPixel += 4;

														
 
															+				lightPixel += laneCountX_8Bit;

														
 
															+				normalPixel += laneCountX_32Bit;

														
 
															 			}

														
 
															 			lightRow.increaseBytes(lightStride);

														
 
															 			normalRow.increaseBytes(normalStride);

														
@@ -136,16 +138,33 @@ static float getShadowTransparency(SafePointer<float> pixelData, int32_t width,
 
															 	return reciDepth * 1.02f > shadowReciDepth ? 1.0f : 0.0f;

														
 
															 }

														
 
															-static inline F32x4 getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32x4x3& lightOffset) {

														
 
															-	FVector4D offsetX = lightOffset.v1.get();

														
 
															-	FVector4D offsetY = lightOffset.v2.get();

														
 
															-	FVector4D offsetZ = lightOffset.v3.get();

														
 
															-	return F32x4(

														
 
															-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.x, offsetY.x, offsetZ.x)),

														
 
															-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.y, offsetY.y, offsetZ.y)),

														
 
															-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.z, offsetY.z, offsetZ.z)),

														
 
															-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.w, offsetY.w, offsetZ.w))

														
 
															-	);

														
 
															+static inline F32xX getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32xXx3& lightOffset) {

														
 
															+	// TODO: Create a way to quickly iterate over elements in a SIMD vector for interfacing with scalar operations.

														
 
															+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetX[DSR_DEFAULT_VECTOR_SIZE];

														
 
															+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetY[DSR_DEFAULT_VECTOR_SIZE];

														
 
															+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetZ[DSR_DEFAULT_VECTOR_SIZE];

														
 
															+	lightOffset.v1.writeAlignedUnsafe(offsetX);

														
 
															+	lightOffset.v2.writeAlignedUnsafe(offsetY);

														
 
															+	lightOffset.v3.writeAlignedUnsafe(offsetZ);

														
 
															+	#if DSR_DEFAULT_VECTOR_SIZE == 16

														
 
															+		return F32x4(

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3]))

														
 
															+		);

														
 
															+	#elif DSR_DEFAULT_VECTOR_SIZE == 32

														
 
															+		return F32x8(

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[4], offsetY[4], offsetZ[4])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[5], offsetY[5], offsetZ[5])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[6], offsetY[6], offsetZ[6])),

														
 
															+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[7], offsetY[7], offsetZ[7]))

														
 
															+		);

														
 
															+	#endif

														
 
															 }

														
 
															 template <bool SHADOW_CASTING>

														
@@ -154,11 +173,11 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
															 	//   Normal-space defines the rotation for light-space

														
 
															 	FVector3D lightSpaceSourcePosition = camera.normalToWorldSpace.transformTransposed(lightPosition);

														
 
															 	// Align the rectangle with 8 pixels, because that's the widest read to align in the 16-bit height buffer

														
 
															-	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, 4);

														
 
															+	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, laneCountX_32Bit);

														
 
															 	if (rectangleBound.hasArea()) {

														
 
															 		// Uniform values

														
 
															 		// How much closer to your face in light-space does the pixel go per depth unit

														
 
															-		F32x4x3 inYourFaceAxis = F32x4x3(camera.screenDepthToLightSpace.zAxis);

														
 
															+		F32xXx3 inYourFaceAxis = F32xXx3(camera.screenDepthToLightSpace.zAxis);

														
 
															 		// Light color

														
 
															 		float colorR = std::max(0.0f, (float)lightColor.red * lightIntensity);

														
 
															 		float colorG = std::max(0.0f, (float)lightColor.green * lightIntensity);

														
@@ -173,14 +192,14 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
															 			FVector3D dx = camera.screenDepthToLightSpace.xAxis;

														
 
															 			FVector3D dy = camera.screenDepthToLightSpace.yAxis;

														
 
															 			// Pack the offset for each of the 4 first pixels into a transposing constructor

														
 
															-			F32x4x3 lightBaseRowX4 = F32x4x3(lightBaseRow, lightBaseRow + dx, lightBaseRow + dx * 2.0f, lightBaseRow + dx * 3.0f);

														
 
															+			F32xXx3 lightBaseRowX = F32xXx3::createGradient(lightBaseRow, dx);

														
 
															 			// Derivatives for moving four pixels to the right in parallel

														
 
															 			//    (n+0, y0), (n+1, y0), (n+2, y0), (n+3, y0) -> (n+4, y0), (n+5, y0), (n+6, y0), (n+7, y0)

														
 
															-			F32x4x3 dx4 = F32x4x3(dx * 4.0f);

														
 
															+			F32xXx3 dxX = F32xXx3(dx * (float)laneCountX_32Bit);

														
 
															 			// Derivatives for moving one pixel down in parallel

														
 
															 			//    (x0, n+0), (x1, n+0), (x2, n+0), (x3, n+0)

														
 
															 			// -> (x0, n+1), (x1, n+1), (x2, n+1), (x3, n+1)

														
 
															-			F32x4x3 dy1 = F32x4x3(dy);

														
 
															+			F32xXx3 dy1 = F32xXx3(dy);

														
 
															 			// Get strides

														
 
															 			int lightStride = image_getStride(lightBuffer);

														
 
															 			int normalStride = image_getStride(normalBuffer);

														
@@ -194,56 +213,56 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 
															 			SafePointer<float> shadowCubeData;

														
 
															 			float shadowCubeCenter;

														
 
															 			if (SHADOW_CASTING) {

														
 
															-				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % 4 == 0);

														
 
															+				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % laneCountX_32Bit == 0);

														
 
															 				shadowCubeData = image_getSafePointer(shadowCubeMap);

														
 
															 				shadowCubeCenter = (float)shadowCubeWidth * 0.5f;

														
 
															 			}

														
 
															 			// Loop over the pixels to add light

														
 
															 			for (int y = bound.top(); y < bound.bottom(); y++) {

														
 
															 				// Initiate the leftmost pixels before iterating to the right

														
 
															-				F32x4x3 lightBasePixelx4 = lightBaseRowX4;

														
 
															+				F32xXx3 lightBasePixelxX = lightBaseRowX;

														
 
															 				SafePointer<uint8_t> lightPixel = lightRow;

														
 
															 				SafePointer<uint32_t> normalPixel = normalRow;

														
 
															 				SafePointer<float> heightPixel = heightRow;

														
 
															 				// Iterate over 16-bit pixels 8 at a time

														
 
															-				for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {

														
 
															+				for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {

														
 
															 					// Read pixel height

														
 
															-					F32x4 depthOffset = F32x4::readAligned(heightPixel, "addPointLight: reading height");

														
 
															+					F32xX depthOffset = F32xX::readAligned(heightPixel, "addPointLight: reading height");

														
 
															 					// Extrude the pixel using positive values towards the camera to represent another height

														
 
															 					//   This will solve X and Z positions based on the height Y

														
 
															-					F32x4x3 lightOffset = lightBasePixelx4 + (inYourFaceAxis * depthOffset);

														
 
															+					F32xXx3 lightOffset = lightBasePixelxX + (inYourFaceAxis * depthOffset);

														
 
															 					// Get the linear distance, divide by sphere radius and limit to length 1 at intensity 0

														
 
															-					F32x4 lightRatio = min(F32x4(1.0f), length(lightOffset) * reciprocalRadius);

														
 
															+					F32xX lightRatio = min(F32xX(1.0f), length(lightOffset) * reciprocalRadius);

														
 
															 					// Read surface normal

														
 
															-					U32x4 normalColor = U32x4::readAligned(normalPixel, "addPointLight: reading normal");

														
 
															+					U32xX normalColor = U32xX::readAligned(normalPixel, "addPointLight: reading normal");

														
 
															 					// normalScale is used to negate the normals in advance so that opposing directions get positive values

														
 
															-					F32x4x3 negativeSurfaceNormal = (unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f) * (-1.0f / 128.0f);

														
 
															+					F32xXx3 negativeSurfaceNormal = (unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f) * (-1.0f / 128.0f);

														
 
															 					// Fade from 0 to 1 using 1 - 2x + x²

														
 
															-					F32x4 distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;

														
 
															-					F32x4 angleIntensity = max(F32x4(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));

														
 
															-					F32x4 intensity = angleIntensity * distanceIntensity;

														
 
															+					F32xX distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;

														
 
															+					F32xX angleIntensity = max(F32xX(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));

														
 
															+					F32xX intensity = angleIntensity * distanceIntensity;

														
 
															 					if (SHADOW_CASTING) {

														
 
															 						intensity = intensity * getShadowTransparency(shadowCubeData, shadowCubeWidth, shadowCubeCenter, lightOffset);

														
 
															 					}

														
 
															 					// TODO: Make an optimized version for white light replacing red, green and blue with a single LUMA

														
 
															-					F32x4 red = intensity * colorR;

														
 
															-					F32x4 green = intensity * colorG;

														
 
															-					F32x4 blue = intensity * colorB;

														
 
															+					F32xX red = intensity * colorR;

														
 
															+					F32xX green = intensity * colorG;

														
 
															+					F32xX blue = intensity * colorB;

														
 
															 					red = red.clampUpper(255.1f);

														
 
															 					green = green.clampUpper(255.1f);

														
 
															 					blue = blue.clampUpper(255.1f);

														
 
															 					// Add light to the image

														
 
															-					U8x16 morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

														
 
															+					U8xX morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));

														
 
															 					addLight(lightPixel, morelight);

														
 
															 					// Go to the next four pixels in light-space

														
 
															-					lightBasePixelx4 += dx4;

														
 
															+					lightBasePixelxX += dxX;

														
 
															 					// Go to the next 4 pixels of image data

														
 
															-					lightPixel += 16;

														
 
															-					normalPixel += 4;

														
 
															-					heightPixel += 4;

														
 
															+					lightPixel += laneCountX_8Bit;

														
 
															+					normalPixel += laneCountX_32Bit;

														
 
															+					heightPixel += laneCountX_32Bit;

														
 
															 				}

														
 
															 				// Go to the next row in light-space

														
 
															-				lightBaseRowX4 += dy1;

														
 
															+				lightBaseRowX += dy1;

														
 
															 				// Go to the next row of image data

														
 
															 				lightRow.increaseBytes(lightStride);

														
 
															 				normalRow.increaseBytes(normalStride);

														
@@ -276,25 +295,25 @@ void blendLight(AlignedImageRgbaU8& colorBuffer, const OrderedImageRgbaU8& diffu
 
															 		int targetStride = image_getStride(colorBuffer);

														
 
															 		int diffuseStride = image_getStride(diffuseBuffer);

														
 
															 		int lightStride = image_getStride(lightBuffer);

														
 
															-		F32x4 scale = F32x4(1.0 / 128.0f);

														
 
															+		F32xX scale = F32xX(1.0 / 128.0f);

														
 
															 		for (int y = startIndex; y < stopIndex; y++) {

														
 
															 			SafePointer<uint32_t> targetPixel = targetRow;

														
 
															 			SafePointer<uint32_t> diffusePixel = diffuseRow;

														
 
															 			SafePointer<uint32_t> lightPixel = lightRow;

														
 
															-			for (int x4 = 0; x4 < width; x4 += 4) {

														
 
															-				U32x4 diffuse = U32x4::readAligned(diffusePixel, "blendLight: reading diffuse");

														
 
															-				U32x4 light = U32x4::readAligned(lightPixel, "blendLight: reading light");

														
 
															-				F32x4 red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;

														
 
															-				F32x4 green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;

														
 
															-				F32x4 blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;

														
 
															+			for (int x = 0; x < width; x += laneCountX_32Bit) {

														
 
															+				U32xX diffuse = U32xX::readAligned(diffusePixel, "blendLight: reading diffuse");

														
 
															+				U32xX light = U32xX::readAligned(lightPixel, "blendLight: reading light");

														
 
															+				F32xX red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;

														
 
															+				F32xX green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;

														
 
															+				F32xX blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;

														
 
															 				red = red.clampUpper(255.1f);

														
 
															 				green = green.clampUpper(255.1f);

														
 
															 				blue = blue.clampUpper(255.1f);

														
 
															-				U32x4 color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);

														
 
															+				U32xX color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);

														
 
															 				color.writeAligned(targetPixel, "blendLight: writing color");

														
 
															-				targetPixel += 4;

														
 
															-				diffusePixel += 4;

														
 
															-				lightPixel += 4;

														
 
															+				targetPixel += laneCountX_32Bit;

														
 
															+				diffusePixel += laneCountX_32Bit;

														
 
															+				lightPixel += laneCountX_32Bit;

														
 
															 			}

														
 
															 			targetRow.increaseBytes(targetStride);

														
 
															 			diffuseRow.increaseBytes(diffuseStride);