Browse Source

Testing arbitrary length vectors in the Sprite Engine.

David Piuva 2 years ago
parent
commit
d54ee7027e
3 changed files with 346 additions and 233 deletions
  1. 222 132
      Source/DFPSR/base/simd3D.h
  2. 42 38
      Source/DFPSR/image/PackOrder.h
  3. 82 63
      Source/SDK/SpriteEngine/lightAPI.cpp

+ 222 - 132
Source/DFPSR/base/simd3D.h

@@ -1,6 +1,6 @@
 // zlib open source license
 // zlib open source license
 //
 //
-// Copyright (c) 2017 to 2022 David Forsgren Piuva
+// Copyright (c) 2017 to 2023 David Forsgren Piuva
 // 
 // 
 // This software is provided 'as-is', without any express or implied
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // warranty. In no event will the authors be held liable for any damages
@@ -31,157 +31,247 @@
 #ifndef DFPSR_SIMD_3D
 #ifndef DFPSR_SIMD_3D
 #define DFPSR_SIMD_3D
 #define DFPSR_SIMD_3D
 
 
-// 3D vector in xxxxyyyyzzzz format
-struct F32x4x3 {
-	F32x4 v1, v2, v3;
-	// Direct constructor given 3 rows of length 4
-	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)
-	: v1(v1), v2(v2), v3(v3) {}
-	// Transposed constructor given 4 columns of length 3
-	F32x4x3(const dsr::FVector3D& vx, const dsr::FVector3D& vy, const dsr::FVector3D& vz, const dsr::FVector3D& vw)
-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),
-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)),
-	  v3(F32x4(vx.z, vy.z, vz.z, vw.z)) {}
-	// Transposed constructor given a single repeated column
-	F32x4x3(const dsr::FVector3D& v)
-	: v1(F32x4(v.x, v.x, v.x, v.x)),
-	  v2(F32x4(v.y, v.y, v.y, v.y)),
-	  v3(F32x4(v.z, v.z, v.z, v.z)) {}
-	// In-place math operations
-	inline F32x4x3& operator+=(const F32x4x3& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; }
-	inline F32x4x3& operator-=(const F32x4x3& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; }
-	inline F32x4x3& operator*=(const F32x4x3& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; }
-	inline F32x4x3& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }
-	inline F32x4x3& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }
-	inline F32x4x3& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }
-	inline F32x4x3& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; }
-	inline F32x4x3& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; }
-	inline F32x4x3& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }
-};
-
-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4x3 &right) {
-	return F32x4x3(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3);
-}
-inline F32x4x3 operator+(const F32x4x3 &left, const F32x4 &right) {
-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);
-}
-inline F32x4x3 operator+(const F32x4x3 &left, const float &right) {
-	return F32x4x3(left.v1 + right, left.v2 + right, left.v3 + right);
+// These are the infix operations for 2D SIMD vectors F32x4x2, F32x8x2...
+#define SIMD_VECTOR_INFIX_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2); \
+} \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \
+} \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right, left.v2 + right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right, left.v2 - right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \
+	return VECTOR_TYPE(-value.v1, -value.v2); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right, left.v2 * right); \
+} \
+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \
+	return (a.v1 * b.v1) + (a.v2 * b.v2); \
+} \
+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \
+	return dotProduct(v, v); \
+} \
+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \
+	return squareLength(v).squareRoot(); \
+} \
+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \
+	return v * squareLength(v).reciprocalSquareRoot(); \
 }
 }
 
 
-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4x3 &right) {
-	return F32x4x3(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3);
-}
-inline F32x4x3 operator-(const F32x4x3 &left, const F32x4 &right) {
-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);
-}
-inline F32x4x3 operator-(const F32x4x3 &left, const float &right) {
-	return F32x4x3(left.v1 - right, left.v2 - right, left.v3 - right);
-}
-inline F32x4x3 operator-(const F32x4x3& value) {
-	return F32x4x3(-value.v1, -value.v2, -value.v3);
+// These are the infix operations for 3D SIMD vectors F32x4x3, F32x8x3...
+#define SIMD_VECTOR_INFIX_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right.v1, left.v2 + right.v2, left.v3 + right.v3); \
+} \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \
+} \
+inline VECTOR_TYPE operator+(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 + right, left.v2 + right, left.v3 + right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right.v1, left.v2 - right.v2, left.v3 - right.v3); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 - right, left.v2 - right, left.v3 - right); \
+} \
+inline VECTOR_TYPE operator-(const VECTOR_TYPE& value) { \
+	return VECTOR_TYPE(-value.v1, -value.v2, -value.v3); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const VECTOR_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const SIMD_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \
+} \
+inline VECTOR_TYPE operator*(const VECTOR_TYPE &left, const ELEMENT_TYPE &right) { \
+	return VECTOR_TYPE(left.v1 * right, left.v2 * right, left.v3 * right); \
+} \
+inline SIMD_TYPE dotProduct(const VECTOR_TYPE &a, const VECTOR_TYPE &b) { \
+	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3); \
+} \
+inline SIMD_TYPE squareLength(const VECTOR_TYPE &v) { \
+	return dotProduct(v, v); \
+} \
+inline SIMD_TYPE length(const VECTOR_TYPE &v) { \
+	return squareLength(v).squareRoot(); \
+} \
+inline VECTOR_TYPE normalize(const VECTOR_TYPE &v) { \
+	return v * squareLength(v).reciprocalSquareRoot(); \
 }
 }
 
 
-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4x3 &right) {
-	return F32x4x3(left.v1 * right.v1, left.v2 * right.v2, left.v3 * right.v3);
-}
-inline F32x4x3 operator*(const F32x4x3 &left, const F32x4 &right) {
-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);
-}
-inline F32x4x3 operator*(const F32x4x3 &left, const float &right) {
-	return F32x4x3(left.v1 * right, left.v2 * right, left.v3 * right);
-}
-
-inline F32x4 dotProduct(const F32x4x3 &a, const F32x4x3 &b) {
-	return (a.v1 * b.v1) + (a.v2 * b.v2) + (a.v3 * b.v3);
-}
-
-inline F32x4 squareLength(const F32x4x3 &v) {
-	return dotProduct(v, v);
-}
-
-inline F32x4 length(const F32x4x3 &v) {
-	return squareLength(v).squareRoot();
-}
+// These are the available in-plaxe operations for 2D SIMD vectors F32x4x2, F32x8x2...
+#define SIMD_VECTOR_MEMBER_OPERATORS_2D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \
+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; } \
+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; } \
+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; } \
+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \
+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \
+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; } \
+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; } \
+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; } \
+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }
 
 
-inline F32x4x3 normalize(const F32x4x3 &v) {
-	return v * squareLength(v).reciprocalSquareRoot();
-}
+// These are the available in-plaxe operations for 3D SIMD vectors F32x4x3, F32x8x3...
+#define SIMD_VECTOR_MEMBER_OPERATORS_3D(VECTOR_TYPE, SIMD_TYPE, ELEMENT_TYPE) \
+	inline VECTOR_TYPE& operator+=(const VECTOR_TYPE& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; this->v3 = this->v3 + offset.v3; return *this; } \
+	inline VECTOR_TYPE& operator-=(const VECTOR_TYPE& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; this->v3 = this->v3 - offset.v3; return *this; } \
+	inline VECTOR_TYPE& operator*=(const VECTOR_TYPE& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; this->v3 = this->v3 * scale.v3; return *this; } \
+	inline VECTOR_TYPE& operator+=(const SIMD_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \
+	inline VECTOR_TYPE& operator-=(const SIMD_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \
+	inline VECTOR_TYPE& operator*=(const SIMD_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; } \
+	inline VECTOR_TYPE& operator+=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; this->v3 = this->v3 + offset; return *this; } \
+	inline VECTOR_TYPE& operator-=(const ELEMENT_TYPE& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; this->v3 = this->v3 - offset; return *this; } \
+	inline VECTOR_TYPE& operator*=(const ELEMENT_TYPE& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; this->v3 = this->v3 * scale; return *this; }
 
 
-// 2D vector in xxxxyyyy format
+// 128x2-bit SIMD vectorized 2D math vector stored in xxxxyyyy format (one planar SIMD vector per dimension).
 struct F32x4x2 {
 struct F32x4x2 {
 	F32x4 v1, v2;
 	F32x4 v1, v2;
 	// Direct constructor given 3 rows of length 4
 	// Direct constructor given 3 rows of length 4
 	F32x4x2(const F32x4& v1, const F32x4& v2)
 	F32x4x2(const F32x4& v1, const F32x4& v2)
 	: v1(v1), v2(v2) {}
 	: v1(v1), v2(v2) {}
-	// Transposed constructor given 4 columns of length 3
-	F32x4x2(const dsr::FVector2D& vx, const dsr::FVector2D& vy, const dsr::FVector2D& vz, const dsr::FVector2D& vw)
-	: v1(F32x4(vx.x, vy.x, vz.x, vw.x)),
-	  v2(F32x4(vx.y, vy.y, vz.y, vw.y)) {}
+	// Gradient constructor from an initial vector and the increment for each element.
+	static F32x4x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {
+		return F32x4x2(
+		  F32x4::createGradient(start.x, increment.x),
+		  F32x4::createGradient(start.y, increment.y)
+		);
+	}
+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)
+	F32x4x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d)
+	: v1(a.x, b.x, c.x, d.x),
+	  v2(a.y, b.y, c.y, d.y) {}
 	// Transposed constructor given a single repeated column
 	// Transposed constructor given a single repeated column
 	F32x4x2(const dsr::FVector2D& v)
 	F32x4x2(const dsr::FVector2D& v)
-	: v1(F32x4(v.x, v.x, v.x, v.x)),
-	  v2(F32x4(v.y, v.y, v.y, v.y)) {}
+	: v1(F32x4(v.x)),
+	  v2(F32x4(v.y)) {}
 	// In-place math operations
 	// In-place math operations
-	inline F32x4x2& operator+=(const F32x4x2& offset) { this->v1 = this->v1 + offset.v1; this->v2 = this->v2 + offset.v2; return *this; }
-	inline F32x4x2& operator-=(const F32x4x2& offset) { this->v1 = this->v1 - offset.v1; this->v2 = this->v2 - offset.v2; return *this; }
-	inline F32x4x2& operator*=(const F32x4x2& scale) { this->v1 = this->v1 * scale.v1; this->v2 = this->v2 * scale.v2; return *this; }
-	inline F32x4x2& operator+=(const F32x4& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }
-	inline F32x4x2& operator-=(const F32x4& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }
-	inline F32x4x2& operator*=(const F32x4& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }
-	inline F32x4x2& operator+=(const float& offset) { this->v1 = this->v1 + offset; this->v2 = this->v2 + offset; return *this; }
-	inline F32x4x2& operator-=(const float& offset) { this->v1 = this->v1 - offset; this->v2 = this->v2 - offset; return *this; }
-	inline F32x4x2& operator*=(const float& scale) { this->v1 = this->v1 * scale; this->v2 = this->v2 * scale; return *this; }
+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x4x2, F32x4, float)
 };
 };
+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x4x2, F32x4, float)
 
 
-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4x2 &right) {
-	return F32x4x2(left.v1 + right.v1, left.v2 + right.v2);
-}
-inline F32x4x2 operator+(const F32x4x2 &left, const F32x4 &right) {
-	return F32x4x2(left.v1 + right, left.v2 + right);
-}
-inline F32x4x2 operator+(const F32x4x2 &left, const float &right) {
-	return F32x4x2(left.v1 + right, left.v2 + right);
-}
-
-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4x2 &right) {
-	return F32x4x2(left.v1 - right.v1, left.v2 - right.v2);
-}
-inline F32x4x2 operator-(const F32x4x2 &left, const F32x4 &right) {
-	return F32x4x2(left.v1 - right, left.v2 - right);
-}
-inline F32x4x2 operator-(const F32x4x2 &left, const float &right) {
-	return F32x4x2(left.v1 - right, left.v2 - right);
-}
-inline F32x4x2 operator-(const F32x4x2& value) {
-	return F32x4x2(-value.v1, -value.v2);
-}
-
-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4x2 &right) {
-	return F32x4x2(left.v1 * right.v1, left.v2 * right.v2);
-}
-inline F32x4x2 operator*(const F32x4x2 &left, const F32x4 &right) {
-	return F32x4x2(left.v1 * right, left.v2 * right);
-}
-inline F32x4x2 operator*(const F32x4x2 &left, const float &right) {
-	return F32x4x2(left.v1 * right, left.v2 * right);
-}
+// 256x2-bit SIMD vectorized 2D math vector stored in xxxxxxxxyyyyyyyy format (one planar SIMD vector per dimension).
+struct F32x8x2 {
+	F32x8 v1, v2;
+	// Direct constructor given 3 rows of length 4
+	F32x8x2(const F32x8& v1, const F32x8& v2)
+	: v1(v1), v2(v2) {}
+	// Gradient constructor from an initial vector and the increment for each element.
+	static F32x8x2 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {
+		return F32x8x2(
+		  F32x8::createGradient(start.x, increment.x),
+		  F32x8::createGradient(start.y, increment.y)
+		);
+	}
+	// Transposed constructor given 4 columns of length 2 (Only allowed for fixed size SIMD, not X or F vector lengths)
+	F32x8x2(const dsr::FVector2D& a, const dsr::FVector2D& b, const dsr::FVector2D& c, const dsr::FVector2D& d, const dsr::FVector2D& e, const dsr::FVector2D& f, const dsr::FVector2D& g, const dsr::FVector2D& h)
+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),
+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y) {}
+	// Transposed constructor given a single repeated column
+	F32x8x2(const dsr::FVector2D& v)
+	: v1(F32x8(v.x)),
+	  v2(F32x8(v.y)) {}
+	// In-place math operations
+	SIMD_VECTOR_MEMBER_OPERATORS_2D(F32x8x2, F32x8, float)
+};
+SIMD_VECTOR_INFIX_OPERATORS_2D(F32x8x2, F32x8, float)
 
 
-inline F32x4 dotProduct(const F32x4x2 &a, const F32x4x2 &b) {
-	return (a.v1 * b.v1) + (a.v2 * b.v2);
-}
+// 128x3-bit SIMD vectorized 3D math vector stored in xxxxyyyyzzzz format (one planar SIMD vector per dimension).
+struct F32x4x3 {
+	F32x4 v1, v2, v3;
+	// Direct constructor given 3 rows of length 4
+	F32x4x3(const F32x4& v1, const F32x4& v2, const F32x4& v3)
+	: v1(v1), v2(v2), v3(v3) {}
+	// Gradient constructor from an initial vector and the increment for each element.
+	static F32x4x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {
+		return F32x4x3(
+		  F32x4::createGradient(start.x, increment.x),
+		  F32x4::createGradient(start.y, increment.y),
+		  F32x4::createGradient(start.z, increment.z)
+		);
+	}
+	// Transposed constructor given 4 columns of length 3
+	F32x4x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d)
+	: v1(a.x, b.x, c.x, d.x),
+	  v2(a.y, b.y, c.y, d.y),
+	  v3(a.z, b.z, c.z, d.z) {}
+	// Transposed constructor given a single repeated column
+	F32x4x3(const dsr::FVector3D& v)
+	: v1(F32x4(v.x)),
+	  v2(F32x4(v.y)),
+	  v3(F32x4(v.z)) {}
+	// In-place math operations
+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x4x3, F32x4, float)
+};
+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x4x3, F32x4, float)
 
 
-inline F32x4 squareLength(const F32x4x2 &v) {
-	return dotProduct(v, v);
-}
+// 256x3-bit SIMD vectorized 3D math vector stored in xxxxxxxxyyyyyyyyzzzzzzzz format (one planar SIMD vector per dimension).
+struct F32x8x3 {
+	F32x8 v1, v2, v3;
+	// Direct constructor given 3 rows of length 4
+	F32x8x3(const F32x8& v1, const F32x8& v2, const F32x8& v3)
+	: v1(v1), v2(v2), v3(v3) {}
+	// Gradient constructor from an initial vector and the increment for each element.
+	static F32x8x3 createGradient(const dsr::FVector3D& start, const dsr::FVector3D& increment) {
+		return F32x8x3(
+		  F32x8::createGradient(start.x, increment.x),
+		  F32x8::createGradient(start.y, increment.y),
+		  F32x8::createGradient(start.z, increment.z)
+		);
+	}
+	// Transposed constructor given 4 columns of length 3
+	F32x8x3(const dsr::FVector3D& a, const dsr::FVector3D& b, const dsr::FVector3D& c, const dsr::FVector3D& d, const dsr::FVector3D& e, const dsr::FVector3D& f, const dsr::FVector3D& g, const dsr::FVector3D& h)
+	: v1(a.x, b.x, c.x, d.x, e.x, f.x, g.x, h.x),
+	  v2(a.y, b.y, c.y, d.y, e.y, f.y, g.y, h.y),
+	  v3(a.z, b.z, c.z, d.z, e.z, f.z, g.z, h.z) {}
+	// Transposed constructor given a single repeated column
+	F32x8x3(const dsr::FVector3D& v)
+	: v1(F32x8(v.x)),
+	  v2(F32x8(v.y)),
+	  v3(F32x8(v.z)) {}
+	// In-place math operations
+	SIMD_VECTOR_MEMBER_OPERATORS_3D(F32x8x3, F32x8, float)
+};
+SIMD_VECTOR_INFIX_OPERATORS_3D(F32x8x3, F32x8, float)
 
 
-inline F32x4 length(const F32x4x2 &v) {
-	return squareLength(v).squareRoot();
-}
+// X vector aliases
+#if DSR_DEFAULT_VECTOR_SIZE == 16
+	using F32xXx3 = F32x4x3;
+	using F32xXx2 = F32x4x2;
+#elif DSR_DEFAULT_VECTOR_SIZE == 32
+	using F32xXx3 = F32x8x3;
+	using F32xXx2 = F32x8x2;
+#endif
 
 
-inline F32x4x2 normalize(const F32x4x2 &v) {
-	return v * squareLength(v).reciprocalSquareRoot();
-}
+// F vector aliases
+#if DSR_FLOAT_VECTOR_SIZE == 16
+	using F32xFx3 = F32x4x3;
+	using F32xFx2 = F32x4x2;
+#elif DSR_FLOAT_VECTOR_SIZE == 32
+	using F32xFx3 = F32x8x3;
+	using F32xFx2 = F32x8x2;
+#endif
 
 
 #endif
 #endif
 
 

+ 42 - 38
Source/DFPSR/image/PackOrder.h

@@ -1,6 +1,6 @@
 // zlib open source license
 // zlib open source license
 //
 //
-// Copyright (c) 2017 to 2019 David Forsgren Piuva
+// Copyright (c) 2017 to 2023 David Forsgren Piuva
 // 
 // 
 // This software is provided 'as-is', without any express or implied
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // warranty. In no event will the authors be held liable for any damages
@@ -93,22 +93,26 @@ inline bool operator==(const PackOrder &left, const PackOrder &right) {
 }
 }
 
 
 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.
 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.
-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+T packBytes(const T &s0, const T &s1, const T &s2) {
 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16);
 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16);
 }
 }
 // Using a specified packing order
 // Using a specified packing order
-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+T packBytes(const T &s0, const T &s1, const T &s2, const PackOrder &order) {
 	return ENDIAN_POS_ADDR(s0, order.redOffset)
 	return ENDIAN_POS_ADDR(s0, order.redOffset)
 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)
 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)
 	     | ENDIAN_POS_ADDR(s2, order.blueOffset);
 	     | ENDIAN_POS_ADDR(s2, order.blueOffset);
 }
 }
 
 
 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.
 // Each input 32-bit element is from 0 to 255. Otherwise, the remainder will leak to other elements.
-inline static U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3) {
 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16) | ENDIAN_POS_ADDR(s3, 24);
 	return s0 | ENDIAN_POS_ADDR(s1, 8) | ENDIAN_POS_ADDR(s2, 16) | ENDIAN_POS_ADDR(s3, 24);
 }
 }
 // Using a specified packing order
 // Using a specified packing order
-inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const U32x4 &s3, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+T packBytes(const T &s0, const T &s1, const T &s2, const T &s3, const PackOrder &order) {
 	return ENDIAN_POS_ADDR(s0, order.redOffset)
 	return ENDIAN_POS_ADDR(s0, order.redOffset)
 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)
 	     | ENDIAN_POS_ADDR(s1, order.greenOffset)
 	     | ENDIAN_POS_ADDR(s2, order.blueOffset)
 	     | ENDIAN_POS_ADDR(s2, order.blueOffset)
@@ -116,7 +120,15 @@ inline U32x4 packBytes(const U32x4 &s0, const U32x4 &s1, const U32x4 &s2, const
 }
 }
 
 
 // Pack separate floats into saturated bytes
 // Pack separate floats into saturated bytes
-inline static U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {
+inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4 &s2, const F32x4 &s3) {
+	return packBytes(
+	  truncateToU32(s0.clamp(0.1f, 255.1f)),
+	  truncateToU32(s1.clamp(0.1f, 255.1f)),
+	  truncateToU32(s2.clamp(0.1f, 255.1f)),
+	  truncateToU32(s3.clamp(0.1f, 255.1f))
+	);
+}
+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3) {
 	return packBytes(
 	return packBytes(
 	  truncateToU32(s0.clamp(0.1f, 255.1f)),
 	  truncateToU32(s0.clamp(0.1f, 255.1f)),
 	  truncateToU32(s1.clamp(0.1f, 255.1f)),
 	  truncateToU32(s1.clamp(0.1f, 255.1f)),
@@ -134,54 +146,46 @@ inline U32x4 floatToSaturatedByte(const F32x4 &s0, const F32x4 &s1, const F32x4
 	  order
 	  order
 	);
 	);
 }
 }
-
-inline uint32_t getRed(uint32_t color) {
-	return color & ENDIAN32_BYTE_0;
-}
-inline uint32_t getRed(uint32_t color, const PackOrder &order) {
-	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);
-}
-inline uint32_t getGreen(uint32_t color) {
-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);
-}
-inline uint32_t getGreen(uint32_t color, const PackOrder &order) {
-	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);
-}
-inline uint32_t getBlue(uint32_t color) {
-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);
-}
-inline uint32_t getBlue(uint32_t color, const PackOrder &order) {
-	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);
-}
-inline uint32_t getAlpha(uint32_t color) {
-	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);
-}
-inline uint32_t getAlpha(uint32_t color, const PackOrder &order) {
-	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);
+inline U32x8 floatToSaturatedByte(const F32x8 &s0, const F32x8 &s1, const F32x8 &s2, const F32x8 &s3, const PackOrder &order) {
+	return packBytes(
+	  truncateToU32(s0.clamp(0.1f, 255.1f)),
+	  truncateToU32(s1.clamp(0.1f, 255.1f)),
+	  truncateToU32(s2.clamp(0.1f, 255.1f)),
+	  truncateToU32(s3.clamp(0.1f, 255.1f)),
+	  order
+	);
 }
 }
 
 
-inline U32x4 getRed(const U32x4 &color) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getRed(T color) {
 	return color & ENDIAN32_BYTE_0;
 	return color & ENDIAN32_BYTE_0;
 }
 }
-inline U32x4 getRed(const U32x4 &color, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getRed(T color, const PackOrder &order) {
 	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);
 	return ENDIAN_NEG_ADDR(color & order.redMask, order.redOffset);
 }
 }
-inline U32x4 getGreen(const U32x4 &color) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getGreen(T color) {
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_1, 8);
 }
 }
-inline U32x4 getGreen(const U32x4 &color, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getGreen(T color, const PackOrder &order) {
 	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);
 	return ENDIAN_NEG_ADDR(color & order.greenMask, order.greenOffset);
 }
 }
-inline U32x4 getBlue(const U32x4 &color) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getBlue(T color) {
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_2, 16);
 }
 }
-inline U32x4 getBlue(const U32x4 &color, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getBlue(T color, const PackOrder &order) {
 	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);
 	return ENDIAN_NEG_ADDR(color & order.blueMask, order.blueOffset);
 }
 }
-inline U32x4 getAlpha(const U32x4 &color) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getAlpha(T color) {
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);
 	return ENDIAN_NEG_ADDR(color & ENDIAN32_BYTE_3, 24);
 }
 }
-inline U32x4 getAlpha(const U32x4 &color, const PackOrder &order) {
+template<typename T> // Accepting uint32_t, U32x4, U32x8...
+inline T getAlpha(T color, const PackOrder &order) {
 	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);
 	return ENDIAN_NEG_ADDR(color & order.alphaMask, order.alphaOffset);
 }
 }
 
 

+ 82 - 63
Source/SDK/SpriteEngine/lightAPI.cpp

@@ -6,24 +6,24 @@
 namespace dsr {
 namespace dsr {
 
 
 // Precondition: The packed color must be in the standard RGBA order, meaning no native packing
 // Precondition: The packed color must be in the standard RGBA order, meaning no native packing
-inline F32x4x3 unpackRgb_U32x4_to_F32x4x3(const U32x4& color) {
-	return F32x4x3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));
+inline F32xXx3 unpackRgb_U32xX_to_F32xXx3(const U32xX& color) {
+	return F32xXx3(floatFromU32(getRed(color)), floatFromU32(getGreen(color)), floatFromU32(getBlue(color)));
 }
 }
 
 
-static inline void setLight(SafePointer<uint8_t> lightPixel, U8x16 newlight) {
+static inline void setLight(SafePointer<uint8_t> lightPixel, U8xX newlight) {
 	newlight.writeAligned(lightPixel, "setLight: writing light");
 	newlight.writeAligned(lightPixel, "setLight: writing light");
 }
 }
 
 
-static inline void addLight(SafePointer<uint8_t> lightPixel, U8x16 addedlight) {
-	U8x16 oldLight = U8x16::readAligned(lightPixel, "addLight: reading light");
-	U8x16 newlight = saturatedAddition(oldLight, addedlight);
+static inline void addLight(SafePointer<uint8_t> lightPixel, U8xX addedlight) {
+	U8xX oldLight = U8xX::readAligned(lightPixel, "addLight: reading light");
+	U8xX newlight = saturatedAddition(oldLight, addedlight);
 	newlight.writeAligned(lightPixel, "addLight: writing light");
 	newlight.writeAligned(lightPixel, "addLight: writing light");
 }
 }
 
 
 template <bool ADD_LIGHT>
 template <bool ADD_LIGHT>
 void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lightBuffer, const OrderedImageRgbaU8& normalBuffer, const FVector3D& lightDirection, float lightIntensity, const ColorRgbI32& lightColor) {
 void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lightBuffer, const OrderedImageRgbaU8& normalBuffer, const FVector3D& lightDirection, float lightIntensity, const ColorRgbI32& lightColor) {
 	// Normals in range 0..255 - 128 have lengths of 127 and 128, so if we double the reverse light direction we'll end up near 0..255 again for colors
 	// Normals in range 0..255 - 128 have lengths of 127 and 128, so if we double the reverse light direction we'll end up near 0..255 again for colors
-	F32x4x3 reverseLightDirection = F32x4x3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);
+	F32xXx3 reverseLightDirection = F32xXx3(-normalize(normalToWorldSpace.transformTransposed(lightDirection)) * lightIntensity * 2.0f);
 	IRect rectangleBound = image_getBound(lightBuffer);
 	IRect rectangleBound = image_getBound(lightBuffer);
 	float colorR = std::max(0.0f, (float)lightColor.red / 255.0f);
 	float colorR = std::max(0.0f, (float)lightColor.red / 255.0f);
 	float colorG = std::max(0.0f, (float)lightColor.green / 255.0f);
 	float colorG = std::max(0.0f, (float)lightColor.green / 255.0f);
@@ -37,27 +37,29 @@ void directedLight(const FMatrix3x3& normalToWorldSpace, OrderedImageRgbaU8& lig
 		for (int y = bound.top(); y < bound.bottom(); y++) {
 		for (int y = bound.top(); y < bound.bottom(); y++) {
 			SafePointer<uint8_t> lightPixel = lightRow;
 			SafePointer<uint8_t> lightPixel = lightRow;
 			SafePointer<uint32_t> normalPixel = normalRow;
 			SafePointer<uint32_t> normalPixel = normalRow;
-			for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {
+			for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {
 				// Read surface normals
 				// Read surface normals
-				U32x4 normalColor = U32x4::readAligned(normalPixel, "directedLight: reading normal");
-				F32x4x3 negativeSurfaceNormal = unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f;
+				U32xX normalColor = U32xX::readAligned(normalPixel, "directedLight: reading normal");
+				// TODO: Port SIMD3D to handle arbitrary vector lengths.
+				F32xXx3 negativeSurfaceNormal = unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f;
 				// Calculate light intensity
 				// Calculate light intensity
 				//   Normalization and negation is already pre-multiplied into reverseLightDirection
 				//   Normalization and negation is already pre-multiplied into reverseLightDirection
-				F32x4 intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);
-				F32x4 red = intensity * colorR;
-				F32x4 green = intensity * colorG;
-				F32x4 blue = intensity * colorB;
+				F32xX intensity = dotProduct(negativeSurfaceNormal, reverseLightDirection).clampLower(0.0f);
+				F32xX red = intensity * colorR;
+				F32xX green = intensity * colorG;
+				F32xX blue = intensity * colorB;
 				red = red.clampUpper(255.1f);
 				red = red.clampUpper(255.1f);
 				green = green.clampUpper(255.1f);
 				green = green.clampUpper(255.1f);
 				blue = blue.clampUpper(255.1f);
 				blue = blue.clampUpper(255.1f);
-				U8x16 light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));
+				// TODO: Let color packing handle arbitrary vector lengths.
+				U8xX light = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));
 				if (ADD_LIGHT) {
 				if (ADD_LIGHT) {
 					addLight(lightPixel, light);
 					addLight(lightPixel, light);
 				} else {
 				} else {
 					setLight(lightPixel, light);
 					setLight(lightPixel, light);
 				}
 				}
-				lightPixel += 16;
-				normalPixel += 4;
+				lightPixel += laneCountX_8Bit;
+				normalPixel += laneCountX_32Bit;
 			}
 			}
 			lightRow.increaseBytes(lightStride);
 			lightRow.increaseBytes(lightStride);
 			normalRow.increaseBytes(normalStride);
 			normalRow.increaseBytes(normalStride);
@@ -136,16 +138,33 @@ static float getShadowTransparency(SafePointer<float> pixelData, int32_t width,
 	return reciDepth * 1.02f > shadowReciDepth ? 1.0f : 0.0f;
 	return reciDepth * 1.02f > shadowReciDepth ? 1.0f : 0.0f;
 }
 }
 
 
-static inline F32x4 getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32x4x3& lightOffset) {
-	FVector4D offsetX = lightOffset.v1.get();
-	FVector4D offsetY = lightOffset.v2.get();
-	FVector4D offsetZ = lightOffset.v3.get();
-	return F32x4(
-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.x, offsetY.x, offsetZ.x)),
-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.y, offsetY.y, offsetZ.y)),
-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.z, offsetY.z, offsetZ.z)),
-		getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX.w, offsetY.w, offsetZ.w))
-	);
+static inline F32xX getShadowTransparency(SafePointer<float> pixelData, int32_t width, float halfWidth, const F32xXx3& lightOffset) {
+	// TODO: Create a way to quickly iterate over elements in a SIMD vector for interfacing with scalar operations.
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetX[DSR_DEFAULT_VECTOR_SIZE];
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetY[DSR_DEFAULT_VECTOR_SIZE];
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) float offsetZ[DSR_DEFAULT_VECTOR_SIZE];
+	lightOffset.v1.writeAlignedUnsafe(offsetX);
+	lightOffset.v2.writeAlignedUnsafe(offsetY);
+	lightOffset.v3.writeAlignedUnsafe(offsetZ);
+	#if DSR_DEFAULT_VECTOR_SIZE == 16
+		return F32x4(
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3]))
+		);
+	#elif DSR_DEFAULT_VECTOR_SIZE == 32
+		return F32x8(
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[0], offsetY[0], offsetZ[0])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[1], offsetY[1], offsetZ[1])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[2], offsetY[2], offsetZ[2])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[3], offsetY[3], offsetZ[3])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[4], offsetY[4], offsetZ[4])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[5], offsetY[5], offsetZ[5])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[6], offsetY[6], offsetZ[6])),
+			getShadowTransparency(pixelData, width, halfWidth, FVector3D(offsetX[7], offsetY[7], offsetZ[7]))
+		);
+	#endif
 }
 }
 
 
 template <bool SHADOW_CASTING>
 template <bool SHADOW_CASTING>
@@ -154,11 +173,11 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 	//   Normal-space defines the rotation for light-space
 	//   Normal-space defines the rotation for light-space
 	FVector3D lightSpaceSourcePosition = camera.normalToWorldSpace.transformTransposed(lightPosition);
 	FVector3D lightSpaceSourcePosition = camera.normalToWorldSpace.transformTransposed(lightPosition);
 	// Align the rectangle with 8 pixels, because that's the widest read to align in the 16-bit height buffer
 	// Align the rectangle with 8 pixels, because that's the widest read to align in the 16-bit height buffer
-	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, 4);
+	IRect rectangleBound = calculateBound(camera, worldCenter, lightBuffer, lightSpaceSourcePosition, lightRadius, laneCountX_32Bit);
 	if (rectangleBound.hasArea()) {
 	if (rectangleBound.hasArea()) {
 		// Uniform values
 		// Uniform values
 		// How much closer to your face in light-space does the pixel go per depth unit
 		// How much closer to your face in light-space does the pixel go per depth unit
-		F32x4x3 inYourFaceAxis = F32x4x3(camera.screenDepthToLightSpace.zAxis);
+		F32xXx3 inYourFaceAxis = F32xXx3(camera.screenDepthToLightSpace.zAxis);
 		// Light color
 		// Light color
 		float colorR = std::max(0.0f, (float)lightColor.red * lightIntensity);
 		float colorR = std::max(0.0f, (float)lightColor.red * lightIntensity);
 		float colorG = std::max(0.0f, (float)lightColor.green * lightIntensity);
 		float colorG = std::max(0.0f, (float)lightColor.green * lightIntensity);
@@ -173,14 +192,14 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 			FVector3D dx = camera.screenDepthToLightSpace.xAxis;
 			FVector3D dx = camera.screenDepthToLightSpace.xAxis;
 			FVector3D dy = camera.screenDepthToLightSpace.yAxis;
 			FVector3D dy = camera.screenDepthToLightSpace.yAxis;
 			// Pack the offset for each of the 4 first pixels into a transposing constructor
 			// Pack the offset for each of the 4 first pixels into a transposing constructor
-			F32x4x3 lightBaseRowX4 = F32x4x3(lightBaseRow, lightBaseRow + dx, lightBaseRow + dx * 2.0f, lightBaseRow + dx * 3.0f);
+			F32xXx3 lightBaseRowX = F32xXx3::createGradient(lightBaseRow, dx);
 			// Derivatives for moving four pixels to the right in parallel
 			// Derivatives for moving four pixels to the right in parallel
 			//    (n+0, y0), (n+1, y0), (n+2, y0), (n+3, y0) -> (n+4, y0), (n+5, y0), (n+6, y0), (n+7, y0)
 			//    (n+0, y0), (n+1, y0), (n+2, y0), (n+3, y0) -> (n+4, y0), (n+5, y0), (n+6, y0), (n+7, y0)
-			F32x4x3 dx4 = F32x4x3(dx * 4.0f);
+			F32xXx3 dxX = F32xXx3(dx * (float)laneCountX_32Bit);
 			// Derivatives for moving one pixel down in parallel
 			// Derivatives for moving one pixel down in parallel
 			//    (x0, n+0), (x1, n+0), (x2, n+0), (x3, n+0)
 			//    (x0, n+0), (x1, n+0), (x2, n+0), (x3, n+0)
 			// -> (x0, n+1), (x1, n+1), (x2, n+1), (x3, n+1)
 			// -> (x0, n+1), (x1, n+1), (x2, n+1), (x3, n+1)
-			F32x4x3 dy1 = F32x4x3(dy);
+			F32xXx3 dy1 = F32xXx3(dy);
 			// Get strides
 			// Get strides
 			int lightStride = image_getStride(lightBuffer);
 			int lightStride = image_getStride(lightBuffer);
 			int normalStride = image_getStride(normalBuffer);
 			int normalStride = image_getStride(normalBuffer);
@@ -194,56 +213,56 @@ static void addPointLightSuper(const OrthoView& camera, const IVector2D& worldCe
 			SafePointer<float> shadowCubeData;
 			SafePointer<float> shadowCubeData;
 			float shadowCubeCenter;
 			float shadowCubeCenter;
 			if (SHADOW_CASTING) {
 			if (SHADOW_CASTING) {
-				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % 4 == 0);
+				shadowCubeWidth = image_getWidth(shadowCubeMap); assert(shadowCubeWidth % laneCountX_32Bit == 0);
 				shadowCubeData = image_getSafePointer(shadowCubeMap);
 				shadowCubeData = image_getSafePointer(shadowCubeMap);
 				shadowCubeCenter = (float)shadowCubeWidth * 0.5f;
 				shadowCubeCenter = (float)shadowCubeWidth * 0.5f;
 			}
 			}
 			// Loop over the pixels to add light
 			// Loop over the pixels to add light
 			for (int y = bound.top(); y < bound.bottom(); y++) {
 			for (int y = bound.top(); y < bound.bottom(); y++) {
 				// Initiate the leftmost pixels before iterating to the right
 				// Initiate the leftmost pixels before iterating to the right
-				F32x4x3 lightBasePixelx4 = lightBaseRowX4;
+				F32xXx3 lightBasePixelxX = lightBaseRowX;
 				SafePointer<uint8_t> lightPixel = lightRow;
 				SafePointer<uint8_t> lightPixel = lightRow;
 				SafePointer<uint32_t> normalPixel = normalRow;
 				SafePointer<uint32_t> normalPixel = normalRow;
 				SafePointer<float> heightPixel = heightRow;
 				SafePointer<float> heightPixel = heightRow;
 				// Iterate over 16-bit pixels 8 at a time
 				// Iterate over 16-bit pixels 8 at a time
-				for (int x4 = bound.left(); x4 < bound.right(); x4+=4) {
+				for (int x = bound.left(); x < bound.right(); x += laneCountX_32Bit) {
 					// Read pixel height
 					// Read pixel height
-					F32x4 depthOffset = F32x4::readAligned(heightPixel, "addPointLight: reading height");
+					F32xX depthOffset = F32xX::readAligned(heightPixel, "addPointLight: reading height");
 					// Extrude the pixel using positive values towards the camera to represent another height
 					// Extrude the pixel using positive values towards the camera to represent another height
 					//   This will solve X and Z positions based on the height Y
 					//   This will solve X and Z positions based on the height Y
-					F32x4x3 lightOffset = lightBasePixelx4 + (inYourFaceAxis * depthOffset);
+					F32xXx3 lightOffset = lightBasePixelxX + (inYourFaceAxis * depthOffset);
 					// Get the linear distance, divide by sphere radius and limit to length 1 at intensity 0
 					// Get the linear distance, divide by sphere radius and limit to length 1 at intensity 0
-					F32x4 lightRatio = min(F32x4(1.0f), length(lightOffset) * reciprocalRadius);
+					F32xX lightRatio = min(F32xX(1.0f), length(lightOffset) * reciprocalRadius);
 					// Read surface normal
 					// Read surface normal
-					U32x4 normalColor = U32x4::readAligned(normalPixel, "addPointLight: reading normal");
+					U32xX normalColor = U32xX::readAligned(normalPixel, "addPointLight: reading normal");
 					// normalScale is used to negate the normals in advance so that opposing directions get positive values
 					// normalScale is used to negate the normals in advance so that opposing directions get positive values
-					F32x4x3 negativeSurfaceNormal = (unpackRgb_U32x4_to_F32x4x3(normalColor) - 128.0f) * (-1.0f / 128.0f);
+					F32xXx3 negativeSurfaceNormal = (unpackRgb_U32xX_to_F32xXx3(normalColor) - 128.0f) * (-1.0f / 128.0f);
 					// Fade from 0 to 1 using 1 - 2x + x²
 					// Fade from 0 to 1 using 1 - 2x + x²
-					F32x4 distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;
-					F32x4 angleIntensity = max(F32x4(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));
-					F32x4 intensity = angleIntensity * distanceIntensity;
+					F32xX distanceIntensity = 1.0f - 2.0f * lightRatio + lightRatio * lightRatio;
+					F32xX angleIntensity = max(F32xX(0.0f), dotProduct(normalize(lightOffset), negativeSurfaceNormal));
+					F32xX intensity = angleIntensity * distanceIntensity;
 					if (SHADOW_CASTING) {
 					if (SHADOW_CASTING) {
 						intensity = intensity * getShadowTransparency(shadowCubeData, shadowCubeWidth, shadowCubeCenter, lightOffset);
 						intensity = intensity * getShadowTransparency(shadowCubeData, shadowCubeWidth, shadowCubeCenter, lightOffset);
 					}
 					}
 					// TODO: Make an optimized version for white light replacing red, green and blue with a single LUMA
 					// TODO: Make an optimized version for white light replacing red, green and blue with a single LUMA
-					F32x4 red = intensity * colorR;
-					F32x4 green = intensity * colorG;
-					F32x4 blue = intensity * colorB;
+					F32xX red = intensity * colorR;
+					F32xX green = intensity * colorG;
+					F32xX blue = intensity * colorB;
 					red = red.clampUpper(255.1f);
 					red = red.clampUpper(255.1f);
 					green = green.clampUpper(255.1f);
 					green = green.clampUpper(255.1f);
 					blue = blue.clampUpper(255.1f);
 					blue = blue.clampUpper(255.1f);
 					// Add light to the image
 					// Add light to the image
-					U8x16 morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));
+					U8xX morelight = reinterpret_U8FromU32(packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue)));
 					addLight(lightPixel, morelight);
 					addLight(lightPixel, morelight);
 					// Go to the next four pixels in light-space
 					// Go to the next four pixels in light-space
-					lightBasePixelx4 += dx4;
+					lightBasePixelxX += dxX;
 					// Go to the next 4 pixels of image data
 					// Go to the next 4 pixels of image data
-					lightPixel += 16;
-					normalPixel += 4;
-					heightPixel += 4;
+					lightPixel += laneCountX_8Bit;
+					normalPixel += laneCountX_32Bit;
+					heightPixel += laneCountX_32Bit;
 				}
 				}
 				// Go to the next row in light-space
 				// Go to the next row in light-space
-				lightBaseRowX4 += dy1;
+				lightBaseRowX += dy1;
 				// Go to the next row of image data
 				// Go to the next row of image data
 				lightRow.increaseBytes(lightStride);
 				lightRow.increaseBytes(lightStride);
 				normalRow.increaseBytes(normalStride);
 				normalRow.increaseBytes(normalStride);
@@ -276,25 +295,25 @@ void blendLight(AlignedImageRgbaU8& colorBuffer, const OrderedImageRgbaU8& diffu
 		int targetStride = image_getStride(colorBuffer);
 		int targetStride = image_getStride(colorBuffer);
 		int diffuseStride = image_getStride(diffuseBuffer);
 		int diffuseStride = image_getStride(diffuseBuffer);
 		int lightStride = image_getStride(lightBuffer);
 		int lightStride = image_getStride(lightBuffer);
-		F32x4 scale = F32x4(1.0 / 128.0f);
+		F32xX scale = F32xX(1.0 / 128.0f);
 		for (int y = startIndex; y < stopIndex; y++) {
 		for (int y = startIndex; y < stopIndex; y++) {
 			SafePointer<uint32_t> targetPixel = targetRow;
 			SafePointer<uint32_t> targetPixel = targetRow;
 			SafePointer<uint32_t> diffusePixel = diffuseRow;
 			SafePointer<uint32_t> diffusePixel = diffuseRow;
 			SafePointer<uint32_t> lightPixel = lightRow;
 			SafePointer<uint32_t> lightPixel = lightRow;
-			for (int x4 = 0; x4 < width; x4 += 4) {
-				U32x4 diffuse = U32x4::readAligned(diffusePixel, "blendLight: reading diffuse");
-				U32x4 light = U32x4::readAligned(lightPixel, "blendLight: reading light");
-				F32x4 red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;
-				F32x4 green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;
-				F32x4 blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;
+			for (int x = 0; x < width; x += laneCountX_32Bit) {
+				U32xX diffuse = U32xX::readAligned(diffusePixel, "blendLight: reading diffuse");
+				U32xX light = U32xX::readAligned(lightPixel, "blendLight: reading light");
+				F32xX red = (floatFromU32(getRed(diffuse)) * floatFromU32(getRed(light))) * scale;
+				F32xX green = (floatFromU32(getGreen(diffuse)) * floatFromU32(getGreen(light))) * scale;
+				F32xX blue = (floatFromU32(getBlue(diffuse)) * floatFromU32(getBlue(light))) * scale;
 				red = red.clampUpper(255.1f);
 				red = red.clampUpper(255.1f);
 				green = green.clampUpper(255.1f);
 				green = green.clampUpper(255.1f);
 				blue = blue.clampUpper(255.1f);
 				blue = blue.clampUpper(255.1f);
-				U32x4 color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);
+				U32xX color = packBytes(truncateToU32(red), truncateToU32(green), truncateToU32(blue), targetOrder);
 				color.writeAligned(targetPixel, "blendLight: writing color");
 				color.writeAligned(targetPixel, "blendLight: writing color");
-				targetPixel += 4;
-				diffusePixel += 4;
-				lightPixel += 4;
+				targetPixel += laneCountX_32Bit;
+				diffusePixel += laneCountX_32Bit;
+				lightPixel += laneCountX_32Bit;
 			}
 			}
 			targetRow.increaseBytes(targetStride);
 			targetRow.increaseBytes(targetStride);
 			diffuseRow.increaseBytes(diffuseStride);
 			diffuseRow.increaseBytes(diffuseStride);