Browse Source

Explicitly aligning SIMD vectors and supressing as many temps as possible to go around that g++ violates strict alignment requirements in Intel's ABI.

David Piuva 10 months ago
parent
commit
fb7fc94991

+ 62 - 28
Source/DFPSR/api/textureAPI.h

@@ -135,7 +135,7 @@ namespace dsr {
 	  bool HIGHEST_RESOLUTION = false,
 	  bool HIGHEST_RESOLUTION = false,
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
-	inline U texture_getPixelOffsetToLayer(const TextureRgbaU8 &texture, U mipLevel) {
+	inline U texture_getPixelOffsetToLayer(const TextureRgbaU8 &texture, const U &mipLevel) {
 		if (HIGHEST_RESOLUTION) {
 		if (HIGHEST_RESOLUTION) {
 			return U(texture.impl_startOffset);
 			return U(texture.impl_startOffset);
 		} else {
 		} else {
@@ -143,6 +143,9 @@ namespace dsr {
 		}
 		}
 	}
 	}
 
 
+	// TODO: Use SQUARE AND SINGLE_LAYER to generate faster specialized shaders.
+	// TODO: Generate an array of as many mip levels as the mip calculation generates ahead of time to accelerate texture sampling.
+
 	// mipLevel starts from 0 at the highest resolution and ends with the lowest resolution.
 	// mipLevel starts from 0 at the highest resolution and ends with the lowest resolution.
 	// Optimization arguments:
 	// Optimization arguments:
 	//   * SQUARE can be set to true when you know in compile time that texture has the same width and height.
 	//   * SQUARE can be set to true when you know in compile time that texture has the same width and height.
@@ -163,11 +166,11 @@ namespace dsr {
 	  bool HIGHEST_RESOLUTION = false, // Ignoring any lower layers.
 	  bool HIGHEST_RESOLUTION = false, // Ignoring any lower layers.
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
-	inline U texture_getPixelOffset(const TextureRgbaU8 &texture, U x, U y, U mipLevel) {
+	U texture_getPixelOffset(const TextureRgbaU8 &texture, const U &x, const U &y, const U &mipLevel) {
 		// Clamp the mip-level using bitwise operations in a logarithmic scale, by masking out excess bits with zeroes and filling missing bits with ones.
 		// Clamp the mip-level using bitwise operations in a logarithmic scale, by masking out excess bits with zeroes and filling missing bits with ones.
 		U tileMaskX = U(texture.impl_maxWidthAndMask );
 		U tileMaskX = U(texture.impl_maxWidthAndMask );
 		U tileMaskY = U(texture.impl_maxHeightAndMask);
 		U tileMaskY = U(texture.impl_maxHeightAndMask);
-		if (!HIGHEST_RESOLUTION) {
+		if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
 			tileMaskX = tileMaskX >> mipLevel;
 			tileMaskX = tileMaskX >> mipLevel;
 			tileMaskY = tileMaskY >> mipLevel;
 			tileMaskY = tileMaskY >> mipLevel;
 		}
 		}
@@ -179,32 +182,40 @@ namespace dsr {
 			}
 			}
 		}
 		}
 		U log2PixelStride = U(texture.impl_log2width);
 		U log2PixelStride = U(texture.impl_log2width);
-		if (!HIGHEST_RESOLUTION) {
+		if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
 			log2PixelStride = log2PixelStride - mipLevel;
 			log2PixelStride = log2PixelStride - mipLevel;
 		}
 		}
+		U tiledX = x;
+		U tiledY = y;
 		if (!XY_INSIDE) {
 		if (!XY_INSIDE) {
-			x = x & tileMaskX;
+			tiledX = tiledX & tileMaskX;
 			if (SQUARE) {
 			if (SQUARE) {
 				// Apply the same mask to both for square images, so that the other mask can be optimized away.
 				// Apply the same mask to both for square images, so that the other mask can be optimized away.
-				y = y & tileMaskX;
+				tiledY = tiledY & tileMaskX;
 			} else {
 			} else {
 				// Apply a separate mask for Y coordinates when the texture might not be square.
 				// Apply a separate mask for Y coordinates when the texture might not be square.
-				y = y & tileMaskY;
+				tiledY = tiledY & tileMaskY;
 			}
 			}
 		}
 		}
-		U coordinateOffset = ((y << log2PixelStride) | x);
+		U coordinateOffset = ((tiledY << log2PixelStride) | tiledX);
 		#ifndef NDEBUG
 		#ifndef NDEBUG
 			// In debug mode, wrong use of optimization arguments will throw errors.
 			// In debug mode, wrong use of optimization arguments will throw errors.
-			if (SQUARE && (texture.impl_log2width != texture.impl_log2height)) {
-				throwError(U"texture_getPixelOffset was told that the texture would have square dimensions using SQUARE, but ", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" is not square!\n");
+			if (SQUARE) {
+				if (texture.impl_log2width != texture.impl_log2height) {
+					throwError(U"texture_getPixelOffset was told that the texture would have square dimensions using SQUARE, but ", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" is not square!\n");
+				}
 			}
 			}
-			if (SINGLE_LAYER && (texture_getSmallestMipLevel(texture) > 0)) {
-				throwError(U"texture_getPixelOffset was told that the texture would only have a single layer using SINGLE_LAYER, but it has ", texture_getSmallestMipLevel(texture) + 1, U" layers!\n");
+			if (SINGLE_LAYER) {
+				if (texture_getSmallestMipLevel(texture) > 0) {
+					throwError(U"texture_getPixelOffset was told that the texture would only have a single layer using SINGLE_LAYER, but it has ", texture_getSmallestMipLevel(texture) + 1, U" layers!\n");
+				}
 			}
 			}
-			if (XY_INSIDE && !(allLanesEqual(x & ~tileMaskX, U(0)) && allLanesEqual(y & ~tileMaskY, U(0)))) {
-				throwError(U"texture_getPixelOffset was told that the pixel coordinates would stay inside using XY_INSIDE, but the coordinate (", x, U", ", y, U") is not within", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" pixels!\n");
+			if (XY_INSIDE) {
+				if (!(allLanesEqual(x & ~tileMaskX, U(0)) && allLanesEqual(y & ~tileMaskY, U(0)))) {
+					throwError(U"texture_getPixelOffset was told that the pixel coordinates would stay inside using XY_INSIDE, but the coordinate (", x, U", ", y, U") is not within", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" pixels!\n");
+				}
 			}
 			}
-			if (!HIGHEST_RESOLUTION) {
+			if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
 				if (!allLanesLesserOrEqual(mipLevel, U(15u))) {
 				if (!allLanesLesserOrEqual(mipLevel, U(15u))) {
 					throwError(U"texture_getPixelOffset got mip level ", mipLevel, U", which is not within the fixed range of 0..15!\n");
 					throwError(U"texture_getPixelOffset got mip level ", mipLevel, U", which is not within the fixed range of 0..15!\n");
 				}
 				}
@@ -231,7 +242,7 @@ namespace dsr {
 	  bool HIGHEST_RESOLUTION = false,
 	  bool HIGHEST_RESOLUTION = false,
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
-	inline U texture_readPixel(const TextureRgbaU8 &texture, U x, U y, U mipLevel) {
+	inline U texture_readPixel(const TextureRgbaU8 &texture, const U &x, const U &y, const U &mipLevel) {
 		#ifndef NDEBUG
 		#ifndef NDEBUG
 			if (!texture_exists(texture)) {
 			if (!texture_exists(texture)) {
 				throwError(U"Tried to read pixels from a texture that does not exist!\n");
 				throwError(U"Tried to read pixels from a texture that does not exist!\n");
@@ -275,7 +286,7 @@ namespace dsr {
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  typename U, // uint32_t, U32x4, U32x8, U32xX
 	  typename F, // float, F32x4, F32x8, F32xX, F32xF
 	  typename F, // float, F32x4, F32x8, F32xX, F32xF
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U) && DSR_CHECK_PROPERTY(DsrTrait_Any_F32, F))>
 	  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U) && DSR_CHECK_PROPERTY(DsrTrait_Any_F32, F))>
-	inline U texture_sample_nearest(const TextureRgbaU8 &texture, F u, F v, U mipLevel) {
+	inline U texture_sample_nearest(const TextureRgbaU8 &texture, const F &u, const F &v, const U &mipLevel) {
 		U scaleU = U(1u) << U(texture.impl_log2width );
 		U scaleU = U(1u) << U(texture.impl_log2width );
 		U scaleV = U(1u) << U(texture.impl_log2height);
 		U scaleV = U(1u) << U(texture.impl_log2height);
 		if (!HIGHEST_RESOLUTION) {
 		if (!HIGHEST_RESOLUTION) {
@@ -287,6 +298,7 @@ namespace dsr {
 		return texture_readPixel<SQUARE, SINGLE_LAYER, false, MIP_INSIDE, HIGHEST_RESOLUTION, U>(texture, xPixel, yPixel, mipLevel);
 		return texture_readPixel<SQUARE, SINGLE_LAYER, false, MIP_INSIDE, HIGHEST_RESOLUTION, U>(texture, xPixel, yPixel, mipLevel);
 	}
 	}
 
 
+	// Internal helper function, not a part of the API!
 	// Returns (colorA * weightA + colorB * weightB) / 256 as bytes
 	// Returns (colorA * weightA + colorB * weightB) / 256 as bytes
 	// weightA and weightB should contain pairs of the same 16-bit weights for each of the 4 pixels in the corresponding A and B colors
 	// weightA and weightB should contain pairs of the same 16-bit weights for each of the 4 pixels in the corresponding A and B colors
 	template <typename U32, typename U16, DSR_ENABLE_IF(
 	template <typename U32, typename U16, DSR_ENABLE_IF(
@@ -305,8 +317,9 @@ namespace dsr {
 		return ((bitShiftRightImmediate<8>(lowColor) & lowMask) | (highColor & highMask));
 		return ((bitShiftRightImmediate<8>(lowColor) & lowMask) | (highColor & highMask));
 	}
 	}
 
 
+	// Internal helper function, not a part of the API!
 	// The more significant bits must be zero so that the lower bits can fill the space.
 	// The more significant bits must be zero so that the lower bits can fill the space.
-	//   lowBits[x] < 2^16
+	//   lowBits[x] < 2¹⁶
 	template <typename U32, DSR_ENABLE_IF(
 	template <typename U32, DSR_ENABLE_IF(
 	  DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U32)
 	  DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U32)
 	)>
 	)>
@@ -314,27 +327,50 @@ namespace dsr {
 		return reinterpret_U16FromU32(lowBits | bitShiftLeftImmediate<16>(lowBits));
 		return reinterpret_U16FromU32(lowBits | bitShiftLeftImmediate<16>(lowBits));
 	}
 	}
 
 
+	// Internal helper function, not a part of the API!
 	// Returns 256 - weight
 	// Returns 256 - weight
 	template <typename U16, DSR_ENABLE_IF(
 	template <typename U16, DSR_ENABLE_IF(
 	  DSR_CHECK_PROPERTY(DsrTrait_Any_U16, U16)
 	  DSR_CHECK_PROPERTY(DsrTrait_Any_U16, U16)
 	)>
 	)>
 	inline U16 invertWeight(const U16 &weight) {
 	inline U16 invertWeight(const U16 &weight) {
-		return U16(0x01000100u) - weight;
+		return U16(0x0100u) - weight;
 	}
 	}
 
 
-	/* TODO: Use for anisotropic or tri-linear sampling.
+	// TODO: Implement a scalar version for easy sampling of colors in reference implementations.
+	// A X-->    B
+	// Pre-condition:
+	//   0 <= weight <= 256
+	// Post-condition: Returns a bi-linear color mix of colors A and B.
+	// texture_interpolate_color_linear(a, b, 0) = a
+	// texture_interpolate_color_linear(a, b, 128) = floor((a + b) / 2)
+	// texture_interpolate_color_linear(a, b, 256) = b
 	template <typename U32, typename U16>
 	template <typename U32, typename U16>
-	inline U32 mix_L(const U32 &colorA, const U32 &colorB, const U32 &weight) {
+	inline U32 texture_interpolate_color_linear(const U32 &colorA, const U32 &colorB, const U32 &weight) {
 		// Get inverse weights
 		// Get inverse weights
 		U16 weightB = repeatAs16Bits(weight);
 		U16 weightB = repeatAs16Bits(weight);
 		U16 weightA = invertWeight(weightB);
 		U16 weightA = invertWeight(weightB);
 		// Multiply
 		// Multiply
-		return weightColors(colorA, weightA, colorB, weightB);
+		return weightColors<U32, U16>(colorA, weightA, colorB, weightB);
 	}
 	}
-	*/
 
 
+	// TODO: Implement a scalar version for easy sampling of colors in reference implementations.
+	// A X-->    B
+	// Y
+	// |
+	// V
+	//
+	// C         D
+	// Pre-condition:
+	//   0 <= weightX <= 256
+	//   0 <= weightY <= 256
+	// Post-condition: Returns a bi-linear color mix of colors A, B, C, D using weights X and Y.
+	// texture_interpolate_color_bilinear(a, b, c, d,   0,   0) = a
+	// texture_interpolate_color_bilinear(a, b, c, d, 256,   0) = b
+	// texture_interpolate_color_bilinear(a, b, c, d,   0, 256) = c
+	// texture_interpolate_color_bilinear(a, b, c, d, 256, 256) = d
+	// texture_interpolate_color_bilinear(a, b, c, d, 128, 128) = floor((a + b + c + d) / 4)
 	template <typename U32, typename U16>
 	template <typename U32, typename U16>
-	inline U32 mix_BL(const U32 &colorA, const U32 &colorB, const U32 &colorC, const U32 &colorD, const U32 &weightX, const U32 &weightY) {
+	inline U32 texture_interpolate_color_bilinear(const U32 &colorA, const U32 &colorB, const U32 &colorC, const U32 &colorD, const U32 &weightX, const U32 &weightY) {
 		// Get inverse weights
 		// Get inverse weights
 		U16 weightXR = repeatAs16Bits<U32>(weightX);
 		U16 weightXR = repeatAs16Bits<U32>(weightX);
 		U16 weightYB = repeatAs16Bits<U32>(weightY);
 		U16 weightYB = repeatAs16Bits<U32>(weightY);
@@ -357,7 +393,7 @@ namespace dsr {
 	    DSR_CHECK_PROPERTY(DsrTrait_Any_U16, U16) &&
 	    DSR_CHECK_PROPERTY(DsrTrait_Any_U16, U16) &&
 	    DSR_CHECK_PROPERTY(DsrTrait_Any_F32, F32)
 	    DSR_CHECK_PROPERTY(DsrTrait_Any_F32, F32)
 	  )>
 	  )>
-	inline U32 texture_sample_bilinear(const TextureRgbaU8 &texture, F32 u, F32 v, U32 mipLevel) {
+	inline U32 texture_sample_bilinear(const TextureRgbaU8 &texture, const F32 &u, const F32 &v, const U32 &mipLevel) {
 		U32 scaleU = U32(256u) << U32(texture.impl_log2width );
 		U32 scaleU = U32(256u) << U32(texture.impl_log2width );
 		U32 scaleV = U32(256u) << U32(texture.impl_log2height);
 		U32 scaleV = U32(256u) << U32(texture.impl_log2height);
 		if (!HIGHEST_RESOLUTION) {
 		if (!HIGHEST_RESOLUTION) {
@@ -406,8 +442,6 @@ namespace dsr {
 			pixelTop = pixelTop & tileMaskY;
 			pixelTop = pixelTop & tileMaskY;
 			pixelBottom = pixelBottom & tileMaskY;
 			pixelBottom = pixelBottom & tileMaskY;
 		}
 		}
-
-
 		#ifndef NDEBUG
 		#ifndef NDEBUG
 			// In debug mode, wrong use of optimization arguments will throw errors.
 			// In debug mode, wrong use of optimization arguments will throw errors.
 			if (SQUARE && (texture.impl_log2width != texture.impl_log2height)) {
 			if (SQUARE && (texture.impl_log2width != texture.impl_log2height)) {
@@ -445,7 +479,7 @@ namespace dsr {
 		U32 upperRightColor  = gather_U32(data, upperRightOffset );
 		U32 upperRightColor  = gather_U32(data, upperRightOffset );
 		U32 bottomLeftColor  = gather_U32(data, bottomLeftOffset );
 		U32 bottomLeftColor  = gather_U32(data, bottomLeftOffset );
 		U32 bottomRightColor = gather_U32(data, bottomRightOffset);
 		U32 bottomRightColor = gather_U32(data, bottomRightOffset);
-		return mix_BL<U32, U16>(upperLeftColor, upperRightColor, bottomLeftColor, bottomRightColor, weightX, weightY);
+		return texture_interpolate_color_bilinear<U32, U16>(upperLeftColor, upperRightColor, bottomLeftColor, bottomRightColor, weightX, weightY);
 	}
 	}
 
 
 	// resolutions is the maximum number of resolutions to create.
 	// resolutions is the maximum number of resolutions to create.

File diff suppressed because it is too large
+ 188 - 341
Source/DFPSR/base/simd.h


+ 7 - 0
Source/test/TestCaller.DsrProj

@@ -4,10 +4,17 @@ CompilerFlag "-std=c++14"
 # Use all locally available SIMD extensions.
 # Use all locally available SIMD extensions.
 CompilerFlag "-march=native"
 CompilerFlag "-march=native"
 
 
+# Enable debug information
+CompilerFlag "-g"
+
 Debug = 1
 Debug = 1
 Supressed = 1
 Supressed = 1
 Graphics = 0
 Graphics = 0
 Sound = 0
 Sound = 0
+# Turning off optimization will cause AVX2 to crash in GNU's g++!
+#   Because there is no way to create a U32x8 vector without getting an unaligned __m256i temp
+#   generated and moved by the broken compiler and they have refused to fix the bug since 2009.
+#Optimization = 0
 Import "../DFPSR/DFPSR.DsrHead"
 Import "../DFPSR/DFPSR.DsrHead"
 
 
 # Compile and run each source file ending with Test.cpp in tests as its own project.
 # Compile and run each source file ending with Test.cpp in tests as its own project.

+ 9 - 3
Source/test/testTools.h

@@ -124,8 +124,13 @@ void dsrMain(List<String> args) { \
 	stateName = string_combine(U"After evaluating condition ", #CONDITION, U"\n");
 	stateName = string_combine(U"After evaluating condition ", #CONDITION, U"\n");
 
 
 #define ASSERT_COMP(A, B, OP, OP_NAME) \
 #define ASSERT_COMP(A, B, OP, OP_NAME) \
-	stateName = string_combine(U"While evaluating comparison ", #A, " ", OP_NAME, U" ", #B, U"\n"); \
-	if (OP(A, B)) { \
+{ \
+	stateName = string_combine(U"While evaluating ", #A, U"\n"); \
+	auto lhs = A; \
+	stateName = string_combine(U"While evaluating ", #B, U"\n"); \
+	auto rhs = B; \
+	stateName = string_combine(U"While comparing ", #A, " ", OP_NAME, U" ", #B, U"\n"); \
+	if (OP(lhs, rhs)) { \
 		printText(U"*"); \
 		printText(U"*"); \
 	} else { \
 	} else { \
 	stateName = string_combine(U"While reporting failure for comparison ", #A, " ", OP_NAME, U" ", #B, U"\n"); \
 	stateName = string_combine(U"While reporting failure for comparison ", #A, " ", OP_NAME, U" ", #B, U"\n"); \
@@ -138,7 +143,8 @@ void dsrMain(List<String> args) { \
 			U"____________________________________________________________________\n" \
 			U"____________________________________________________________________\n" \
 		); \
 		); \
 	} \
 	} \
-	stateName = string_combine(U"After evaluating comparison ", #A, " ", OP_NAME, U" ", #B, U"\n");
+	stateName = string_combine(U"After evaluating comparison ", #A, " ", OP_NAME, U" ", #B, U"\n"); \
+}
 #define ASSERT_EQUAL(A, B) ASSERT_COMP(A, B, OP_EQUALS, "==")
 #define ASSERT_EQUAL(A, B) ASSERT_COMP(A, B, OP_EQUALS, "==")
 #define ASSERT_NOT_EQUAL(A, B) ASSERT_COMP(A, B, OP_NOT_EQUALS, "!=")
 #define ASSERT_NOT_EQUAL(A, B) ASSERT_COMP(A, B, OP_NOT_EQUALS, "!=")
 #define ASSERT_LESSER(A, B) ASSERT_COMP(A, B, OP_LESSER, "<")
 #define ASSERT_LESSER(A, B) ASSERT_COMP(A, B, OP_LESSER, "<")

+ 4 - 4
Source/test/tests/DataLoopTest.cpp

@@ -17,15 +17,15 @@
 				errors++; \
 				errors++; \
 			} \
 			} \
 		} \
 		} \
-		ASSERT(errors == 0); \
+		ASSERT_EQUAL(errors, 0); \
 	}
 	}
 
 
 START_TEST(DataLoop)
 START_TEST(DataLoop)
 	// Allocate aligned memory
 	// Allocate aligned memory
 	const int elements = 256;
 	const int elements = 256;
-	int32_t allocationA[elements] ALIGN16;
-	int32_t allocationB[elements] ALIGN16;
-	int32_t allocationC[elements] ALIGN16;
+	ALIGN16 int32_t allocationA[elements];
+	ALIGN16 int32_t allocationB[elements];
+	ALIGN16 int32_t allocationC[elements];
 	// The SafePointer class will emulate the behaviour of a raw data pointer while providing full bound checks in debug mode.
 	// The SafePointer class will emulate the behaviour of a raw data pointer while providing full bound checks in debug mode.
 	SafePointer<int32_t> bufferA("bufferA", allocationA, sizeof(allocationA));
 	SafePointer<int32_t> bufferA("bufferA", allocationA, sizeof(allocationA));
 	SafePointer<int32_t> bufferB("bufferB", allocationB, sizeof(allocationB));
 	SafePointer<int32_t> bufferB("bufferB", allocationB, sizeof(allocationB));

File diff suppressed because it is too large
+ 604 - 608
Source/test/tests/SimdTest.cpp


+ 159 - 14
Source/test/tests/TextureTest.cpp

@@ -1,21 +1,165 @@
 
 
 #include "../testTools.h"
 #include "../testTools.h"
 #include "../../DFPSR/base/simd.h"
 #include "../../DFPSR/base/simd.h"
+#include "../../DFPSR/implementation/image/PackOrder.h"
+
+#define ASSERT_EQUAL_SIMD(A, B) ASSERT_COMP(A, B, allLanesEqual, "==")
+#define ASSERT_NOTEQUAL_SIMD(A, B) ASSERT_COMP(A, B, !allLanesEqual, "!=")
+
+inline U32x8 shr_test(const U32x8& left, const U32x8 &bitOffsets) {
+stateName = U"shr_test: A.\n";
+	assert((uintptr_t(&left) & 31u) == 0);
+	#ifdef SAFE_POINTER_CHECKS
+stateName = U"shr_test: B.\n";
+		if(!allLanesLesser(bitOffsets, U32x8(32u))) {
+			throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
+		}
+	#endif
+	#if defined(USE_AVX2)
+stateName = U"shr_test: C 1.\n";
+		ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesA[8];
+		ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesB[8];
+stateName = U"shr_test: C 2.\n";
+		left.writeAlignedUnsafe(&(lanesA[0]));
+stateName = U"shr_test: C 3.\n";
+		bitOffsets.writeAlignedUnsafe(&(lanesB[0]));
+stateName = U"shr_test: C 4 calculate.\n";
+		uint32_t a1 = uint32_t(lanesA[0] >> lanesB[0]);
+		uint32_t a2 = uint32_t(lanesA[1] >> lanesB[1]);
+		uint32_t a3 = uint32_t(lanesA[2] >> lanesB[2]);
+		uint32_t a4 = uint32_t(lanesA[3] >> lanesB[3]);
+		uint32_t a5 = uint32_t(lanesA[4] >> lanesB[4]);
+		uint32_t a6 = uint32_t(lanesA[5] >> lanesB[5]);
+		uint32_t a7 = uint32_t(lanesA[6] >> lanesB[6]);
+		uint32_t a8 = uint32_t(lanesA[7] >> lanesB[7]);
+stateName = U"shr_test: C 5 _mm256_set_epi32.\n";
+		ALIGN32 __m256i result = _mm256_set_epi32(a8, a7, a6, a5, a4, a3, a2, a1);
+stateName = U"shr_test: C 6 return.\n";
+		return U32x8(result);
+	#else
+stateName = U"shr_test: D.\n";
+		IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
+	#endif
+}
+
+template<
+  bool SQUARE = false,             // Width and height must be the same.
+  bool SINGLE_LAYER = false,       // Demanding that the texture only has a single layer.
+  bool XY_INSIDE = false,          // No pixels may be sampled outside.
+  bool MIP_INSIDE = false,         // Mip level may not go outside of existing layer indices.
+  bool HIGHEST_RESOLUTION = false, // Ignoring any lower layers.
+  typename U, // uint32_t, U32x4, U32x8, U32xX
+  DSR_ENABLE_IF(DSR_CHECK_PROPERTY(DsrTrait_Any_U32, U))>
+U texture_getPixelOffset_test(const TextureRgbaU8 &texture, const U &x, const U &y, const U &mipLevel) {
+stateName = U"Getting eight pixel offsets A.\n";
+	// Clamp the mip-level using bitwise operations in a logarithmic scale, by masking out excess bits with zeroes and filling missing bits with ones.
+	U tileMaskX = U(texture.impl_maxWidthAndMask );
+	U tileMaskY = U(texture.impl_maxHeightAndMask);
+stateName = U"Getting eight pixel offsets B.\n"; // Crashes here on the server!
+	if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
+		//tileMaskX = tileMaskX >> mipLevel;
+		//tileMaskY = tileMaskY >> mipLevel;
+		tileMaskX = shr_test(tileMaskX, mipLevel);
+		tileMaskY = shr_test(tileMaskY, mipLevel);
+	}
+stateName = U"Getting eight pixel offsets C.\n";
+	if (!MIP_INSIDE) {
+		// If the mip level index might be higher than what is used in the texture, make sure that the tile masks have at least enough bits for the lowest texture resolution.
+		tileMaskX = tileMaskX | texture.impl_minWidthOrMask;
+		if (!SQUARE) {
+			tileMaskY = tileMaskY | texture.impl_minHeightOrMask;
+		}
+	}
+stateName = U"Getting eight pixel offsets D.\n";
+	U log2PixelStride = U(texture.impl_log2width);
+	if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
+stateName = U"Getting eight pixel offsets E.\n";
+		log2PixelStride = log2PixelStride - mipLevel;
+	}
+stateName = U"Getting untiled coordinates.\n";
+	U tiledX = x;
+	U tiledY = y;
+stateName = U"Getting eight pixel offsets F.\n";
+	if (!XY_INSIDE) {
+		tiledX = tiledX & tileMaskX;
+stateName = U"Getting eight pixel offsets G.\n";
+		if (SQUARE) {
+			// Apply the same mask to both for square images, so that the other mask can be optimized away.
+			tiledY = tiledY & tileMaskX;
+		} else {
+			// Apply a separate mask for Y coordinates when the texture might not be square.
+			tiledY = tiledY & tileMaskY;
+		}
+	}
+stateName = U"Getting eight pixel offsets H.\n";
+	U coordinateOffset = ((tiledY << log2PixelStride) | tiledX);
+stateName = U"Getting eight pixel offsets I.\n";
+	#ifndef NDEBUG
+		// In debug mode, wrong use of optimization arguments will throw errors.
+		if (SQUARE) {
+			if (texture.impl_log2width != texture.impl_log2height) {
+				throwError(U"texture_getPixelOffset was told that the texture would have square dimensions using SQUARE, but ", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" is not square!\n");
+			}
+		}
+		if (SINGLE_LAYER) {
+			if (texture_getSmallestMipLevel(texture) > 0) {
+				throwError(U"texture_getPixelOffset was told that the texture would only have a single layer using SINGLE_LAYER, but it has ", texture_getSmallestMipLevel(texture) + 1, U" layers!\n");
+			}
+		}
+		if (XY_INSIDE) {
+			if (!(allLanesEqual(x & ~tileMaskX, U(0)) && allLanesEqual(y & ~tileMaskY, U(0)))) {
+				throwError(U"texture_getPixelOffset was told that the pixel coordinates would stay inside using XY_INSIDE, but the coordinate (", x, U", ", y, U") is not within", texture_getMaxWidth(texture), U"x", texture_getMaxHeight(texture), U" pixels!\n");
+			}
+		}
+		if (!SINGLE_LAYER && !HIGHEST_RESOLUTION) {
+			if (!allLanesLesserOrEqual(mipLevel, U(15u))) {
+				throwError(U"texture_getPixelOffset got mip level ", mipLevel, U", which is not within the fixed range of 0..15!\n");
+			}
+			if (MIP_INSIDE) {
+				if (!allLanesLesserOrEqual(mipLevel, U(texture_getSmallestMipLevel(texture)))) {
+					throwError(U"texture_getPixelOffset was told that the mip level would stay within valid indices using MIP_INSIDE, but mip level ", mipLevel, U" is not within 0..", texture_getSmallestMipLevel(texture), U"!\n");
+				}
+			}
+		}
+	#endif
+stateName = U"Getting eight pixel offsets J.\n";
+	if (SINGLE_LAYER) {
+		return coordinateOffset;
+	} else {
+		U startOffset = texture_getPixelOffsetToLayer<HIGHEST_RESOLUTION, U>(texture, mipLevel);
+stateName = U"Getting eight pixel offsets K.\n";
+		return startOffset + coordinateOffset;
+	}
+}
 
 
 START_TEST(Texture)
 START_TEST(Texture)
+	{
+		// Linear blending of colors using unsigned integers.
+		U32x4 mixedColor = texture_interpolate_color_linear<U32x4, U16x8>(
+		    packOrder_packBytes(U32x4(255, 175, 253,  95), U32x4(255,  84, 255, 210), U32x4(  0, 253, 172, 100), U32x4(  0, 150, 241,  61)),
+		    packOrder_packBytes(U32x4(  0, 215,  62, 127), U32x4(255, 162, 152,  93), U32x4(255,  71,  62, 200), U32x4(  0, 139, 180, 124)),
+		    U32x4(  0, 128, 256, 256)
+		  );
+		U32x4 expectedColor = packOrder_packBytes(U32x4(255, 195,  62, 127), U32x4(255, 123, 152,  93), U32x4(  0, 162,  62, 200), U32x4(  0, 144, 180, 124));
+		ASSERT_EQUAL_SIMD(mixedColor, expectedColor);
+	}
 	{
 	{
 		// 1x1, 2x2, 4x4, 8x8, 16x16
 		// 1x1, 2x2, 4x4, 8x8, 16x16
 		TextureRgbaU8 texture = TextureRgbaU8(4, 4);
 		TextureRgbaU8 texture = TextureRgbaU8(4, 4);
 		{
 		{
-			stateName = U"Getting eight pixel offsets to layer.\n";
-			U32x8 layerOffsets = texture_getPixelOffsetToLayer(texture, U32x8(0u));
-			stateName = U"Comparing eight pixel offsets to layer.\n";
-			ASSERT(allLanesEqual(layerOffsets, U32x8(85u)));
+			stateName = U"Getting eight pixel offsets TEST.\n";
+			U32x8 pixelOffsets = texture_getPixelOffset_test(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(0u));
+			stateName = U"Comparing eight pixel offsets.\n";
+			ASSERT_EQUAL_SIMD(pixelOffsets, U32x8(85u, 86u, 87u, 88u, 101u, 102u, 103u, 104u));
+		}
+		/*
+		{
 			stateName = U"Getting eight pixel offsets.\n";
 			stateName = U"Getting eight pixel offsets.\n";
 			U32x8 pixelOffsets = texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(0u));
 			U32x8 pixelOffsets = texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(0u));
 			stateName = U"Comparing eight pixel offsets.\n";
 			stateName = U"Comparing eight pixel offsets.\n";
-			ASSERT(allLanesEqual((pixelOffsets), U32x8(85u, 86u, 87u, 88u, 101u, 102u, 103u, 104u)));
+			ASSERT_EQUAL_SIMD(pixelOffsets, U32x8(85u, 86u, 87u, 88u, 101u, 102u, 103u, 104u));
 		}
 		}
+		*/
 		ASSERT(texture_hasPyramid(texture));
 		ASSERT(texture_hasPyramid(texture));
 		ASSERT_EQUAL(texture_getMaxWidth(texture), 16);
 		ASSERT_EQUAL(texture_getMaxWidth(texture), 16);
 		ASSERT_EQUAL(texture_getMaxHeight(texture), 16);
 		ASSERT_EQUAL(texture_getMaxHeight(texture), 16);
@@ -151,7 +295,8 @@ START_TEST(Texture)
 		// The four first template arguments to texture_getPixelOffset are SQUARE, SINGLE_LAYER, XY_INSIDE and MIP_INSIDE, which can be used to simplify the calculations with any information known in compile time.
 		// The four first template arguments to texture_getPixelOffset are SQUARE, SINGLE_LAYER, XY_INSIDE and MIP_INSIDE, which can be used to simplify the calculations with any information known in compile time.
 
 
 		// Optimized by saying that the image is a square, with multiple levels, and mip level within used bounds.
 		// Optimized by saying that the image is a square, with multiple levels, and mip level within used bounds.
-		uint32_t result = texture_getPixelOffset<true, false, true, true>(texture, 0u, 0u, 0u); ASSERT_EQUAL(result, 85u);
+		uint32_t result = texture_getPixelOffset<true, false, true, true>(texture, 0u, 0u, 0u);
+		ASSERT_EQUAL(result, 85u);
 		#ifndef NDEBUG
 		#ifndef NDEBUG
 			// Should crash with an error when making a false claim that the texture only has a single layer.
 			// Should crash with an error when making a false claim that the texture only has a single layer.
 			BEGIN_CRASH(U"texture_getPixelOffset was told that the texture would only have a single layer");
 			BEGIN_CRASH(U"texture_getPixelOffset was told that the texture would only have a single layer");
@@ -159,14 +304,14 @@ START_TEST(Texture)
 			END_CRASH
 			END_CRASH
 		#endif
 		#endif
 
 
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x4(0u, 0u, 0u, 0u), U32x4(0u, 0u, 0u, 0u), U32x4(0u, 1u, 2u, 3u)), U32x4(85u, 21u, 5u, 1u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x4(0u, 1u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(3u, 3u, 3u, 3u)), U32x4(1u, 2u, 3u, 4u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(0u)), U32x4(87u, 88u, 101u, 102u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(1u)), U32x4(23u, 24u, 29u, 30u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(2u)), U32x4(7u, 8u, 9u, 10u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(0u)), U32x8(85u, 86u, 87u, 88u, 101u, 102u, 103u, 104u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(1u)), U32x8(21u, 22u, 23u, 24u, 29u, 30u, 31u, 32u)));
-		ASSERT(allLanesEqual(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(2u)), U32x8(5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u)));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x4(0u, 0u, 0u, 0u), U32x4(0u, 0u, 0u, 0u), U32x4(0u, 1u, 2u, 3u)), U32x4(85u, 21u, 5u, 1u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x4(0u, 1u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(3u, 3u, 3u, 3u)), U32x4(1u, 2u, 3u, 4u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(0u)), U32x4(87u, 88u, 101u, 102u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(1u)), U32x4(23u, 24u, 29u, 30u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x4(2u, 3u, 0u, 1u), U32x4(0u, 0u, 1u, 1u), U32x4(2u)), U32x4(7u, 8u, 9u, 10u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(0u)), U32x8(85u, 86u, 87u, 88u, 101u, 102u, 103u, 104u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(1u)), U32x8(21u, 22u, 23u, 24u, 29u, 30u, 31u, 32u));
+		ASSERT_EQUAL_SIMD(texture_getPixelOffset(texture, U32x8(0u, 1u, 2u, 3u, 0u, 1u, 2u, 3u), U32x8(0u, 0u, 0u, 0u, 1u, 1u, 1u, 1u), U32x8(2u)), U32x8(5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u));
 	}
 	}
 	{
 	{
 		// 1x2, 2x4, 4x8
 		// 1x2, 2x4, 4x8

Some files were not shown because too many files changed in this diff