Browse Source

Minimum texture resolution is now 32x32 pixels for future 1024-bit SIMD support, but textures are automatically rescaled when before generating texture pyramids.
Reallocating the existing image buffer for storing the pyramid, so that texture sampling only need one pointer and offsets.
Started to vectorize image downscaling, but needed more SIMD functions.
Using the portable gather function in texture sampling.

David Piuva 2 years ago
parent
commit
4b28d8f085

+ 5 - 0
Source/DFPSR/api/imageAPI.cpp

@@ -185,6 +185,11 @@ PackOrderIndex dsr::image_getPackOrderIndex(const ImageRgbaU8& image) {
 }
 
 // Texture
+void dsr::image_makeIntoTexture(ImageRgbaU8& image) {
+	if (image) {
+		image->makeIntoTexture();
+	}
+}
 void dsr::image_generatePyramid(ImageRgbaU8& image) {
 	if (image) {
 		image->generatePyramid();

+ 16 - 8
Source/DFPSR/api/imageAPI.h

@@ -80,13 +80,21 @@ namespace dsr {
 	PackOrderIndex image_getPackOrderIndex(const ImageRgbaU8& image);
 
 // Texture
-	// Pre-condition: image must exist and qualify as a texture according to image_isTexture
-	// Side-effect: Creates a mip-map pyramid of lower resolution images from the current content
-	// If successful, image_hasPyramid should return true from the image
+	// Pre-condition: image must exist for something to happen
+	// Side-effect: If image is not a valid texture, it will be resized into a suitable power-of-two dimension.
+	// Applied automatically when calling image_generatePyramid for the first time on the image.
+	// Warning! May invalidate all SafePointers and raw pointers to the image's data.
+	void image_makeIntoTexture(ImageRgbaU8& image);
+	// Pre-condition: image must exist for something to happen
+	// Side-effects:
+	//  If the image does not have valid texture dimensions, it will be resized using image_makeIntoTexture before generating the pyramid.
+	//  Reallocates the image's buffer and uses the new memory to write smaller versions of the image.
+	// Afterwards, image_hasPyramid should return true for the image
+	// Warning! May invalidate all SafePointers and raw pointers to the image's data.
 	void image_generatePyramid(ImageRgbaU8& image);
 	// Pre-condition: image must exist
 	// Side-effect: Removes image's mip-map pyramid, including its buffer to save memory
-	// If successful, image_hasPyramid should return false from the image
+	// Afterwards, image_hasPyramid should return false for the image
 	void image_removePyramid(ImageRgbaU8& image);
 	// Post-condition: Returns true iff image contains a mip-map pyramid generated by image_generatePyramid
 	// Returns false without a warning if the image handle is empty
@@ -95,10 +103,10 @@ namespace dsr {
 	//   Returns true iff image fulfills the criterias for being a texture
 	//   Returns false without a warning if the image handle is empty
 	// Texture criterias:
-	//  * Each dimension of width and height should be a power-of-two from 4 to 16384
-	//    width = 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 or 16384
-	//    height = 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 or 16384
-	//    Large enough to allow padding-free SIMD vectorization of 128-bit vectors (4 x 32 = 128)
+	//  * Each dimension of width and height should be a power-of-two from 32 to 16384
+	//    width = 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 or 16384
+	//    height = 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 or 16384
+	//    Large enough to be padding-free with 1024-bit memory alignment.
 	//    Small enough to allow expressing the total size in bytes using a signed 32-bit integer
 	//  * If it's a sub-image, it must also consume the whole with of the original image so that width times pixel size equals the stride
 	//    Textures may not contain padding in the rows, but it's okay to use sub-images from a vertical atlas where the whole width is consumed

+ 2 - 2
Source/DFPSR/image/Image.h

@@ -38,9 +38,9 @@ namespace dsr {
 // See imageInternal.h for protected methods
 class ImageImpl {
 public:
-	const int32_t width, height, stride, pixelSize;
+	int32_t width, height, stride, pixelSize;
 	Buffer buffer; // Content
-	const intptr_t startOffset; // Byte offset of the first pixel
+	intptr_t startOffset; // Byte offset of the first pixel
 	bool isSubImage = false;
 private:
 	void validate() {

+ 159 - 85
Source/DFPSR/image/ImageRgbaU8.cpp

@@ -1,6 +1,6 @@
 // zlib open source license
 //
-// Copyright (c) 2017 to 2019 David Forsgren Piuva
+// Copyright (c) 2017 to 2023 David Forsgren Piuva
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
@@ -24,12 +24,15 @@
 #include "ImageRgbaU8.h"
 #include "internal/imageInternal.h"
 #include "internal/imageTemplate.h"
+#include "draw.h"
 #include <algorithm>
 #include "../base/simd.h"
 
 using namespace dsr;
 
-IMAGE_DEFINITION(ImageRgbaU8Impl, 4, Color4xU8, uint8_t);
+static const int pixelSize = 4;
+
+IMAGE_DEFINITION(ImageRgbaU8Impl, pixelSize, Color4xU8, uint8_t);
 
 ImageRgbaU8Impl::ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t newStride, Buffer buffer, intptr_t startOffset, const PackOrder &packOrder) :
   ImageImpl(newWidth, newHeight, newStride, sizeof(Color4xU8), buffer, startOffset), packOrder(packOrder) {
@@ -58,7 +61,7 @@ bool ImageRgbaU8Impl::isTexture(const ImageRgbaU8Impl* image) {
 }
 
 ImageRgbaU8Impl ImageRgbaU8Impl::getWithoutPadding() const {
-	if (this->stride == this->width * this->pixelSize) {
+	if (this->stride == this->width * pixelSize) {
 		// No padding
 		return *this;
 	} else {
@@ -104,20 +107,22 @@ ImageU8Impl ImageRgbaU8Impl::getChannel(int32_t channelIndex) const {
 	return result;
 }
 
+static const int32_t smallestSizeGroup = 5;
+static const int32_t largestSizeGroup = 14;
 static int32_t getSizeGroup(int32_t size) {
 	int32_t group = -1;
 	if (size == 1) {
 		group = 0; // Too small for 16-byte alignment!
 	} else if (size == 2) {
-		group = 1; // Too small for 16-byte alignment!
+		group = 1; // Too small for 16-byte alignment! (SSE2)
 	} else if (size == 4) {
-		group = 2; // Smallest allowed texture dimension
+		group = 2; // Too small for 32-byte alignment! (AVX2)
 	} else if (size == 8) {
-		group = 3;
+		group = 3; // Too small for 64-byte alignment! (AVX3)
 	} else if (size == 16) {
-		group = 4;
+		group = 4; // Too small for 128-byte alignment!
 	} else if (size == 32) {
-		group = 5;
+		group = 5; // Smallest allowed texture dimension, allowing 1024-bit SIMD.
 	} else if (size == 64) {
 		group = 6;
 	} else if (size == 128) {
@@ -140,7 +145,22 @@ static int32_t getSizeGroup(int32_t size) {
 	return group;
 }
 
-static int32_t getPyramidSize(int32_t width, int32_t height, int32_t pixelSize, int32_t levels) {
+inline int32_t sizeFromGroup(int32_t group) {
+	return 1 << group;
+}
+
+// Round the size down, unless it is already too small.
+static int32_t roundSize(int32_t size) {
+	for (int groupIndex = smallestSizeGroup; groupIndex < largestSizeGroup; groupIndex++) {
+		int currentSize = sizeFromGroup(groupIndex);
+		if (size < currentSize) {
+			return currentSize;
+		}
+	}
+	return sizeFromGroup(largestSizeGroup);
+}
+
+static int32_t getPyramidSize(int32_t width, int32_t height, int32_t levels) {
 	uint32_t result = 0;
 	uint32_t byteCount = width * height * pixelSize;
 	for (int32_t l = 0; l < levels; l++) {
@@ -150,36 +170,72 @@ static int32_t getPyramidSize(int32_t width, int32_t height, int32_t pixelSize,
 	return (int32_t)result;
 }
 
-static void downScaleByTwo(SafePointer<uint8_t> targetData, const SafePointer<uint8_t> sourceData, int32_t targetWidth, int32_t targetHeight, int32_t pixelSize, int32_t targetStride) {
+inline U32xX averageColor(const U32xX &colorA, const U32xX &colorB) {
+	// TODO: Expand to 16 bits or use built in average intrinsics for full bit depth.
+	// 7-bit precision for speed.
+	return reinterpret_U32FromU8(reinterpret_U8FromU32((colorA >> 1) & U32xX(0b01111111011111110111111101111111)) + reinterpret_U8FromU32((colorB >> 1) & U32xX(0b01111111011111110111111101111111)));
+}
+
+inline U32xX pairwiseAverageColor(const U32xX &colorA, const U32xX &colorB) {
+	// TODO: Vectorize with 32-bit unzipping of pixels and 8-bit average of channels.
+	// Reference implementation
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) uint8_t elementsA[laneCountX_8Bit];
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) uint8_t elementsB[laneCountX_8Bit];
+	ALIGN_BYTES(DSR_DEFAULT_ALIGNMENT) uint8_t elementsR[laneCountX_8Bit];
+	colorA.writeAlignedUnsafe((uint32_t*)elementsA);
+	colorB.writeAlignedUnsafe((uint32_t*)elementsB);
+	int32_t halfPixels = laneCountX_32Bit / 2;
+	for (int p = 0; p < halfPixels; p++) {
+		for (int c = 0; c < 4; c++) {
+			elementsR[p * 4 + c] = uint8_t((uint16_t(elementsA[p * 8 + c]) + uint16_t(elementsA[p * 8 + 4 + c])) >> 1);
+			elementsR[(p + halfPixels) * 4 + c] = uint8_t((uint16_t(elementsB[p * 8 + c]) + uint16_t(elementsB[p * 8 + 4 + c])) >> 1);
+		}
+	}
+	return U32xX::readAlignedUnsafe((uint32_t*)elementsR);
+}
+
+static void downScaleByTwo(SafePointer<uint32_t> targetData, const SafePointer<uint32_t> sourceData, int32_t targetWidth, int32_t targetHeight, int32_t targetStride) {
 	int32_t sourceStride = targetStride * 2;
 	int32_t doubleSourceStride = sourceStride * 2;
-	SafePointer<uint8_t> targetRow = targetData;
-	const SafePointer<uint8_t> sourceRow = sourceData;
+	SafePointer<uint32_t> targetRow = targetData;
+	const SafePointer<uint32_t> sourceRow = sourceData;
 	for (int32_t y = 0; y < targetHeight; y++) {
-		const SafePointer<uint8_t> sourcePixel = sourceRow;
-		SafePointer<uint8_t> targetPixel = targetRow;
-		for (int32_t x = 0; x < targetWidth; x++) {
-			// TODO: Use pariwise and vector average functions for fixed channel counts (SSE has _mm_avg_epu8 for vector average)
-			for (int32_t c = 0; c < pixelSize; c++) {
-				uint8_t value = (uint8_t)((
-				    (uint16_t)(*sourcePixel)
-				  + (uint16_t)(*(sourcePixel + pixelSize))
-				  + (uint16_t)(*(sourcePixel + sourceStride))
-				  + (uint16_t)(*(sourcePixel + sourceStride + pixelSize))) / 4);
-				*targetPixel = value;
-				targetPixel += 1;
-				sourcePixel += 1;
-			}
-			sourcePixel += pixelSize;
+		const SafePointer<uint32_t> upperSourcePixel = sourceRow;
+		const SafePointer<uint32_t> lowerSourcePixel = sourceRow;
+		lowerSourcePixel.increaseBytes(sourceStride);
+		SafePointer<uint32_t> targetPixel = targetRow;
+		for (int32_t x = 0; x < targetWidth; x += laneCountX_32Bit) {
+			U32xX upperLeft = U32xX::readAligned(upperSourcePixel, "upperLeftSource in downScaleByTwo");
+			U32xX upperRight = U32xX::readAligned(lowerSourcePixel + laneCountX_32Bit, "upperLeftSource in downScaleByTwo");
+			U32xX lowerLeft = U32xX::readAligned(lowerSourcePixel, "upperLeftSource in downScaleByTwo");
+			U32xX lowerRight = U32xX::readAligned(lowerSourcePixel + laneCountX_32Bit, "upperLeftSource in downScaleByTwo");
+			U32xX upperAverage = pairwiseAverageColor(upperLeft, upperRight);
+			U32xX lowerAverage = pairwiseAverageColor(lowerLeft, lowerRight);
+			U32xX finalAverage = averageColor(upperAverage, lowerAverage);
+			finalAverage.writeAligned(targetPixel, "average result in downScaleByTwo");
+			targetPixel += laneCountX_32Bit;
+			upperSourcePixel += laneCountX_32Bit * 2;
+			lowerSourcePixel += laneCountX_32Bit * 2;
 		}
-		targetRow += targetStride;
-		sourceRow += doubleSourceStride;
+		targetRow.increaseBytes(targetStride);
+		sourceRow.increaseBytes(doubleSourceStride);
 	}
 }
 
+static void updatePyramid(TextureRgba &texture, int32_t layerCount) {
+	// Downscale each following layer from the previous.
+	for (int32_t targetIndex = 1; targetIndex < layerCount; targetIndex++) {
+		int32_t sourceIndex = targetIndex - 1;
+		int32_t targetWidth = texture.mips[targetIndex].width;
+		int32_t targetHeight = texture.mips[targetIndex].height;
+		downScaleByTwo(texture.mips[targetIndex].data, texture.mips[sourceIndex].data, targetWidth, targetHeight, targetWidth * pixelSize);
+	}
+	texture.layerCount = layerCount;
+}
+
 TextureRgbaLayer::TextureRgbaLayer() {}
 
-TextureRgbaLayer::TextureRgbaLayer(const uint8_t *data, int32_t width, int32_t height) :
+TextureRgbaLayer::TextureRgbaLayer(SafePointer<uint32_t> data, int32_t width, int32_t height) :
   data(data),
   strideShift(getSizeGroup(width) + 2),
   widthMask(width - 1),
@@ -191,74 +247,92 @@ TextureRgbaLayer::TextureRgbaLayer(const uint8_t *data, int32_t width, int32_t h
   halfPixelOffsetU(1.0f - (0.5f / width)),
   halfPixelOffsetV(1.0f - (0.5f / height)) {}
 
-void ImageRgbaU8Impl::generatePyramid() {
+void ImageRgbaU8Impl::generatePyramidStructure(int32_t layerCount) {
+	int32_t currentWidth = this->width;
+	int32_t currentHeight = this->height;
+	// Allocate smaller pyramid images within the buffer
+	SafePointer<uint32_t> currentStart = buffer_getSafeData<uint32_t>(this->buffer, "Pyramid generation target");
+	for (int32_t m = 0; m < layerCount; m++) {
+		this->texture.mips[m] = TextureRgbaLayer(currentStart, currentWidth, currentHeight);
+		currentStart += currentWidth * currentHeight;
+		currentWidth /= 2;
+		currentHeight /= 2;
+	}
+	// Fill unused mip levels with duplicates of the last mip level
+	for (int32_t m = layerCount; m < MIP_BIN_COUNT; m++) {
+		// m - 1 is never negative, because layerCount is clamped to at least 1 and nobody would choose zero for MIP_BIN_COUNT.
+		this->texture.mips[m] = this->texture.mips[m - 1];
+	}
+	this->texture.layerCount = layerCount;
+}
+
+void ImageRgbaU8Impl::removePyramidStructure() {
+	for (int32_t m = 0; m < MIP_BIN_COUNT; m++) {
+		this->texture.mips[m] = TextureRgbaLayer(imageInternal::getSafeData<uint32_t>(*this), this->width, this->height);
+	}
+	// Declare the old pyramid invalid so that it will not be displayed while rendering, but keep the extra memory for next time it is generated.
+	this->texture.layerCount = 1;
+}
+
+void ImageRgbaU8Impl::makeIntoTexture() {
+	// Check if the image is a valid texture.
 	if (!this->isTexture()) {
-		if (this->width < 4 || this->height < 4) {
-			printText("Cannot generate a pyramid from an image smaller than 4x4 pixels.\n");
-		} else if (this->width > 16384 || this->height > 16384) {
-			printText("Cannot generate a pyramid from an image larger than 16384x16384 pixels.\n");
-		} else if (getSizeGroup(this->width) == -1 || getSizeGroup(this->height) == -1) {
-			printText("Cannot generate a pyramid from image dimensions that are not powers of two.\n");
-		} else if (this->stride > this->width * pixelSize) {
-			printText("Cannot generate a pyramid from an image that contains padding.\n");
-		} else if (this->stride < this->width * pixelSize) {
-			printText("Cannot generate a pyramid from an image with corrupted stride.\n");
-		} else {
-			printText("Cannot generate a pyramid from an image that has not been initialized correctly.\n");
-		}
+		// Get valid dimensions.
+		int newWidth = roundSize(this->width);
+		int newHeight = roundSize(this->height);
+		// Create a new image with the correct dimensions.
+		ImageRgbaU8Impl result = ImageRgbaU8Impl(newWidth, newHeight, DSR_DEFAULT_ALIGNMENT);
+		// Resize the image content with bi-linear interpolation.
+		imageImpl_resizeToTarget(result, *this, true);
+		// Take over the new image's content.
+		this->buffer = result.buffer;
+		this->width = result.width;
+		this->height = result.height;
+		this->stride = result.stride;
+		this->startOffset = 0; // Starts from the beginning.
+		this->isSubImage = false; // No longer sharing buffer with any parent image.
+	}
+}
+
+void ImageRgbaU8Impl::generatePyramid() {
+	int32_t fullSizeGroup = getSizeGroup(std::min(this->width, this->height));
+	int32_t layerCount = std::min(std::max(fullSizeGroup - smallestSizeGroup, 1), MIP_BIN_COUNT);
+	if (this->texture.layerCount > 1) {
+		// Regenerate smaller images without wasting time with any redundant checks,
+		//   because the image has already been approved the first time it had the pyramid allocated.
+		updatePyramid(this->texture, layerCount);
 	} else {
-		int32_t pixelSize = this->pixelSize;
-		int32_t mipmaps = std::min(std::max(getSizeGroup(std::min(this->width, this->height)) - 1, 1), MIP_BIN_COUNT);
-		if (!this->texture.hasMipBuffer()) {
-			this->texture.pyramidBuffer = buffer_create(getPyramidSize(this->width / 2, this->height / 2, pixelSize, mipmaps - 1));
-		}
-		// Point to the image's original buffer in mip level 0
-		SafePointer<uint8_t> currentStart = imageInternal::getSafeData<uint8_t>(*this);
+		// In the event of having to correct a bad image into a valid texture, there will be two reallocations.
+		this->makeIntoTexture();
+		Buffer oldBuffer = this->buffer;
+		SafePointer<uint32_t> oldData = buffer_getSafeData<uint32_t>(oldBuffer, "Pyramid generation source") + this->startOffset;
+		this->buffer = buffer_create(getPyramidSize(this->width, this->height, layerCount));
 		int32_t currentWidth = this->width;
 		int32_t currentHeight = this->height;
-		this->texture.mips[0] = TextureRgbaLayer(currentStart.getUnsafe(), currentWidth, currentHeight);
-		// Create smaller pyramid images in the extra buffer
-		SafePointer<uint8_t> previousStart = currentStart;
-		currentStart = buffer_getSafeData<uint8_t>(this->texture.pyramidBuffer, "Pyramid generation target");
-		for (int32_t m = 1; m < mipmaps; m++) {
-			currentWidth /= 2;
-			currentHeight /= 2;
-			this->texture.mips[m] = TextureRgbaLayer(currentStart.getUnsafe(), currentWidth, currentHeight);
-			int32_t size = currentWidth * currentHeight * pixelSize;
-			// In-place downscaling by two.
-			downScaleByTwo(currentStart, previousStart, currentWidth, currentHeight, pixelSize, currentWidth * pixelSize);
-			previousStart = currentStart;
-			currentStart.increaseBytes(size);
-		}
-		// Fill unused mip levels with duplicates of the last mip level
-		for (int32_t m = mipmaps; m < MIP_BIN_COUNT; m++) {
-			// m - 1 is never negative, because mipmaps is clamped to at least 1 and nobody would choose zero for MIP_BIN_COUNT.
-			this->texture.mips[m] = this->texture.mips[m - 1];
-		}
+		this->generatePyramidStructure(layerCount);
+		// Copy the image's old content while assuming that there is no padding.
+		safeMemoryCopy(this->texture.mips[0].data, oldData, this->width * this->height * pixelSize);
+		// Generate smaller images.
+		updatePyramid(this->texture, layerCount);
+		// Once an image had a pyramid generated, the new buffer will remain for as long as the image exists.
+		this->texture.layerCount = layerCount;
+		// Remove start offset because the old data has been cloned to create the new pyramid image.
+		this->startOffset = 0;
 	}
 }
 
 void ImageRgbaU8Impl::removePyramid() {
-	// Only try to remove if it has a pyramid
-	if (buffer_exists(this->texture.pyramidBuffer)) {
-		// Remove the pyramid's buffer
-		this->texture.pyramidBuffer = Buffer();
-		// Re-initialize
-		for (int32_t m = 0; m < MIP_BIN_COUNT; m++) {
-			this->texture.mips[m] = TextureRgbaLayer(imageInternal::getSafeData<uint8_t>(*this).getUnsafe(), this->width, this->height);
-		}
-	}
+	// Duplicate the original image when no longer showing the pyramid.
+	this->removePyramidStructure();
 }
 
 void ImageRgbaU8Impl::initializeRgbaImage() {
 	// If the image fills the criterias of a texture
-	if (getSizeGroup(this->width) >= 2
-	 && getSizeGroup(this->height) >= 2
-	 && this->stride == this->width * this->pixelSize) {
+	if (getSizeGroup(this->width) >= smallestSizeGroup
+	 && getSizeGroup(this->height) >= smallestSizeGroup
+	 && this->stride == this->width * pixelSize) {
 		// Initialize each mip bin to show the original image
-		for (int32_t m = 0; m < MIP_BIN_COUNT; m++) {
-			this->texture.mips[m] = TextureRgbaLayer(imageInternal::getSafeData<uint8_t>(*this).getUnsafe(), this->width, this->height);
-		}
+		this->removePyramidStructure();
 	}
 };
 

+ 47 - 46
Source/DFPSR/image/ImageRgbaU8.h

@@ -1,6 +1,6 @@
 // zlib open source license
 //
-// Copyright (c) 2017 to 2019 David Forsgren Piuva
+// Copyright (c) 2017 to 2023 David Forsgren Piuva
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
@@ -30,72 +30,67 @@
 
 namespace dsr {
 
+// TODO: Figure out how to handle very small textures where the full resolution is smaller than the smallest allowed MIP level.
+//       Larger SIMD vectors will change the smallest allowed textures if the bit pattern is used, which may be inconsistent if it keeps growing.
+//       Having 32x32 pixels as the minimum size would allow using up to 1024-bit SIMD without a problem.
 // TODO: Reallocate the image's buffer, so that the pyramid images are placed into the same allocation.
 //       This allow reading a texture with multiple mip levels using different 32-bit offsets in the same SIMD vector holding multiple groups of 2x2 pixels.
 // TODO: Adapt how far down to go in mip resolutions based on DSR_DEFAULT_ALIGNMENT, so that no mip level is padded in memory.
 //       This is needed so that the stride can be calculated using bit shifting from the mip level.
 //       The visual appearance may differ between SIMD lengths for low resolution textures, but not between computers running the same executable.
-// TODO: Store the smallest layer first in memory, so that the offset is a multiple of the smallest size following a pre-determined bit pattern.
-//       If one s is the number of pixels in the smallest mip layer, then the size of each layer n equals s * 2^n.
-//       The offset in s units is then the sum of all previous unpadded dimensions.
-//         0000000000000000 0
-//         0000000000000001 1
-//         0000000000000101 5
-//         0000000000010101 21
-//         0000000001010101 85
-//         0000000101010101 341
-//         0000010101010101 1365
-//         0001010101010101 5461
-//         0101010101010101 21845
-//       Then one can start with the offset to the largest mip layer in pixels or bytes as the initial mask and then mask out initial ones using a mask directly from the MIP calculation.
-//         0000000000000000 4x2 pixels at offset 0
-//         0000000000001000 8x4 pixels at offset 8
-//         0000000000101000 16x32 pixels at offset 40
-//         0000000010101000 32x64 pixels at offset 168
-//         0000001010101000 64x128 pixels at offset 680
-//         0000101010101000 128x256 pixels at offset 2728 (Full unmasked offset leading to the highest mip level)
-//       Masks for different visibility.
-//         0000000000000011 Very far away or seen from the side
-//         0000000000111111 Far away or seen from the side
-//         0000001111111111 Normal viewing
-//         0011111111111111 Close with many screen pixels per texels.
-//       The difficult part is how to generate a good mip level offset mask from the pixel coordinate derivation from groups of 2x2 pixels.
-//         The offset is not exactly exponential, so there will be visual tradeoffs between artifacts in this approximation.
-//       One could take the texture coordinate offset per pixel as the initial value and
-//         then repeat shifting and or masking at power of two offsets to only get ones after the initial one, but this would require many cycles.
-//           Pixels per texel in full resolution times full resolution offset:
-//             00000000000010101000110110011001
-//           Mip offset mask:
-//             00000000000011111111111111111111
-//           Full resolution mip offset:
-//             00000000001010101010101010000000
-//           Final mip offset containing half width and height:
-//             00000000000010101010101010000000
+// TODO: Begin by replacing the lookup table for pyramid layers with template inline functions, because figuring out how to get start offset and stride consistently may take time.
+//       Pixel loops will later look up the masks once and store it in SIMD vectors to avoid fetching it from memory multiple times from potential memory aliasing.
+// IDEA: Keep the same order of mip layers, but mask out offset bits from the right side.
+//       When the most significant bit is masked out, it jumps to the full resoultion image at offset zero.
+//       Offsets
+//         00000000000000000000000000000000 Full resolution of 64x64
+//         00000000000000000000010000000000 Half resolution of 32x32
+//         00000000000000000000010100000000 Quarter resolution of 16x16
+//         00000000000000000000010101000000 Low resolution of 8x8
+//         00000000000000000000010101010000 Lowest resolution of 4x4
+//       Power of 4 offset masks
+//         11111111111111111100000000000000 Show at most 16384 pixels (clamped to full resolution because no more bits are masked out)
+//         11111111111111111111000000000000 Show at most 4096 pixels (full resolution for the image)
+//         11111111111111111111110000000000 Show at most 1024 pixels
+//         11111111111111111111111100000000 Show at most 256 pixels
+//         11111111111111111111111111000000 Show at most 64 pixels
+//         11111111111111111111111111110000 Show at most 16 pixels
+// PROBLEMS:
+//   * How can stride be calculated in the same way as the start offset?
+//     - Consistently in base two, not using the base 4 mask.
+//     - Limited to the range of available resolutions, not going to stride 512 when the full resolution stride is 256.
+//   * What about the width and height masks, can they reuse the same bit masking to avoid looking up data with scalar operations?
+//   * What should be done about very small textures?
+//     Automatically scale them up to the minimum resolution and leave the original image in the middle of the buffer?
+//     Change minimum size requirements?
+//       This would be the simplest approach and nobody would want their textures up-scaled anyway if one can easily redraw images in a higher resolution.
 
 // Pointing to the parent image using raw pointers for fast rendering. May not exceed the lifetime of the parent image!
 struct TextureRgbaLayer {
-	const uint8_t *data = 0;
+	SafePointer<uint32_t> data;
 	int32_t strideShift = 0;
 	uint32_t widthMask = 0, heightMask = 0;
 	int32_t width = 0, height = 0;
 	float subWidth = 0.0f, subHeight = 0.0f; // TODO: Better names?
 	float halfPixelOffsetU = 0.0f, halfPixelOffsetV = 0.0f;
 	TextureRgbaLayer();
-	TextureRgbaLayer(const uint8_t *data, int32_t width, int32_t height);
+	TextureRgbaLayer(SafePointer<uint32_t> data, int32_t width, int32_t height);
 	// Can it be sampled as a texture
-	bool exists() const { return this->data != nullptr; }
+	bool exists() const { return this->data.getUnsafe() != nullptr; }
 };
 
+// TODO: Try to replace with generated bit masks from inline functions.
 #define MIP_BIN_COUNT 5
 
-// Pointing to the parent image using raw pointers for fast rendering. Not not separate from the image!
+// Pointing to the parent image using raw pointers for fast rendering. Do not separate from the image!
 struct TextureRgba {
-	Buffer pyramidBuffer; // Storing the smaller mip levels
 	TextureRgbaLayer mips[MIP_BIN_COUNT]; // Pointing to all mip levels including the original image
+	int32_t layerCount = 0; // 0 Means that there are no pointers, 1 means that you have a pyramid but only one layer.
 	// Can it be sampled as a texture
 	bool exists() const { return this->mips[0].exists(); }
 	// Does it have a mip pyramid generated for smoother sampling
-	bool hasMipBuffer() const { return this->pyramidBuffer.get() != nullptr; }
+	// TODO: Rename once there is no separate MIP buffer, just a single pyramid buffer.
+	bool hasMipBuffer() const { return this->layerCount != 0; }
 };
 
 class ImageRgbaU8Impl : public ImageImpl {
@@ -110,13 +105,19 @@ public:
 	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, int32_t alignment);
 	// Native canvas constructor
 	ImageRgbaU8Impl(int32_t newWidth, int32_t newHeight, PackOrderIndex packOrderIndex, int32_t alignment);
-	// Fast reading
-	TextureRgba texture; // The texture view
-	void initializeRgbaImage(); // Points to level 0 from all bins to allow rendering
+	// The texture view for fast reading
+	TextureRgba texture;
+	// Points to level 0 from all bins to allow rendering
+	void initializeRgbaImage();
+	// Resizes the image to valid texture dimensions
+	void makeIntoTexture();
 	void generatePyramid(); // Fills the following bins with smaller images
 	void removePyramid();
 	bool isTexture() const;
 	static bool isTexture(const ImageRgbaU8Impl* image); // Null cannot be sampled as a texture
+private:
+	void generatePyramidStructure(int32_t layerCount);
+	void removePyramidStructure();
 public:
 	// Conversion to monochrome by extracting a channel
 	ImageU8Impl getChannel(int32_t channelIndex) const;

+ 9 - 9
Source/DFPSR/render/shader/RgbaMultiply.h

@@ -84,8 +84,8 @@ public:
 	Rgba_F32 getPixels_2x2(const F32x4x3 &vertexWeights) const override {
 		if (HAS_DIFFUSE_MAP && !HAS_LIGHT_MAP && COLORLESS) {
 			// Optimized for diffuse only
-			ALIGN16 F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));
-			ALIGN16 F32x4 v1(shaderMethods::interpolate(this->texCoords.v1, vertexWeights));
+			F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));
+			F32x4 v1(shaderMethods::interpolate(this->texCoords.v1, vertexWeights));
 			if (DISABLE_MIPMAP) {
 				return shaderMethods::sample_F32<Interpolation::BL, false>(this->diffuseLayer, u1, v1);
 			} else {
@@ -93,18 +93,18 @@ public:
 			}
 		} else if (HAS_LIGHT_MAP && !HAS_DIFFUSE_MAP && COLORLESS) {
 			// Optimized for light only
-			ALIGN16 F32x4 u2(shaderMethods::interpolate(this->texCoords.u2, vertexWeights));
-			ALIGN16 F32x4 v2(shaderMethods::interpolate(this->texCoords.v2, vertexWeights));
+			F32x4 u2(shaderMethods::interpolate(this->texCoords.u2, vertexWeights));
+			F32x4 v2(shaderMethods::interpolate(this->texCoords.v2, vertexWeights));
 			return shaderMethods::sample_F32<Interpolation::BL, false>(this->lightLayer, u2, v2);
 		} else {
 			// Interpolate the vertex color
-			ALIGN16 Rgba_F32 color = HAS_VERTEX_FADING ?
+			Rgba_F32 color = HAS_VERTEX_FADING ?
 			  shaderMethods::interpolateVertexColor(this->colors.red, this->colors.green, this->colors.blue, this->colors.alpha, vertexWeights) :
 			  Rgba_F32(F32x4(this->colors.red.x), F32x4(this->colors.green.x), F32x4(this->colors.blue.x), F32x4(this->colors.alpha.x));
 			// Sample diffuse
 			if (HAS_DIFFUSE_MAP) {
-				ALIGN16 F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));
-				ALIGN16 F32x4 v1(shaderMethods::interpolate(this->texCoords.v1, vertexWeights));
+				F32x4 u1(shaderMethods::interpolate(this->texCoords.u1, vertexWeights));
+				F32x4 v1(shaderMethods::interpolate(this->texCoords.v1, vertexWeights));
 				if (DISABLE_MIPMAP) {
 					color = color * shaderMethods::sample_F32<Interpolation::BL, false>(this->diffuseLayer, u1, v1);
 				} else {
@@ -113,8 +113,8 @@ public:
 			}
 			// Sample lightmap
 			if (HAS_LIGHT_MAP) {
-				ALIGN16 F32x4 u2(shaderMethods::interpolate(this->texCoords.u2, vertexWeights));
-				ALIGN16 F32x4 v2(shaderMethods::interpolate(this->texCoords.v2, vertexWeights));
+				F32x4 u2(shaderMethods::interpolate(this->texCoords.u2, vertexWeights));
+				F32x4 v2(shaderMethods::interpolate(this->texCoords.v2, vertexWeights));
 				color = color * shaderMethods::sample_F32<Interpolation::BL, false>(this->lightLayer, u2, v2);
 			}
 			return color;

+ 27 - 30
Source/DFPSR/render/shader/shaderMethods.h

@@ -37,9 +37,9 @@ namespace dsr {
 namespace shaderMethods {
 	// Returns the linear interpolation of the values using corresponding weight ratios for A, B and C in 4 pixels at the same time.
 	inline F32x4 interpolate(const FVector3D &vertexData, const F32x4x3 &vertexWeights) {
-		ALIGN16 F32x4 vMA = vertexData.x * vertexWeights.v1;
-		ALIGN16 F32x4 vMB = vertexData.y * vertexWeights.v2;
-		ALIGN16 F32x4 vMC = vertexData.z * vertexWeights.v3;
+		F32x4 vMA = vertexData.x * vertexWeights.v1;
+		F32x4 vMB = vertexData.y * vertexWeights.v2;
+		F32x4 vMC = vertexData.z * vertexWeights.v3;
 		return vMA + vMB + vMC;
 	}
 
@@ -55,14 +55,14 @@ namespace shaderMethods {
 	// Returns (colorA * weightA + colorB * weightB) / 256 as bytes
 	// weightA and weightB should contain pairs of the same 16-bit weights for each of the 4 pixels in the corresponding A and B colors
 	inline U32x4 weightColors(const U32x4 &colorA, const U16x8 &weightA, const U32x4 &colorB, const U16x8 &weightB) {
-		ALIGN16 U32x4 lowMask(0x00FF00FFu);
-		ALIGN16 U16x8 lowColorA = U16x8(colorA & lowMask);
-		ALIGN16 U16x8 lowColorB = U16x8(colorB & lowMask);
-		ALIGN16 U32x4 highMask(0xFF00FF00u);
-		ALIGN16 U16x8 highColorA = U16x8((colorA & highMask) >> 8);
-		ALIGN16 U16x8 highColorB = U16x8((colorB & highMask) >> 8);
-		ALIGN16 U32x4 lowColor = (((lowColorA * weightA) + (lowColorB * weightB))).get_U32();
-		ALIGN16 U32x4 highColor = (((highColorA * weightA) + (highColorB * weightB))).get_U32();
+		U32x4 lowMask(0x00FF00FFu);
+		U16x8 lowColorA = U16x8(colorA & lowMask);
+		U16x8 lowColorB = U16x8(colorB & lowMask);
+		U32x4 highMask(0xFF00FF00u);
+		U16x8 highColorA = U16x8((colorA & highMask) >> 8);
+		U16x8 highColorB = U16x8((colorB & highMask) >> 8);
+		U32x4 lowColor = (((lowColorA * weightA) + (lowColorB * weightB))).get_U32();
+		U32x4 highColor = (((highColorA * weightA) + (highColorB * weightB))).get_U32();
 		return (((lowColor >> 8) & lowMask) | (highColor & highMask));
 	}
 
@@ -79,8 +79,8 @@ namespace shaderMethods {
 
 	inline U32x4 mix_L(const U32x4 &colorA, const U32x4 &colorB, const U32x4 &weight) {
 		// Get inverse weights
-		ALIGN16 U16x8 weightB = repeatAs16Bits(weight);
-		ALIGN16 U16x8 weightA = invertWeight(weightB);
+		U16x8 weightB = repeatAs16Bits(weight);
+		U16x8 weightA = invertWeight(weightB);
 		// Multiply
 		return weightColors(colorA, weightA, colorB, weightB);
 	}
@@ -97,19 +97,8 @@ namespace shaderMethods {
 
 	// Single layer sampling methods
 	inline U32x4 sample_U32(const TextureRgbaLayer *source, const U32x4 &col, const U32x4 &row) {
-		#ifdef USE_AVX2
-			U32x4 pixelOffset((col + (row << (source->strideShift - 2)))); // PixelOffset = Column + Row * PixelStride
-			return U32x4(GATHER_U32x4_AVX2(source->data, pixelOffset.v, 4));
-			// return gather(source->data, pixelOffset.v); TODO: Needs SafePointer, so that this function can use the gather function with automatic emulation instead of hardcoding for AVX2
-		#else
-			UVector4D byteOffset = ((col << 2) + (row << source->strideShift)).get(); // ByteOffset = Column * 4 + Row * ByteStride
-			return U32x4(
-			  *((uint32_t*)(source->data + byteOffset.x)),
-			  *((uint32_t*)(source->data + byteOffset.y)),
-			  *((uint32_t*)(source->data + byteOffset.z)),
-			  *((uint32_t*)(source->data + byteOffset.w))
-			);
-		#endif
+		U32x4 pixelOffset((col + (row << (source->strideShift - 2)))); // PixelOffset = Column + Row * PixelStride
+		return gather(source->data, pixelOffset);
 	}
 
 	// How many mip levels down from here should be sampled for the given texture coordinates
@@ -140,7 +129,9 @@ namespace shaderMethods {
 	}
 
 	// Single layer sampling method
-	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
+	// Preconditions:
+	//   u >= -halfPixelOffsetU
+	//   v >= -halfPixelOffsetV
 	template<Interpolation INTERPOLATION>
 	inline U32x4 sample_U32(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
 		if (INTERPOLATION == Interpolation::BL) {
@@ -174,7 +165,9 @@ namespace shaderMethods {
 		}
 	}
 
-	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
+	// Preconditions:
+	//   u >= -halfPixelOffsetU
+	//   v >= -halfPixelOffsetV
 	template<Interpolation INTERPOLATION, bool HIGH_QUALITY>
 	inline Rgba_F32 sample_F32(const TextureRgbaLayer *source, const F32x4 &u, const F32x4 &v) {
 		if (INTERPOLATION == Interpolation::BL) {
@@ -211,14 +204,18 @@ namespace shaderMethods {
 	}
 
 	// Multi layer sampling method
-	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
+	// Preconditions:
+	//   u >= -halfPixelOffsetU
+	//   v >= -halfPixelOffsetV
 	template<Interpolation INTERPOLATION>
 	inline U32x4 sample_U32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
 		int mipLevel = getMipLevel(source, u, v);
 		return sample_U32<INTERPOLATION>(&(source->mips[mipLevel]), u, v);
 	}
 
-	// Precondition: u, v > -0.875f = 1 - (0.5 / minimumMipSize)
+	// Preconditions:
+	//   u >= -halfPixelOffsetU
+	//   v >= -halfPixelOffsetV
 	template<Interpolation INTERPOLATION, bool HIGH_QUALITY>
 	inline Rgba_F32 sample_F32(const TextureRgba *source, const F32x4 &u, const F32x4 &v) {
 		int mipLevel = getMipLevel(source, u, v);