Browse Source

Implemented memory alignment for the virtual stack allocator.

David Piuva 1 year ago
parent
commit
072d215085

+ 76 - 30
Source/DFPSR/base/virtualStack.cpp

@@ -24,49 +24,95 @@
 #include "virtualStack.h"
 #include "virtualStack.h"
 
 
 namespace dsr {
 namespace dsr {
+	// A structure that is placed in front of each stack allocation while allocating backwards along decreasing addresses to allow aligning memory quickly using bit masking.
+	struct StackAllocationHeader {
+		uint32_t totalSize; // Size of both header and payload.
+		#ifdef SAFE_POINTER_CHECKS
+		uint32_t identity; // A unique identifier that can be used to reduce the risk of using a block of memory after it has been freed.
+		#endif
+		StackAllocationHeader(uint32_t totalSize);
+	};
+
 	// How many bytes that are allocated directly in thread local memory.
 	// How many bytes that are allocated directly in thread local memory.
-	static const size_t DSR_VIRTUAL_STACK_SIZE = 131072;
+	static const size_t VIRTUAL_STACK_SIZE = 131072;
+	// How many bytes are reserved for the head.
+	static const size_t ALLOCATION_HEAD_SIZE = memory_getPaddedSize<StackAllocationHeader>();
+	
+	static const uintptr_t stackHeaderPaddedSize = memory_getPaddedSize<StackAllocationHeader>();
+	static const uintptr_t stackHeaderAlignmentAndMask = memory_createAlignmentAndMask((uintptr_t)alignof(StackAllocationHeader));
 
 
-	// TODO: Allow expanding using recycled heap memory when running out of stack space.
-	// TODO: Align the first allocation in address space from unaligned memory.
-	//       The easiest way would be to allocate memory in reverse order from the end.
-	//       * Subtract the amount of allocated memory from the previous uint8_t pointer.
-	//       * Use the pre-defined alignment mask that has zeroes for the rounded bits in the address.
-	//       * Place the allocation in the aligned location.
-	//       * Store an allocation size integer in front of the allocation to allow freeing.
-	//         The integer stores the total size of the size integer, allocation with padded size and alignment padding.
-	//         Arrays are allowed to access the padded size of all elements to allow optimizations.
-	//       * Store the new stack location pointing at the integer, with a fixed offset from the topmost allocation.
-	//       This would also make it easier to unwind the allocations when freeing memory.
-	//       * Read the integer pointed to and add it to the pointer.
+	#ifdef SAFE_POINTER_CHECKS
+		// In debug mode, each new thread creates a hash from its own identity to catch most of the memory errors in debug mode.
+		std::hash<std::thread::id> hasher;
+		thread_local uint32_t threadIdentity = hasher(std::this_thread::get_id());
+		//   To check the allocation identity, subtract the padded size of the header from the base pointer, cast to the head type and compare to the pointer's identity.
+		thread_local uint32_t nextIdentity = threadIdentity;
+	#endif
+	StackAllocationHeader::StackAllocationHeader(uint32_t totalSize) : totalSize(totalSize) {
+		#ifdef SAFE_POINTER_CHECKS
+			// No identity may be zero, because identity zero is no identity.
+			if (nextIdentity == 0) nextIdentity++;
+			this->identity = nextIdentity;
+			nextIdentity++;
+		#endif
+	}
 
 
 	struct StackMemory {
 	struct StackMemory {
-		uint8_t data[DSR_VIRTUAL_STACK_SIZE];
-		uint64_t stackLocation = 0;
-		// TODO: Try to store stack locations between the allocations to avoid heap allocations.
-		List<uint64_t> allocationEnds;
+		uint8_t data[VIRTUAL_STACK_SIZE];
+		uint8_t *top = nullptr;
+		StackMemory() {
+			this->top = this->data + VIRTUAL_STACK_SIZE;
+		}
 	};
 	};
 	thread_local StackMemory virtualStack;
 	thread_local StackMemory virtualStack;
 
 
-	uint8_t *virtualStack_push(uint64_t paddedSize, uint64_t alignment) {
-		uint64_t oldStackLocation = virtualStack.stackLocation;
-		// Align the start location by rounding up and then add padded elements.
-		uint64_t startOffset = roundUp(virtualStack.stackLocation, alignment);
-		virtualStack.stackLocation = startOffset + paddedSize;
-		if (virtualStack.stackLocation > DSR_VIRTUAL_STACK_SIZE) {
-			throwError(U"Ran out of stack memory!\n"); // TODO: Expand automatically using more memory blocks instead of crashing.
+	// Returns the size of the allocation including alignment.
+	inline uint64_t increaseTop(uint64_t paddedSize, uintptr_t alignmentAndMask) {
+		// Add the padded payload and align.
+		uintptr_t oldAddress = (uintptr_t)virtualStack.top;
+		uintptr_t newAddress = (oldAddress - paddedSize) & alignmentAndMask;
+		virtualStack.top = (uint8_t*)newAddress;
+		return oldAddress - newAddress;
+	}
+
+	inline void decreaseTop(uint64_t totalSize) {
+		// Remove the data and alignment.
+		virtualStack.top += totalSize;
+	}
+
+	uint8_t *virtualStack_push(uint64_t paddedSize, uintptr_t alignmentAndMask) {
+		uint8_t *oldTop = virtualStack.top;
+		// Allocate memory for payload.
+		uint64_t payloadTotalSize = increaseTop(paddedSize, alignmentAndMask);
+		// Get a pointer to the payload.
+		uint8_t *result = virtualStack.top;
+		// Allocate memory for header.
+		uint64_t headerTotalSize = increaseTop(stackHeaderPaddedSize, stackHeaderAlignmentAndMask);
+		// Check that we did not run out of memory.
+		if (virtualStack.top < virtualStack.data) {
+			// TODO: Expand automatically using heap memory instead of crashing.
+			throwError(U"Ran out of stack memory to allocate!\n");
+			virtualStack.top = oldTop;
 			return nullptr;
 			return nullptr;
 		} else {
 		} else {
-			virtualStack.allocationEnds.push(oldStackLocation);
-			// Clear the allocation for determinism.
-			std::memset((char*)(virtualStack.data + startOffset), 0, paddedSize);
-			return virtualStack.data + startOffset;
+			// Write the header to memory.
+			*((StackAllocationHeader*)virtualStack.top) = StackAllocationHeader(payloadTotalSize + headerTotalSize);
+			// Clear the new allocation for determinism.
+			std::memset((char*)result, 0, payloadTotalSize);
+			// Return a pointer to the payload.
+			return result;
 		}
 		}
 	}
 	}
 
 
 	void virtualStack_pop() {
 	void virtualStack_pop() {
-		virtualStack.stackLocation = virtualStack.allocationEnds.last();
-		virtualStack.allocationEnds.pop();
+		if (virtualStack.top + stackHeaderPaddedSize > virtualStack.data + VIRTUAL_STACK_SIZE) {
+			throwError(U"No more stack memory to pop!\n");
+		} else {
+			// Read the header.
+			StackAllocationHeader header = *((StackAllocationHeader*)virtualStack.top);
+			// Deallocate both header and payload using the stored total size.
+			decreaseTop(header.totalSize);
+		}
 	}
 	}
 
 
 }
 }

+ 18 - 8
Source/DFPSR/base/virtualStack.h

@@ -24,9 +24,6 @@
 #include "SafePointer.h"
 #include "SafePointer.h"
 #include "../api/stringAPI.h"
 #include "../api/stringAPI.h"
 
 
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
 #include <mutex>
 #include <mutex>
 #include <thread>
 #include <thread>
 
 
@@ -37,14 +34,27 @@ namespace dsr {
 	}
 	}
 
 
 	template <typename T>
 	template <typename T>
-	constexpr uint64_t memory_getPaddedElementSize() {
+	constexpr uint64_t memory_getPaddedSize() {
 		return roundUp((uint64_t)sizeof(T), (uint64_t)alignof(T));
 		return roundUp((uint64_t)sizeof(T), (uint64_t)alignof(T));
 	}
 	}
 
 
+	// Pre-condition:
+	//   alignment is a power of two (1, 2, 4, 8, 16, 32, 64...)
+	// Post-condition:
+	//   Returns a bit mask for rounding an integer down to the closest multiple of alignment.
+	constexpr uintptr_t memory_createAlignmentAndMask(uintptr_t alignment) {
+		// alignment = ...00001000...
+		// Subtracting one from a power of two gives a mask with ones for the remainder bits.
+		// remainder = ...00000111...
+		// Then we simply negate the mask to get the alignment mask for rounding down.
+		// mask      = ...11111000...
+		return ~(alignment - 1);
+	}
+
 	// Allocate memory in the virtual stack owned by the current thread.
 	// Allocate memory in the virtual stack owned by the current thread.
 	//   paddedSize is the number of bytes to allocate including all elements and internal padding.
 	//   paddedSize is the number of bytes to allocate including all elements and internal padding.
-	//   alignment is what the start address should be divisible by in bytes, which must be a power of two.
-	uint8_t *virtualStack_push(uint64_t paddedSize, uint64_t alignment);
+	//   alignmentMask should only contain zeroes at the bits to round away for alignment.
+	uint8_t *virtualStack_push(uint64_t paddedSize, uintptr_t alignmentAndMask);
 
 
 	// A simpler way to get the correct alignment is to allocate a number of elements with a specific type.
 	// A simpler way to get the correct alignment is to allocate a number of elements with a specific type.
 	// TODO: Create another function for manual alignment exceeding the type's alignment using another template argument.
 	// TODO: Create another function for manual alignment exceeding the type's alignment using another template argument.
@@ -53,9 +63,9 @@ namespace dsr {
 	template <typename T>
 	template <typename T>
 	SafePointer<T> virtualStack_push(uint64_t elementCount, const char *name) {
 	SafePointer<T> virtualStack_push(uint64_t elementCount, const char *name) {
 		// Calculate element size and multiply by element count to get the total size.
 		// Calculate element size and multiply by element count to get the total size.
-		uint64_t paddedSize = memory_getPaddedElementSize<T>() * elementCount;
+		uint64_t paddedSize = memory_getPaddedSize<T>() * elementCount;
 		// Allocate the data with the amount of alignment requested by the element type T.
 		// Allocate the data with the amount of alignment requested by the element type T.
-		uint8_t *data = virtualStack_push(paddedSize, (uint64_t)alignof(T));
+		uint8_t *data = virtualStack_push(paddedSize, memory_createAlignmentAndMask((uintptr_t)alignof(T)));
 		// Return a safe pointer to the allocated data.
 		// Return a safe pointer to the allocated data.
 		return SafePointer<T>(name, (T*)data, (intptr_t)paddedSize);
 		return SafePointer<T>(name, (T*)data, (intptr_t)paddedSize);
 	}
 	}

+ 3 - 3
Source/DFPSR/render/renderCore.cpp

@@ -328,10 +328,10 @@ static void executeTriangleDrawingDepth(ImageF32Impl *depthBuffer, const ITriang
 	int32_t rowCount = triangle.getBufferSize(clipBound, 1, 1);
 	int32_t rowCount = triangle.getBufferSize(clipBound, 1, 1);
 	if (rowCount > 0) {
 	if (rowCount > 0) {
 		int startRow;
 		int startRow;
-		RowInterval rows[rowCount];
-		triangle.getShape(startRow, rows, clipBound, 1, 1);
+		VirtualStackAllocation<RowInterval> rows(rowCount);
+		triangle.getShape(startRow, rows.data.getUnsafe(), clipBound, 1, 1);
 		Projection projection = triangle.getProjection(FVector3D(), FVector3D(), !AFFINE); // TODO: Create a weight using only depth to save time
 		Projection projection = triangle.getProjection(FVector3D(), FVector3D(), !AFFINE); // TODO: Create a weight using only depth to save time
-		RowShape shape = RowShape(startRow, rowCount, rows);
+		RowShape shape = RowShape(startRow, rowCount, rows.data.getUnsafe());
 		// Draw the triangle
 		// Draw the triangle
 		const int depthBufferStride = imageInternal::getStride(depthBuffer);
 		const int depthBufferStride = imageInternal::getStride(depthBuffer);
 		SafePointer<float> depthDataRow = imageInternal::getSafeData<float>(depthBuffer, shape.startRow);
 		SafePointer<float> depthDataRow = imageInternal::getSafeData<float>(depthBuffer, shape.startRow);