1 year ago · fd51a665c0
--- a/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
+++ b/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
@@ -15,70 +15,51 @@ namespace anki {
 
				 /// @addtogroup core
			
 
				 /// @{
			
 
				 
			
 
				-/// @memberof GpuVisibleTransientMemoryPool
			
 
				-class GpuVisibleTransientMemoryAllocation
			
 
				+/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
			
 
				+class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
			
 
				 {
			
 
				-	friend class GpuVisibleTransientMemoryPool;
			
 
				+	template<typename>
			
 
				+	friend class MakeSingleton;
			
 
				 
			
 
				 public:
			
 
				-	Buffer& getBuffer() const
			
 
				-	{
			
 
				-		ANKI_ASSERT(isValid());
			
 
				-		return *m_buffer;
			
 
				-	}
			
 
				-
			
 
				-	PtrSize getOffset() const
			
 
				-	{
			
 
				-		ANKI_ASSERT(isValid());
			
 
				-		return m_offset;
			
 
				-	}
			
 
				-
			
 
				-	PtrSize getRange() const
			
 
				+	BufferView allocate(PtrSize size, PtrSize alignment = 0)
			
 
				 	{
			
 
				-		ANKI_ASSERT(isValid());
			
 
				-		return m_size;
			
 
				+		alignment = (alignment == 0) ? m_alignment : alignment;
			
 
				+		PtrSize offset;
			
 
				+		Buffer* buffer;
			
 
				+		m_pool.allocate(size, alignment, offset, buffer);
			
 
				+		return BufferView(buffer, offset, size);
			
 
				 	}
			
 
				 
			
 
				-	Bool isValid() const
			
 
				+	template<typename T>
			
 
				+	BufferView allocateStructuredBuffer(U32 count)
			
 
				 	{
			
 
				-		return m_buffer != nullptr;
			
 
				+		return allocateStructuredBuffer(count, sizeof(T));
			
 
				 	}
			
 
				 
			
 
				-	operator BufferView() const;
			
 
				-
			
 
				-private:
			
 
				-	Buffer* m_buffer = nullptr;
			
 
				-	PtrSize m_offset = kMaxPtrSize;
			
 
				-	PtrSize m_size = 0;
			
 
				-};
			
 
				-
			
 
				-/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
			
 
				-class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
			
 
				-{
			
 
				-	template<typename>
			
 
				-	friend class MakeSingleton;
			
 
				-
			
 
				-public:
			
 
				-	GpuVisibleTransientMemoryAllocation allocate(PtrSize size)
			
 
				+	BufferView allocateStructuredBuffer(U32 count, U32 structureSize)
			
 
				 	{
			
 
				-		GpuVisibleTransientMemoryAllocation out;
			
 
				-		m_pool.allocate(size, out.m_offset, out.m_buffer);
			
 
				-		out.m_size = size;
			
 
				-		return out;
			
 
				+		return allocate(PtrSize(structureSize * count), (m_structuredBufferAlignment == kMaxU32) ? structureSize : m_structuredBufferAlignment);
			
 
				 	}
			
 
				 
			
 
				 	void endFrame();
			
 
				 
			
 
				 private:
			
 
				 	StackGpuMemoryPool m_pool;
			
 
				+	U32 m_alignment = 0;
			
 
				 	U32 m_frame = 0;
			
 
				+	U32 m_structuredBufferAlignment = 0;
			
 
				 
			
 
				 	GpuVisibleTransientMemoryPool()
			
 
				 	{
			
 
				-		U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
			
 
				-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
			
 
				-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
			
 
				-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
			
 
				+		m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+										  ? kMaxU32
			
 
				+										  : GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
			
 
				+
			
 
				+		m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
			
 
				+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
			
 
				+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
			
 
				+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
			
 
				 
			
 
				 		BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
			
 
				 								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer;
			
@@ -86,17 +67,11 @@ private:
 
				 		{
			
 
				 			buffUsage |= (BufferUsageBit::kAccelerationStructureBuildScratch | BufferUsageBit::kAccelerationStructureBuild);
			
 
				 		}
			
 
				-		m_pool.init(10_MB, 2.0, 0, alignment, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
			
 
				+		m_pool.init(10_MB, 2.0, 0, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
			
 
				 	}
			
 
				 
			
 
				 	~GpuVisibleTransientMemoryPool() = default;
			
 
				 };
			
 
				-
			
 
				-inline GpuVisibleTransientMemoryAllocation::operator BufferView() const
			
 
				-{
			
 
				-	ANKI_ASSERT(isValid());
			
 
				-	return {m_buffer, m_offset, m_size};
			
 
				-}
			
 
				 /// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Common.h
+++ b/AnKi/Gr/Common.h
@@ -201,6 +201,9 @@ public:
 
				 	/// API version.
			
 
				 	U8 m_majorApiVersion = 0;
			
 
				 
			
 
				+	/// Align structured buffers using the structure's size and not the m_storageBufferBindOffsetAlignment.
			
 
				+	Bool m_structuredBufferNaturalAlignment = false;
			
 
				+
			
 
				 	/// RT.
			
 
				 	Bool m_rayTracingEnabled = false;
			
 
				 
			
--- a/AnKi/Gr/D3D/D3DDescriptor.cpp
+++ b/AnKi/Gr/D3D/D3DDescriptor.cpp
@@ -575,9 +575,10 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 
				 					getDevice().CopyDescriptorsSimple(1, samplerHeapOffset.getCpuOffset(), outDescriptor.m_heapOffset,
			
 
				 													  D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
			
 
				 				}
			
 
				-				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
			
 
				+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite)
			
 
				+						&& !!(inDescriptor.m_flags | DescriptorFlag::kByteAddressBuffer))
			
 
				 				{
			
 
				-					// RWStructuredBuffer or RWByteAddressBuffer
			
 
				+					// RWByteAddressBuffer
			
 
				 
			
 
				 					ANKI_ASSERT(!outDescriptor.m_isHandle);
			
 
				 
			
@@ -596,9 +597,31 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 
				 
			
 
				 					getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
			
 
				 				}
			
 
				-				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
			
 
				+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
			
 
				+				{
			
 
				+					// RWStructuredBuffer
			
 
				+
			
 
				+					ANKI_ASSERT(!outDescriptor.m_isHandle);
			
 
				+
			
 
				+					const BufferView& view = outDescriptor.m_bufferView;
			
 
				+					D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
			
 
				+					uavDesc.Format = DXGI_FORMAT_UNKNOWN;
			
 
				+					uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
			
 
				+
			
 
				+					ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
			
 
				+					uavDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;
			
 
				+
			
 
				+					ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
			
 
				+					uavDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);
			
 
				+
			
 
				+					uavDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;
			
 
				+
			
 
				+					getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
			
 
				+				}
			
 
				+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite)
			
 
				+						&& !!(inDescriptor.m_flags & DescriptorFlag::kByteAddressBuffer))
			
 
				 				{
			
 
				-					// StructuredBuffer or ByteAddressBuffer
			
 
				+					// ByteAddressBuffer
			
 
				 
			
 
				 					ANKI_ASSERT(!outDescriptor.m_isHandle);
			
 
				 					const BufferView& view = outDescriptor.m_bufferView;
			
@@ -617,6 +640,27 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 
				 
			
 
				 					getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
			
 
				 				}
			
 
				+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
			
 
				+				{
			
 
				+					// StructuredBuffer
			
 
				+
			
 
				+					ANKI_ASSERT(!outDescriptor.m_isHandle);
			
 
				+					const BufferView& view = outDescriptor.m_bufferView;
			
 
				+					D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
			
 
				+					srvDesc.Format = DXGI_FORMAT_UNKNOWN;
			
 
				+					srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
			
 
				+					srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
			
 
				+
			
 
				+					ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
			
 
				+					srvDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;
			
 
				+
			
 
				+					ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
			
 
				+					srvDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);
			
 
				+
			
 
				+					srvDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;
			
 
				+
			
 
				+					getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
			
 
				+				}
			
 
				 				else if(inDescriptor.m_type == DescriptorType::kTexelBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
			
 
				 				{
			
 
				 					// RWBuffer
			
--- a/AnKi/Gr/D3D/D3DGrManager.cpp
+++ b/AnKi/Gr/D3D/D3DGrManager.cpp
@@ -432,6 +432,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 
				 		m_capabilities.m_uniformBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
			
 
				 		m_capabilities.m_uniformBufferMaxRange = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * D3D12_STANDARD_VECTOR_SIZE * sizeof(F32);
			
 
				 		m_capabilities.m_storageBufferBindOffsetAlignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT;
			
 
				+		m_capabilities.m_structuredBufferNaturalAlignment = true;
			
 
				 		m_capabilities.m_storageBufferMaxRange = 1 << D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
			
 
				 		m_capabilities.m_texelBufferBindOffsetAlignment = 32;
			
 
				 		m_capabilities.m_textureBufferMaxRange = kMaxU32; // ?
			
--- a/AnKi/Gr/Utils/StackGpuMemoryPool.cpp
+++ b/AnKi/Gr/Utils/StackGpuMemoryPool.cpp
@@ -30,18 +30,11 @@ public:
 
				 	PtrSize m_bias = 0;
			
 
				 	PtrSize m_allocatedMemory = 0;
			
 
				 	GrString m_bufferName;
			
 
				-	U32 m_alignment = 0;
			
 
				 	BufferUsageBit m_bufferUsage = BufferUsageBit::kNone;
			
 
				 	BufferMapAccessBit m_bufferMap = BufferMapAccessBit::kNone;
			
 
				 	U8 m_chunkCount = 0;
			
 
				 	Bool m_allowToGrow = false;
			
 
				 
			
 
				-	// Builder interface stuff:
			
 
				-	U32 getMaxAlignment() const
			
 
				-	{
			
 
				-		return m_alignment;
			
 
				-	}
			
 
				-
			
 
				 	PtrSize getInitialChunkSize() const
			
 
				 	{
			
 
				 		return m_initialSize;
			
@@ -123,11 +116,10 @@ StackGpuMemoryPool::~StackGpuMemoryPool()
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
			
 
				+void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage,
			
 
				 							  BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName)
			
 
				 {
			
 
				 	ANKI_ASSERT(m_builder == nullptr);
			
 
				-	ANKI_ASSERT(initialSize > 0 && alignment > 0);
			
 
				 	ANKI_ASSERT(nextChunkGrowScale >= 1.0);
			
 
				 
			
 
				 	m_builder = newInstance<Builder>(GrMemoryPool::getSingleton());
			
@@ -136,7 +128,6 @@ void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSi
 
				 	inter.m_scale = nextChunkGrowScale;
			
 
				 	inter.m_bias = nextChunkGrowBias;
			
 
				 	inter.m_bufferName = bufferName;
			
 
				-	inter.m_alignment = alignment;
			
 
				 	inter.m_bufferUsage = bufferUsage;
			
 
				 	inter.m_bufferMap = bufferMapping;
			
 
				 	inter.m_allowToGrow = allowToGrow;
			
@@ -147,11 +138,11 @@ void StackGpuMemoryPool::reset()
 
				 	m_builder->reset();
			
 
				 }
			
 
				 
			
 
				-void StackGpuMemoryPool::allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
			
 
				+void StackGpuMemoryPool::allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
			
 
				 {
			
 
				 	Chunk* chunk;
			
 
				 	PtrSize offset;
			
 
				-	const Error err = m_builder->allocate(size, 1, chunk, offset);
			
 
				+	const Error err = m_builder->allocate(size, alignment, chunk, offset);
			
 
				 	if(err)
			
 
				 	{
			
 
				 		ANKI_GR_LOGF("Allocation failed");
			
--- a/AnKi/Gr/Utils/StackGpuMemoryPool.h
+++ b/AnKi/Gr/Utils/StackGpuMemoryPool.h
@@ -25,18 +25,18 @@ public:
 
				 
			
 
				 	StackGpuMemoryPool& operator=(const StackGpuMemoryPool&) = delete; // Non-copyable
			
 
				 
			
 
				-	void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
			
 
				-			  BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName);
			
 
				+	void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage, BufferMapAccessBit bufferMapping,
			
 
				+			  Bool allowToGrow, CString bufferName);
			
 
				 
			
 
				 	/// @note It's thread-safe against other allocate()
			
 
				-	void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer)
			
 
				+	void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer)
			
 
				 	{
			
 
				 		void* dummyMapped = nullptr;
			
 
				-		allocate(size, outOffset, buffer, dummyMapped);
			
 
				+		allocate(size, alignment, outOffset, buffer, dummyMapped);
			
 
				 	}
			
 
				 
			
 
				 	/// @note It's thread-safe against other allocate()
			
 
				-	void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);
			
 
				+	void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);
			
 
				 
			
 
				 	void reset();
			
 
				 
			
--- a/AnKi/Gr/Vulkan/VkGrManager.cpp
+++ b/AnKi/Gr/Vulkan/VkGrManager.cpp
@@ -809,6 +809,7 @@ Error GrManagerImpl::initInstance()
 
				 	m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange;
			
 
				 	m_capabilities.m_storageBufferBindOffsetAlignment =
			
 
				 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment));
			
 
				+	m_capabilities.m_structuredBufferNaturalAlignment = false;
			
 
				 	m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange;
			
 
				 	m_capabilities.m_texelBufferBindOffsetAlignment = max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
			
 
				 	m_capabilities.m_textureBufferMaxRange = kMaxU32;
			
--- a/AnKi/Renderer/ClusterBinning.cpp
+++ b/AnKi/Renderer/ClusterBinning.cpp
@@ -52,7 +52,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 
				 	// Allocate the clusters buffer
			
 
				 	{
			
 
				 		const U32 clusterCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y() + getRenderer().getZSplitCount();
			
 
				-		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(Cluster) * clusterCount);
			
 
				+		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<Cluster>(clusterCount);
			
 
				 		m_runCtx.m_clustersHandle = rgraph.importBuffer(m_runCtx.m_clustersBuffer, BufferUsageBit::kNone);
			
 
				 	}
			
 
				 
			
@@ -62,7 +62,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 
				 	{
			
 
				 		// Allocate memory for the indirect args
			
 
				 		constexpr U32 dispatchCount = U32(GpuSceneNonRenderableObjectType::kCount) * 2;
			
 
				-		indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * dispatchCount);
			
 
				+		indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(dispatchCount);
			
 
				 		indirectArgsHandle = rgraph.importBuffer(indirectArgsBuff, BufferUsageBit::kNone);
			
 
				 
			
 
				 		// Create the pass
			
@@ -208,8 +208,8 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 
				 		// Allocations
			
 
				 		for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
			
 
				 		{
			
 
				-			m_runCtx.m_packedObjectsBuffers[type] =
			
 
				-				GpuVisibleTransientMemoryPool::getSingleton().allocate(kClusteredObjectSizes[type] * kMaxVisibleClusteredObjects[type]);
			
 
				+			m_runCtx.m_packedObjectsBuffers[type] = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(
			
 
				+				kMaxVisibleClusteredObjects[type], kClusteredObjectSizes[type]);
			
 
				 			m_runCtx.m_packedObjectsHandles[type] = rgraph.importBuffer(m_runCtx.m_packedObjectsBuffers[type], BufferUsageBit::kNone);
			
 
				 		}
			
 
				 
			
--- a/AnKi/Renderer/LensFlare.cpp
+++ b/AnKi/Renderer/LensFlare.cpp
@@ -54,7 +54,7 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx)
 
				 	RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
			
 
				 
			
 
				 	// Create indirect buffer
			
 
				-	m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * flareCount);
			
 
				+	m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(flareCount);
			
 
				 	m_runCtx.m_indirectBuffHandle = rgraph.importBuffer(m_runCtx.m_indirectBuff, BufferUsageBit::kNone);
			
 
				 
			
 
				 	// Create the pass
			
--- a/AnKi/Renderer/RtShadows.cpp
+++ b/AnKi/Renderer/RtShadows.cpp
@@ -208,7 +208,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 
				 	BufferHandle sbtBuildIndirectArgsHandle;
			
 
				 	BufferView sbtBuildIndirectArgsBuffer;
			
 
				 	{
			
 
				-		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs));
			
 
				+		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(1);
			
 
				 		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kStorageComputeWrite);
			
 
				 
			
 
				 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows setup build SBT");
			
--- a/AnKi/Renderer/ShadowMapping.cpp
+++ b/AnKi/Renderer/ShadowMapping.cpp
@@ -539,7 +539,7 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
 
				 {
			
 
				 	BufferView clearTileIndirectArgs;
			
 
				 
			
 
				-	clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs));
			
 
				+	clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(1);
			
 
				 
			
 
				 	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(passName);
			
 
				 
			
--- a/AnKi/Renderer/Utils/GpuVisibility.cpp
+++ b/AnKi/Renderer/Utils/GpuVisibility.cpp
@@ -79,16 +79,17 @@ private:
 
				 	U64 m_frameIdx = kMaxU64;
			
 
				 };
			
 
				 
			
 
				-BufferView allocateTransientGpuMem(PtrSize size)
			
 
				+template<typename T>
			
 
				+static BufferView allocateStructuredBuffer(U32 count)
			
 
				 {
			
 
				 	BufferView out = {};
			
 
				 
			
 
				-	if(size)
			
 
				+	if(count > 0)
			
 
				 	{
			
 
				-		g_gpuVisMemoryAllocatedStatVar.increment(size);
			
 
				-		out = GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
			
 
				+		g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count);
			
 
				+		out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
			
 
				 
			
 
				-		GpuVisMemoryStats::getSingleton().informAboutAllocation(size);
			
 
				+		GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count);
			
 
				 	}
			
 
				 
			
 
				 	return out;
			
@@ -296,22 +297,19 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 		}
			
 
				 
			
 
				 		m_persistentMemory.m_stage1.m_visibleRenderables =
			
 
				-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleLegacyRenderables);
			
 
				-		m_persistentMemory.m_stage1.m_visibleMeshlets =
			
 
				-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * maxLimits.m_maxVisibleMeshlets);
			
 
				+			allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleLegacyRenderables);
			
 
				+		m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(maxLimits.m_maxVisibleMeshlets);
			
 
				 
			
 
				-		m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables =
			
 
				-			allocateTransientGpuMem(sizeof(UVec4) * maxLimits.m_maxVisibleLegacyRenderables);
			
 
				+		m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(maxLimits.m_maxVisibleLegacyRenderables);
			
 
				 		m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
			
 
				-			allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * maxLimits.m_maxVisibleLegacyRenderables);
			
 
				+			allocateStructuredBuffer<DrawIndexedIndirectArgs>(maxLimits.m_maxVisibleLegacyRenderables);
			
 
				 
			
 
				-		m_persistentMemory.m_stage2Meshlet.m_meshletInstances =
			
 
				-			allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
			
 
				+		m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
			
 
				 
			
 
				 		m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
			
 
				-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleMeshlets);
			
 
				+			allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleMeshlets);
			
 
				 
			
 
				-		m_persistentMemory.m_stage3.m_meshletInstances = allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
			
 
				+		m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
			
 
				 
			
 
				 		m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
			
 
				 																		   : m_persistentMemory.m_stage1.m_visibleRenderables,
			
@@ -361,7 +359,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 		BufferView m_hash;
			
 
				 	} stage1Mem;
			
 
				 
			
 
				-	stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * U32(GpuVisibilityCounter::kCount));
			
 
				+	stage1Mem.m_counters = allocateStructuredBuffer<U32>(U32(GpuVisibilityCounter::kCount));
			
 
				 	if(in.m_limitMemory)
			
 
				 	{
			
 
				 		PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
			
@@ -380,21 +378,21 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		stage1Mem.m_visibleRenderables = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables);
			
 
				-		stage1Mem.m_visibleMeshlets = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets);
			
 
				+		stage1Mem.m_visibleRenderables = allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(limits.m_maxVisibleLegacyRenderables);
			
 
				+		stage1Mem.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(limits.m_maxVisibleMeshlets);
			
 
				 	}
			
 
				-	stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
			
 
				-	stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
			
 
				-	stage1Mem.m_gpuVisIndirectDispatchArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::kCount));
			
 
				+	stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer<U32>(bucketCount);
			
 
				+	stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer<U32>(bucketCount);
			
 
				+	stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer<DispatchIndirectArgs>(U32(GpuVisibilityIndirectDispatches::kCount));
			
 
				 
			
 
				 	if(in.m_gatherAabbIndices)
			
 
				 	{
			
 
				-		stage1Mem.m_visibleAabbIndices = allocateTransientGpuMem(sizeof(U32) * buckets.getBucketsActiveUserCount(in.m_technique));
			
 
				+		stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer<U32>(buckets.getBucketsActiveUserCount(in.m_technique));
			
 
				 	}
			
 
				 
			
 
				 	if(in.m_hashVisibles)
			
 
				 	{
			
 
				-		stage1Mem.m_hash = allocateTransientGpuMem(sizeof(GpuVisibilityHash));
			
 
				+		stage1Mem.m_hash = allocateStructuredBuffer<GpuVisibilityHash>(1);
			
 
				 	}
			
 
				 
			
 
				 	// Allocate memory for stage 2
			
@@ -436,47 +434,48 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			stage2Mem.m_legacy.m_instanceRateRenderables = allocateTransientGpuMem(sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables);
			
 
				-			stage2Mem.m_legacy.m_drawIndexedIndirectArgs =
			
 
				-				allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables);
			
 
				+			stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(limits.m_maxVisibleLegacyRenderables);
			
 
				+			stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer<DrawIndexedIndirectArgs>(limits.m_maxVisibleLegacyRenderables);
			
 
				 		}
			
 
				 
			
 
				-		stage2Mem.m_legacy.m_mdiDrawCounts = allocateTransientGpuMem(sizeof(U32) * bucketCount);
			
 
				+		stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer<U32>(bucketCount);
			
 
				 	}
			
 
				 
			
 
				 	if(bMeshletRendering)
			
 
				 	{
			
 
				 		if(bHwMeshletRendering)
			
 
				 		{
			
 
				-			stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
			
 
				+			stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			stage2Mem.m_meshlet.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
			
 
				+			stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
			
 
				 		}
			
 
				 
			
 
				-		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
			
 
				+		const U32 newCount = limits.m_maxVisibleMeshlets;
			
 
				 		if(in.m_limitMemory)
			
 
				 		{
			
 
				-			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
			
 
				-			stage2Mem.m_meshlet.m_meshletInstances = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newRange);
			
 
				+			ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
			
 
				+			stage2Mem.m_meshlet.m_meshletInstances =
			
 
				+				BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange);
			
 
				+			stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
			
 
				 		}
			
 
				 
			
 
				 		if(bStoreMeshletsFailedHzb)
			
 
				 		{
			
 
				-			const PtrSize newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
			
 
				+			const U32 newCount = limits.m_maxVisibleMeshlets;
			
 
				 			if(in.m_limitMemory)
			
 
				 			{
			
 
				-				ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
			
 
				-				stage2Mem.m_meshlet.m_meshletsFailedHzb = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newRange);
			
 
				+				ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
			
 
				+				stage2Mem.m_meshlet.m_meshletsFailedHzb =
			
 
				+					BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc));
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateTransientGpuMem(newRange);
			
 
				+				stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(newCount);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -495,22 +494,23 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 	{
			
 
				 		if(bHwMeshletRendering)
			
 
				 		{
			
 
				-			stage3Mem.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
			
 
				+			stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			stage3Mem.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
			
 
				+			stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
			
 
				 		}
			
 
				 
			
 
				-		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
			
 
				+		const U32 newCount = limits.m_maxVisibleMeshlets;
			
 
				 		if(in.m_limitMemory)
			
 
				 		{
			
 
				-			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
			
 
				-			stage3Mem.m_meshletInstances = BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newRange);
			
 
				+			ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
			
 
				+			stage3Mem.m_meshletInstances =
			
 
				+				BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			stage3Mem.m_meshletInstances = allocateTransientGpuMem(newRange);
			
 
				+			stage3Mem.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -991,7 +991,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
				 	}
			
 
				 
			
 
				 	// Allocate memory for the result
			
 
				-	out.m_visiblesBuffer = allocateTransientGpuMem((objCount + 1) * sizeof(U32));
			
 
				+	out.m_visiblesBuffer = allocateStructuredBuffer<U32>(objCount + 1);
			
 
				 	out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
			
 
				 
			
 
				 	// Create the renderpass
			
@@ -1093,12 +1093,12 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 
				 	// Allocate the transient buffers
			
 
				 	const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
			
 
				 
			
 
				-	out.m_instancesBuffer = allocateTransientGpuMem(aabbCount * sizeof(AccelerationStructureInstance));
			
 
				+	out.m_instancesBuffer = allocateStructuredBuffer<AccelerationStructureInstance>(aabbCount);
			
 
				 	out.m_someBufferHandle = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kStorageComputeWrite);
			
 
				 
			
 
				-	out.m_renderableIndicesBuffer = allocateTransientGpuMem((aabbCount + 1) * sizeof(U32));
			
 
				+	out.m_renderableIndicesBuffer = allocateStructuredBuffer<U32>(aabbCount + 1);
			
 
				 
			
 
				-	const BufferView zeroInstancesDispatchArgsBuff = allocateTransientGpuMem(sizeof(DispatchIndirectArgs));
			
 
				+	const BufferView zeroInstancesDispatchArgsBuff = allocateStructuredBuffer<DispatchIndirectArgs>(1);
			
 
				 
			
 
				 	// Create vis pass
			
 
				 	{
			
--- a/AnKi/Resource/TransferGpuAllocator.h
+++ b/AnKi/Resource/TransferGpuAllocator.h
@@ -154,11 +154,6 @@ private:
 
				 
			
 
				 		// The rest of the functions implement the StackAllocatorBuilder TInterface.
			
 
				 
			
 
				-		constexpr PtrSize getMaxAlignment()
			
 
				-		{
			
 
				-			return kGpuBufferAlignment;
			
 
				-		}
			
 
				-
			
 
				 		constexpr PtrSize getInitialChunkSize() const
			
 
				 		{
			
 
				 			return kChunkInitialSize;
			
--- a/AnKi/Scene/SceneGraph.cpp
+++ b/AnKi/Scene/SceneGraph.cpp
@@ -104,7 +104,7 @@ Error SceneGraph::init(AllocAlignedCallback allocCallback, void* allocCallbackDa
 
				 {
			
 
				 	SceneMemoryPool::allocateSingleton(allocCallback, allocCallbackData);
			
 
				 
			
 
				-	m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, ANKI_SAFE_ALIGNMENT, "SceneGraphFramePool");
			
 
				+	m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, "SceneGraphFramePool");
			
 
				 
			
 
				 	// Init the default main camera
			
 
				 	ANKI_CHECK(newSceneNode<SceneNode>("mainCamera", m_defaultMainCam));
			
--- a/AnKi/Util/MemoryPool.cpp
+++ b/AnKi/Util/MemoryPool.cpp
@@ -276,15 +276,13 @@ void StackMemoryPool::StackAllocatorBuilderInterface::recycleChunk([[maybe_unuse
 
				 }
			
 
				 
			
 
				 void StackMemoryPool::init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale, PtrSize nextChunkBias,
			
 
				-						   Bool ignoreDeallocationErrors, U32 alignmentBytes, const Char* name)
			
 
				+						   Bool ignoreDeallocationErrors, const Char* name)
			
 
				 {
			
 
				 	ANKI_ASSERT(initialChunkSize > 0);
			
 
				 	ANKI_ASSERT(nextChunkScale >= 1.0);
			
 
				-	ANKI_ASSERT(alignmentBytes > 0 && alignmentBytes <= kMaxAlignment);
			
 
				 	BaseMemoryPool::init(allocCb, allocCbUserData, name);
			
 
				 
			
 
				 	m_builder.getInterface().m_parent = this;
			
 
				-	m_builder.getInterface().m_alignmentBytes = alignmentBytes;
			
 
				 	m_builder.getInterface().m_ignoreDeallocationErrors = ignoreDeallocationErrors;
			
 
				 	m_builder.getInterface().m_initialChunkSize = initialChunkSize;
			
 
				 	m_builder.getInterface().m_nextChunkScale = nextChunkScale;
			
@@ -301,6 +299,7 @@ void StackMemoryPool::destroy()
 
				 void* StackMemoryPool::allocate(PtrSize size, PtrSize alignment)
			
 
				 {
			
 
				 	ANKI_ASSERT(size > 0);
			
 
				+	ANKI_ASSERT(alignment > 0 && alignment <= kMaxAlignment);
			
 
				 
			
 
				 	Chunk* chunk;
			
 
				 	PtrSize offset;
			
--- a/AnKi/Util/MemoryPool.h
+++ b/AnKi/Util/MemoryPool.h
@@ -195,11 +195,10 @@ public:
 
				 
			
 
				 	/// @see init
			
 
				 	StackMemoryPool(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0,
			
 
				-					PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT,
			
 
				-					const Char* name = nullptr)
			
 
				+					PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, const Char* name = nullptr)
			
 
				 		: StackMemoryPool()
			
 
				 	{
			
 
				-		init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, alignmentBytes, name);
			
 
				+		init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, name);
			
 
				 	}
			
 
				 
			
 
				 	/// Destroy
			
@@ -214,12 +213,10 @@ public:
 
				 	/// @param initialChunkSize The size of the first chunk.
			
 
				 	/// @param nextChunkScale Value that controls the next chunk.
			
 
				 	/// @param nextChunkBias Value that controls the next chunk.
			
 
				-	/// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to
			
 
				-	///        true to suppress such errors.
			
 
				-	/// @param alignmentBytes The maximum supported alignment for returned memory.
			
 
				+	/// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to true to suppress such errors.
			
 
				 	/// @param name An optional name.
			
 
				 	void init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0, PtrSize nextChunkBias = 0,
			
 
				-			  Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT, const Char* name = nullptr);
			
 
				+			  Bool ignoreDeallocationErrors = true, const Char* name = nullptr);
			
 
				 
			
 
				 	/// Manual destroy. The destructor calls that as well.
			
 
				 	void destroy();
			
@@ -274,8 +271,6 @@ private:
 
				 	public:
			
 
				 		StackMemoryPool* m_parent = nullptr;
			
 
				 
			
 
				-		PtrSize m_alignmentBytes = 0;
			
 
				-
			
 
				 		Bool m_ignoreDeallocationErrors = false;
			
 
				 
			
 
				 		PtrSize m_initialChunkSize = 0;
			
@@ -286,12 +281,6 @@ private:
 
				 
			
 
				 		// The rest of the functions implement the StackAllocatorBuilder TInterface.
			
 
				 
			
 
				-		PtrSize getMaxAlignment() const
			
 
				-		{
			
 
				-			ANKI_ASSERT(m_alignmentBytes > 0);
			
 
				-			return m_alignmentBytes;
			
 
				-		}
			
 
				-
			
 
				 		PtrSize getInitialChunkSize() const
			
 
				 		{
			
 
				 			ANKI_ASSERT(m_initialChunkSize > 0);
			
--- a/AnKi/Util/StackAllocatorBuilder.h
+++ b/AnKi/Util/StackAllocatorBuilder.h
@@ -22,7 +22,6 @@ namespace anki {
 
				 ///                @endcode
			
 
				 /// @tparam TInterface This is the type of the interface that contains various info. Should have the following members:
			
 
				 ///                    @code
			
 
				-///                    U32 getMaxAlignment();
			
 
				 ///                    PtrSize getInitialChunkSize();
			
 
				 ///                    F64 getNextChunkGrowScale();
			
 
				 ///                    PtrSize getNextChunkGrowBias();
			
@@ -52,7 +51,7 @@ public:
 
				 
			
 
				 	/// Allocate memory.
			
 
				 	/// @param size The size to allocate.
			
 
				-	/// @param alignment The alignment of the returned address.
			
 
				+	/// @param alignment The alignment of the returned address. Can be anything, not only a power of two.
			
 
				 	/// @param[out] chunk The chunk that the memory belongs to.
			
 
				 	/// @param[out] offset The offset inside the chunk.
			
 
				 	/// @note This is thread safe with itself.
			
--- a/AnKi/Util/StackAllocatorBuilder.inl.h
+++ b/AnKi/Util/StackAllocatorBuilder.inl.h
@@ -38,12 +38,10 @@ void StackAllocatorBuilder<TChunk, TInterface, TLock>::destroy()
 
				 }
			
 
				 
			
 
				 template<typename TChunk, typename TInterface, typename TLock>
			
 
				-Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [[maybe_unused]] PtrSize alignment, TChunk*& chunk, PtrSize& offset)
			
 
				+Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, PtrSize alignment, TChunk*& chunk, PtrSize& offset)
			
 
				 {
			
 
				-	ANKI_ASSERT(alignment <= m_interface.getMaxAlignment());
			
 
				-
			
 
				-	size = getAlignedRoundUp(m_interface.getMaxAlignment(), size);
			
 
				 	ANKI_ASSERT(size > 0);
			
 
				+	size += alignment;
			
 
				 
			
 
				 	chunk = nullptr;
			
 
				 	offset = kMaxPtrSize;
			
@@ -73,7 +71,7 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			// Need new chunk
			
 
				+			// Need new chunk, create it and loop back
			
 
				 
			
 
				 			LockGuard<TLock> lock(m_lock);
			
 
				 
			
@@ -98,7 +96,6 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 
				 			}
			
 
				 
			
 
				 			nextChunkSize = max(size, nextChunkSize); // Can't have the allocation fail
			
 
				-			alignRoundUp(m_interface.getMaxAlignment(), nextChunkSize); // Align again
			
 
				 
			
 
				 			TChunk* nextChunk;
			
 
				 			if(crntChunk)
			
@@ -167,7 +164,10 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	alignRoundUp(alignment, offset);
			
 
				+
			
 
				 	ANKI_ASSERT(chunk && offset != kMaxPtrSize);
			
 
				+	ANKI_ASSERT(offset + size <= chunk->m_chunkSize);
			
 
				 	return Error::kNone;
			
 
				 }
			
 
				 
			
--- a/Tests/Gr/Gr.cpp
+++ b/Tests/Gr/Gr.cpp
@@ -196,11 +196,18 @@ ANKI_TEST(Gr, Bindings)
 
				 
			
 
				 	{
			
 
				 		constexpr const char* kSrc = R"(
			
 
				+struct Foo3
			
 
				+{
			
 
				+	float x;
			
 
				+	float y;
			
 
				+	float z;
			
 
				+};
			
 
				+
			
 
				 StructuredBuffer<float4> g_structured : register(t0);
			
 
				 Texture2D g_tex : register(t2);
			
 
				 Buffer<float4> g_buff : register(t3);
			
 
				 
			
 
				-RWStructuredBuffer<float4> g_rwstructured : register(u0, space2);
			
 
				+RWStructuredBuffer<Foo3> g_rwstructured : register(u0, space2);
			
 
				 RWTexture2D<float4> g_rwtex[3] : register(u2);
			
 
				 RWBuffer<float4> g_rwbuff : register(u7);
			
 
				 
			
@@ -228,7 +235,12 @@ SamplerState g_sampler : register(s2);
 
				 [numthreads(1, 1, 1)]
			
 
				 void main()
			
 
				 {
			
 
				-	g_rwstructured[0] = g_structured[0] + g_structured[1];
			
 
				+	float3 tmp = (g_structured[0] + g_structured[1]).xyz;
			
 
				+	Foo3 tmp3 = {tmp.x, tmp.y, tmp.z};
			
 
				+	g_rwstructured[0] = tmp3;
			
 
				+	tmp *= 2.0f;
			
 
				+	Foo3 tmp3_ = {tmp.x, tmp.y, tmp.z};
			
 
				+	g_rwstructured[1] = tmp3_;
			
 
				 
			
 
				 	g_rwtex[0][uint2(0, 0)] = g_consts.m_val;
			
 
				 
			
@@ -244,6 +256,26 @@ void main()
 
				 	g_rwbuff[0] = g_buff[0];
			
 
				 }
			
 
				 )";
			
 
				+		struct Foo
			
 
				+		{
			
 
				+			F32 x;
			
 
				+			F32 y;
			
 
				+			F32 z;
			
 
				+
			
 
				+			Foo() = default;
			
 
				+
			
 
				+			Foo(Vec4 v)
			
 
				+				: x(v.x())
			
 
				+				, y(v.y())
			
 
				+				, z(v.z())
			
 
				+			{
			
 
				+			}
			
 
				+
			
 
				+			Bool operator==(const Foo& b) const
			
 
				+			{
			
 
				+				return x == b.x && y == b.y && z == b.z;
			
 
				+			}
			
 
				+		};
			
 
				 
			
 
				 		TextureInitInfo texInit;
			
 
				 		texInit.m_width = texInit.m_height = 1;
			
@@ -258,14 +290,14 @@ void main()
 
				 		const Vec4 kMagicVec(1.0f, 2.0f, 3.0f, 4.0f);
			
 
				 		const Vec4 kInvalidVec(1.0f, 2.0f, 3.0f, 4.0f);
			
 
				 
			
 
				-		const Array<Vec4, 2> data = {kMagicVec, kMagicVec};
			
 
				+		const Array<Vec4, 2> data = {kMagicVec, kMagicVec * 2.0f};
			
 
				 		BufferPtr structured = createBuffer(BufferUsageBit::kAllStorage, ConstWeakArray<Vec4>(data), "structured");
			
 
				 
			
 
				 		texInit.m_usage = TextureUsageBit::kSampledCompute | TextureUsageBit::kTransferDestination;
			
 
				 		TexturePtr tex = createTexture2d(texInit, kMagicVec * 2.0f);
			
 
				 
			
 
				 		BufferPtr buff = createBuffer(BufferUsageBit::kAllTexel, kMagicVec * 2.0f, 1, "buff");
			
 
				-		BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, kInvalidVec, 1, "rwstructured");
			
 
				+		BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, Foo(kInvalidVec), 2, "rwstructured");
			
 
				 		BufferPtr rwbuff = createBuffer(BufferUsageBit::kAllTexel, kInvalidVec, 1, "rwbuff");
			
 
				 
			
 
				 		Array<TexturePtr, 3> rwtex;
			
@@ -310,8 +342,8 @@ void main()
 
				 		signalFence->clientWait(kMaxSecond);
			
 
				 
			
 
				 		// Check
			
 
				-		validateBuffer(rwstructured, kMagicVec + kMagicVec);
			
 
				-		validateBuffer(rwbuff, kMagicVec * 2.0f);
			
 
				+		validateBuffer(rwstructured, ConstWeakArray(Array<Foo, 2>{kMagicVec + kMagicVec * 2.0f, (kMagicVec + kMagicVec * 2.0f) * 2.0f}));
			
 
				+		validateBuffer(rwbuff, ConstWeakArray(Array<Vec4, 1>{kMagicVec * 2.0f}));
			
 
				 	}
			
 
				 
			
 
				 	commonDestroy();
			
--- a/Tests/Gr/GrCommon.h
+++ b/Tests/Gr/GrCommon.h
@@ -184,7 +184,8 @@ inline void readBuffer(BufferPtr buff, DynamicArray<T>& out)
 
				 		buffInit.m_usage = BufferUsageBit::kTransferDestination;
			
 
				 		tmpBuff = GrManager::getSingleton().newBuffer(buffInit);
			
 
				 
			
 
				-		CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kSmallBatch));
			
 
				+		CommandBufferPtr cmdb =
			
 
				+			GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kGeneralWork | CommandBufferFlag::kSmallBatch));
			
 
				 		cmdb->copyBufferToBuffer(BufferView(buff.get()), BufferView(tmpBuff.get()));
			
 
				 		cmdb->endRecording();
			
 
				 
			
@@ -202,14 +203,16 @@ inline void readBuffer(BufferPtr buff, DynamicArray<T>& out)
 
				 }
			
 
				 
			
 
				 template<typename T>
			
 
				-inline void validateBuffer(BufferPtr buff, T value)
			
 
				+inline void validateBuffer(BufferPtr buff, ConstWeakArray<T> values)
			
 
				 {
			
 
				 	DynamicArray<T> cpuBuff;
			
 
				 	readBuffer<T>(buff, cpuBuff);
			
 
				 
			
 
				-	for(const T& x : cpuBuff)
			
 
				+	ANKI_ASSERT(values.getSize() == cpuBuff.getSize());
			
 
				+
			
 
				+	for(U32 i = 0; i < values.getSize(); ++i)
			
 
				 	{
			
 
				-		ANKI_TEST_EXPECT_EQ(x, value);
			
 
				+		ANKI_TEST_EXPECT_EQ(cpuBuff[i], values[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/Tests/Gr/GrWorkGraphs.cpp
+++ b/Tests/Gr/GrWorkGraphs.cpp
@@ -145,7 +145,7 @@ void thirdNode([MaxRecords(32)] GroupNodeInputRecords<ThirdNodeRecord> inp, uint
 
				 		GrManager::getSingleton().submit(cmdb.get(), {}, &fence);
			
 
				 		fence->clientWait(kMaxSecond);
			
 
				 
			
 
				-		validateBuffer(counterBuff, 122880);
			
 
				+		validateBuffer(counterBuff, ConstWeakArray(Array<U32, 1>{122880}));
			
 
				 	}
			
 
				 
			
 
				 	commonDestroy();