Browse Source

Fix some D3D issues with structured buffers

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
fd51a665c0

+ 26 - 51
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -15,70 +15,51 @@ namespace anki {
 /// @addtogroup core
 /// @{
 
-/// @memberof GpuVisibleTransientMemoryPool
-class GpuVisibleTransientMemoryAllocation
+/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
+class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
 {
-	friend class GpuVisibleTransientMemoryPool;
+	template<typename>
+	friend class MakeSingleton;
 
 public:
-	Buffer& getBuffer() const
-	{
-		ANKI_ASSERT(isValid());
-		return *m_buffer;
-	}
-
-	PtrSize getOffset() const
-	{
-		ANKI_ASSERT(isValid());
-		return m_offset;
-	}
-
-	PtrSize getRange() const
+	BufferView allocate(PtrSize size, PtrSize alignment = 0)
 	{
-		ANKI_ASSERT(isValid());
-		return m_size;
+		alignment = (alignment == 0) ? m_alignment : alignment;
+		PtrSize offset;
+		Buffer* buffer;
+		m_pool.allocate(size, alignment, offset, buffer);
+		return BufferView(buffer, offset, size);
 	}
 
-	Bool isValid() const
+	template<typename T>
+	BufferView allocateStructuredBuffer(U32 count)
 	{
-		return m_buffer != nullptr;
+		return allocateStructuredBuffer(count, sizeof(T));
 	}
 
-	operator BufferView() const;
-
-private:
-	Buffer* m_buffer = nullptr;
-	PtrSize m_offset = kMaxPtrSize;
-	PtrSize m_size = 0;
-};
-
-/// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
-class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMemoryPool>
-{
-	template<typename>
-	friend class MakeSingleton;
-
-public:
-	GpuVisibleTransientMemoryAllocation allocate(PtrSize size)
+	BufferView allocateStructuredBuffer(U32 count, U32 structureSize)
 	{
-		GpuVisibleTransientMemoryAllocation out;
-		m_pool.allocate(size, out.m_offset, out.m_buffer);
-		out.m_size = size;
-		return out;
+		return allocate(PtrSize(structureSize * count), (m_structuredBufferAlignment == kMaxU32) ? structureSize : m_structuredBufferAlignment);
 	}
 
 	void endFrame();
 
 private:
 	StackGpuMemoryPool m_pool;
+	U32 m_alignment = 0;
 	U32 m_frame = 0;
+	U32 m_structuredBufferAlignment = 0;
 
 	GpuVisibleTransientMemoryPool()
 	{
-		U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
-		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
+		m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+										  ? kMaxU32
+										  : GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
+
+		m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
+		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
 
 		BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
 								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer;
@@ -86,17 +67,11 @@ private:
 		{
 			buffUsage |= (BufferUsageBit::kAccelerationStructureBuildScratch | BufferUsageBit::kAccelerationStructureBuild);
 		}
-		m_pool.init(10_MB, 2.0, 0, alignment, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
+		m_pool.init(10_MB, 2.0, 0, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
 	}
 
 	~GpuVisibleTransientMemoryPool() = default;
 };
-
-inline GpuVisibleTransientMemoryAllocation::operator BufferView() const
-{
-	ANKI_ASSERT(isValid());
-	return {m_buffer, m_offset, m_size};
-}
 /// @}
 
 } // end namespace anki

+ 3 - 0
AnKi/Gr/Common.h

@@ -201,6 +201,9 @@ public:
 	/// API version.
 	U8 m_majorApiVersion = 0;
 
+	/// Align structured buffers using the structure's size and not the m_storageBufferBindOffsetAlignment.
+	Bool m_structuredBufferNaturalAlignment = false;
+
 	/// RT.
 	Bool m_rayTracingEnabled = false;
 

+ 48 - 4
AnKi/Gr/D3D/D3DDescriptor.cpp

@@ -575,9 +575,10 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 					getDevice().CopyDescriptorsSimple(1, samplerHeapOffset.getCpuOffset(), outDescriptor.m_heapOffset,
 													  D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
 				}
-				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite)
+						&& !!(inDescriptor.m_flags | DescriptorFlag::kByteAddressBuffer))
 				{
-					// RWStructuredBuffer or RWByteAddressBuffer
+					// RWByteAddressBuffer
 
 					ANKI_ASSERT(!outDescriptor.m_isHandle);
 
@@ -596,9 +597,31 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 
 					getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
 				}
-				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
+				{
+					// RWStructuredBuffer
+
+					ANKI_ASSERT(!outDescriptor.m_isHandle);
+
+					const BufferView& view = outDescriptor.m_bufferView;
+					D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+					uavDesc.Format = DXGI_FORMAT_UNKNOWN;
+					uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+
+					ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
+					uavDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;
+
+					ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
+					uavDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);
+
+					uavDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;
+
+					getDevice().CreateUnorderedAccessView(view.m_resource, nullptr, &uavDesc, cbvSrvUavHeapOffset.getCpuOffset());
+				}
+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite)
+						&& !!(inDescriptor.m_flags & DescriptorFlag::kByteAddressBuffer))
 				{
-					// StructuredBuffer or ByteAddressBuffer
+					// ByteAddressBuffer
 
 					ANKI_ASSERT(!outDescriptor.m_isHandle);
 					const BufferView& view = outDescriptor.m_bufferView;
@@ -617,6 +640,27 @@ void DescriptorState::flush(ID3D12GraphicsCommandList& cmdList)
 
 					getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
 				}
+				else if(inDescriptor.m_type == DescriptorType::kStorageBuffer && !(inDescriptor.m_flags & DescriptorFlag::kWrite))
+				{
+					// StructuredBuffer
+
+					ANKI_ASSERT(!outDescriptor.m_isHandle);
+					const BufferView& view = outDescriptor.m_bufferView;
+					D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+					srvDesc.Format = DXGI_FORMAT_UNKNOWN;
+					srvDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
+					srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+
+					ANKI_ASSERT((view.m_offset % inDescriptor.m_structuredBufferStride) == 0);
+					srvDesc.Buffer.FirstElement = view.m_offset / inDescriptor.m_structuredBufferStride;
+
+					ANKI_ASSERT((view.m_range % inDescriptor.m_structuredBufferStride) == 0);
+					srvDesc.Buffer.NumElements = U32(view.m_range / inDescriptor.m_structuredBufferStride);
+
+					srvDesc.Buffer.StructureByteStride = inDescriptor.m_structuredBufferStride;
+
+					getDevice().CreateShaderResourceView(view.m_resource, &srvDesc, cbvSrvUavHeapOffset.getCpuOffset());
+				}
 				else if(inDescriptor.m_type == DescriptorType::kTexelBuffer && !!(inDescriptor.m_flags & DescriptorFlag::kWrite))
 				{
 					// RWBuffer

+ 1 - 0
AnKi/Gr/D3D/D3DGrManager.cpp

@@ -432,6 +432,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 		m_capabilities.m_uniformBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
 		m_capabilities.m_uniformBufferMaxRange = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * D3D12_STANDARD_VECTOR_SIZE * sizeof(F32);
 		m_capabilities.m_storageBufferBindOffsetAlignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT;
+		m_capabilities.m_structuredBufferNaturalAlignment = true;
 		m_capabilities.m_storageBufferMaxRange = 1 << D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
 		m_capabilities.m_texelBufferBindOffsetAlignment = 32;
 		m_capabilities.m_textureBufferMaxRange = kMaxU32; // ?

+ 3 - 12
AnKi/Gr/Utils/StackGpuMemoryPool.cpp

@@ -30,18 +30,11 @@ public:
 	PtrSize m_bias = 0;
 	PtrSize m_allocatedMemory = 0;
 	GrString m_bufferName;
-	U32 m_alignment = 0;
 	BufferUsageBit m_bufferUsage = BufferUsageBit::kNone;
 	BufferMapAccessBit m_bufferMap = BufferMapAccessBit::kNone;
 	U8 m_chunkCount = 0;
 	Bool m_allowToGrow = false;
 
-	// Builder interface stuff:
-	U32 getMaxAlignment() const
-	{
-		return m_alignment;
-	}
-
 	PtrSize getInitialChunkSize() const
 	{
 		return m_initialSize;
@@ -123,11 +116,10 @@ StackGpuMemoryPool::~StackGpuMemoryPool()
 	}
 }
 
-void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
+void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage,
 							  BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName)
 {
 	ANKI_ASSERT(m_builder == nullptr);
-	ANKI_ASSERT(initialSize > 0 && alignment > 0);
 	ANKI_ASSERT(nextChunkGrowScale >= 1.0);
 
 	m_builder = newInstance<Builder>(GrMemoryPool::getSingleton());
@@ -136,7 +128,6 @@ void StackGpuMemoryPool::init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSi
 	inter.m_scale = nextChunkGrowScale;
 	inter.m_bias = nextChunkGrowBias;
 	inter.m_bufferName = bufferName;
-	inter.m_alignment = alignment;
 	inter.m_bufferUsage = bufferUsage;
 	inter.m_bufferMap = bufferMapping;
 	inter.m_allowToGrow = allowToGrow;
@@ -147,11 +138,11 @@ void StackGpuMemoryPool::reset()
 	m_builder->reset();
 }
 
-void StackGpuMemoryPool::allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
+void StackGpuMemoryPool::allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory)
 {
 	Chunk* chunk;
 	PtrSize offset;
-	const Error err = m_builder->allocate(size, 1, chunk, offset);
+	const Error err = m_builder->allocate(size, alignment, chunk, offset);
 	if(err)
 	{
 		ANKI_GR_LOGF("Allocation failed");

+ 5 - 5
AnKi/Gr/Utils/StackGpuMemoryPool.h

@@ -25,18 +25,18 @@ public:
 
 	StackGpuMemoryPool& operator=(const StackGpuMemoryPool&) = delete; // Non-copyable
 
-	void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, U32 alignment, BufferUsageBit bufferUsage,
-			  BufferMapAccessBit bufferMapping, Bool allowToGrow, CString bufferName);
+	void init(PtrSize initialSize, F64 nextChunkGrowScale, PtrSize nextChunkGrowBias, BufferUsageBit bufferUsage, BufferMapAccessBit bufferMapping,
+			  Bool allowToGrow, CString bufferName);
 
 	/// @note It's thread-safe against other allocate()
-	void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer)
+	void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer)
 	{
 		void* dummyMapped = nullptr;
-		allocate(size, outOffset, buffer, dummyMapped);
+		allocate(size, alignment, outOffset, buffer, dummyMapped);
 	}
 
 	/// @note It's thread-safe against other allocate()
-	void allocate(PtrSize size, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);
+	void allocate(PtrSize size, PtrSize alignment, PtrSize& outOffset, Buffer*& buffer, void*& mappedMemory);
 
 	void reset();
 

+ 1 - 0
AnKi/Gr/Vulkan/VkGrManager.cpp

@@ -809,6 +809,7 @@ Error GrManagerImpl::initInstance()
 	m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange;
 	m_capabilities.m_storageBufferBindOffsetAlignment =
 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment));
+	m_capabilities.m_structuredBufferNaturalAlignment = false;
 	m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange;
 	m_capabilities.m_texelBufferBindOffsetAlignment = max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
 	m_capabilities.m_textureBufferMaxRange = kMaxU32;

+ 4 - 4
AnKi/Renderer/ClusterBinning.cpp

@@ -52,7 +52,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 	// Allocate the clusters buffer
 	{
 		const U32 clusterCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y() + getRenderer().getZSplitCount();
-		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(Cluster) * clusterCount);
+		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<Cluster>(clusterCount);
 		m_runCtx.m_clustersHandle = rgraph.importBuffer(m_runCtx.m_clustersBuffer, BufferUsageBit::kNone);
 	}
 
@@ -62,7 +62,7 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 	{
 		// Allocate memory for the indirect args
 		constexpr U32 dispatchCount = U32(GpuSceneNonRenderableObjectType::kCount) * 2;
-		indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * dispatchCount);
+		indirectArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(dispatchCount);
 		indirectArgsHandle = rgraph.importBuffer(indirectArgsBuff, BufferUsageBit::kNone);
 
 		// Create the pass
@@ -208,8 +208,8 @@ void ClusterBinning::populateRenderGraph(RenderingContext& ctx)
 		// Allocations
 		for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
 		{
-			m_runCtx.m_packedObjectsBuffers[type] =
-				GpuVisibleTransientMemoryPool::getSingleton().allocate(kClusteredObjectSizes[type] * kMaxVisibleClusteredObjects[type]);
+			m_runCtx.m_packedObjectsBuffers[type] = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer(
+				kMaxVisibleClusteredObjects[type], kClusteredObjectSizes[type]);
 			m_runCtx.m_packedObjectsHandles[type] = rgraph.importBuffer(m_runCtx.m_packedObjectsBuffers[type], BufferUsageBit::kNone);
 		}
 

+ 1 - 1
AnKi/Renderer/LensFlare.cpp

@@ -54,7 +54,7 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx)
 	RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
 
 	// Create indirect buffer
-	m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * flareCount);
+	m_runCtx.m_indirectBuff = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(flareCount);
 	m_runCtx.m_indirectBuffHandle = rgraph.importBuffer(m_runCtx.m_indirectBuff, BufferUsageBit::kNone);
 
 	// Create the pass

+ 1 - 1
AnKi/Renderer/RtShadows.cpp

@@ -208,7 +208,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 	BufferHandle sbtBuildIndirectArgsHandle;
 	BufferView sbtBuildIndirectArgsBuffer;
 	{
-		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs));
+		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(1);
 		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kStorageComputeWrite);
 
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows setup build SBT");

+ 1 - 1
AnKi/Renderer/ShadowMapping.cpp

@@ -539,7 +539,7 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
 {
 	BufferView clearTileIndirectArgs;
 
-	clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs));
+	clearTileIndirectArgs = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DrawIndirectArgs>(1);
 
 	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(passName);
 

+ 47 - 47
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -79,16 +79,17 @@ private:
 	U64 m_frameIdx = kMaxU64;
 };
 
-BufferView allocateTransientGpuMem(PtrSize size)
+template<typename T>
+static BufferView allocateStructuredBuffer(U32 count)
 {
 	BufferView out = {};
 
-	if(size)
+	if(count > 0)
 	{
-		g_gpuVisMemoryAllocatedStatVar.increment(size);
-		out = GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
+		g_gpuVisMemoryAllocatedStatVar.increment(sizeof(T) * count);
+		out = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
 
-		GpuVisMemoryStats::getSingleton().informAboutAllocation(size);
+		GpuVisMemoryStats::getSingleton().informAboutAllocation(sizeof(T) * count);
 	}
 
 	return out;
@@ -296,22 +297,19 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		}
 
 		m_persistentMemory.m_stage1.m_visibleRenderables =
-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleLegacyRenderables);
-		m_persistentMemory.m_stage1.m_visibleMeshlets =
-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * maxLimits.m_maxVisibleMeshlets);
+			allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleLegacyRenderables);
+		m_persistentMemory.m_stage1.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(maxLimits.m_maxVisibleMeshlets);
 
-		m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables =
-			allocateTransientGpuMem(sizeof(UVec4) * maxLimits.m_maxVisibleLegacyRenderables);
+		m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(maxLimits.m_maxVisibleLegacyRenderables);
 		m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
-			allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * maxLimits.m_maxVisibleLegacyRenderables);
+			allocateStructuredBuffer<DrawIndexedIndirectArgs>(maxLimits.m_maxVisibleLegacyRenderables);
 
-		m_persistentMemory.m_stage2Meshlet.m_meshletInstances =
-			allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
+		m_persistentMemory.m_stage2Meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
 
 		m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
-			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleMeshlets);
+			allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(maxLimits.m_maxVisibleMeshlets);
 
-		m_persistentMemory.m_stage3.m_meshletInstances = allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
+		m_persistentMemory.m_stage3.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(maxLimits.m_maxVisibleMeshlets);
 
 		m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
 																		   : m_persistentMemory.m_stage1.m_visibleRenderables,
@@ -361,7 +359,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		BufferView m_hash;
 	} stage1Mem;
 
-	stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * U32(GpuVisibilityCounter::kCount));
+	stage1Mem.m_counters = allocateStructuredBuffer<U32>(U32(GpuVisibilityCounter::kCount));
 	if(in.m_limitMemory)
 	{
 		PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
@@ -380,21 +378,21 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 	else
 	{
-		stage1Mem.m_visibleRenderables = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables);
-		stage1Mem.m_visibleMeshlets = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets);
+		stage1Mem.m_visibleRenderables = allocateStructuredBuffer<GpuVisibilityVisibleRenderableDesc>(limits.m_maxVisibleLegacyRenderables);
+		stage1Mem.m_visibleMeshlets = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(limits.m_maxVisibleMeshlets);
 	}
-	stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
-	stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
-	stage1Mem.m_gpuVisIndirectDispatchArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::kCount));
+	stage1Mem.m_renderablePrefixSums = allocateStructuredBuffer<U32>(bucketCount);
+	stage1Mem.m_meshletPrefixSums = allocateStructuredBuffer<U32>(bucketCount);
+	stage1Mem.m_gpuVisIndirectDispatchArgs = allocateStructuredBuffer<DispatchIndirectArgs>(U32(GpuVisibilityIndirectDispatches::kCount));
 
 	if(in.m_gatherAabbIndices)
 	{
-		stage1Mem.m_visibleAabbIndices = allocateTransientGpuMem(sizeof(U32) * buckets.getBucketsActiveUserCount(in.m_technique));
+		stage1Mem.m_visibleAabbIndices = allocateStructuredBuffer<U32>(buckets.getBucketsActiveUserCount(in.m_technique));
 	}
 
 	if(in.m_hashVisibles)
 	{
-		stage1Mem.m_hash = allocateTransientGpuMem(sizeof(GpuVisibilityHash));
+		stage1Mem.m_hash = allocateStructuredBuffer<GpuVisibilityHash>(1);
 	}
 
 	// Allocate memory for stage 2
@@ -436,47 +434,48 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		}
 		else
 		{
-			stage2Mem.m_legacy.m_instanceRateRenderables = allocateTransientGpuMem(sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables);
-			stage2Mem.m_legacy.m_drawIndexedIndirectArgs =
-				allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables);
+			stage2Mem.m_legacy.m_instanceRateRenderables = allocateStructuredBuffer<UVec4>(limits.m_maxVisibleLegacyRenderables);
+			stage2Mem.m_legacy.m_drawIndexedIndirectArgs = allocateStructuredBuffer<DrawIndexedIndirectArgs>(limits.m_maxVisibleLegacyRenderables);
 		}
 
-		stage2Mem.m_legacy.m_mdiDrawCounts = allocateTransientGpuMem(sizeof(U32) * bucketCount);
+		stage2Mem.m_legacy.m_mdiDrawCounts = allocateStructuredBuffer<U32>(bucketCount);
 	}
 
 	if(bMeshletRendering)
 	{
 		if(bHwMeshletRendering)
 		{
-			stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
+			stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
 		}
 		else
 		{
-			stage2Mem.m_meshlet.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
+			stage2Mem.m_meshlet.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
 		}
 
-		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
+		const U32 newCount = limits.m_maxVisibleMeshlets;
 		if(in.m_limitMemory)
 		{
-			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
-			stage2Mem.m_meshlet.m_meshletInstances = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newRange);
+			ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
+			stage2Mem.m_meshlet.m_meshletInstances =
+				BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
 		}
 		else
 		{
-			stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange);
+			stage2Mem.m_meshlet.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
 		}
 
 		if(bStoreMeshletsFailedHzb)
 		{
-			const PtrSize newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
+			const U32 newCount = limits.m_maxVisibleMeshlets;
 			if(in.m_limitMemory)
 			{
-				ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
-				stage2Mem.m_meshlet.m_meshletsFailedHzb = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newRange);
+				ANKI_ASSERT(newCount * sizeof(GpuVisibilityVisibleMeshletDesc) <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
+				stage2Mem.m_meshlet.m_meshletsFailedHzb =
+					BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newCount * sizeof(GpuVisibilityVisibleMeshletDesc));
 			}
 			else
 			{
-				stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateTransientGpuMem(newRange);
+				stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateStructuredBuffer<GpuVisibilityVisibleMeshletDesc>(newCount);
 			}
 		}
 	}
@@ -495,22 +494,23 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	{
 		if(bHwMeshletRendering)
 		{
-			stage3Mem.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
+			stage3Mem.m_dispatchMeshIndirectArgs = allocateStructuredBuffer<DispatchIndirectArgs>(bucketCount);
 		}
 		else
 		{
-			stage3Mem.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
+			stage3Mem.m_indirectDrawArgs = allocateStructuredBuffer<DrawIndirectArgs>(bucketCount);
 		}
 
-		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
+		const U32 newCount = limits.m_maxVisibleMeshlets;
 		if(in.m_limitMemory)
 		{
-			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
-			stage3Mem.m_meshletInstances = BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newRange);
+			ANKI_ASSERT(newCount * sizeof(GpuSceneMeshletInstance) <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
+			stage3Mem.m_meshletInstances =
+				BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newCount * sizeof(GpuSceneMeshletInstance));
 		}
 		else
 		{
-			stage3Mem.m_meshletInstances = allocateTransientGpuMem(newRange);
+			stage3Mem.m_meshletInstances = allocateStructuredBuffer<GpuSceneMeshletInstance>(newCount);
 		}
 	}
 
@@ -991,7 +991,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 	}
 
 	// Allocate memory for the result
-	out.m_visiblesBuffer = allocateTransientGpuMem((objCount + 1) * sizeof(U32));
+	out.m_visiblesBuffer = allocateStructuredBuffer<U32>(objCount + 1);
 	out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
 
 	// Create the renderpass
@@ -1093,12 +1093,12 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 	// Allocate the transient buffers
 	const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
 
-	out.m_instancesBuffer = allocateTransientGpuMem(aabbCount * sizeof(AccelerationStructureInstance));
+	out.m_instancesBuffer = allocateStructuredBuffer<AccelerationStructureInstance>(aabbCount);
 	out.m_someBufferHandle = rgraph.importBuffer(out.m_instancesBuffer, BufferUsageBit::kStorageComputeWrite);
 
-	out.m_renderableIndicesBuffer = allocateTransientGpuMem((aabbCount + 1) * sizeof(U32));
+	out.m_renderableIndicesBuffer = allocateStructuredBuffer<U32>(aabbCount + 1);
 
-	const BufferView zeroInstancesDispatchArgsBuff = allocateTransientGpuMem(sizeof(DispatchIndirectArgs));
+	const BufferView zeroInstancesDispatchArgsBuff = allocateStructuredBuffer<DispatchIndirectArgs>(1);
 
 	// Create vis pass
 	{

+ 0 - 5
AnKi/Resource/TransferGpuAllocator.h

@@ -154,11 +154,6 @@ private:
 
 		// The rest of the functions implement the StackAllocatorBuilder TInterface.
 
-		constexpr PtrSize getMaxAlignment()
-		{
-			return kGpuBufferAlignment;
-		}
-
 		constexpr PtrSize getInitialChunkSize() const
 		{
 			return kChunkInitialSize;

+ 1 - 1
AnKi/Scene/SceneGraph.cpp

@@ -104,7 +104,7 @@ Error SceneGraph::init(AllocAlignedCallback allocCallback, void* allocCallbackDa
 {
 	SceneMemoryPool::allocateSingleton(allocCallback, allocCallbackData);
 
-	m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, ANKI_SAFE_ALIGNMENT, "SceneGraphFramePool");
+	m_framePool.init(allocCallback, allocCallbackData, 1_MB, 2.0, 0, true, "SceneGraphFramePool");
 
 	// Init the default main camera
 	ANKI_CHECK(newSceneNode<SceneNode>("mainCamera", m_defaultMainCam));

+ 2 - 3
AnKi/Util/MemoryPool.cpp

@@ -276,15 +276,13 @@ void StackMemoryPool::StackAllocatorBuilderInterface::recycleChunk([[maybe_unuse
 }
 
 void StackMemoryPool::init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale, PtrSize nextChunkBias,
-						   Bool ignoreDeallocationErrors, U32 alignmentBytes, const Char* name)
+						   Bool ignoreDeallocationErrors, const Char* name)
 {
 	ANKI_ASSERT(initialChunkSize > 0);
 	ANKI_ASSERT(nextChunkScale >= 1.0);
-	ANKI_ASSERT(alignmentBytes > 0 && alignmentBytes <= kMaxAlignment);
 	BaseMemoryPool::init(allocCb, allocCbUserData, name);
 
 	m_builder.getInterface().m_parent = this;
-	m_builder.getInterface().m_alignmentBytes = alignmentBytes;
 	m_builder.getInterface().m_ignoreDeallocationErrors = ignoreDeallocationErrors;
 	m_builder.getInterface().m_initialChunkSize = initialChunkSize;
 	m_builder.getInterface().m_nextChunkScale = nextChunkScale;
@@ -301,6 +299,7 @@ void StackMemoryPool::destroy()
 void* StackMemoryPool::allocate(PtrSize size, PtrSize alignment)
 {
 	ANKI_ASSERT(size > 0);
+	ANKI_ASSERT(alignment > 0 && alignment <= kMaxAlignment);
 
 	Chunk* chunk;
 	PtrSize offset;

+ 4 - 15
AnKi/Util/MemoryPool.h

@@ -195,11 +195,10 @@ public:
 
 	/// @see init
 	StackMemoryPool(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0,
-					PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT,
-					const Char* name = nullptr)
+					PtrSize nextChunkBias = 0, Bool ignoreDeallocationErrors = true, const Char* name = nullptr)
 		: StackMemoryPool()
 	{
-		init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, alignmentBytes, name);
+		init(allocCb, allocCbUserData, initialChunkSize, nextChunkScale, nextChunkBias, ignoreDeallocationErrors, name);
 	}
 
 	/// Destroy
@@ -214,12 +213,10 @@ public:
 	/// @param initialChunkSize The size of the first chunk.
 	/// @param nextChunkScale Value that controls the next chunk.
 	/// @param nextChunkBias Value that controls the next chunk.
-	/// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to
-	///        true to suppress such errors.
-	/// @param alignmentBytes The maximum supported alignment for returned memory.
+	/// @param ignoreDeallocationErrors Method free() may fail if the ptr is not in the top of the stack. Set that to true to suppress such errors.
 	/// @param name An optional name.
 	void init(AllocAlignedCallback allocCb, void* allocCbUserData, PtrSize initialChunkSize, F64 nextChunkScale = 2.0, PtrSize nextChunkBias = 0,
-			  Bool ignoreDeallocationErrors = true, U32 alignmentBytes = ANKI_SAFE_ALIGNMENT, const Char* name = nullptr);
+			  Bool ignoreDeallocationErrors = true, const Char* name = nullptr);
 
 	/// Manual destroy. The destructor calls that as well.
 	void destroy();
@@ -274,8 +271,6 @@ private:
 	public:
 		StackMemoryPool* m_parent = nullptr;
 
-		PtrSize m_alignmentBytes = 0;
-
 		Bool m_ignoreDeallocationErrors = false;
 
 		PtrSize m_initialChunkSize = 0;
@@ -286,12 +281,6 @@ private:
 
 		// The rest of the functions implement the StackAllocatorBuilder TInterface.
 
-		PtrSize getMaxAlignment() const
-		{
-			ANKI_ASSERT(m_alignmentBytes > 0);
-			return m_alignmentBytes;
-		}
-
 		PtrSize getInitialChunkSize() const
 		{
 			ANKI_ASSERT(m_initialChunkSize > 0);

+ 1 - 2
AnKi/Util/StackAllocatorBuilder.h

@@ -22,7 +22,6 @@ namespace anki {
 ///                @endcode
 /// @tparam TInterface This is the type of the interface that contains various info. Should have the following members:
 ///                    @code
-///                    U32 getMaxAlignment();
 ///                    PtrSize getInitialChunkSize();
 ///                    F64 getNextChunkGrowScale();
 ///                    PtrSize getNextChunkGrowBias();
@@ -52,7 +51,7 @@ public:
 
 	/// Allocate memory.
 	/// @param size The size to allocate.
-	/// @param alignment The alignment of the returned address.
+	/// @param alignment The alignment of the returned address. Can be anything, not only a power of two.
 	/// @param[out] chunk The chunk that the memory belongs to.
 	/// @param[out] offset The offset inside the chunk.
 	/// @note This is thread safe with itself.

+ 6 - 6
AnKi/Util/StackAllocatorBuilder.inl.h

@@ -38,12 +38,10 @@ void StackAllocatorBuilder<TChunk, TInterface, TLock>::destroy()
 }
 
 template<typename TChunk, typename TInterface, typename TLock>
-Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [[maybe_unused]] PtrSize alignment, TChunk*& chunk, PtrSize& offset)
+Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, PtrSize alignment, TChunk*& chunk, PtrSize& offset)
 {
-	ANKI_ASSERT(alignment <= m_interface.getMaxAlignment());
-
-	size = getAlignedRoundUp(m_interface.getMaxAlignment(), size);
 	ANKI_ASSERT(size > 0);
+	size += alignment;
 
 	chunk = nullptr;
 	offset = kMaxPtrSize;
@@ -73,7 +71,7 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 		}
 		else
 		{
-			// Need new chunk
+			// Need new chunk, create it and loop back
 
 			LockGuard<TLock> lock(m_lock);
 
@@ -98,7 +96,6 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 			}
 
 			nextChunkSize = max(size, nextChunkSize); // Can't have the allocation fail
-			alignRoundUp(m_interface.getMaxAlignment(), nextChunkSize); // Align again
 
 			TChunk* nextChunk;
 			if(crntChunk)
@@ -167,7 +164,10 @@ Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, [
 		}
 	}
 
+	alignRoundUp(alignment, offset);
+
 	ANKI_ASSERT(chunk && offset != kMaxPtrSize);
+	ANKI_ASSERT(offset + size <= chunk->m_chunkSize);
 	return Error::kNone;
 }
 

+ 38 - 6
Tests/Gr/Gr.cpp

@@ -196,11 +196,18 @@ ANKI_TEST(Gr, Bindings)
 
 	{
 		constexpr const char* kSrc = R"(
+struct Foo3
+{
+	float x;
+	float y;
+	float z;
+};
+
 StructuredBuffer<float4> g_structured : register(t0);
 Texture2D g_tex : register(t2);
 Buffer<float4> g_buff : register(t3);
 
-RWStructuredBuffer<float4> g_rwstructured : register(u0, space2);
+RWStructuredBuffer<Foo3> g_rwstructured : register(u0, space2);
 RWTexture2D<float4> g_rwtex[3] : register(u2);
 RWBuffer<float4> g_rwbuff : register(u7);
 
@@ -228,7 +235,12 @@ SamplerState g_sampler : register(s2);
 [numthreads(1, 1, 1)]
 void main()
 {
-	g_rwstructured[0] = g_structured[0] + g_structured[1];
+	float3 tmp = (g_structured[0] + g_structured[1]).xyz;
+	Foo3 tmp3 = {tmp.x, tmp.y, tmp.z};
+	g_rwstructured[0] = tmp3;
+	tmp *= 2.0f;
+	Foo3 tmp3_ = {tmp.x, tmp.y, tmp.z};
+	g_rwstructured[1] = tmp3_;
 
 	g_rwtex[0][uint2(0, 0)] = g_consts.m_val;
 
@@ -244,6 +256,26 @@ void main()
 	g_rwbuff[0] = g_buff[0];
 }
 )";
+		struct Foo
+		{
+			F32 x;
+			F32 y;
+			F32 z;
+
+			Foo() = default;
+
+			Foo(Vec4 v)
+				: x(v.x())
+				, y(v.y())
+				, z(v.z())
+			{
+			}
+
+			Bool operator==(const Foo& b) const
+			{
+				return x == b.x && y == b.y && z == b.z;
+			}
+		};
 
 		TextureInitInfo texInit;
 		texInit.m_width = texInit.m_height = 1;
@@ -258,14 +290,14 @@ void main()
 		const Vec4 kMagicVec(1.0f, 2.0f, 3.0f, 4.0f);
 		const Vec4 kInvalidVec(1.0f, 2.0f, 3.0f, 4.0f);
 
-		const Array<Vec4, 2> data = {kMagicVec, kMagicVec};
+		const Array<Vec4, 2> data = {kMagicVec, kMagicVec * 2.0f};
 		BufferPtr structured = createBuffer(BufferUsageBit::kAllStorage, ConstWeakArray<Vec4>(data), "structured");
 
 		texInit.m_usage = TextureUsageBit::kSampledCompute | TextureUsageBit::kTransferDestination;
 		TexturePtr tex = createTexture2d(texInit, kMagicVec * 2.0f);
 
 		BufferPtr buff = createBuffer(BufferUsageBit::kAllTexel, kMagicVec * 2.0f, 1, "buff");
-		BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, kInvalidVec, 1, "rwstructured");
+		BufferPtr rwstructured = createBuffer(BufferUsageBit::kAllStorage, Foo(kInvalidVec), 2, "rwstructured");
 		BufferPtr rwbuff = createBuffer(BufferUsageBit::kAllTexel, kInvalidVec, 1, "rwbuff");
 
 		Array<TexturePtr, 3> rwtex;
@@ -310,8 +342,8 @@ void main()
 		signalFence->clientWait(kMaxSecond);
 
 		// Check
-		validateBuffer(rwstructured, kMagicVec + kMagicVec);
-		validateBuffer(rwbuff, kMagicVec * 2.0f);
+		validateBuffer(rwstructured, ConstWeakArray(Array<Foo, 2>{kMagicVec + kMagicVec * 2.0f, (kMagicVec + kMagicVec * 2.0f) * 2.0f}));
+		validateBuffer(rwbuff, ConstWeakArray(Array<Vec4, 1>{kMagicVec * 2.0f}));
 	}
 
 	commonDestroy();

+ 7 - 4
Tests/Gr/GrCommon.h

@@ -184,7 +184,8 @@ inline void readBuffer(BufferPtr buff, DynamicArray<T>& out)
 		buffInit.m_usage = BufferUsageBit::kTransferDestination;
 		tmpBuff = GrManager::getSingleton().newBuffer(buffInit);
 
-		CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kSmallBatch));
+		CommandBufferPtr cmdb =
+			GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo(CommandBufferFlag::kGeneralWork | CommandBufferFlag::kSmallBatch));
 		cmdb->copyBufferToBuffer(BufferView(buff.get()), BufferView(tmpBuff.get()));
 		cmdb->endRecording();
 
@@ -202,14 +203,16 @@ inline void readBuffer(BufferPtr buff, DynamicArray<T>& out)
 }
 
 template<typename T>
-inline void validateBuffer(BufferPtr buff, T value)
+inline void validateBuffer(BufferPtr buff, ConstWeakArray<T> values)
 {
 	DynamicArray<T> cpuBuff;
 	readBuffer<T>(buff, cpuBuff);
 
-	for(const T& x : cpuBuff)
+	ANKI_ASSERT(values.getSize() == cpuBuff.getSize());
+
+	for(U32 i = 0; i < values.getSize(); ++i)
 	{
-		ANKI_TEST_EXPECT_EQ(x, value);
+		ANKI_TEST_EXPECT_EQ(cpuBuff[i], values[i]);
 	}
 }
 

+ 1 - 1
Tests/Gr/GrWorkGraphs.cpp

@@ -145,7 +145,7 @@ void thirdNode([MaxRecords(32)] GroupNodeInputRecords<ThirdNodeRecord> inp, uint
 		GrManager::getSingleton().submit(cmdb.get(), {}, &fence);
 		fence->clientWait(kMaxSecond);
 
-		validateBuffer(counterBuff, 122880);
+		validateBuffer(counterBuff, ConstWeakArray(Array<U32, 1>{122880}));
 	}
 
 	commonDestroy();