1 year ago · 2d5ecdb36b
--- a/AnKi/Core/GpuMemory/GpuReadbackMemoryPool.cpp
+++ b/AnKi/Core/GpuMemory/GpuReadbackMemoryPool.cpp
@@ -17,17 +17,20 @@ GpuReadbackMemoryPool::GpuReadbackMemoryPool()
 
				 
			
 
				 	m_pool.init(buffUsage, classes, classes.getBack(), "GpuReadback", false, mapAccess);
			
 
				 
			
 
				-	m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
			
 
				+	if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+	{
			
 
				+		m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 GpuReadbackMemoryPool ::~GpuReadbackMemoryPool()
			
 
				 {
			
 
				 }
			
 
				 
			
 
				-GpuReadbackMemoryAllocation GpuReadbackMemoryPool::allocate(PtrSize size)
			
 
				+GpuReadbackMemoryAllocation GpuReadbackMemoryPool::allocate(PtrSize size, U32 alignment)
			
 
				 {
			
 
				 	GpuReadbackMemoryAllocation out;
			
 
				-	m_pool.allocate(size, m_alignment, out.m_token);
			
 
				+	m_pool.allocate(size, alignment, out.m_token);
			
 
				 	out.m_buffer = &m_pool.getGpuBuffer();
			
 
				 	out.m_mappedMemory = static_cast<U8*>(m_pool.getGpuBufferMappedMemory()) + out.m_token.m_offset;
			
 
				 	return out;
			
--- a/AnKi/Core/GpuMemory/GpuReadbackMemoryPool.h
+++ b/AnKi/Core/GpuMemory/GpuReadbackMemoryPool.h
@@ -84,7 +84,14 @@ class GpuReadbackMemoryPool : public MakeSingleton<GpuReadbackMemoryPool>
 
				 	friend class MakeSingleton;
			
 
				 
			
 
				 public:
			
 
				-	GpuReadbackMemoryAllocation allocate(PtrSize size);
			
 
				+	GpuReadbackMemoryAllocation allocate(PtrSize size, U32 alignment);
			
 
				+
			
 
				+	template<typename T>
			
 
				+	GpuReadbackMemoryAllocation allocateStructuredBuffer(U32 count)
			
 
				+	{
			
 
				+		const U32 alignment = (m_structuredBufferAlignment == kMaxU32) ? sizeof(T) : m_structuredBufferAlignment;
			
 
				+		return allocate(sizeof(T) * count, alignment);
			
 
				+	}
			
 
				 
			
 
				 	void deferredFree(GpuReadbackMemoryAllocation& allocation);
			
 
				 
			
@@ -92,7 +99,7 @@ public:
 
				 
			
 
				 private:
			
 
				 	SegregatedListsGpuMemoryPool m_pool;
			
 
				-	U32 m_alignment = 0;
			
 
				+	U32 m_structuredBufferAlignment = kMaxU32;
			
 
				 
			
 
				 	GpuReadbackMemoryPool();
			
 
				 
			
--- a/AnKi/Core/GpuMemory/GpuSceneBuffer.cpp
+++ b/AnKi/Core/GpuMemory/GpuSceneBuffer.cpp
@@ -130,15 +130,16 @@ void GpuSceneMicroPatcher::patchGpuScene(CommandBuffer& cmdb)
 
				 	ANKI_TRACE_INC_COUNTER(GpuSceneMicroPatches, m_crntFramePatchHeaders.getSize());
			
 
				 	ANKI_TRACE_INC_COUNTER(GpuSceneMicroPatchUploadData, m_crntFramePatchData.getSizeInBytes());
			
 
				 
			
 
				-	void* mapped;
			
 
				-	const RebarAllocation headersToken = RebarTransientMemoryPool::getSingleton().allocateFrame(m_crntFramePatchHeaders.getSizeInBytes(), mapped);
			
 
				-	memcpy(mapped, &m_crntFramePatchHeaders[0], m_crntFramePatchHeaders.getSizeInBytes());
			
 
				+	WeakArray<PatchHeader> mapped;
			
 
				+	const BufferView headersBuff = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(m_crntFramePatchHeaders.getSize(), mapped);
			
 
				+	memcpy(mapped.getBegin(), &m_crntFramePatchHeaders[0], m_crntFramePatchHeaders.getSizeInBytes());
			
 
				 
			
 
				-	const RebarAllocation dataToken = RebarTransientMemoryPool::getSingleton().allocateFrame(m_crntFramePatchData.getSizeInBytes(), mapped);
			
 
				-	memcpy(mapped, &m_crntFramePatchData[0], m_crntFramePatchData.getSizeInBytes());
			
 
				+	WeakArray<U32> mapped2;
			
 
				+	const BufferView dataBuff = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(m_crntFramePatchData.getSize(), mapped2);
			
 
				+	memcpy(mapped2.getBegin(), &m_crntFramePatchData[0], m_crntFramePatchData.getSizeInBytes());
			
 
				 
			
 
				-	cmdb.bindSrv(0, 0, headersToken);
			
 
				-	cmdb.bindSrv(1, 0, dataToken);
			
 
				+	cmdb.bindSrv(0, 0, headersBuff);
			
 
				+	cmdb.bindSrv(1, 0, dataBuff);
			
 
				 	cmdb.bindUav(0, 0, BufferView(&GpuSceneBuffer::getSingleton().getBuffer()));
			
 
				 
			
 
				 	cmdb.bindShaderProgram(m_grProgram.get());
			
--- a/AnKi/Core/GpuMemory/GpuSceneBuffer.h
+++ b/AnKi/Core/GpuMemory/GpuSceneBuffer.h
@@ -83,6 +83,17 @@ public:
 
				 		return alloc;
			
 
				 	}
			
 
				 
			
 
				+	template<typename T>
			
 
				+	GpuSceneBufferAllocation allocateStructuredBuffer(U32 count)
			
 
				+	{
			
 
				+		const U32 alignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+								  ? sizeof(T)
			
 
				+								  : GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
			
 
				+		GpuSceneBufferAllocation alloc;
			
 
				+		m_pool.allocate(count * sizeof(T), alignment, alloc.m_token);
			
 
				+		return alloc;
			
 
				+	}
			
 
				+
			
 
				 	void deferredFree(GpuSceneBufferAllocation& alloc)
			
 
				 	{
			
 
				 		m_pool.deferredFree(alloc.m_token);
			
--- a/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
+++ b/AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
@@ -22,9 +22,8 @@ class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMe
 
				 	friend class MakeSingleton;
			
 
				 
			
 
				 public:
			
 
				-	BufferView allocate(PtrSize size, PtrSize alignment = 0)
			
 
				+	BufferView allocate(PtrSize size, PtrSize alignment)
			
 
				 	{
			
 
				-		alignment = (alignment == 0) ? m_alignment : alignment;
			
 
				 		PtrSize offset;
			
 
				 		Buffer* buffer;
			
 
				 		m_pool.allocate(size, alignment, offset, buffer);
			
@@ -46,20 +45,15 @@ public:
 
				 
			
 
				 private:
			
 
				 	StackGpuMemoryPool m_pool;
			
 
				-	U32 m_alignment = 0;
			
 
				 	U32 m_frame = 0;
			
 
				-	U32 m_structuredBufferAlignment = 0;
			
 
				+	U32 m_structuredBufferAlignment = kMaxU32;
			
 
				 
			
 
				 	GpuVisibleTransientMemoryPool()
			
 
				 	{
			
 
				-		m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				-										  ? kMaxU32
			
 
				-										  : GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
			
 
				-
			
 
				-		m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
			
 
				-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
			
 
				-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
			
 
				-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
			
 
				+		if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+		{
			
 
				+			m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
			
 
				+		}
			
 
				 
			
 
				 		BufferUsageBit buffUsage = BufferUsageBit::kAllConstant | BufferUsageBit::kAllUav | BufferUsageBit::kAllSrv | BufferUsageBit::kIndirectDraw
			
 
				 								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllCopy;
			
--- a/AnKi/Core/GpuMemory/RebarTransientMemoryPool.cpp
+++ b/AnKi/Core/GpuMemory/RebarTransientMemoryPool.cpp
@@ -36,28 +36,19 @@ void RebarTransientMemoryPool::init()
 
				 
			
 
				 	m_bufferSize = buffInit.m_size;
			
 
				 
			
 
				-	m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
			
 
				-	m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
			
 
				-	m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
			
 
				-
			
 
				-	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
			
 
				-}
			
 
				-
			
 
				-RebarAllocation RebarTransientMemoryPool::allocateFrame(PtrSize size, void*& mappedMem)
			
 
				-{
			
 
				-	RebarAllocation out = tryAllocateFrame(size, mappedMem);
			
 
				-	if(!out.isValid()) [[unlikely]]
			
 
				+	if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				 	{
			
 
				-		ANKI_CORE_LOGF("Out of ReBAR GPU memory");
			
 
				+		m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
			
 
				 	}
			
 
				 
			
 
				-	return out;
			
 
				+	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
			
 
				 }
			
 
				 
			
 
				-RebarAllocation RebarTransientMemoryPool::tryAllocateFrame(PtrSize origSize, void*& mappedMem)
			
 
				+BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alignment, void*& mappedMem)
			
 
				 {
			
 
				 	ANKI_ASSERT(origSize > 0);
			
 
				-	const PtrSize size = getAlignedRoundUp(m_alignment, origSize);
			
 
				+	ANKI_ASSERT(alignment > 0);
			
 
				+	const PtrSize size = origSize + alignment;
			
 
				 
			
 
				 	// Try in a loop because we may end up with an allocation its offset crosses the buffer's end
			
 
				 	PtrSize offset;
			
@@ -65,17 +56,16 @@ RebarAllocation RebarTransientMemoryPool::tryAllocateFrame(PtrSize origSize, voi
 
				 	do
			
 
				 	{
			
 
				 		offset = m_offset.fetchAdd(size) % m_bufferSize;
			
 
				-		const PtrSize end = (offset + origSize) % (m_bufferSize + 1);
			
 
				+		const PtrSize end = (offset + size) % (m_bufferSize + 1);
			
 
				 
			
 
				 		done = offset < end;
			
 
				 	} while(!done);
			
 
				 
			
 
				-	mappedMem = m_mappedMem + offset;
			
 
				-	RebarAllocation out;
			
 
				-	out.m_offset = offset;
			
 
				-	out.m_range = origSize;
			
 
				+	const PtrSize alignedOffset = getAlignedRoundUp(alignment, offset);
			
 
				+	ANKI_ASSERT(alignedOffset + origSize <= offset + size);
			
 
				 
			
 
				-	return out;
			
 
				+	mappedMem = m_mappedMem + alignedOffset;
			
 
				+	return BufferView(m_buffer.get(), alignedOffset, origSize);
			
 
				 }
			
 
				 
			
 
				 void RebarTransientMemoryPool::endFrame()
			
--- a/AnKi/Core/GpuMemory/RebarTransientMemoryPool.h
+++ b/AnKi/Core/GpuMemory/RebarTransientMemoryPool.h
@@ -13,47 +13,6 @@ namespace anki {
 
				 /// @addtogroup core
			
 
				 /// @{
			
 
				 
			
 
				-/// Token that gets returned when requesting for memory to write to a resource.
			
 
				-class RebarAllocation
			
 
				-{
			
 
				-	friend class RebarTransientMemoryPool;
			
 
				-
			
 
				-public:
			
 
				-	RebarAllocation() = default;
			
 
				-
			
 
				-	~RebarAllocation() = default;
			
 
				-
			
 
				-	Bool operator==(const RebarAllocation& b) const
			
 
				-	{
			
 
				-		return m_offset == b.m_offset && m_range == b.m_range;
			
 
				-	}
			
 
				-
			
 
				-	Bool isValid() const
			
 
				-	{
			
 
				-		return m_range != 0;
			
 
				-	}
			
 
				-
			
 
				-	PtrSize getOffset() const
			
 
				-	{
			
 
				-		ANKI_ASSERT(isValid());
			
 
				-		return m_offset;
			
 
				-	}
			
 
				-
			
 
				-	PtrSize getRange() const
			
 
				-	{
			
 
				-		ANKI_ASSERT(isValid());
			
 
				-		return m_range;
			
 
				-	}
			
 
				-
			
 
				-	Buffer& getBuffer() const;
			
 
				-
			
 
				-	operator BufferView() const;
			
 
				-
			
 
				-private:
			
 
				-	PtrSize m_offset = kMaxPtrSize;
			
 
				-	PtrSize m_range = 0;
			
 
				-};
			
 
				-
			
 
				 /// Manages staging GPU memory.
			
 
				 class RebarTransientMemoryPool : public MakeSingleton<RebarTransientMemoryPool>
			
 
				 {
			
@@ -70,28 +29,43 @@ public:
 
				 	void endFrame();
			
 
				 
			
 
				 	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
			
 
				-	RebarAllocation allocateFrame(PtrSize size, void*& mappedMem);
			
 
				-
			
 
				 	template<typename T>
			
 
				-	RebarAllocation allocateFrame(U32 count, T*& mappedMem)
			
 
				+	BufferView allocate(PtrSize size, U32 alignment, T*& mappedMem)
			
 
				 	{
			
 
				 		void* mem;
			
 
				-		const RebarAllocation out = allocateFrame(count * sizeof(T), mem);
			
 
				+		const BufferView out = allocateInternal(size, alignment, mem);
			
 
				 		mappedMem = static_cast<T*>(mem);
			
 
				 		return out;
			
 
				 	}
			
 
				 
			
 
				+	/// @copydoc allocate
			
 
				 	template<typename T>
			
 
				-	RebarAllocation allocateFrame(U32 count, WeakArray<T>& arr)
			
 
				+	BufferView allocateConstantBuffer(T*& mappedMem)
			
 
				 	{
			
 
				-		void* mem;
			
 
				-		const RebarAllocation out = allocateFrame(count * sizeof(T), mem);
			
 
				-		arr = {static_cast<T*>(mem), count};
			
 
				+		return allocate(sizeof(T), GrManager::getSingleton().getDeviceCapabilities().m_constantBufferBindOffsetAlignment, mappedMem);
			
 
				+	}
			
 
				+
			
 
				+	/// @copydoc allocate
			
 
				+	template<typename T>
			
 
				+	BufferView allocateStructuredBuffer(U32 count, WeakArray<T>& arr)
			
 
				+	{
			
 
				+		T* mem;
			
 
				+		const U32 alignment = (m_structuredBufferAlignment == kMaxU32) ? sizeof(T) : m_structuredBufferAlignment;
			
 
				+		const BufferView out = allocate(count * sizeof(T), alignment, mem);
			
 
				+		arr = {mem, count};
			
 
				 		return out;
			
 
				 	}
			
 
				 
			
 
				-	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
			
 
				-	RebarAllocation tryAllocateFrame(PtrSize size, void*& mappedMem);
			
 
				+	/// @copydoc allocate
			
 
				+	template<typename T>
			
 
				+	BufferView allocateCopyBuffer(U32 count, WeakArray<T>& arr)
			
 
				+	{
			
 
				+		T* mem;
			
 
				+		const U32 alignment = sizeof(U32);
			
 
				+		const BufferView out = allocate(sizeof(T) * count, alignment, mem);
			
 
				+		arr = {mem, count};
			
 
				+		return out;
			
 
				+	}
			
 
				 
			
 
				 	ANKI_PURE Buffer& getBuffer() const
			
 
				 	{
			
@@ -109,23 +83,14 @@ private:
 
				 	PtrSize m_bufferSize = 0; ///< Cache it.
			
 
				 	Atomic<PtrSize> m_offset = {0};
			
 
				 	PtrSize m_previousFrameEndOffset = 0;
			
 
				-	U32 m_alignment = 0;
			
 
				+	U32 m_structuredBufferAlignment = kMaxU32;
			
 
				 
			
 
				 	RebarTransientMemoryPool() = default;
			
 
				 
			
 
				 	~RebarTransientMemoryPool();
			
 
				-};
			
 
				 
			
 
				-inline Buffer& RebarAllocation::getBuffer() const
			
 
				-{
			
 
				-	return RebarTransientMemoryPool::getSingleton().getBuffer();
			
 
				-}
			
 
				-
			
 
				-inline RebarAllocation::operator BufferView() const
			
 
				-{
			
 
				-	ANKI_ASSERT(isValid());
			
 
				-	return {&RebarTransientMemoryPool::getSingleton().getBuffer(), m_offset, m_range};
			
 
				-}
			
 
				+	BufferView allocateInternal(PtrSize size, U32 alignment, void*& mappedMem);
			
 
				+};
			
 
				 /// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Common.h
+++ b/AnKi/Gr/Common.h
@@ -63,7 +63,7 @@ constexpr U32 kMaxBindingsPerRegisterSpace = 32;
 
				 constexpr U32 kMaxFramesInFlight = 3; ///< Triple buffering.
			
 
				 constexpr U32 kMaxGrObjectNameLength = 61;
			
 
				 constexpr U32 kMaxBindlessTextures = 512;
			
 
				-constexpr U32 kMaxFastConstantsSize = 128; ///< Thanks AMD!!
			
 
				+constexpr U32 kMaxFastConstantsSize = 128; ///< Push/root constants size. Thanks AMD!!
			
 
				 
			
 
				 /// The number of commands in a command buffer that make it a small batch command buffer.
			
 
				 constexpr U32 kCommandBufferSmallBatchMaxCommands = 100;
			
@@ -90,7 +90,6 @@ ANKI_GR_CLASS(Texture)
 
				 ANKI_GR_CLASS(Sampler)
			
 
				 ANKI_GR_CLASS(CommandBuffer)
			
 
				 ANKI_GR_CLASS(Shader)
			
 
				-ANKI_GR_CLASS(Framebuffer)
			
 
				 ANKI_GR_CLASS(OcclusionQuery)
			
 
				 ANKI_GR_CLASS(TimestampQuery)
			
 
				 ANKI_GR_CLASS(PipelineQuery)
			
@@ -144,23 +143,14 @@ class GpuDeviceCapabilities
 
				 {
			
 
				 public:
			
 
				 	/// The alignment of offsets when bounding constant buffers.
			
 
				-	U32 m_uniformBufferBindOffsetAlignment = kMaxU32;
			
 
				+	U32 m_constantBufferBindOffsetAlignment = kMaxU32;
			
 
				 
			
 
				-	/// The max visible range of constant buffers inside the shaders.
			
 
				-	PtrSize m_uniformBufferMaxRange = 0;
			
 
				-
			
 
				-	/// The alignment of offsets when bounding storage buffers.
			
 
				-	U32 m_storageBufferBindOffsetAlignment = kMaxU32;
			
 
				-
			
 
				-	/// The max visible range of storage buffers inside the shaders.
			
 
				-	PtrSize m_storageBufferMaxRange = 0;
			
 
				+	/// The alignment of offsets when bounding structured buffers.
			
 
				+	U32 m_structuredBufferBindOffsetAlignment = kMaxU32;
			
 
				 
			
 
				 	/// The alignment of offsets when bounding texture buffers.
			
 
				 	U32 m_texelBufferBindOffsetAlignment = kMaxU32;
			
 
				 
			
 
				-	/// The max visible range of texture buffers inside the shaders.
			
 
				-	PtrSize m_textureBufferMaxRange = 0;
			
 
				-
			
 
				 	/// Max push/root constant size.
			
 
				 	PtrSize m_fastConstantsSize = 128;
			
 
				 
			
@@ -177,10 +167,10 @@ public:
 
				 	U32 m_shaderGroupHandleSize = 0;
			
 
				 
			
 
				 	/// Min subgroup size of the GPU.
			
 
				-	U32 m_minSubgroupSize = 0;
			
 
				+	U32 m_minWaveSize = 0;
			
 
				 
			
 
				 	/// Max subgroup size of the GPU.
			
 
				-	U32 m_maxSubgroupSize = 0;
			
 
				+	U32 m_maxWaveSize = 0;
			
 
				 
			
 
				 	/// Min size of a texel in the shading rate image.
			
 
				 	U32 m_minShadingRateImageTexelSize = 0;
			
--- a/AnKi/Gr/D3D/D3DGrManager.cpp
+++ b/AnKi/Gr/D3D/D3DGrManager.cpp
@@ -328,35 +328,35 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 
				 	{
			
 
				 	case 0x13B5:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kArm;
			
 
				-		m_capabilities.m_minSubgroupSize = 16;
			
 
				-		m_capabilities.m_maxSubgroupSize = 16;
			
 
				+		m_capabilities.m_minWaveSize = 16;
			
 
				+		m_capabilities.m_maxWaveSize = 16;
			
 
				 		break;
			
 
				 	case 0x10DE:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kNvidia;
			
 
				-		m_capabilities.m_minSubgroupSize = 32;
			
 
				-		m_capabilities.m_maxSubgroupSize = 32;
			
 
				+		m_capabilities.m_minWaveSize = 32;
			
 
				+		m_capabilities.m_maxWaveSize = 32;
			
 
				 		break;
			
 
				 	case 0x1002:
			
 
				 	case 0x1022:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kAMD;
			
 
				-		m_capabilities.m_minSubgroupSize = 32;
			
 
				-		m_capabilities.m_maxSubgroupSize = 64;
			
 
				+		m_capabilities.m_minWaveSize = 32;
			
 
				+		m_capabilities.m_maxWaveSize = 64;
			
 
				 		break;
			
 
				 	case 0x8086:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kIntel;
			
 
				-		m_capabilities.m_minSubgroupSize = 8;
			
 
				-		m_capabilities.m_maxSubgroupSize = 32;
			
 
				+		m_capabilities.m_minWaveSize = 8;
			
 
				+		m_capabilities.m_maxWaveSize = 32;
			
 
				 		break;
			
 
				 	case 0x5143:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kQualcomm;
			
 
				-		m_capabilities.m_minSubgroupSize = 64;
			
 
				-		m_capabilities.m_maxSubgroupSize = 128;
			
 
				+		m_capabilities.m_minWaveSize = 64;
			
 
				+		m_capabilities.m_maxWaveSize = 128;
			
 
				 		break;
			
 
				 	default:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kUnknown;
			
 
				 		// Choose something really low
			
 
				-		m_capabilities.m_minSubgroupSize = 8;
			
 
				-		m_capabilities.m_maxSubgroupSize = 8;
			
 
				+		m_capabilities.m_minWaveSize = 8;
			
 
				+		m_capabilities.m_maxWaveSize = 8;
			
 
				 	}
			
 
				 	ANKI_D3D_LOGI("Vendor identified as %s", &kGPUVendorStrings[m_capabilities.m_gpuVendor][0]);
			
 
				 
			
@@ -431,13 +431,10 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 
				 			ANKI_D3D_LOGW("ReBAR not supported");
			
 
				 		}
			
 
				 
			
 
				-		m_capabilities.m_uniformBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
			
 
				-		m_capabilities.m_uniformBufferMaxRange = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * D3D12_STANDARD_VECTOR_SIZE * sizeof(F32);
			
 
				-		m_capabilities.m_storageBufferBindOffsetAlignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT;
			
 
				+		m_capabilities.m_constantBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
			
 
				+		m_capabilities.m_structuredBufferBindOffsetAlignment = 0; // Not for DX
			
 
				 		m_capabilities.m_structuredBufferNaturalAlignment = true;
			
 
				-		m_capabilities.m_storageBufferMaxRange = 1 << D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
			
 
				 		m_capabilities.m_texelBufferBindOffsetAlignment = 32;
			
 
				-		m_capabilities.m_textureBufferMaxRange = kMaxU32; // ?
			
 
				 		m_capabilities.m_fastConstantsSize = kMaxFastConstantsSize;
			
 
				 		m_capabilities.m_computeSharedMemorySize = D3D12_CS_TGSM_REGISTER_COUNT * sizeof(F32);
			
 
				 		m_capabilities.m_accelerationStructureBuildScratchOffsetAlignment = 32; // ?
			
--- a/AnKi/Gr/GrManager.h
+++ b/AnKi/Gr/GrManager.h
@@ -75,7 +75,6 @@ public:
 
				 	[[nodiscard]] ShaderPtr newShader(const ShaderInitInfo& init);
			
 
				 	[[nodiscard]] ShaderProgramPtr newShaderProgram(const ShaderProgramInitInfo& init);
			
 
				 	[[nodiscard]] CommandBufferPtr newCommandBuffer(const CommandBufferInitInfo& init);
			
 
				-	[[nodiscard]] FramebufferPtr newFramebuffer(const FramebufferInitInfo& init);
			
 
				 	[[nodiscard]] OcclusionQueryPtr newOcclusionQuery();
			
 
				 	[[nodiscard]] TimestampQueryPtr newTimestampQuery();
			
 
				 	[[nodiscard]] PipelineQueryPtr newPipelineQuery(const PipelineQueryInitInfo& inf);
			
--- a/AnKi/Gr/Vulkan/VkGrManager.cpp
+++ b/AnKi/Gr/Vulkan/VkGrManager.cpp
@@ -771,48 +771,45 @@ Error GrManagerImpl::initInstance()
 
				 	{
			
 
				 	case 0x13B5:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kArm;
			
 
				-		m_capabilities.m_minSubgroupSize = 16;
			
 
				-		m_capabilities.m_maxSubgroupSize = 16;
			
 
				+		m_capabilities.m_minWaveSize = 16;
			
 
				+		m_capabilities.m_maxWaveSize = 16;
			
 
				 		break;
			
 
				 	case 0x10DE:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kNvidia;
			
 
				-		m_capabilities.m_minSubgroupSize = 32;
			
 
				-		m_capabilities.m_maxSubgroupSize = 32;
			
 
				+		m_capabilities.m_minWaveSize = 32;
			
 
				+		m_capabilities.m_maxWaveSize = 32;
			
 
				 		break;
			
 
				 	case 0x1002:
			
 
				 	case 0x1022:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kAMD;
			
 
				-		m_capabilities.m_minSubgroupSize = 32;
			
 
				-		m_capabilities.m_maxSubgroupSize = 64;
			
 
				+		m_capabilities.m_minWaveSize = 32;
			
 
				+		m_capabilities.m_maxWaveSize = 64;
			
 
				 		break;
			
 
				 	case 0x8086:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kIntel;
			
 
				-		m_capabilities.m_minSubgroupSize = 8;
			
 
				-		m_capabilities.m_maxSubgroupSize = 32;
			
 
				+		m_capabilities.m_minWaveSize = 8;
			
 
				+		m_capabilities.m_maxWaveSize = 32;
			
 
				 		break;
			
 
				 	case 0x5143:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kQualcomm;
			
 
				-		m_capabilities.m_minSubgroupSize = 64;
			
 
				-		m_capabilities.m_maxSubgroupSize = 128;
			
 
				+		m_capabilities.m_minWaveSize = 64;
			
 
				+		m_capabilities.m_maxWaveSize = 128;
			
 
				 		break;
			
 
				 	default:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::kUnknown;
			
 
				 		// Choose something really low
			
 
				-		m_capabilities.m_minSubgroupSize = 8;
			
 
				-		m_capabilities.m_maxSubgroupSize = 8;
			
 
				+		m_capabilities.m_minWaveSize = 8;
			
 
				+		m_capabilities.m_maxWaveSize = 8;
			
 
				 	}
			
 
				 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName, &kGPUVendorStrings[m_capabilities.m_gpuVendor][0]);
			
 
				 
			
 
				 	// Set limits
			
 
				-	m_capabilities.m_uniformBufferBindOffsetAlignment =
			
 
				+	m_capabilities.m_constantBufferBindOffsetAlignment =
			
 
				 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minUniformBufferOffsetAlignment));
			
 
				-	m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange;
			
 
				-	m_capabilities.m_storageBufferBindOffsetAlignment =
			
 
				+	m_capabilities.m_structuredBufferBindOffsetAlignment =
			
 
				 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment));
			
 
				 	m_capabilities.m_structuredBufferNaturalAlignment = false;
			
 
				-	m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange;
			
 
				 	m_capabilities.m_texelBufferBindOffsetAlignment = max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
			
 
				-	m_capabilities.m_textureBufferMaxRange = kMaxU32;
			
 
				 	m_capabilities.m_computeSharedMemorySize = m_devProps.properties.limits.maxComputeSharedMemorySize;
			
 
				 	m_capabilities.m_maxDrawIndirectCount = m_devProps.properties.limits.maxDrawIndirectCount;
			
 
				 
			
--- a/AnKi/Renderer/AccelerationStructureBuilder.cpp
+++ b/AnKi/Renderer/AccelerationStructureBuilder.cpp
@@ -51,7 +51,9 @@ void AccelerationStructureBuilder::populateRenderGraph(RenderingContext& ctx)
 
				 	{
			
 
				 		RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
			
 
				 
			
 
				-		const BufferView scratchBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(m_runCtx.m_tlas->getBuildScratchBufferSize());
			
 
				+		const BufferView scratchBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(
			
 
				+			m_runCtx.m_tlas->getBuildScratchBufferSize(),
			
 
				+			GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
			
 
				 
			
 
				 		m_runCtx.m_tlasHandle = rgraph.importAccelerationStructure(m_runCtx.m_tlas.get(), AccelerationStructureUsageBit::kNone);
			
 
				 
			
--- a/AnKi/Renderer/LensFlare.cpp
+++ b/AnKi/Renderer/LensFlare.cpp
@@ -75,11 +75,11 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx)
 
				 		cmdb.setFastConstants(&ctx.m_matrices.m_viewProjectionJitter, sizeof(ctx.m_matrices.m_viewProjectionJitter));
			
 
				 
			
 
				 		// Write flare info
			
 
				-		Vec4* flarePositions = allocateAndBindSrvStructuredBuffer<Vec4>(cmdb, 0, 0, flareCount);
			
 
				+		WeakArray<Vec4> flarePositions = allocateAndBindSrvStructuredBuffer<Vec4>(cmdb, 0, 0, flareCount);
			
 
				+		U32 count = 0;
			
 
				 		for(const LensFlareComponent& comp : SceneGraph::getSingleton().getComponentArrays().getLensFlares())
			
 
				 		{
			
 
				-			*flarePositions = Vec4(comp.getWorldPosition(), 1.0f);
			
 
				-			++flarePositions;
			
 
				+			flarePositions[count++] = Vec4(comp.getWorldPosition(), 1.0f);
			
 
				 		}
			
 
				 
			
 
				 		rgraphCtx.bindUav(0, 0, m_runCtx.m_indirectBuffHandle);
			
@@ -122,8 +122,7 @@ void LensFlare::runDrawFlares(const RenderingContext& ctx, CommandBuffer& cmdb)
 
				 		U32 spritesCount = max<U32>(1, m_maxSpritesPerFlare);
			
 
				 
			
 
				 		// Get uniform memory
			
 
				-		LensFlareSprite* tmpSprites = allocateAndBindSrvStructuredBuffer<LensFlareSprite>(cmdb, 0, 0, spritesCount);
			
 
				-		WeakArray<LensFlareSprite> sprites(tmpSprites, spritesCount);
			
 
				+		WeakArray<LensFlareSprite> sprites = allocateAndBindSrvStructuredBuffer<LensFlareSprite>(cmdb, 0, 0, spritesCount);
			
 
				 
			
 
				 		// misc
			
 
				 		Vec2 posNdc = posClip.xy() / posClip.w();
			
--- a/AnKi/Renderer/PrimaryNonRenderableVisibility.cpp
+++ b/AnKi/Renderer/PrimaryNonRenderableVisibility.cpp
@@ -81,9 +81,9 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 
				 		{
			
 
				 			// No objects, point to a buffer with zeros
			
 
				 
			
 
				-			void* mem;
			
 
				-			RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(sizeof(U32), mem);
			
 
				-			memset(mem, 0, sizeof(U32));
			
 
				+			WeakArray<U32> mem;
			
 
				+			const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, mem);
			
 
				+			mem[0] = 0;
			
 
				 
			
 
				 			m_runCtx.m_visibleIndicesBuffers[type] = alloc;
			
 
				 			m_runCtx.m_visibleIndicesHandles[type] = rgraph.importBuffer(m_runCtx.m_visibleIndicesBuffers[type], BufferUsageBit::kNone);
			
@@ -135,7 +135,8 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 
				 				}
			
 
				 
			
 
				 				// Allocate feedback buffer for this frame
			
 
				-				getRenderer().getReadbackManager().allocateData(m_readbacks[feedbackType], (objCount * 2 + 1) * sizeof(U32), in.m_cpuFeedbackBuffer);
			
 
				+				in.m_cpuFeedbackBuffer =
			
 
				+					getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_readbacks[feedbackType], objCount * 2 + 1);
			
 
				 			}
			
 
				 
			
 
				 			GpuVisibilityNonRenderablesOutput out;
			
--- a/AnKi/Renderer/Renderer.cpp
+++ b/AnKi/Renderer/Renderer.cpp
@@ -299,7 +299,7 @@ Error Renderer::populateRenderGraph(RenderingContext& ctx)
 
				 
			
 
				 	// Allocate global constants
			
 
				 	GlobalRendererConstants* globalUnis;
			
 
				-	ctx.m_globalRenderingConstantsBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalUnis);
			
 
				+	ctx.m_globalRenderingConstantsBuffer = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalUnis);
			
 
				 
			
 
				 	// Import RTs first
			
 
				 	m_downscaleBlur->importRenderTargets(ctx);
			
--- a/AnKi/Renderer/RendererObject.h
+++ b/AnKi/Renderer/RendererObject.h
@@ -65,30 +65,30 @@ protected:
 
				 	static T* allocateAndBindConstants(CommandBuffer& cmdb, U32 reg, U32 space)
			
 
				 	{
			
 
				 		T* ptr;
			
 
				-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(1, ptr);
			
 
				+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer<T>(ptr);
			
 
				 		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
			
 
				 		cmdb.bindConstantBuffer(reg, space, alloc);
			
 
				 		return ptr;
			
 
				 	}
			
 
				 
			
 
				 	template<typename T>
			
 
				-	static T* allocateAndBindSrvStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
			
 
				+	static WeakArray<T> allocateAndBindSrvStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
			
 
				 	{
			
 
				-		T* ptr;
			
 
				-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(count, ptr);
			
 
				-		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
			
 
				+		WeakArray<T> out;
			
 
				+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count, out);
			
 
				+		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(out.getBegin())));
			
 
				 		cmdb.bindSrv(reg, space, alloc);
			
 
				-		return ptr;
			
 
				+		return out;
			
 
				 	}
			
 
				 
			
 
				 	template<typename T>
			
 
				-	static T* allocateAndBindUavStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
			
 
				+	static WeakArray<T> allocateAndBindUavStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
			
 
				 	{
			
 
				-		T* ptr;
			
 
				-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(count, ptr);
			
 
				-		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
			
 
				+		WeakArray<T> out;
			
 
				+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count, out);
			
 
				+		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(out.getBegin())));
			
 
				 		cmdb.bindUav(reg, space, alloc);
			
 
				-		return ptr;
			
 
				+		return out;
			
 
				 	}
			
 
				 
			
 
				 	void registerDebugRenderTarget(CString rtName);
			
--- a/AnKi/Renderer/RtShadows.cpp
+++ b/AnKi/Renderer/RtShadows.cpp
@@ -233,16 +233,16 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 
				 	BufferView sbtBuffer;
			
 
				 	{
			
 
				 		// Allocate SBT
			
 
				-		U8* sbtMem;
			
 
				-		sbtBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(
			
 
				-			(GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount() + 2) * m_sbtRecordSize, sbtMem);
			
 
				+		WeakArray<U32> sbtMem;
			
 
				+		sbtBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(
			
 
				+			(GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount() + 2) * m_sbtRecordSize / sizeof(U32), sbtMem);
			
 
				 		sbtHandle = rgraph.importBuffer(sbtBuffer, BufferUsageBit::kUavCompute);
			
 
				 
			
 
				 		// Write the first 2 entries of the SBT
			
 
				 		ConstWeakArray<U8> shaderGroupHandles = m_rtLibraryGrProg->getShaderGroupHandles();
			
 
				 		const U32 shaderHandleSize = GrManager::getSingleton().getDeviceCapabilities().m_shaderGroupHandleSize;
			
 
				-		memcpy(sbtMem, &shaderGroupHandles[m_rayGenShaderGroupIdx * shaderHandleSize], shaderHandleSize);
			
 
				-		memcpy(sbtMem + m_sbtRecordSize, &shaderGroupHandles[m_missShaderGroupIdx * shaderHandleSize], shaderHandleSize);
			
 
				+		memcpy(&sbtMem[0], &shaderGroupHandles[m_rayGenShaderGroupIdx * shaderHandleSize], shaderHandleSize);
			
 
				+		memcpy(&sbtMem[m_sbtRecordSize / sizeof(U32)], &shaderGroupHandles[m_missShaderGroupIdx * shaderHandleSize], shaderHandleSize);
			
 
				 
			
 
				 		// Create the pass
			
 
				 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows build SBT");
			
@@ -304,7 +304,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 
				 			// Allocate, set and bind global uniforms
			
 
				 			{
			
 
				 				MaterialGlobalConstants* globalConstants;
			
 
				-				const RebarAllocation globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalConstants);
			
 
				+				const BufferView globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalConstants);
			
 
				 
			
 
				 				memset(globalConstants, 0, sizeof(*globalConstants)); // Don't care for now
			
 
				 
			
--- a/AnKi/Renderer/Utils/Drawer.cpp
+++ b/AnKi/Renderer/Utils/Drawer.cpp
@@ -32,7 +32,7 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 
				 	// Allocate, set and bind global uniforms
			
 
				 	{
			
 
				 		MaterialGlobalConstants* globalConstants;
			
 
				-		const RebarAllocation globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalConstants);
			
 
				+		const BufferView globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalConstants);
			
 
				 
			
 
				 		globalConstants->m_viewProjectionMatrix = args.m_viewProjectionMatrix;
			
 
				 		globalConstants->m_previousViewProjectionMatrix = args.m_previousViewProjectionMatrix;
			
--- a/AnKi/Renderer/Utils/GpuVisibility.cpp
+++ b/AnKi/Renderer/Utils/GpuVisibility.cpp
@@ -267,7 +267,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 			ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
			
 
				 		}
			
 
				 
			
 
				-		getRenderer().getReadbackManager().allocateData(m_outOfMemoryReadback, sizeof(U32), m_outOfMemoryReadbackBuffer);
			
 
				+		m_outOfMemoryReadbackBuffer = getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_outOfMemoryReadback, 1);
			
 
				 	}
			
 
				 
			
 
				 	// Get some limits
			
@@ -738,13 +738,12 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
				 				cmdb.bindSrv(4, 0, stage1Mem.m_counters);
			
 
				 				cmdb.bindSrv(5, 0, stage1Mem.m_renderablePrefixSums);
			
 
				 
			
 
				-				UVec2* firstDrawIndirectArgAndCount =
			
 
				+				WeakArray<UVec2> firstDrawIndirectArgAndCount =
			
 
				 					allocateAndBindSrvStructuredBuffer<UVec2>(cmdb, 6, 0, out.m_legacy.m_bucketIndirectArgsRanges.getSize());
			
 
				 				for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
			
 
				 				{
			
 
				-					firstDrawIndirectArgAndCount->x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
			
 
				-					firstDrawIndirectArgAndCount->y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
			
 
				-					++firstDrawIndirectArgAndCount;
			
 
				+					firstDrawIndirectArgAndCount[ibucket].x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
			
 
				+					firstDrawIndirectArgAndCount[ibucket].y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
			
 
				 				}
			
 
				 
			
 
				 				cmdb.bindUav(0, 0, stage2Mem.m_legacy.m_instanceRateRenderables);
			
@@ -936,9 +935,9 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
				 
			
 
				 	if(objCount == 0)
			
 
				 	{
			
 
				-		U32* count;
			
 
				-		out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(sizeof(U32), count);
			
 
				-		*count = 0;
			
 
				+		WeakArray<U32> count;
			
 
				+		out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, count);
			
 
				+		count[0] = 0;
			
 
				 		out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
			
 
				 
			
 
				 		return;
			
@@ -958,16 +957,23 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
				 		m_counterBufferZeroingHandle = {};
			
 
				 	}
			
 
				 
			
 
				-	constexpr U32 kCountersPerDispatch = 3; // 1 for the threadgroup, 1 for the visbile object count and 1 for objects with feedback
			
 
				-	const U32 counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment,
			
 
				-														   U32(kCountersPerDispatch * sizeof(U32)));
			
 
				+	U32 counterBufferElementSize;
			
 
				+	if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+	{
			
 
				+		counterBufferElementSize = sizeof(GpuVisibilityNonRenderablesCounters);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment,
			
 
				+													 U32(sizeof(GpuVisibilityNonRenderablesCounters)));
			
 
				+	}
			
 
				+
			
 
				 	if(!m_counterBuffer.isCreated() || m_counterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
			
 
				 	{
			
 
				 		// Counter buffer not created or not big enough, create a new one
			
 
				 
			
 
				 		BufferInitInfo buffInit("GpuVisibilityNonRenderablesCounters");
			
 
				-		buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2
			
 
				-														: kCountersPerDispatch * counterBufferElementSize * kInitialCounterArraySize;
			
 
				+		buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2 : counterBufferElementSize * kInitialCounterArraySize;
			
 
				 		buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
			
 
				 		m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
			
 
				 
			
@@ -1051,7 +1057,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
				 		cmdb.setFastConstants(&consts, sizeof(consts));
			
 
				 
			
 
				 		rgraph.bindUav(0, 0, visibleIndicesBuffHandle);
			
 
				-		cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(U32) * kCountersPerDispatch));
			
 
				+		cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(GpuVisibilityNonRenderablesCounters)));
			
 
				 
			
 
				 		if(needsFeedback)
			
 
				 		{
			
--- a/AnKi/Renderer/Utils/HzbGenerator.cpp
+++ b/AnKi/Renderer/Utils/HzbGenerator.cpp
@@ -50,7 +50,15 @@ Error HzbGenerator::init()
 
				 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMaxDepth.ankiprogbin", m_maxDepthProg, m_maxDepthGrProg));
			
 
				 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMaxDepthProject.ankiprogbin", m_maxBoxProg, m_maxBoxGrProg));
			
 
				 
			
 
				-	m_counterBufferElementSize = max<U32>(sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
			
 
				+	if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
			
 
				+	{
			
 
				+		m_counterBufferElementSize = sizeof(U32);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		m_counterBufferElementSize = max<U32>(sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment);
			
 
				+	}
			
 
				+
			
 
				 	BufferInitInfo buffInit("HzbCounterBuffer");
			
 
				 	buffInit.m_size = m_counterBufferElementSize * kCounterBufferElementCount;
			
 
				 	buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kCopyDestination;
			
--- a/AnKi/Renderer/Utils/Readback.cpp
+++ b/AnKi/Renderer/Utils/Readback.cpp
@@ -7,31 +7,6 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-void ReadbackManager::allocateData(MultiframeReadbackToken& token, PtrSize size, BufferView& buffer) const
			
 
				-{
			
 
				-	for([[maybe_unused]] U64 frame : token.m_frameIds)
			
 
				-	{
			
 
				-		ANKI_ASSERT(frame != m_frameId && "Can't allocate multiple times in a frame");
			
 
				-	}
			
 
				-
			
 
				-	GpuReadbackMemoryAllocation& allocation = token.m_allocations[token.m_slot];
			
 
				-
			
 
				-	if(allocation.isValid() && allocation.getAllocatedSize() != size)
			
 
				-	{
			
 
				-		GpuReadbackMemoryPool::getSingleton().deferredFree(allocation);
			
 
				-	}
			
 
				-
			
 
				-	if(!allocation.isValid())
			
 
				-	{
			
 
				-		allocation = GpuReadbackMemoryPool::getSingleton().allocate(size);
			
 
				-	}
			
 
				-	token.m_frameIds[token.m_slot] = m_frameId;
			
 
				-
			
 
				-	buffer = BufferView(&allocation.getBuffer(), allocation.getOffset(), size);
			
 
				-
			
 
				-	token.m_slot = (token.m_slot + 1) % kMaxFramesInFlight;
			
 
				-}
			
 
				-
			
 
				 U32 ReadbackManager::findBestSlot(const MultiframeReadbackToken& token) const
			
 
				 {
			
 
				 	const U64 earliestFrame = m_frameId - (kMaxFramesInFlight - 1);
			
--- a/AnKi/Renderer/Utils/Readback.h
+++ b/AnKi/Renderer/Utils/Readback.h
@@ -56,7 +56,33 @@ public:
 
				 	}
			
 
				 
			
 
				 	/// Allocate new data for the following frame. 2nd thing to call in a frame.
			
 
				-	void allocateData(MultiframeReadbackToken& token, PtrSize size, BufferView& buffer) const;
			
 
				+	template<typename T>
			
 
				+	BufferView allocateStructuredBuffer(MultiframeReadbackToken& token, U32 count) const
			
 
				+	{
			
 
				+		ANKI_ASSERT(count > 0);
			
 
				+
			
 
				+		for([[maybe_unused]] U64 frame : token.m_frameIds)
			
 
				+		{
			
 
				+			ANKI_ASSERT(frame != m_frameId && "Can't allocate multiple times in a frame");
			
 
				+		}
			
 
				+
			
 
				+		GpuReadbackMemoryAllocation& allocation = token.m_allocations[token.m_slot];
			
 
				+
			
 
				+		if(allocation.isValid() && allocation.getAllocatedSize() != sizeof(T) * count)
			
 
				+		{
			
 
				+			GpuReadbackMemoryPool::getSingleton().deferredFree(allocation);
			
 
				+		}
			
 
				+
			
 
				+		if(!allocation.isValid())
			
 
				+		{
			
 
				+			allocation = GpuReadbackMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
			
 
				+		}
			
 
				+		token.m_frameIds[token.m_slot] = m_frameId;
			
 
				+
			
 
				+		token.m_slot = (token.m_slot + 1) % kMaxFramesInFlight;
			
 
				+
			
 
				+		return BufferView(&allocation.getBuffer(), allocation.getOffset(), sizeof(T) * count);
			
 
				+	}
			
 
				 
			
 
				 	/// Last thing to call in a frame.
			
 
				 	void endFrame(Fence* fence);
			
--- a/AnKi/Renderer/VrsSriGeneration.cpp
+++ b/AnKi/Renderer/VrsSriGeneration.cpp
@@ -51,12 +51,12 @@ Error VrsSriGeneration::initInternal()
 
				 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
			
 
				 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
			
 
				 
			
 
				-	if(m_sriTexelDimension == 16 && GrManager::getSingleton().getDeviceCapabilities().m_minSubgroupSize >= 32)
			
 
				+	if(m_sriTexelDimension == 16 && GrManager::getSingleton().getDeviceCapabilities().m_minWaveSize >= 32)
			
 
				 	{
			
 
				 		// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No need for shared mem
			
 
				 		variantInit.addMutation("SHARED_MEMORY", 0);
			
 
				 	}
			
 
				-	else if(m_sriTexelDimension == 8 && GrManager::getSingleton().getDeviceCapabilities().m_minSubgroupSize >= 16)
			
 
				+	else if(m_sriTexelDimension == 8 && GrManager::getSingleton().getDeviceCapabilities().m_minWaveSize >= 16)
			
 
				 	{
			
 
				 		// Algorithm's workgroup size is 16, GPU's subgroup size is min 16 -> each workgroup has 1 subgroup -> No need for shared mem
			
 
				 		variantInit.addMutation("SHARED_MEMORY", 0);
			
--- a/AnKi/Scene/Components/ParticleEmitterComponent.cpp
+++ b/AnKi/Scene/Components/ParticleEmitterComponent.cpp
@@ -206,7 +206,7 @@ ParticleEmitterComponent::ParticleEmitterComponent(SceneNode* node)
 
				 
			
 
				 	static_assert(kMeshRelatedVertexStreamFormats[VertexStreamId::kPosition] == Format::kR16G16B16A16_Unorm);
			
 
				 	WeakArray<U16Vec4> transientPositions;
			
 
				-	const RebarAllocation positionsAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(vertCount, transientPositions);
			
 
				+	const BufferView positionsAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(vertCount, transientPositions);
			
 
				 	transientPositions[0] = U16Vec4(0, 0, 0, 0);
			
 
				 	transientPositions[1] = U16Vec4(kMaxU16, 0, 0, 0);
			
 
				 	transientPositions[2] = U16Vec4(kMaxU16, kMaxU16, 0, 0);
			
@@ -214,14 +214,14 @@ ParticleEmitterComponent::ParticleEmitterComponent(SceneNode* node)
 
				 
			
 
				 	static_assert(kMeshRelatedVertexStreamFormats[VertexStreamId::kUv] == Format::kR32G32_Sfloat);
			
 
				 	WeakArray<Vec2> transientUvs;
			
 
				-	const RebarAllocation uvsAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(vertCount, transientUvs);
			
 
				+	const BufferView uvsAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(vertCount, transientUvs);
			
 
				 	transientUvs[0] = Vec2(0.0f);
			
 
				 	transientUvs[1] = Vec2(1.0f, 0.0f);
			
 
				 	transientUvs[2] = Vec2(1.0f, 1.0f);
			
 
				 	transientUvs[3] = Vec2(0.0f, 1.0f);
			
 
				 
			
 
				 	WeakArray<U16> transientIndices;
			
 
				-	const RebarAllocation indicesAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(indexCount, transientIndices);
			
 
				+	const BufferView indicesAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(indexCount, transientIndices);
			
 
				 	transientIndices[0] = 0;
			
 
				 	transientIndices[1] = 1;
			
 
				 	transientIndices[2] = 3;
			
--- a/AnKi/Scene/GpuSceneArray.inl.h
+++ b/AnKi/Scene/GpuSceneArray.inl.h
@@ -13,8 +13,7 @@ template<typename TGpuSceneObject, U32 kId>
 
				 GpuSceneArray<TGpuSceneObject, kId>::GpuSceneArray(U32 maxArraySize)
			
 
				 {
			
 
				 	maxArraySize = getAlignedRoundUp(sizeof(SubMask), maxArraySize);
			
 
				-	const U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
			
 
				-	m_gpuSceneAllocation = GpuSceneBuffer::getSingleton().allocate(sizeof(TGpuSceneObject) * maxArraySize, alignment);
			
 
				+	m_gpuSceneAllocation = GpuSceneBuffer::getSingleton().allocateStructuredBuffer<TGpuSceneObject>(maxArraySize);
			
 
				 
			
 
				 	m_inUseIndicesMask.resize(maxArraySize / sizeof(SubMask), false);
			
 
				 	ANKI_ASSERT(m_inUseIndicesCount == 0);
			
--- a/AnKi/Shaders/GBufferGeneric.ankiprog
+++ b/AnKi/Shaders/GBufferGeneric.ankiprog
@@ -433,7 +433,7 @@ main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, out vertices
 
				 
			
 
				 			primitives[idx].m_constantsOffset = constantsOffset;
			
 
				 #	if VISUALIZE_MESHLETS
			
 
				-			primitives[idx].m_meshletIndex = relativeMeshletIdx;
			
 
				+			primitives[idx].m_meshletIndex = instance.m_meshletGeometryDescriptorIndex;
			
 
				 #	endif
			
 
				 		}
			
 
				 	}
			
--- a/AnKi/Shaders/GpuVisibilityNonRenderables.ankiprog
+++ b/AnKi/Shaders/GpuVisibilityNonRenderables.ankiprog
@@ -34,10 +34,7 @@ RWStructuredBuffer<U32> g_visibleIndices : register(u0); // 1st element is the c
 
				 
			
 
				 ANKI_FAST_CONSTANTS(GpuVisibilityNonRenderableConstants, g_consts)
			
 
				 
			
 
				-constexpr U32 kVisibleObjCounterIdx = 1;
			
 
				-constexpr U32 kThreadgroupCounterIdx = 0;
			
 
				-constexpr U32 kFeedbackCounterIdx = 2;
			
 
				-globallycoherent RWStructuredBuffer<U32> g_counterBuffer : register(u1); // 2 counters per dispatch with an optional 3rd for feedback
			
 
				+RWStructuredBuffer<GpuVisibilityNonRenderablesCounters> g_counterBuffer : register(u1);
			
 
				 
			
 
				 #if CPU_FEEDBACK
			
 
				 // 1st element is a count. What follows is an array pairs of UUIDs and array index.
			
@@ -104,7 +101,7 @@ Vec4 getSphere(GpuSceneGlobalIlluminationProbe l)
 
				 	if(!skip)
			
 
				 	{
			
 
				 		U32 idx;
			
 
				-		InterlockedAdd(g_counterBuffer[kVisibleObjCounterIdx], 1, idx);
			
 
				+		InterlockedAdd(g_counterBuffer[0].m_visibleObjectCount, 1, idx);
			
 
				 
			
 
				 		g_visibleIndices[idx + 1] = svDispatchThreadId;
			
 
				 	}
			
@@ -115,36 +112,35 @@ Vec4 getSphere(GpuSceneGlobalIlluminationProbe l)
 
				 	if(!skip && g_objects[svDispatchThreadId].m_uuid != 0)
			
 
				 	{
			
 
				 		U32 idx;
			
 
				-		InterlockedAdd(g_counterBuffer[kFeedbackCounterIdx], 1, idx);
			
 
				+		InterlockedAdd(g_counterBuffer[0].m_feedbackObjectCount, 1, idx);
			
 
				 
			
 
				 		g_cpuFeedbackBuffer[idx * 2 + 1] = g_objects[svDispatchThreadId].m_uuid;
			
 
				 		g_cpuFeedbackBuffer[idx * 2 + 2] = g_objects[svDispatchThreadId].m_componentArrayIndex;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				+	// Sync to make sure all the atomic ops have finished before the following code reads them
			
 
				+	AllMemoryBarrierWithGroupSync();
			
 
				+
			
 
				 	// Store the counters to the actual buffers
			
 
				 	//
			
 
				-	Bool lastThreadgroupExecuting = false;
			
 
				 	if(svGroupIndex == 0)
			
 
				 	{
			
 
				 		U32 threadgroupIdx;
			
 
				-		InterlockedAdd(g_counterBuffer[kThreadgroupCounterIdx], 1, threadgroupIdx);
			
 
				+		InterlockedAdd(g_counterBuffer[0].m_threadgroupCount, 1, threadgroupIdx);
			
 
				 		const U32 threadgroupCount = (objectCount + NUMTHREADS - 1) / NUMTHREADS;
			
 
				-		lastThreadgroupExecuting = (threadgroupIdx + 1 == threadgroupCount);
			
 
				-	}
			
 
				+		const Bool lastThreadgroupExecuting = (threadgroupIdx + 1 == threadgroupCount);
			
 
				 
			
 
				-	// Sync to make sure all the atomic ops have finished before the following code reads them
			
 
				-	AllMemoryBarrierWithGroupSync();
			
 
				-
			
 
				-	if(lastThreadgroupExecuting)
			
 
				-	{
			
 
				-		g_visibleIndices[0] = g_counterBuffer[kVisibleObjCounterIdx];
			
 
				-		g_counterBuffer[kVisibleObjCounterIdx] = 0;
			
 
				+		if(lastThreadgroupExecuting)
			
 
				+		{
			
 
				+			g_visibleIndices[0] = g_counterBuffer[0].m_visibleObjectCount;
			
 
				+			g_counterBuffer[0].m_visibleObjectCount = 0;
			
 
				 #if CPU_FEEDBACK
			
 
				-		g_cpuFeedbackBuffer[0] = g_counterBuffer[kFeedbackCounterIdx];
			
 
				-		g_counterBuffer[kFeedbackCounterIdx] = 0;
			
 
				+			g_cpuFeedbackBuffer[0] = g_counterBuffer[0].m_feedbackObjectCount;
			
 
				+			g_counterBuffer[0].m_feedbackObjectCount = 0;
			
 
				 #endif
			
 
				-		g_counterBuffer[kThreadgroupCounterIdx] = 0;
			
 
				+			g_counterBuffer[0].m_threadgroupCount = 0;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/AnKi/Shaders/Include/GpuVisibilityTypes.h
+++ b/AnKi/Shaders/Include/GpuVisibilityTypes.h
@@ -96,4 +96,12 @@ enum class GpuVisibilityIndirectDispatches : U32
 
				 	kCount
			
 
				 };
			
 
				 
			
 
				+/// Counters used in non-renderables visibility
			
 
				+class GpuVisibilityNonRenderablesCounters
			
 
				+{
			
 
				+	U32 m_threadgroupCount; ///< Counts the no of threadgroups
			
 
				+	U32 m_visibleObjectCount; ///< Counts the visible objects
			
 
				+	U32 m_feedbackObjectCount; ///< Counts the visbile objects that need feedback
			
 
				+};
			
 
				+
			
 
				 ANKI_END_NAMESPACE
			
--- a/AnKi/Ui/Canvas.cpp
+++ b/AnKi/Ui/Canvas.cpp
@@ -190,7 +190,7 @@ void Canvas::appendToCommandBufferInternal(CommandBuffer& cmdb)
 
				 	ImDrawData& drawData = *ImGui::GetDrawData();
			
 
				 
			
 
				 	// Allocate index and vertex buffers
			
 
				-	RebarAllocation vertsToken, indicesToken;
			
 
				+	BufferView vertsToken, indicesToken;
			
 
				 	{
			
 
				 		if(drawData.TotalVtxCount == 0 || drawData.TotalIdxCount == 0)
			
 
				 		{
			
@@ -198,9 +198,9 @@ void Canvas::appendToCommandBufferInternal(CommandBuffer& cmdb)
 
				 		}
			
 
				 
			
 
				 		ImDrawVert* verts;
			
 
				-		vertsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(drawData.TotalVtxCount, verts);
			
 
				+		vertsToken = RebarTransientMemoryPool::getSingleton().allocate(drawData.TotalVtxCount * sizeof(ImDrawVert), sizeof(F32), verts);
			
 
				 		ImDrawIdx* indices;
			
 
				-		indicesToken = RebarTransientMemoryPool::getSingleton().allocateFrame(drawData.TotalIdxCount, indices);
			
 
				+		indicesToken = RebarTransientMemoryPool::getSingleton().allocate(drawData.TotalIdxCount * sizeof(ImDrawIdx), sizeof(ImDrawIdx), indices);
			
 
				 
			
 
				 		for(I n = 0; n < drawData.CmdListsCount; ++n)
			
 
				 		{
			
--- a/AnKi/Util/StackAllocatorBuilder.inl.h
+++ b/AnKi/Util/StackAllocatorBuilder.inl.h
@@ -41,6 +41,7 @@ template<typename TChunk, typename TInterface, typename TLock>
 
				 Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, PtrSize alignment, TChunk*& chunk, PtrSize& offset)
			
 
				 {
			
 
				 	ANKI_ASSERT(size > 0);
			
 
				+	ANKI_ASSERT(alignment > 0);
			
 
				 	size += alignment;
			
 
				 
			
 
				 	chunk = nullptr;
			
--- a/Sandbox/Main.cpp
+++ b/Sandbox/Main.cpp
@@ -342,7 +342,7 @@ Error MyApp::userMainLoop(Bool& quit, Second elapsedTime)
 
				 
			
 
				 	if(in.getKey(KeyCode::kU) == 1)
			
 
				 	{
			
 
				-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "Ssr") ? "" : "Ssr");
			
 
				+		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "GBufferAlbedo") ? "" : "GBufferAlbedo");
			
 
				 	}
			
 
				 
			
 
				 	if(in.getKey(KeyCode::kI) == 1)