Browse Source

More refactoring

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
2d5ecdb36b

+ 6 - 3
AnKi/Core/GpuMemory/GpuReadbackMemoryPool.cpp

@@ -17,17 +17,20 @@ GpuReadbackMemoryPool::GpuReadbackMemoryPool()
 
 	m_pool.init(buffUsage, classes, classes.getBack(), "GpuReadback", false, mapAccess);
 
-	m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
+	if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+	{
+		m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
+	}
 }
 
 GpuReadbackMemoryPool ::~GpuReadbackMemoryPool()
 {
 }
 
-GpuReadbackMemoryAllocation GpuReadbackMemoryPool::allocate(PtrSize size)
+GpuReadbackMemoryAllocation GpuReadbackMemoryPool::allocate(PtrSize size, U32 alignment)
 {
 	GpuReadbackMemoryAllocation out;
-	m_pool.allocate(size, m_alignment, out.m_token);
+	m_pool.allocate(size, alignment, out.m_token);
 	out.m_buffer = &m_pool.getGpuBuffer();
 	out.m_mappedMemory = static_cast<U8*>(m_pool.getGpuBufferMappedMemory()) + out.m_token.m_offset;
 	return out;

+ 9 - 2
AnKi/Core/GpuMemory/GpuReadbackMemoryPool.h

@@ -84,7 +84,14 @@ class GpuReadbackMemoryPool : public MakeSingleton<GpuReadbackMemoryPool>
 	friend class MakeSingleton;
 
 public:
-	GpuReadbackMemoryAllocation allocate(PtrSize size);
+	GpuReadbackMemoryAllocation allocate(PtrSize size, U32 alignment);
+
+	template<typename T>
+	GpuReadbackMemoryAllocation allocateStructuredBuffer(U32 count)
+	{
+		const U32 alignment = (m_structuredBufferAlignment == kMaxU32) ? sizeof(T) : m_structuredBufferAlignment;
+		return allocate(sizeof(T) * count, alignment);
+	}
 
 	void deferredFree(GpuReadbackMemoryAllocation& allocation);
 
@@ -92,7 +99,7 @@ public:
 
 private:
 	SegregatedListsGpuMemoryPool m_pool;
-	U32 m_alignment = 0;
+	U32 m_structuredBufferAlignment = kMaxU32;
 
 	GpuReadbackMemoryPool();
 

+ 8 - 7
AnKi/Core/GpuMemory/GpuSceneBuffer.cpp

@@ -130,15 +130,16 @@ void GpuSceneMicroPatcher::patchGpuScene(CommandBuffer& cmdb)
 	ANKI_TRACE_INC_COUNTER(GpuSceneMicroPatches, m_crntFramePatchHeaders.getSize());
 	ANKI_TRACE_INC_COUNTER(GpuSceneMicroPatchUploadData, m_crntFramePatchData.getSizeInBytes());
 
-	void* mapped;
-	const RebarAllocation headersToken = RebarTransientMemoryPool::getSingleton().allocateFrame(m_crntFramePatchHeaders.getSizeInBytes(), mapped);
-	memcpy(mapped, &m_crntFramePatchHeaders[0], m_crntFramePatchHeaders.getSizeInBytes());
+	WeakArray<PatchHeader> mapped;
+	const BufferView headersBuff = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(m_crntFramePatchHeaders.getSize(), mapped);
+	memcpy(mapped.getBegin(), &m_crntFramePatchHeaders[0], m_crntFramePatchHeaders.getSizeInBytes());
 
-	const RebarAllocation dataToken = RebarTransientMemoryPool::getSingleton().allocateFrame(m_crntFramePatchData.getSizeInBytes(), mapped);
-	memcpy(mapped, &m_crntFramePatchData[0], m_crntFramePatchData.getSizeInBytes());
+	WeakArray<U32> mapped2;
+	const BufferView dataBuff = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(m_crntFramePatchData.getSize(), mapped2);
+	memcpy(mapped2.getBegin(), &m_crntFramePatchData[0], m_crntFramePatchData.getSizeInBytes());
 
-	cmdb.bindSrv(0, 0, headersToken);
-	cmdb.bindSrv(1, 0, dataToken);
+	cmdb.bindSrv(0, 0, headersBuff);
+	cmdb.bindSrv(1, 0, dataBuff);
 	cmdb.bindUav(0, 0, BufferView(&GpuSceneBuffer::getSingleton().getBuffer()));
 
 	cmdb.bindShaderProgram(m_grProgram.get());

+ 11 - 0
AnKi/Core/GpuMemory/GpuSceneBuffer.h

@@ -83,6 +83,17 @@ public:
 		return alloc;
 	}
 
+	template<typename T>
+	GpuSceneBufferAllocation allocateStructuredBuffer(U32 count)
+	{
+		const U32 alignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+								  ? sizeof(T)
+								  : GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
+		GpuSceneBufferAllocation alloc;
+		m_pool.allocate(count * sizeof(T), alignment, alloc.m_token);
+		return alloc;
+	}
+
 	void deferredFree(GpuSceneBufferAllocation& alloc)
 	{
 		m_pool.deferredFree(alloc.m_token);

+ 6 - 12
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -22,9 +22,8 @@ class GpuVisibleTransientMemoryPool : public MakeSingleton<GpuVisibleTransientMe
 	friend class MakeSingleton;
 
 public:
-	BufferView allocate(PtrSize size, PtrSize alignment = 0)
+	BufferView allocate(PtrSize size, PtrSize alignment)
 	{
-		alignment = (alignment == 0) ? m_alignment : alignment;
 		PtrSize offset;
 		Buffer* buffer;
 		m_pool.allocate(size, alignment, offset, buffer);
@@ -46,20 +45,15 @@ public:
 
 private:
 	StackGpuMemoryPool m_pool;
-	U32 m_alignment = 0;
 	U32 m_frame = 0;
-	U32 m_structuredBufferAlignment = 0;
+	U32 m_structuredBufferAlignment = kMaxU32;
 
 	GpuVisibleTransientMemoryPool()
 	{
-		m_structuredBufferAlignment = (GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
-										  ? kMaxU32
-										  : GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
-
-		m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
-		m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
+		if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+		{
+			m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
+		}
 
 		BufferUsageBit buffUsage = BufferUsageBit::kAllConstant | BufferUsageBit::kAllUav | BufferUsageBit::kAllSrv | BufferUsageBit::kIndirectDraw
 								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllCopy;

+ 11 - 21
AnKi/Core/GpuMemory/RebarTransientMemoryPool.cpp

@@ -36,28 +36,19 @@ void RebarTransientMemoryPool::init()
 
 	m_bufferSize = buffInit.m_size;
 
-	m_alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
-	m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
-	m_alignment = max(m_alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
-
-	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
-}
-
-RebarAllocation RebarTransientMemoryPool::allocateFrame(PtrSize size, void*& mappedMem)
-{
-	RebarAllocation out = tryAllocateFrame(size, mappedMem);
-	if(!out.isValid()) [[unlikely]]
+	if(!GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
 	{
-		ANKI_CORE_LOGF("Out of ReBAR GPU memory");
+		m_structuredBufferAlignment = GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment;
 	}
 
-	return out;
+	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
 }
 
-RebarAllocation RebarTransientMemoryPool::tryAllocateFrame(PtrSize origSize, void*& mappedMem)
+BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alignment, void*& mappedMem)
 {
 	ANKI_ASSERT(origSize > 0);
-	const PtrSize size = getAlignedRoundUp(m_alignment, origSize);
+	ANKI_ASSERT(alignment > 0);
+	const PtrSize size = origSize + alignment;
 
 	// Try in a loop because we may end up with an allocation its offset crosses the buffer's end
 	PtrSize offset;
@@ -65,17 +56,16 @@ RebarAllocation RebarTransientMemoryPool::tryAllocateFrame(PtrSize origSize, voi
 	do
 	{
 		offset = m_offset.fetchAdd(size) % m_bufferSize;
-		const PtrSize end = (offset + origSize) % (m_bufferSize + 1);
+		const PtrSize end = (offset + size) % (m_bufferSize + 1);
 
 		done = offset < end;
 	} while(!done);
 
-	mappedMem = m_mappedMem + offset;
-	RebarAllocation out;
-	out.m_offset = offset;
-	out.m_range = origSize;
+	const PtrSize alignedOffset = getAlignedRoundUp(alignment, offset);
+	ANKI_ASSERT(alignedOffset + origSize <= offset + size);
 
-	return out;
+	mappedMem = m_mappedMem + alignedOffset;
+	return BufferView(m_buffer.get(), alignedOffset, origSize);
 }
 
 void RebarTransientMemoryPool::endFrame()

+ 28 - 63
AnKi/Core/GpuMemory/RebarTransientMemoryPool.h

@@ -13,47 +13,6 @@ namespace anki {
 /// @addtogroup core
 /// @{
 
-/// Token that gets returned when requesting for memory to write to a resource.
-class RebarAllocation
-{
-	friend class RebarTransientMemoryPool;
-
-public:
-	RebarAllocation() = default;
-
-	~RebarAllocation() = default;
-
-	Bool operator==(const RebarAllocation& b) const
-	{
-		return m_offset == b.m_offset && m_range == b.m_range;
-	}
-
-	Bool isValid() const
-	{
-		return m_range != 0;
-	}
-
-	PtrSize getOffset() const
-	{
-		ANKI_ASSERT(isValid());
-		return m_offset;
-	}
-
-	PtrSize getRange() const
-	{
-		ANKI_ASSERT(isValid());
-		return m_range;
-	}
-
-	Buffer& getBuffer() const;
-
-	operator BufferView() const;
-
-private:
-	PtrSize m_offset = kMaxPtrSize;
-	PtrSize m_range = 0;
-};
-
 /// Manages staging GPU memory.
 class RebarTransientMemoryPool : public MakeSingleton<RebarTransientMemoryPool>
 {
@@ -70,28 +29,43 @@ public:
 	void endFrame();
 
 	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
-	RebarAllocation allocateFrame(PtrSize size, void*& mappedMem);
-
 	template<typename T>
-	RebarAllocation allocateFrame(U32 count, T*& mappedMem)
+	BufferView allocate(PtrSize size, U32 alignment, T*& mappedMem)
 	{
 		void* mem;
-		const RebarAllocation out = allocateFrame(count * sizeof(T), mem);
+		const BufferView out = allocateInternal(size, alignment, mem);
 		mappedMem = static_cast<T*>(mem);
 		return out;
 	}
 
+	/// @copydoc allocate
 	template<typename T>
-	RebarAllocation allocateFrame(U32 count, WeakArray<T>& arr)
+	BufferView allocateConstantBuffer(T*& mappedMem)
 	{
-		void* mem;
-		const RebarAllocation out = allocateFrame(count * sizeof(T), mem);
-		arr = {static_cast<T*>(mem), count};
+		return allocate(sizeof(T), GrManager::getSingleton().getDeviceCapabilities().m_constantBufferBindOffsetAlignment, mappedMem);
+	}
+
+	/// @copydoc allocate
+	template<typename T>
+	BufferView allocateStructuredBuffer(U32 count, WeakArray<T>& arr)
+	{
+		T* mem;
+		const U32 alignment = (m_structuredBufferAlignment == kMaxU32) ? sizeof(T) : m_structuredBufferAlignment;
+		const BufferView out = allocate(count * sizeof(T), alignment, mem);
+		arr = {mem, count};
 		return out;
 	}
 
-	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
-	RebarAllocation tryAllocateFrame(PtrSize size, void*& mappedMem);
+	/// @copydoc allocate
+	template<typename T>
+	BufferView allocateCopyBuffer(U32 count, WeakArray<T>& arr)
+	{
+		T* mem;
+		const U32 alignment = sizeof(U32);
+		const BufferView out = allocate(sizeof(T) * count, alignment, mem);
+		arr = {mem, count};
+		return out;
+	}
 
 	ANKI_PURE Buffer& getBuffer() const
 	{
@@ -109,23 +83,14 @@ private:
 	PtrSize m_bufferSize = 0; ///< Cache it.
 	Atomic<PtrSize> m_offset = {0};
 	PtrSize m_previousFrameEndOffset = 0;
-	U32 m_alignment = 0;
+	U32 m_structuredBufferAlignment = kMaxU32;
 
 	RebarTransientMemoryPool() = default;
 
 	~RebarTransientMemoryPool();
-};
 
-inline Buffer& RebarAllocation::getBuffer() const
-{
-	return RebarTransientMemoryPool::getSingleton().getBuffer();
-}
-
-inline RebarAllocation::operator BufferView() const
-{
-	ANKI_ASSERT(isValid());
-	return {&RebarTransientMemoryPool::getSingleton().getBuffer(), m_offset, m_range};
-}
+	BufferView allocateInternal(PtrSize size, U32 alignment, void*& mappedMem);
+};
 /// @}
 
 } // end namespace anki

+ 6 - 16
AnKi/Gr/Common.h

@@ -63,7 +63,7 @@ constexpr U32 kMaxBindingsPerRegisterSpace = 32;
 constexpr U32 kMaxFramesInFlight = 3; ///< Triple buffering.
 constexpr U32 kMaxGrObjectNameLength = 61;
 constexpr U32 kMaxBindlessTextures = 512;
-constexpr U32 kMaxFastConstantsSize = 128; ///< Thanks AMD!!
+constexpr U32 kMaxFastConstantsSize = 128; ///< Push/root constants size. Thanks AMD!!
 
 /// The number of commands in a command buffer that make it a small batch command buffer.
 constexpr U32 kCommandBufferSmallBatchMaxCommands = 100;
@@ -90,7 +90,6 @@ ANKI_GR_CLASS(Texture)
 ANKI_GR_CLASS(Sampler)
 ANKI_GR_CLASS(CommandBuffer)
 ANKI_GR_CLASS(Shader)
-ANKI_GR_CLASS(Framebuffer)
 ANKI_GR_CLASS(OcclusionQuery)
 ANKI_GR_CLASS(TimestampQuery)
 ANKI_GR_CLASS(PipelineQuery)
@@ -144,23 +143,14 @@ class GpuDeviceCapabilities
 {
 public:
 	/// The alignment of offsets when bounding constant buffers.
-	U32 m_uniformBufferBindOffsetAlignment = kMaxU32;
+	U32 m_constantBufferBindOffsetAlignment = kMaxU32;
 
-	/// The max visible range of constant buffers inside the shaders.
-	PtrSize m_uniformBufferMaxRange = 0;
-
-	/// The alignment of offsets when bounding storage buffers.
-	U32 m_storageBufferBindOffsetAlignment = kMaxU32;
-
-	/// The max visible range of storage buffers inside the shaders.
-	PtrSize m_storageBufferMaxRange = 0;
+	/// The alignment of offsets when bounding structured buffers.
+	U32 m_structuredBufferBindOffsetAlignment = kMaxU32;
 
 	/// The alignment of offsets when bounding texture buffers.
 	U32 m_texelBufferBindOffsetAlignment = kMaxU32;
 
-	/// The max visible range of texture buffers inside the shaders.
-	PtrSize m_textureBufferMaxRange = 0;
-
 	/// Max push/root constant size.
 	PtrSize m_fastConstantsSize = 128;
 
@@ -177,10 +167,10 @@ public:
 	U32 m_shaderGroupHandleSize = 0;
 
 	/// Min subgroup size of the GPU.
-	U32 m_minSubgroupSize = 0;
+	U32 m_minWaveSize = 0;
 
 	/// Max subgroup size of the GPU.
-	U32 m_maxSubgroupSize = 0;
+	U32 m_maxWaveSize = 0;
 
 	/// Min size of a texel in the shading rate image.
 	U32 m_minShadingRateImageTexelSize = 0;

+ 14 - 17
AnKi/Gr/D3D/D3DGrManager.cpp

@@ -328,35 +328,35 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 	{
 	case 0x13B5:
 		m_capabilities.m_gpuVendor = GpuVendor::kArm;
-		m_capabilities.m_minSubgroupSize = 16;
-		m_capabilities.m_maxSubgroupSize = 16;
+		m_capabilities.m_minWaveSize = 16;
+		m_capabilities.m_maxWaveSize = 16;
 		break;
 	case 0x10DE:
 		m_capabilities.m_gpuVendor = GpuVendor::kNvidia;
-		m_capabilities.m_minSubgroupSize = 32;
-		m_capabilities.m_maxSubgroupSize = 32;
+		m_capabilities.m_minWaveSize = 32;
+		m_capabilities.m_maxWaveSize = 32;
 		break;
 	case 0x1002:
 	case 0x1022:
 		m_capabilities.m_gpuVendor = GpuVendor::kAMD;
-		m_capabilities.m_minSubgroupSize = 32;
-		m_capabilities.m_maxSubgroupSize = 64;
+		m_capabilities.m_minWaveSize = 32;
+		m_capabilities.m_maxWaveSize = 64;
 		break;
 	case 0x8086:
 		m_capabilities.m_gpuVendor = GpuVendor::kIntel;
-		m_capabilities.m_minSubgroupSize = 8;
-		m_capabilities.m_maxSubgroupSize = 32;
+		m_capabilities.m_minWaveSize = 8;
+		m_capabilities.m_maxWaveSize = 32;
 		break;
 	case 0x5143:
 		m_capabilities.m_gpuVendor = GpuVendor::kQualcomm;
-		m_capabilities.m_minSubgroupSize = 64;
-		m_capabilities.m_maxSubgroupSize = 128;
+		m_capabilities.m_minWaveSize = 64;
+		m_capabilities.m_maxWaveSize = 128;
 		break;
 	default:
 		m_capabilities.m_gpuVendor = GpuVendor::kUnknown;
 		// Choose something really low
-		m_capabilities.m_minSubgroupSize = 8;
-		m_capabilities.m_maxSubgroupSize = 8;
+		m_capabilities.m_minWaveSize = 8;
+		m_capabilities.m_maxWaveSize = 8;
 	}
 	ANKI_D3D_LOGI("Vendor identified as %s", &kGPUVendorStrings[m_capabilities.m_gpuVendor][0]);
 
@@ -431,13 +431,10 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 			ANKI_D3D_LOGW("ReBAR not supported");
 		}
 
-		m_capabilities.m_uniformBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
-		m_capabilities.m_uniformBufferMaxRange = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * D3D12_STANDARD_VECTOR_SIZE * sizeof(F32);
-		m_capabilities.m_storageBufferBindOffsetAlignment = D3D12_RAW_UAV_SRV_BYTE_ALIGNMENT;
+		m_capabilities.m_constantBufferBindOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
+		m_capabilities.m_structuredBufferBindOffsetAlignment = 0; // Not for DX
 		m_capabilities.m_structuredBufferNaturalAlignment = true;
-		m_capabilities.m_storageBufferMaxRange = 1 << D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
 		m_capabilities.m_texelBufferBindOffsetAlignment = 32;
-		m_capabilities.m_textureBufferMaxRange = kMaxU32; // ?
 		m_capabilities.m_fastConstantsSize = kMaxFastConstantsSize;
 		m_capabilities.m_computeSharedMemorySize = D3D12_CS_TGSM_REGISTER_COUNT * sizeof(F32);
 		m_capabilities.m_accelerationStructureBuildScratchOffsetAlignment = 32; // ?

+ 0 - 1
AnKi/Gr/GrManager.h

@@ -75,7 +75,6 @@ public:
 	[[nodiscard]] ShaderPtr newShader(const ShaderInitInfo& init);
 	[[nodiscard]] ShaderProgramPtr newShaderProgram(const ShaderProgramInitInfo& init);
 	[[nodiscard]] CommandBufferPtr newCommandBuffer(const CommandBufferInitInfo& init);
-	[[nodiscard]] FramebufferPtr newFramebuffer(const FramebufferInitInfo& init);
 	[[nodiscard]] OcclusionQueryPtr newOcclusionQuery();
 	[[nodiscard]] TimestampQueryPtr newTimestampQuery();
 	[[nodiscard]] PipelineQueryPtr newPipelineQuery(const PipelineQueryInitInfo& inf);

+ 14 - 17
AnKi/Gr/Vulkan/VkGrManager.cpp

@@ -771,48 +771,45 @@ Error GrManagerImpl::initInstance()
 	{
 	case 0x13B5:
 		m_capabilities.m_gpuVendor = GpuVendor::kArm;
-		m_capabilities.m_minSubgroupSize = 16;
-		m_capabilities.m_maxSubgroupSize = 16;
+		m_capabilities.m_minWaveSize = 16;
+		m_capabilities.m_maxWaveSize = 16;
 		break;
 	case 0x10DE:
 		m_capabilities.m_gpuVendor = GpuVendor::kNvidia;
-		m_capabilities.m_minSubgroupSize = 32;
-		m_capabilities.m_maxSubgroupSize = 32;
+		m_capabilities.m_minWaveSize = 32;
+		m_capabilities.m_maxWaveSize = 32;
 		break;
 	case 0x1002:
 	case 0x1022:
 		m_capabilities.m_gpuVendor = GpuVendor::kAMD;
-		m_capabilities.m_minSubgroupSize = 32;
-		m_capabilities.m_maxSubgroupSize = 64;
+		m_capabilities.m_minWaveSize = 32;
+		m_capabilities.m_maxWaveSize = 64;
 		break;
 	case 0x8086:
 		m_capabilities.m_gpuVendor = GpuVendor::kIntel;
-		m_capabilities.m_minSubgroupSize = 8;
-		m_capabilities.m_maxSubgroupSize = 32;
+		m_capabilities.m_minWaveSize = 8;
+		m_capabilities.m_maxWaveSize = 32;
 		break;
 	case 0x5143:
 		m_capabilities.m_gpuVendor = GpuVendor::kQualcomm;
-		m_capabilities.m_minSubgroupSize = 64;
-		m_capabilities.m_maxSubgroupSize = 128;
+		m_capabilities.m_minWaveSize = 64;
+		m_capabilities.m_maxWaveSize = 128;
 		break;
 	default:
 		m_capabilities.m_gpuVendor = GpuVendor::kUnknown;
 		// Choose something really low
-		m_capabilities.m_minSubgroupSize = 8;
-		m_capabilities.m_maxSubgroupSize = 8;
+		m_capabilities.m_minWaveSize = 8;
+		m_capabilities.m_maxWaveSize = 8;
 	}
 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName, &kGPUVendorStrings[m_capabilities.m_gpuVendor][0]);
 
 	// Set limits
-	m_capabilities.m_uniformBufferBindOffsetAlignment =
+	m_capabilities.m_constantBufferBindOffsetAlignment =
 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minUniformBufferOffsetAlignment));
-	m_capabilities.m_uniformBufferMaxRange = m_devProps.properties.limits.maxUniformBufferRange;
-	m_capabilities.m_storageBufferBindOffsetAlignment =
+	m_capabilities.m_structuredBufferBindOffsetAlignment =
 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minStorageBufferOffsetAlignment));
 	m_capabilities.m_structuredBufferNaturalAlignment = false;
-	m_capabilities.m_storageBufferMaxRange = m_devProps.properties.limits.maxStorageBufferRange;
 	m_capabilities.m_texelBufferBindOffsetAlignment = max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
-	m_capabilities.m_textureBufferMaxRange = kMaxU32;
 	m_capabilities.m_computeSharedMemorySize = m_devProps.properties.limits.maxComputeSharedMemorySize;
 	m_capabilities.m_maxDrawIndirectCount = m_devProps.properties.limits.maxDrawIndirectCount;
 

+ 3 - 1
AnKi/Renderer/AccelerationStructureBuilder.cpp

@@ -51,7 +51,9 @@ void AccelerationStructureBuilder::populateRenderGraph(RenderingContext& ctx)
 	{
 		RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
 
-		const BufferView scratchBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(m_runCtx.m_tlas->getBuildScratchBufferSize());
+		const BufferView scratchBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(
+			m_runCtx.m_tlas->getBuildScratchBufferSize(),
+			GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
 
 		m_runCtx.m_tlasHandle = rgraph.importAccelerationStructure(m_runCtx.m_tlas.get(), AccelerationStructureUsageBit::kNone);
 

+ 4 - 5
AnKi/Renderer/LensFlare.cpp

@@ -75,11 +75,11 @@ void LensFlare::populateRenderGraph(RenderingContext& ctx)
 		cmdb.setFastConstants(&ctx.m_matrices.m_viewProjectionJitter, sizeof(ctx.m_matrices.m_viewProjectionJitter));
 
 		// Write flare info
-		Vec4* flarePositions = allocateAndBindSrvStructuredBuffer<Vec4>(cmdb, 0, 0, flareCount);
+		WeakArray<Vec4> flarePositions = allocateAndBindSrvStructuredBuffer<Vec4>(cmdb, 0, 0, flareCount);
+		U32 count = 0;
 		for(const LensFlareComponent& comp : SceneGraph::getSingleton().getComponentArrays().getLensFlares())
 		{
-			*flarePositions = Vec4(comp.getWorldPosition(), 1.0f);
-			++flarePositions;
+			flarePositions[count++] = Vec4(comp.getWorldPosition(), 1.0f);
 		}
 
 		rgraphCtx.bindUav(0, 0, m_runCtx.m_indirectBuffHandle);
@@ -122,8 +122,7 @@ void LensFlare::runDrawFlares(const RenderingContext& ctx, CommandBuffer& cmdb)
 		U32 spritesCount = max<U32>(1, m_maxSpritesPerFlare);
 
 		// Get uniform memory
-		LensFlareSprite* tmpSprites = allocateAndBindSrvStructuredBuffer<LensFlareSprite>(cmdb, 0, 0, spritesCount);
-		WeakArray<LensFlareSprite> sprites(tmpSprites, spritesCount);
+		WeakArray<LensFlareSprite> sprites = allocateAndBindSrvStructuredBuffer<LensFlareSprite>(cmdb, 0, 0, spritesCount);
 
 		// misc
 		Vec2 posNdc = posClip.xy() / posClip.w();

+ 5 - 4
AnKi/Renderer/PrimaryNonRenderableVisibility.cpp

@@ -81,9 +81,9 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 		{
 			// No objects, point to a buffer with zeros
 
-			void* mem;
-			RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(sizeof(U32), mem);
-			memset(mem, 0, sizeof(U32));
+			WeakArray<U32> mem;
+			const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, mem);
+			mem[0] = 0;
 
 			m_runCtx.m_visibleIndicesBuffers[type] = alloc;
 			m_runCtx.m_visibleIndicesHandles[type] = rgraph.importBuffer(m_runCtx.m_visibleIndicesBuffers[type], BufferUsageBit::kNone);
@@ -135,7 +135,8 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 				}
 
 				// Allocate feedback buffer for this frame
-				getRenderer().getReadbackManager().allocateData(m_readbacks[feedbackType], (objCount * 2 + 1) * sizeof(U32), in.m_cpuFeedbackBuffer);
+				in.m_cpuFeedbackBuffer =
+					getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_readbacks[feedbackType], objCount * 2 + 1);
 			}
 
 			GpuVisibilityNonRenderablesOutput out;

+ 1 - 1
AnKi/Renderer/Renderer.cpp

@@ -299,7 +299,7 @@ Error Renderer::populateRenderGraph(RenderingContext& ctx)
 
 	// Allocate global constants
 	GlobalRendererConstants* globalUnis;
-	ctx.m_globalRenderingConstantsBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalUnis);
+	ctx.m_globalRenderingConstantsBuffer = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalUnis);
 
 	// Import RTs first
 	m_downscaleBlur->importRenderTargets(ctx);

+ 11 - 11
AnKi/Renderer/RendererObject.h

@@ -65,30 +65,30 @@ protected:
 	static T* allocateAndBindConstants(CommandBuffer& cmdb, U32 reg, U32 space)
 	{
 		T* ptr;
-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(1, ptr);
+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer<T>(ptr);
 		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
 		cmdb.bindConstantBuffer(reg, space, alloc);
 		return ptr;
 	}
 
 	template<typename T>
-	static T* allocateAndBindSrvStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
+	static WeakArray<T> allocateAndBindSrvStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
 	{
-		T* ptr;
-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(count, ptr);
-		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
+		WeakArray<T> out;
+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count, out);
+		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(out.getBegin())));
 		cmdb.bindSrv(reg, space, alloc);
-		return ptr;
+		return out;
 	}
 
 	template<typename T>
-	static T* allocateAndBindUavStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
+	static WeakArray<T> allocateAndBindUavStructuredBuffer(CommandBuffer& cmdb, U32 reg, U32 space, U32 count = 1)
 	{
-		T* ptr;
-		const RebarAllocation alloc = RebarTransientMemoryPool::getSingleton().allocateFrame(count, ptr);
-		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(ptr)));
+		WeakArray<T> out;
+		const BufferView alloc = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer<T>(count, out);
+		ANKI_ASSERT(isAligned(alignof(T), ptrToNumber(out.getBegin())));
 		cmdb.bindUav(reg, space, alloc);
-		return ptr;
+		return out;
 	}
 
 	void registerDebugRenderTarget(CString rtName);

+ 6 - 6
AnKi/Renderer/RtShadows.cpp

@@ -233,16 +233,16 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 	BufferView sbtBuffer;
 	{
 		// Allocate SBT
-		U8* sbtMem;
-		sbtBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(
-			(GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount() + 2) * m_sbtRecordSize, sbtMem);
+		WeakArray<U32> sbtMem;
+		sbtBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(
+			(GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount() + 2) * m_sbtRecordSize / sizeof(U32), sbtMem);
 		sbtHandle = rgraph.importBuffer(sbtBuffer, BufferUsageBit::kUavCompute);
 
 		// Write the first 2 entries of the SBT
 		ConstWeakArray<U8> shaderGroupHandles = m_rtLibraryGrProg->getShaderGroupHandles();
 		const U32 shaderHandleSize = GrManager::getSingleton().getDeviceCapabilities().m_shaderGroupHandleSize;
-		memcpy(sbtMem, &shaderGroupHandles[m_rayGenShaderGroupIdx * shaderHandleSize], shaderHandleSize);
-		memcpy(sbtMem + m_sbtRecordSize, &shaderGroupHandles[m_missShaderGroupIdx * shaderHandleSize], shaderHandleSize);
+		memcpy(&sbtMem[0], &shaderGroupHandles[m_rayGenShaderGroupIdx * shaderHandleSize], shaderHandleSize);
+		memcpy(&sbtMem[m_sbtRecordSize / sizeof(U32)], &shaderGroupHandles[m_missShaderGroupIdx * shaderHandleSize], shaderHandleSize);
 
 		// Create the pass
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtShadows build SBT");
@@ -304,7 +304,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 			// Allocate, set and bind global uniforms
 			{
 				MaterialGlobalConstants* globalConstants;
-				const RebarAllocation globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalConstants);
+				const BufferView globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalConstants);
 
 				memset(globalConstants, 0, sizeof(*globalConstants)); // Don't care for now
 

+ 1 - 1
AnKi/Renderer/Utils/Drawer.cpp

@@ -32,7 +32,7 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 	// Allocate, set and bind global uniforms
 	{
 		MaterialGlobalConstants* globalConstants;
-		const RebarAllocation globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(1, globalConstants);
+		const BufferView globalConstantsToken = RebarTransientMemoryPool::getSingleton().allocateConstantBuffer(globalConstants);
 
 		globalConstants->m_viewProjectionMatrix = args.m_viewProjectionMatrix;
 		globalConstants->m_previousViewProjectionMatrix = args.m_previousViewProjectionMatrix;

+ 20 - 14
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -267,7 +267,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
 		}
 
-		getRenderer().getReadbackManager().allocateData(m_outOfMemoryReadback, sizeof(U32), m_outOfMemoryReadbackBuffer);
+		m_outOfMemoryReadbackBuffer = getRenderer().getReadbackManager().allocateStructuredBuffer<U32>(m_outOfMemoryReadback, 1);
 	}
 
 	// Get some limits
@@ -738,13 +738,12 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 				cmdb.bindSrv(4, 0, stage1Mem.m_counters);
 				cmdb.bindSrv(5, 0, stage1Mem.m_renderablePrefixSums);
 
-				UVec2* firstDrawIndirectArgAndCount =
+				WeakArray<UVec2> firstDrawIndirectArgAndCount =
 					allocateAndBindSrvStructuredBuffer<UVec2>(cmdb, 6, 0, out.m_legacy.m_bucketIndirectArgsRanges.getSize());
 				for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
 				{
-					firstDrawIndirectArgAndCount->x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
-					firstDrawIndirectArgAndCount->y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
-					++firstDrawIndirectArgAndCount;
+					firstDrawIndirectArgAndCount[ibucket].x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
+					firstDrawIndirectArgAndCount[ibucket].y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
 				}
 
 				cmdb.bindUav(0, 0, stage2Mem.m_legacy.m_instanceRateRenderables);
@@ -936,9 +935,9 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
 	if(objCount == 0)
 	{
-		U32* count;
-		out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(sizeof(U32), count);
-		*count = 0;
+		WeakArray<U32> count;
+		out.m_visiblesBuffer = RebarTransientMemoryPool::getSingleton().allocateStructuredBuffer(1, count);
+		count[0] = 0;
 		out.m_visiblesBufferHandle = rgraph.importBuffer(out.m_visiblesBuffer, BufferUsageBit::kNone);
 
 		return;
@@ -958,16 +957,23 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 		m_counterBufferZeroingHandle = {};
 	}
 
-	constexpr U32 kCountersPerDispatch = 3; // 1 for the threadgroup, 1 for the visbile object count and 1 for objects with feedback
-	const U32 counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment,
-														   U32(kCountersPerDispatch * sizeof(U32)));
+	U32 counterBufferElementSize;
+	if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+	{
+		counterBufferElementSize = sizeof(GpuVisibilityNonRenderablesCounters);
+	}
+	else
+	{
+		counterBufferElementSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment,
+													 U32(sizeof(GpuVisibilityNonRenderablesCounters)));
+	}
+
 	if(!m_counterBuffer.isCreated() || m_counterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
 	{
 		// Counter buffer not created or not big enough, create a new one
 
 		BufferInitInfo buffInit("GpuVisibilityNonRenderablesCounters");
-		buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2
-														: kCountersPerDispatch * counterBufferElementSize * kInitialCounterArraySize;
+		buffInit.m_size = (m_counterBuffer.isCreated()) ? m_counterBuffer->getSize() * 2 : counterBufferElementSize * kInitialCounterArraySize;
 		buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kSrvCompute | BufferUsageBit::kCopyDestination;
 		m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
 
@@ -1051,7 +1057,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 		cmdb.setFastConstants(&consts, sizeof(consts));
 
 		rgraph.bindUav(0, 0, visibleIndicesBuffHandle);
-		cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(U32) * kCountersPerDispatch));
+		cmdb.bindUav(1, 0, BufferView(counterBuffer.get(), counterBufferOffset, sizeof(GpuVisibilityNonRenderablesCounters)));
 
 		if(needsFeedback)
 		{

+ 9 - 1
AnKi/Renderer/Utils/HzbGenerator.cpp

@@ -50,7 +50,15 @@ Error HzbGenerator::init()
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMaxDepth.ankiprogbin", m_maxDepthProg, m_maxDepthGrProg));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMaxDepthProject.ankiprogbin", m_maxBoxProg, m_maxBoxGrProg));
 
-	m_counterBufferElementSize = max<U32>(sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
+	if(GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferNaturalAlignment)
+	{
+		m_counterBufferElementSize = sizeof(U32);
+	}
+	else
+	{
+		m_counterBufferElementSize = max<U32>(sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_structuredBufferBindOffsetAlignment);
+	}
+
 	BufferInitInfo buffInit("HzbCounterBuffer");
 	buffInit.m_size = m_counterBufferElementSize * kCounterBufferElementCount;
 	buffInit.m_usage = BufferUsageBit::kUavCompute | BufferUsageBit::kCopyDestination;

+ 0 - 25
AnKi/Renderer/Utils/Readback.cpp

@@ -7,31 +7,6 @@
 
 namespace anki {
 
-void ReadbackManager::allocateData(MultiframeReadbackToken& token, PtrSize size, BufferView& buffer) const
-{
-	for([[maybe_unused]] U64 frame : token.m_frameIds)
-	{
-		ANKI_ASSERT(frame != m_frameId && "Can't allocate multiple times in a frame");
-	}
-
-	GpuReadbackMemoryAllocation& allocation = token.m_allocations[token.m_slot];
-
-	if(allocation.isValid() && allocation.getAllocatedSize() != size)
-	{
-		GpuReadbackMemoryPool::getSingleton().deferredFree(allocation);
-	}
-
-	if(!allocation.isValid())
-	{
-		allocation = GpuReadbackMemoryPool::getSingleton().allocate(size);
-	}
-	token.m_frameIds[token.m_slot] = m_frameId;
-
-	buffer = BufferView(&allocation.getBuffer(), allocation.getOffset(), size);
-
-	token.m_slot = (token.m_slot + 1) % kMaxFramesInFlight;
-}
-
 U32 ReadbackManager::findBestSlot(const MultiframeReadbackToken& token) const
 {
 	const U64 earliestFrame = m_frameId - (kMaxFramesInFlight - 1);

+ 27 - 1
AnKi/Renderer/Utils/Readback.h

@@ -56,7 +56,33 @@ public:
 	}
 
 	/// Allocate new data for the following frame. 2nd thing to call in a frame.
-	void allocateData(MultiframeReadbackToken& token, PtrSize size, BufferView& buffer) const;
+	template<typename T>
+	BufferView allocateStructuredBuffer(MultiframeReadbackToken& token, U32 count) const
+	{
+		ANKI_ASSERT(count > 0);
+
+		for([[maybe_unused]] U64 frame : token.m_frameIds)
+		{
+			ANKI_ASSERT(frame != m_frameId && "Can't allocate multiple times in a frame");
+		}
+
+		GpuReadbackMemoryAllocation& allocation = token.m_allocations[token.m_slot];
+
+		if(allocation.isValid() && allocation.getAllocatedSize() != sizeof(T) * count)
+		{
+			GpuReadbackMemoryPool::getSingleton().deferredFree(allocation);
+		}
+
+		if(!allocation.isValid())
+		{
+			allocation = GpuReadbackMemoryPool::getSingleton().allocateStructuredBuffer<T>(count);
+		}
+		token.m_frameIds[token.m_slot] = m_frameId;
+
+		token.m_slot = (token.m_slot + 1) % kMaxFramesInFlight;
+
+		return BufferView(&allocation.getBuffer(), allocation.getOffset(), sizeof(T) * count);
+	}
 
 	/// Last thing to call in a frame.
 	void endFrame(Fence* fence);

+ 2 - 2
AnKi/Renderer/VrsSriGeneration.cpp

@@ -51,12 +51,12 @@ Error VrsSriGeneration::initInternal()
 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
 
-	if(m_sriTexelDimension == 16 && GrManager::getSingleton().getDeviceCapabilities().m_minSubgroupSize >= 32)
+	if(m_sriTexelDimension == 16 && GrManager::getSingleton().getDeviceCapabilities().m_minWaveSize >= 32)
 	{
 		// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No need for shared mem
 		variantInit.addMutation("SHARED_MEMORY", 0);
 	}
-	else if(m_sriTexelDimension == 8 && GrManager::getSingleton().getDeviceCapabilities().m_minSubgroupSize >= 16)
+	else if(m_sriTexelDimension == 8 && GrManager::getSingleton().getDeviceCapabilities().m_minWaveSize >= 16)
 	{
 		// Algorithm's workgroup size is 16, GPU's subgroup size is min 16 -> each workgroup has 1 subgroup -> No need for shared mem
 		variantInit.addMutation("SHARED_MEMORY", 0);

+ 3 - 3
AnKi/Scene/Components/ParticleEmitterComponent.cpp

@@ -206,7 +206,7 @@ ParticleEmitterComponent::ParticleEmitterComponent(SceneNode* node)
 
 	static_assert(kMeshRelatedVertexStreamFormats[VertexStreamId::kPosition] == Format::kR16G16B16A16_Unorm);
 	WeakArray<U16Vec4> transientPositions;
-	const RebarAllocation positionsAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(vertCount, transientPositions);
+	const BufferView positionsAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(vertCount, transientPositions);
 	transientPositions[0] = U16Vec4(0, 0, 0, 0);
 	transientPositions[1] = U16Vec4(kMaxU16, 0, 0, 0);
 	transientPositions[2] = U16Vec4(kMaxU16, kMaxU16, 0, 0);
@@ -214,14 +214,14 @@ ParticleEmitterComponent::ParticleEmitterComponent(SceneNode* node)
 
 	static_assert(kMeshRelatedVertexStreamFormats[VertexStreamId::kUv] == Format::kR32G32_Sfloat);
 	WeakArray<Vec2> transientUvs;
-	const RebarAllocation uvsAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(vertCount, transientUvs);
+	const BufferView uvsAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(vertCount, transientUvs);
 	transientUvs[0] = Vec2(0.0f);
 	transientUvs[1] = Vec2(1.0f, 0.0f);
 	transientUvs[2] = Vec2(1.0f, 1.0f);
 	transientUvs[3] = Vec2(0.0f, 1.0f);
 
 	WeakArray<U16> transientIndices;
-	const RebarAllocation indicesAlloc = RebarTransientMemoryPool::getSingleton().allocateFrame(indexCount, transientIndices);
+	const BufferView indicesAlloc = RebarTransientMemoryPool::getSingleton().allocateCopyBuffer(indexCount, transientIndices);
 	transientIndices[0] = 0;
 	transientIndices[1] = 1;
 	transientIndices[2] = 3;

+ 1 - 2
AnKi/Scene/GpuSceneArray.inl.h

@@ -13,8 +13,7 @@ template<typename TGpuSceneObject, U32 kId>
 GpuSceneArray<TGpuSceneObject, kId>::GpuSceneArray(U32 maxArraySize)
 {
 	maxArraySize = getAlignedRoundUp(sizeof(SubMask), maxArraySize);
-	const U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment;
-	m_gpuSceneAllocation = GpuSceneBuffer::getSingleton().allocate(sizeof(TGpuSceneObject) * maxArraySize, alignment);
+	m_gpuSceneAllocation = GpuSceneBuffer::getSingleton().allocateStructuredBuffer<TGpuSceneObject>(maxArraySize);
 
 	m_inUseIndicesMask.resize(maxArraySize / sizeof(SubMask), false);
 	ANKI_ASSERT(m_inUseIndicesCount == 0);

+ 1 - 1
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -433,7 +433,7 @@ main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, out vertices
 
 			primitives[idx].m_constantsOffset = constantsOffset;
 #	if VISUALIZE_MESHLETS
-			primitives[idx].m_meshletIndex = relativeMeshletIdx;
+			primitives[idx].m_meshletIndex = instance.m_meshletGeometryDescriptorIndex;
 #	endif
 		}
 	}

+ 16 - 20
AnKi/Shaders/GpuVisibilityNonRenderables.ankiprog

@@ -34,10 +34,7 @@ RWStructuredBuffer<U32> g_visibleIndices : register(u0); // 1st element is the c
 
 ANKI_FAST_CONSTANTS(GpuVisibilityNonRenderableConstants, g_consts)
 
-constexpr U32 kVisibleObjCounterIdx = 1;
-constexpr U32 kThreadgroupCounterIdx = 0;
-constexpr U32 kFeedbackCounterIdx = 2;
-globallycoherent RWStructuredBuffer<U32> g_counterBuffer : register(u1); // 2 counters per dispatch with an optional 3rd for feedback
+RWStructuredBuffer<GpuVisibilityNonRenderablesCounters> g_counterBuffer : register(u1);
 
 #if CPU_FEEDBACK
 // 1st element is a count. What follows is an array pairs of UUIDs and array index.
@@ -104,7 +101,7 @@ Vec4 getSphere(GpuSceneGlobalIlluminationProbe l)
 	if(!skip)
 	{
 		U32 idx;
-		InterlockedAdd(g_counterBuffer[kVisibleObjCounterIdx], 1, idx);
+		InterlockedAdd(g_counterBuffer[0].m_visibleObjectCount, 1, idx);
 
 		g_visibleIndices[idx + 1] = svDispatchThreadId;
 	}
@@ -115,36 +112,35 @@ Vec4 getSphere(GpuSceneGlobalIlluminationProbe l)
 	if(!skip && g_objects[svDispatchThreadId].m_uuid != 0)
 	{
 		U32 idx;
-		InterlockedAdd(g_counterBuffer[kFeedbackCounterIdx], 1, idx);
+		InterlockedAdd(g_counterBuffer[0].m_feedbackObjectCount, 1, idx);
 
 		g_cpuFeedbackBuffer[idx * 2 + 1] = g_objects[svDispatchThreadId].m_uuid;
 		g_cpuFeedbackBuffer[idx * 2 + 2] = g_objects[svDispatchThreadId].m_componentArrayIndex;
 	}
 #endif
 
+	// Sync to make sure all the atomic ops have finished before the following code reads them
+	AllMemoryBarrierWithGroupSync();
+
 	// Store the counters to the actual buffers
 	//
-	Bool lastThreadgroupExecuting = false;
 	if(svGroupIndex == 0)
 	{
 		U32 threadgroupIdx;
-		InterlockedAdd(g_counterBuffer[kThreadgroupCounterIdx], 1, threadgroupIdx);
+		InterlockedAdd(g_counterBuffer[0].m_threadgroupCount, 1, threadgroupIdx);
 		const U32 threadgroupCount = (objectCount + NUMTHREADS - 1) / NUMTHREADS;
-		lastThreadgroupExecuting = (threadgroupIdx + 1 == threadgroupCount);
-	}
+		const Bool lastThreadgroupExecuting = (threadgroupIdx + 1 == threadgroupCount);
 
-	// Sync to make sure all the atomic ops have finished before the following code reads them
-	AllMemoryBarrierWithGroupSync();
-
-	if(lastThreadgroupExecuting)
-	{
-		g_visibleIndices[0] = g_counterBuffer[kVisibleObjCounterIdx];
-		g_counterBuffer[kVisibleObjCounterIdx] = 0;
+		if(lastThreadgroupExecuting)
+		{
+			g_visibleIndices[0] = g_counterBuffer[0].m_visibleObjectCount;
+			g_counterBuffer[0].m_visibleObjectCount = 0;
 #if CPU_FEEDBACK
-		g_cpuFeedbackBuffer[0] = g_counterBuffer[kFeedbackCounterIdx];
-		g_counterBuffer[kFeedbackCounterIdx] = 0;
+			g_cpuFeedbackBuffer[0] = g_counterBuffer[0].m_feedbackObjectCount;
+			g_counterBuffer[0].m_feedbackObjectCount = 0;
 #endif
-		g_counterBuffer[kThreadgroupCounterIdx] = 0;
+			g_counterBuffer[0].m_threadgroupCount = 0;
+		}
 	}
 }
 

+ 8 - 0
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -96,4 +96,12 @@ enum class GpuVisibilityIndirectDispatches : U32
 	kCount
 };
 
+/// Counters used in non-renderables visibility
+class GpuVisibilityNonRenderablesCounters
+{
+	U32 m_threadgroupCount; ///< Counts the no of threadgroups
+	U32 m_visibleObjectCount; ///< Counts the visible objects
+	U32 m_feedbackObjectCount; ///< Counts the visbile objects that need feedback
+};
+
 ANKI_END_NAMESPACE

+ 3 - 3
AnKi/Ui/Canvas.cpp

@@ -190,7 +190,7 @@ void Canvas::appendToCommandBufferInternal(CommandBuffer& cmdb)
 	ImDrawData& drawData = *ImGui::GetDrawData();
 
 	// Allocate index and vertex buffers
-	RebarAllocation vertsToken, indicesToken;
+	BufferView vertsToken, indicesToken;
 	{
 		if(drawData.TotalVtxCount == 0 || drawData.TotalIdxCount == 0)
 		{
@@ -198,9 +198,9 @@ void Canvas::appendToCommandBufferInternal(CommandBuffer& cmdb)
 		}
 
 		ImDrawVert* verts;
-		vertsToken = RebarTransientMemoryPool::getSingleton().allocateFrame(drawData.TotalVtxCount, verts);
+		vertsToken = RebarTransientMemoryPool::getSingleton().allocate(drawData.TotalVtxCount * sizeof(ImDrawVert), sizeof(F32), verts);
 		ImDrawIdx* indices;
-		indicesToken = RebarTransientMemoryPool::getSingleton().allocateFrame(drawData.TotalIdxCount, indices);
+		indicesToken = RebarTransientMemoryPool::getSingleton().allocate(drawData.TotalIdxCount * sizeof(ImDrawIdx), sizeof(ImDrawIdx), indices);
 
 		for(I n = 0; n < drawData.CmdListsCount; ++n)
 		{

+ 1 - 0
AnKi/Util/StackAllocatorBuilder.inl.h

@@ -41,6 +41,7 @@ template<typename TChunk, typename TInterface, typename TLock>
 Error StackAllocatorBuilder<TChunk, TInterface, TLock>::allocate(PtrSize size, PtrSize alignment, TChunk*& chunk, PtrSize& offset)
 {
 	ANKI_ASSERT(size > 0);
+	ANKI_ASSERT(alignment > 0);
 	size += alignment;
 
 	chunk = nullptr;

+ 1 - 1
Sandbox/Main.cpp

@@ -342,7 +342,7 @@ Error MyApp::userMainLoop(Bool& quit, Second elapsedTime)
 
 	if(in.getKey(KeyCode::kU) == 1)
 	{
-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "Ssr") ? "" : "Ssr");
+		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "GBufferAlbedo") ? "" : "GBufferAlbedo");
 	}
 
 	if(in.getKey(KeyCode::kI) == 1)