Browse Source

Revise how meshlet memory allocation works

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
bf74976c02

+ 5 - 0
AnKi/Gr/Common.h

@@ -1021,6 +1021,11 @@ public:
 	Buffer* m_buffer = nullptr;
 	PtrSize m_offset = kMaxPtrSize;
 	PtrSize m_range = 0;
+
+	Bool isValid() const
+	{
+		return m_buffer != nullptr && m_offset < kMaxPtrSize && m_range > 0;
+	}
 };
 
 /// Compute max number of mipmaps for a 2D texture.

+ 3 - 0
AnKi/Gr/Vulkan/DescriptorSet.h

@@ -243,6 +243,7 @@ public:
 
 	void bindConstantBuffer(U32 binding, U32 arrayIdx, const Buffer* buff, PtrSize offset, PtrSize range)
 	{
+		ANKI_ASSERT(range > 0);
 		AnyBinding& b = getBindingToPopulate(binding, arrayIdx);
 		b = {};
 		b.m_type = DescriptorType::kUniformBuffer;
@@ -258,6 +259,7 @@ public:
 
 	void bindUavBuffer(U32 binding, U32 arrayIdx, const Buffer* buff, PtrSize offset, PtrSize range)
 	{
+		ANKI_ASSERT(range > 0);
 		AnyBinding& b = getBindingToPopulate(binding, arrayIdx);
 		b = {};
 		b.m_type = DescriptorType::kStorageBuffer;
@@ -273,6 +275,7 @@ public:
 
 	void bindReadOnlyTextureBuffer(U32 binding, U32 arrayIdx, const Buffer* buff, PtrSize offset, PtrSize range, Format fmt)
 	{
+		ANKI_ASSERT(range > 0);
 		const VkBufferView view = static_cast<const BufferImpl*>(buff)->getOrCreateBufferView(fmt, offset, range);
 		AnyBinding& b = getBindingToPopulate(binding, arrayIdx);
 		b = {};

+ 5 - 0
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -300,6 +300,11 @@ Error GrManagerImpl::initInstance()
 		disabledValidationFeatures.emplaceBack(VK_VALIDATION_FEATURE_DISABLE_ALL_EXT);
 	}
 
+	if(g_validationCVar.get())
+	{
+		enabledValidationFeatures.emplaceBack(VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT);
+	}
+
 	VkValidationFeaturesEXT validationFeatures = {};
 	if(enabledValidationFeatures.getSize() || disabledValidationFeatures.getSize())
 	{

+ 3 - 5
AnKi/Renderer/ForwardShading.cpp

@@ -49,13 +49,11 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
 		meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
 		meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
 		meshIn.m_viewportSize = getRenderer().getInternalResolution();
-		meshIn.m_taskShaderIndirectArgsBuffer = m_runCtx.m_visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-		meshIn.m_taskShaderPayloadBuffer = m_runCtx.m_visOut.m_mesh.m_taskShaderPayloadBuffer;
-		meshIn.m_dependency = m_runCtx.m_visOut.m_dependency;
 		meshIn.m_rgraph = &rgraph;
 		meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
+		meshIn.fillBuffers(m_runCtx.m_visOut);
 
-		getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, m_runCtx.m_meshVisOut);
+		getRenderer().getGpuVisibility().populateRenderGraph(meshIn, m_runCtx.m_meshVisOut);
 	}
 }
 
@@ -99,7 +97,7 @@ void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgr
 		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
 		args.m_renderingTechinuqe = RenderingTechnique::kForward;
 		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
-		args.fillMdi(m_runCtx.m_visOut);
+		args.fill(m_runCtx.m_visOut);
 
 		if(m_runCtx.m_meshVisOut.isFilled())
 		{

+ 3 - 5
AnKi/Renderer/GBuffer.cpp

@@ -159,13 +159,11 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 			meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
 			meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
 			meshIn.m_viewportSize = getRenderer().getInternalResolution();
-			meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-			meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-			meshIn.m_dependency = visOut.m_dependency;
 			meshIn.m_rgraph = &rgraph;
 			meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
+			meshIn.fillBuffers(visOut);
 
-			getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+			getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 		}
 	}
 
@@ -241,7 +239,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 			args.m_hzbTexture = hzbView.get();
 		}
 
-		args.fillMdi(visOut);
+		args.fill(visOut);
 		if(meshletVisOut.isFilled())
 		{
 			args.fill(meshletVisOut);

+ 6 - 10
AnKi/Renderer/IndirectDiffuseProbes.cpp

@@ -230,12 +230,10 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
 					meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
 					meshIn.m_viewportSize = UVec2(m_tileSize);
-					meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-					meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-					meshIn.m_dependency = visOut.m_dependency;
 					meshIn.m_rgraph = &rgraph;
+					meshIn.fillBuffers(visOut);
 
-					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+					getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 				}
 			}
 
@@ -282,7 +280,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
 					args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
 					args.m_viewport = UVec4(0, 0, m_tileSize, m_tileSize);
-					args.fillMdi(visOut);
+					args.fill(visOut);
 
 					if(meshletVisOut.isFilled())
 					{
@@ -330,12 +328,10 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
 					meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
 					meshIn.m_viewportSize = visIn.m_viewportSize;
-					meshIn.m_taskShaderIndirectArgsBuffer = shadowVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-					meshIn.m_taskShaderPayloadBuffer = shadowVisOut.m_mesh.m_taskShaderPayloadBuffer;
-					meshIn.m_dependency = shadowVisOut.m_dependency;
 					meshIn.m_rgraph = &rgraph;
+					meshIn.fillBuffers(shadowVisOut);
 
-					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
+					getRenderer().getGpuVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
 				}
 			}
 
@@ -367,7 +363,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
 					args.m_renderingTechinuqe = RenderingTechnique::kDepth;
 					args.m_viewport = UVec4(0, 0, rez, rez);
-					args.fillMdi(shadowVisOut);
+					args.fill(shadowVisOut);
 
 					if(shadowMeshletVisOut.isFilled())
 					{

+ 6 - 10
AnKi/Renderer/ProbeReflections.cpp

@@ -226,12 +226,10 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
 				meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
 				meshIn.m_viewportSize = UVec2(m_gbuffer.m_tileSize);
-				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-				meshIn.m_dependency = visOut.m_dependency;
 				meshIn.m_rgraph = &rgraph;
+				meshIn.fillBuffers(visOut);
 
-				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 			}
 		}
 
@@ -277,7 +275,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
 				args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
 				args.m_viewport = UVec4(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
-				args.fillMdi(visOut);
+				args.fill(visOut);
 
 				if(meshletVisOut.isFilled())
 				{
@@ -325,12 +323,10 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
 				meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
 				meshIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
-				meshIn.m_taskShaderIndirectArgsBuffer = shadowVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-				meshIn.m_taskShaderPayloadBuffer = shadowVisOut.m_mesh.m_taskShaderPayloadBuffer;
-				meshIn.m_dependency = shadowVisOut.m_dependency;
 				meshIn.m_rgraph = &rgraph;
+				meshIn.fillBuffers(shadowVisOut);
 
-				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
+				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
 			}
 		}
 
@@ -362,7 +358,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAniso.get();
 				args.m_renderingTechinuqe = RenderingTechnique::kDepth;
 				args.m_viewport = UVec4(0, 0, rez, rez);
-				args.fillMdi(shadowVisOut);
+				args.fill(shadowVisOut);
 
 				if(shadowMeshletVisOut.isFilled())
 				{

+ 0 - 1
AnKi/Renderer/Renderer.cpp

@@ -245,7 +245,6 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	}
 
 	ANKI_CHECK(m_visibility.init());
-	ANKI_CHECK(m_visibilityMeshlets.init());
 	ANKI_CHECK(m_nonRenderablesVisibility.init());
 	ANKI_CHECK(m_asVisibility.init());
 	ANKI_CHECK(m_hzbGenerator.init());

+ 0 - 6
AnKi/Renderer/Renderer.h

@@ -110,11 +110,6 @@ public:
 		return m_visibility;
 	}
 
-	GpuMeshletVisibility& getGpuMeshletVisibility()
-	{
-		return m_visibilityMeshlets;
-	}
-
 	Bool runSoftwareMeshletRendering() const
 	{
 		return g_meshletRenderingCVar.get() && !GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
@@ -236,7 +231,6 @@ private:
 
 	RenderableDrawer m_sceneDrawer;
 	GpuVisibility m_visibility;
-	GpuMeshletVisibility m_visibilityMeshlets;
 	GpuVisibilityNonRenderables m_nonRenderablesVisibility;
 	GpuVisibilityAccelerationStructures m_asVisibility;
 	HzbGenerator m_hzbGenerator;

+ 7 - 13
AnKi/Renderer/ShadowMapping.cpp

@@ -398,12 +398,10 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 					meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
 					meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
 					meshIn.m_viewportSize = atlasViewports[face].zw();
-					meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-					meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-					meshIn.m_dependency = visOut.m_dependency;
 					meshIn.m_rgraph = &rgraph;
+					meshIn.fillBuffers(visOut);
 
-					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+					getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 				}
 
 				createDrawShadowsPass(atlasViewports[face], frustum.getViewProjectionMatrix(), frustum.getViewMatrix(), visOut, meshletVisOut,
@@ -478,12 +476,10 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				meshIn.m_viewProjectionMatrix = lightc->getSpotLightViewProjectionMatrix();
 				meshIn.m_cameraTransform = lightc->getSpotLightViewMatrix().getInverseTransformation();
 				meshIn.m_viewportSize = atlasViewport.zw();
-				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-				meshIn.m_dependency = visOut.m_dependency;
 				meshIn.m_rgraph = &rgraph;
+				meshIn.fillBuffers(visOut);
 
-				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 			}
 
 			// Add draw pass
@@ -566,12 +562,10 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				meshIn.m_viewProjectionMatrix = cascadeViewProjMats[cascade];
 				meshIn.m_cameraTransform = cascadeViewMats[cascade].getInverseTransformation();
 				meshIn.m_viewportSize = dirLightAtlasViewports[cascade].zw();
-				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
-				meshIn.m_dependency = visOut.m_dependency;
 				meshIn.m_rgraph = &rgraph;
+				meshIn.fillBuffers(visOut);
 
-				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
 			}
 
 			// Draw
@@ -662,7 +656,7 @@ void ShadowMapping::createDrawShadowsPass(const UVec4& viewport, const Mat4& vie
 		args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care
 		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
 		args.m_viewport = UVec4(viewport[0], viewport[1], viewport[2], viewport[3]);
-		args.fillMdi(visOut);
+		args.fill(visOut);
 
 		TextureViewPtr hzbView;
 		if(hzbRt.isValid())

+ 56 - 163
AnKi/Renderer/Utils/Drawer.cpp

@@ -63,7 +63,10 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 					   UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshletGeometryDescriptors),
 					   UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
-	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kTaskShaderPayloads), args.m_mesh.m_taskShaderPayloadsBuffer);
+	if(args.m_mesh.m_meshletGroupInstancesBuffer.m_range)
+	{
+		cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshletGroups), args.m_mesh.m_meshletGroupInstancesBuffer);
+	}
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kRenderables),
 					   GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshLods), GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
@@ -102,188 +105,78 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 	const Bool meshShaderHwSupport = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
 
-	// Gather the drawcalls
-	class Command
-	{
-	public:
-		class LegacyDraw
-		{
-		public:
-			Buffer* m_drawIndirectArgsBuffer;
-			PtrSize m_drawIndirectArgsBufferOffset;
-			Buffer* m_mdiDrawCountsBuffer;
-			PtrSize m_mdiDrawCountsBufferOffset;
-			Buffer* m_instancesBuffer;
-			PtrSize m_instancesBufferOffset;
-			U32 m_maxDrawCount;
-			PrimitiveTopology m_primitiveTopology;
-		};
-
-		class MeshDraw
-		{
-		public:
-			U32 m_firstPayload;
-			Buffer* m_taskShaderIndirectArgsBuffer;
-			PtrSize m_taskShaderIndirectArgsBufferOffset;
-		};
-
-		class SwMeshDraw
-		{
-		public:
-			Buffer* m_drawIndirectArgsBuffer;
-			PtrSize m_drawIndirectArgsBufferOffset;
-			Buffer* m_instancesBuffer;
-			PtrSize m_instancesBufferOffset;
-		};
-
-		union
-		{
-			LegacyDraw m_legacyDraw;
-			MeshDraw m_meshDraw;
-			SwMeshDraw m_swMeshDraw;
-		};
-
-		ShaderProgram* m_program;
-		U64 m_shaderBinarySize;
-		U8 m_drawType;
-		Bool m_hasDiscard;
-	};
-
-	Array<Command, 16> commands;
-	U32 commandCount = 0;
-
-	U32 allUserCount = 0;
-	U32 bucketCount = 0;
-	U32 allMeshletGroupCount = 0;
-	U32 legacyGeometryFlowUserCount = 0;
-	PtrSize meshletInstancesBufferOffset = 0;
-	RenderStateBucketContainer::getSingleton().iterateBuckets(
-		args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
+	cmdb.setVertexAttribute(0, 0, Format::kR32G32B32A32_Uint, 0);
+
+	RenderStateBucketContainer::getSingleton().iterateBucketsPerformanceOrder(
+		args.m_renderingTechinuqe,
+		[&](const RenderStateInfo& state, U32 bucketIdx, U32 userCount, U32 meshletGroupCount, [[maybe_unused]] U32 meshletCount) {
 			if(userCount == 0)
 			{
-				++bucketCount;
 				return;
 			}
 
-			Command& cmd = commands[commandCount++];
-
-			cmd.m_program = state.m_program.get();
-			cmd.m_shaderBinarySize = U64(state.m_program->getShaderBinarySize(ShaderType::kFragment)) << 32u;
-			cmd.m_hasDiscard = state.m_program->hasDiscard();
+			cmdb.bindShaderProgram(state.m_program.get());
 
 			const Bool meshlets = meshletGroupCount > 0;
 
 			if(meshlets && meshShaderHwSupport)
 			{
-				cmd.m_drawType = 2;
-				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kMesh);
-
-				cmd.m_meshDraw.m_firstPayload = allMeshletGroupCount;
-				cmd.m_meshDraw.m_taskShaderIndirectArgsBuffer = args.m_mesh.m_taskShaderIndirectArgsBuffer.m_buffer;
-				cmd.m_meshDraw.m_taskShaderIndirectArgsBufferOffset =
-					args.m_mesh.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketCount;
+				const UVec4 firstPayload(args.m_mesh.m_bucketMeshletGroupInstanceRanges[bucketIdx].getFirstInstance());
+				cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
 
-				allMeshletGroupCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+				cmdb.drawMeshTasksIndirect(args.m_mesh.m_taskShaderIndirectArgsBuffer.m_buffer,
+										   args.m_mesh.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketIdx);
 			}
 			else if(meshlets)
 			{
-				cmd.m_drawType = 3;
-				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kVertex);
-
-				cmd.m_swMeshDraw.m_drawIndirectArgsBuffer = args.m_softwareMesh.m_drawIndirectArgsBuffer.m_buffer;
-				cmd.m_swMeshDraw.m_drawIndirectArgsBufferOffset =
-					args.m_softwareMesh.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketCount;
-
-				cmd.m_swMeshDraw.m_instancesBuffer = args.m_softwareMesh.m_meshletInstancesBuffer.m_buffer;
-				cmd.m_swMeshDraw.m_instancesBufferOffset = args.m_softwareMesh.m_meshletInstancesBuffer.m_offset + meshletInstancesBufferOffset;
+				cmdb.bindVertexBuffer(0, args.m_softwareMesh.m_meshletInstancesBuffer.m_buffer,
+									  args.m_softwareMesh.m_meshletInstancesBuffer.m_offset
+										  + args.m_softwareMesh.m_bucketMeshletInstanceRanges[bucketIdx].getFirstInstance()
+												* sizeof(GpuSceneMeshletInstance),
+									  sizeof(GpuSceneMeshletInstance), VertexStepRate::kInstance);
+
+				cmdb.drawIndirect(PrimitiveTopology::kTriangles, 1,
+								  args.m_softwareMesh.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketIdx,
+								  args.m_softwareMesh.m_drawIndirectArgsBuffer.m_buffer);
 			}
 			else
 			{
-				const U32 maxDrawCount = userCount;
-
-				cmd.m_drawType = (state.m_indexedDrawcall) ? 0 : 1;
-				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kVertex);
-
-				cmd.m_legacyDraw.m_primitiveTopology = state.m_primitiveTopology;
-				cmd.m_legacyDraw.m_drawIndirectArgsBuffer = args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_buffer;
-				cmd.m_legacyDraw.m_drawIndirectArgsBufferOffset =
-					args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * legacyGeometryFlowUserCount;
-				cmd.m_legacyDraw.m_maxDrawCount = maxDrawCount;
-				cmd.m_legacyDraw.m_mdiDrawCountsBuffer = args.m_legacy.m_mdiDrawCountsBuffer.m_buffer;
-				cmd.m_legacyDraw.m_mdiDrawCountsBufferOffset = args.m_legacy.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount;
-				cmd.m_legacyDraw.m_instancesBuffer = args.m_legacy.m_renderableInstancesBuffer.m_buffer;
-				cmd.m_legacyDraw.m_instancesBufferOffset =
-					args.m_legacy.m_renderableInstancesBuffer.m_offset + legacyGeometryFlowUserCount * sizeof(GpuSceneRenderableInstance);
-
-				legacyGeometryFlowUserCount += userCount;
+				const U32 maxDrawCount = args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx].getInstanceCount();
+
+				if(state.m_indexedDrawcall)
+				{
+					cmdb.bindVertexBuffer(0, args.m_legacy.m_renderableInstancesBuffer.m_buffer,
+										  args.m_legacy.m_renderableInstancesBuffer.m_offset
+											  + args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx].getFirstInstance()
+													* sizeof(GpuSceneRenderableInstance),
+										  sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
+
+					cmdb.drawIndexedIndirectCount(state.m_primitiveTopology, args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_buffer,
+												  args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_offset
+													  + args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx].getFirstInstance()
+															* sizeof(DrawIndexedIndirectArgs),
+												  sizeof(DrawIndexedIndirectArgs), args.m_legacy.m_mdiDrawCountsBuffer.m_buffer,
+												  args.m_legacy.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketIdx, maxDrawCount);
+				}
+				else
+				{
+					cmdb.bindVertexBuffer(0, args.m_legacy.m_renderableInstancesBuffer.m_buffer,
+										  args.m_legacy.m_renderableInstancesBuffer.m_offset
+											  + args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx].getFirstInstance()
+													* sizeof(GpuSceneRenderableInstance),
+										  sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
+
+					// Yes, the DrawIndexedIndirectArgs is intentional
+					cmdb.drawIndirectCount(state.m_primitiveTopology, args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_buffer,
+										   args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_offset
+											   + args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx].getFirstInstance()
+													 * sizeof(DrawIndexedIndirectArgs),
+										   sizeof(DrawIndexedIndirectArgs), args.m_legacy.m_mdiDrawCountsBuffer.m_buffer,
+										   args.m_legacy.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketIdx, maxDrawCount);
+				}
 			}
-
-			++bucketCount;
-			allUserCount += userCount;
-			meshletInstancesBufferOffset += sizeof(GpuSceneMeshletInstance) * min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket);
 		});
 
-	ANKI_ASSERT(bucketCount == RenderStateBucketContainer::getSingleton().getBucketCount(args.m_renderingTechinuqe));
-
-	// Sort the drawcalls from the least expensive to the most expensive, leave alpha tested at the end
-	std::sort(&commands[0], &commands[0] + commandCount, [](const Command& a, const Command& b) {
-		if(a.m_hasDiscard != b.m_hasDiscard)
-		{
-			return !a.m_hasDiscard;
-		}
-		else
-		{
-			return a.m_shaderBinarySize < b.m_shaderBinarySize;
-		}
-	});
-
-	cmdb.setVertexAttribute(0, 0, Format::kR32G32B32A32_Uint, 0);
-
-	// Now draw
-	for(const Command* it = commands.getBegin(); it < commands.getBegin() + commandCount; ++it)
-	{
-		cmdb.bindShaderProgram(it->m_program);
-
-		if(it->m_drawType == 0)
-		{
-			cmdb.bindVertexBuffer(0, it->m_legacyDraw.m_instancesBuffer, it->m_legacyDraw.m_instancesBufferOffset, sizeof(GpuSceneRenderableInstance),
-								  VertexStepRate::kInstance);
-
-			cmdb.drawIndexedIndirectCount(it->m_legacyDraw.m_primitiveTopology, it->m_legacyDraw.m_drawIndirectArgsBuffer,
-										  it->m_legacyDraw.m_drawIndirectArgsBufferOffset, sizeof(DrawIndexedIndirectArgs),
-										  it->m_legacyDraw.m_mdiDrawCountsBuffer, it->m_legacyDraw.m_mdiDrawCountsBufferOffset,
-										  it->m_legacyDraw.m_maxDrawCount);
-		}
-		else if(it->m_drawType == 1)
-		{
-			cmdb.bindVertexBuffer(0, it->m_legacyDraw.m_instancesBuffer, it->m_legacyDraw.m_instancesBufferOffset, sizeof(GpuSceneRenderableInstance),
-								  VertexStepRate::kInstance);
-
-			// Yes, the DrawIndexedIndirectArgs is intentional
-			cmdb.drawIndirectCount(it->m_legacyDraw.m_primitiveTopology, it->m_legacyDraw.m_drawIndirectArgsBuffer,
-								   it->m_legacyDraw.m_drawIndirectArgsBufferOffset, sizeof(DrawIndexedIndirectArgs),
-								   it->m_legacyDraw.m_mdiDrawCountsBuffer, it->m_legacyDraw.m_mdiDrawCountsBufferOffset,
-								   it->m_legacyDraw.m_maxDrawCount);
-		}
-		else if(it->m_drawType == 2)
-		{
-			const UVec4 firstPayload(it->m_meshDraw.m_firstPayload);
-			cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
-
-			cmdb.drawMeshTasksIndirect(it->m_meshDraw.m_taskShaderIndirectArgsBuffer, it->m_meshDraw.m_taskShaderIndirectArgsBufferOffset);
-		}
-		else
-		{
-			ANKI_ASSERT(it->m_drawType == 3);
-
-			cmdb.bindVertexBuffer(0, it->m_swMeshDraw.m_instancesBuffer, it->m_swMeshDraw.m_instancesBufferOffset, sizeof(GpuSceneMeshletInstance),
-								  VertexStepRate::kInstance);
-
-			cmdb.drawIndirect(PrimitiveTopology::kTriangles, 1, it->m_swMeshDraw.m_drawIndirectArgsBufferOffset,
-							  it->m_swMeshDraw.m_drawIndirectArgsBuffer);
-		}
-	}
-
 #if ANKI_STATS_ENABLED
 	if(pplineQuery.isCreated())
 	{

+ 14 - 5
AnKi/Renderer/Utils/Drawer.h

@@ -36,15 +36,19 @@ public:
 	{
 	public:
 		BufferOffsetRange m_mdiDrawCountsBuffer;
-		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
 		BufferOffsetRange m_renderableInstancesBuffer;
+		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
+
+		ConstWeakArray<InstanceRange> m_bucketRenderableInstanceRanges;
 	} m_legacy; ///< Legacy vertex flow
 
 	class
 	{
 	public:
 		BufferOffsetRange m_taskShaderIndirectArgsBuffer;
-		BufferOffsetRange m_taskShaderPayloadsBuffer;
+		BufferOffsetRange m_meshletGroupInstancesBuffer;
+
+		ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges;
 	} m_mesh;
 
 	class
@@ -52,21 +56,26 @@ public:
 	public:
 		BufferOffsetRange m_meshletInstancesBuffer;
 		BufferOffsetRange m_drawIndirectArgsBuffer;
+
+		ConstWeakArray<InstanceRange> m_bucketMeshletInstanceRanges;
 	} m_softwareMesh;
 
-	void fillMdi(const GpuVisibilityOutput& visOut)
+	void fill(const GpuVisibilityOutput& visOut)
 	{
 		m_legacy.m_mdiDrawCountsBuffer = visOut.m_legacy.m_mdiDrawCountsBuffer;
-		m_legacy.m_drawIndexedIndirectArgsBuffer = visOut.m_legacy.m_drawIndexedIndirectArgsBuffer;
 		m_legacy.m_renderableInstancesBuffer = visOut.m_legacy.m_renderableInstancesBuffer;
+		m_legacy.m_drawIndexedIndirectArgsBuffer = visOut.m_legacy.m_drawIndexedIndirectArgsBuffer;
+		m_legacy.m_bucketRenderableInstanceRanges = visOut.m_legacy.m_bucketRenderableInstanceRanges;
 		m_mesh.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-		m_mesh.m_taskShaderPayloadsBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+		m_mesh.m_meshletGroupInstancesBuffer = visOut.m_mesh.m_meshletGroupInstancesBuffer;
+		m_mesh.m_bucketMeshletGroupInstanceRanges = visOut.m_mesh.m_bucketMeshletGroupInstanceRanges;
 	}
 
 	void fill(const GpuMeshletVisibilityOutput& visOut)
 	{
 		ANKI_ASSERT(visOut.isFilled());
 		m_softwareMesh.m_meshletInstancesBuffer = visOut.m_meshletInstancesBuffer;
+		m_softwareMesh.m_bucketMeshletInstanceRanges = visOut.m_bucketMeshletInstanceRanges;
 		m_softwareMesh.m_drawIndirectArgsBuffer = visOut.m_drawIndirectArgsBuffer;
 	}
 };

+ 293 - 205
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -14,54 +14,38 @@
 #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
 #include <AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h>
 #include <AnKi/Core/StatsSet.h>
+#include <AnKi/Core/CVarSet.h>
 
 namespace anki {
 
+constexpr U32 kMaxVisibleObjects = 30 * 1024;
+
+constexpr U32 kMaxVisiblePrimitives = 40'000'000;
+constexpr U32 kMaxVisibleMeshlets = kMaxVisiblePrimitives / kMaxPrimitivesPerMeshlet;
+constexpr PtrSize kMaxMeshletMemory = kMaxVisibleMeshlets * sizeof(GpuSceneMeshletInstance);
+
+constexpr U32 kVisibleMaxMeshletGroups = max(kMaxVisibleObjects, (kMaxVisibleMeshlets + kMeshletGroupSize - 1) / kMeshletGroupSize);
+constexpr PtrSize kMaxMeshletGroupMemory = kVisibleMaxMeshletGroups * sizeof(GpuSceneMeshletGroupInstance);
+
+static NumericCVar<PtrSize> g_maxMeshletMemoryPerTest(CVarSubsystem::kRenderer, "MaxMeshletMemoryPerTest", kMaxMeshletMemory, 1_KB, 100_MB,
+													  "Max memory that will be allocated per GPU occlusion test for storing meshlets");
+static NumericCVar<PtrSize> g_maxMeshletGroupMemoryPerTest(CVarSubsystem::kRenderer, "MaxMeshletGroupMemoryPerTest", kMaxMeshletGroupMemory, 1_KB,
+														   100_MB,
+														   "Max memory that will be allocated per GPU occlusion test for storing meshlet groups");
+
 static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU visibility mem",
 												  StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
 
 static BufferOffsetRange allocateTransientGpuMem(PtrSize size)
 {
-	g_gpuVisMemoryAllocatedStatVar.increment(size);
-	return GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
-}
-
-GpuVisibilityCommonBase::Counts GpuVisibilityCommonBase::countTechnique(RenderingTechnique t)
-{
-	Counts out = {};
+	BufferOffsetRange out = {};
 
-	switch(t)
+	if(size)
 	{
-	case RenderingTechnique::kGBuffer:
-		out.m_aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
-		break;
-	case RenderingTechnique::kDepth:
-		out.m_aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
-		break;
-	case RenderingTechnique::kForward:
-		out.m_aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
-		break;
-	default:
-		ANKI_ASSERT(0);
+		g_gpuVisMemoryAllocatedStatVar.increment(size);
+		out = GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
 	}
 
-	out.m_bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(t);
-
-	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
-		if(meshletGroupCount)
-		{
-			out.m_modernGeometryFlowUserCount += userCount;
-			out.m_meshletGroupCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
-			out.m_meshletCount += min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket);
-		}
-		else
-		{
-			out.m_legacyGeometryFlowUserCount += userCount;
-		}
-	});
-
-	out.m_allUserCount = out.m_legacyGeometryFlowUserCount + out.m_modernGeometryFlowUserCount;
-
 	return out;
 }
 
@@ -73,9 +57,16 @@ Error GpuVisibility::init()
 		{
 			for(MutatorValue genHash = 0; genHash < 2; ++genHash)
 			{
-				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
-											 {{"HZB_TEST", hzb}, {"DISTANCE_TEST", 0}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}},
-											 m_prog, m_frustumGrProgs[hzb][gatherAabbs][genHash]));
+				for(MutatorValue gatherType = 0; gatherType < 3; ++gatherType)
+				{
+					ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+												 {{"HZB_TEST", hzb},
+												  {"DISTANCE_TEST", 0},
+												  {"GATHER_AABBS", gatherAabbs},
+												  {"HASH_VISIBLES", genHash},
+												  {"GATHER_TYPE", gatherType + 1}},
+												 m_prog, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherType]));
+				}
 			}
 		}
 	}
@@ -84,18 +75,93 @@ Error GpuVisibility::init()
 	{
 		for(MutatorValue genHash = 0; genHash < 2; ++genHash)
 		{
-			ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
-										 {{"HZB_TEST", 0}, {"DISTANCE_TEST", 1}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}}, m_prog,
-										 m_distGrProgs[gatherAabbs][genHash]));
+			for(MutatorValue gatherType = 0; gatherType < 3; ++gatherType)
+			{
+				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+											 {{"HZB_TEST", 0},
+											  {"DISTANCE_TEST", 1},
+											  {"GATHER_AABBS", gatherAabbs},
+											  {"HASH_VISIBLES", genHash},
+											  {"GATHER_TYPE", gatherType + 1}},
+											 m_prog, m_distGrProgs[gatherAabbs][genHash][gatherType]));
+			}
 		}
 	}
 
+	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
+	{
+		ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityMeshlet.ankiprogbin", {{"HZB_TEST", hzb}}, m_meshletCullingProg,
+									 m_meshletCullingGrProgs[hzb]));
+	}
+
 	return Error::kNone;
 }
 
+void GpuVisibility::computeGpuVisibilityMemoryRequirements(RenderingTechnique t, MemoryRequirements& total, WeakArray<MemoryRequirements> perBucket)
+{
+	ANKI_ASSERT(perBucket.getSize() == RenderStateBucketContainer::getSingleton().getBucketCount(t));
+
+	U32 totalMeshletCount = 0;
+	U32 totalMeshletGroupCount = 0;
+	U32 totalRenderableCount = 0;
+
+	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
+		if(meshletCount)
+		{
+			totalMeshletCount += meshletCount;
+			totalMeshletGroupCount += meshletGroupCount;
+		}
+		else
+		{
+			totalRenderableCount += userCount;
+		}
+	});
+
+	const U32 maxVisibleMeshlets = min(U32(g_maxMeshletMemoryPerTest.get() / sizeof(GpuSceneMeshletInstance)), totalMeshletCount);
+	const U32 maxVisibleMeshletGroups = min(U32(g_maxMeshletGroupMemoryPerTest.get() / sizeof(GpuSceneMeshletGroupInstance)), totalMeshletGroupCount);
+	const U32 maxVisibleRenderables = min(kMaxVisibleObjects, totalRenderableCount);
+
+	total = {};
+
+	U32 bucketCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
+		MemoryRequirements& bucket = perBucket[bucketCount++];
+
+		// Use U64 cause some expressions are overflowing
+
+		if(meshletCount)
+		{
+			ANKI_ASSERT(meshletGroupCount > 0);
+
+			ANKI_ASSERT(totalMeshletCount > 0);
+			bucket.m_meshletInstanceCount = max(1u, U32(U64(meshletCount) * maxVisibleMeshlets / totalMeshletCount));
+
+			ANKI_ASSERT(totalMeshletGroupCount > 0);
+			bucket.m_meshletGroupInstanceCount = max(1u, U32(U64(meshletGroupCount) * maxVisibleMeshletGroups / totalMeshletGroupCount));
+		}
+		else if(userCount > 0)
+		{
+			ANKI_ASSERT(totalRenderableCount > 0);
+			bucket.m_renderableInstanceCount = max(1u, U32(U64(userCount) * maxVisibleRenderables / totalRenderableCount));
+		}
+
+		total.m_meshletInstanceCount += bucket.m_meshletInstanceCount;
+		total.m_meshletGroupInstanceCount += bucket.m_meshletGroupInstanceCount;
+		total.m_renderableInstanceCount += bucket.m_renderableInstanceCount;
+	});
+}
+
 void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
 {
 	ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
+
+	if(RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) == 0) [[unlikely]]
+	{
+		// Early exit
+		in = {};
+		return;
+	}
+
 	RenderGraphDescription& rgraph = *in.m_rgraph;
 
 	class DistanceTestData
@@ -131,61 +197,102 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
 	}
 
-	const Counts counts = countTechnique(in.m_technique);
-
-	if(counts.m_allUserCount == 0) [[unlikely]]
-	{
-		// Early exit
-		return;
-	}
-
 	// Allocate memory
 	const Bool firstCallInFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
 	if(firstCallInFrame)
 	{
-		// Allocate the big buffers once at the beginning of the frame
+		// First call in frame. Init stuff
 
 		m_runCtx.m_frameIdx = getRenderer().getFrameCount();
-		m_runCtx.m_populateRenderGraphFrameCallCount = 0;
+		m_runCtx.m_populateRenderGraphCallCount = 0;
+		m_runCtx.m_populateRenderGraphMeshletRenderingCallCount = 0;
 
-		// Find the max counts of all techniques
-		Counts maxCounts = {};
+		// Calc memory requirements
+		MemoryRequirements maxTotalMemReq;
+		WeakArray<MemoryRequirements> bucketsMemReqs;
 		for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
 		{
-			maxCounts = maxCounts.max((in.m_technique == t) ? counts : countTechnique(t));
+			const U32 tBucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(t);
+			if(tBucketCount == 0)
+			{
+				continue;
+			}
+
+			newArray<MemoryRequirements>(getRenderer().getFrameMemoryPool(), tBucketCount, bucketsMemReqs);
+
+			computeGpuVisibilityMemoryRequirements(t, m_runCtx.m_totalMemRequirements[t], bucketsMemReqs);
+
+			maxTotalMemReq = maxTotalMemReq.max(m_runCtx.m_totalMemRequirements[t]);
+
+			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_renderableInstanceRanges[t]);
+			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_meshletGroupInstanceRanges[t]);
+			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_meshletInstanceRanges[t]);
+
+			U32 renderablesFirstInstance = 0, groupsFirstInstance = 0, meshletsFirstInstance = 0;
+			for(U32 i = 0; i < tBucketCount; ++i)
+			{
+				m_runCtx.m_renderableInstanceRanges[t][i].m_firstInstance = renderablesFirstInstance;
+				m_runCtx.m_renderableInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_renderableInstanceCount;
+
+				m_runCtx.m_meshletGroupInstanceRanges[t][i].m_firstInstance = groupsFirstInstance;
+				m_runCtx.m_meshletGroupInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_meshletGroupInstanceCount;
+
+				m_runCtx.m_meshletInstanceRanges[t][i].m_firstInstance = meshletsFirstInstance;
+				m_runCtx.m_meshletInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_meshletInstanceCount;
+
+				renderablesFirstInstance += bucketsMemReqs[i].m_renderableInstanceCount;
+				groupsFirstInstance += bucketsMemReqs[i].m_meshletGroupInstanceCount;
+				meshletsFirstInstance += bucketsMemReqs[i].m_meshletInstanceCount;
+			}
 		}
 
-		// Allocate memory
+		// Allocate persistent memory
 		for(PersistentMemory& mem : m_runCtx.m_persistentMem)
 		{
 			mem = {};
 
-			mem.m_drawIndexedIndirectArgsBuffer =
-				allocateTransientGpuMem(max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
-			mem.m_renderableInstancesBuffer =
-				allocateTransientGpuMem(max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableInstance));
+			mem.m_drawIndexedIndirectArgsBuffer = allocateTransientGpuMem(maxTotalMemReq.m_renderableInstanceCount * sizeof(DrawIndexedIndirectArgs));
+			mem.m_renderableInstancesBuffer = allocateTransientGpuMem(maxTotalMemReq.m_renderableInstanceCount * sizeof(GpuSceneRenderableInstance));
+
+			mem.m_meshletGroupsInstancesBuffer =
+				allocateTransientGpuMem(maxTotalMemReq.m_meshletGroupInstanceCount * sizeof(GpuSceneMeshletGroupInstance));
+
+			mem.m_bufferDepedency =
+				rgraph.importBuffer(BufferUsageBit::kNone, (mem.m_drawIndexedIndirectArgsBuffer.m_buffer) ? mem.m_drawIndexedIndirectArgsBuffer
+																										  : mem.m_meshletGroupsInstancesBuffer);
+		}
+
+		if(getRenderer().runSoftwareMeshletRendering())
+		{
+			// Because someone will need it later
+
+			for(PersistentMemoryMeshletRendering& mem : m_runCtx.m_persistentMeshletRenderingMem)
+			{
+				mem = {};
 
-			mem.m_taskShaderPayloadBuffer = allocateTransientGpuMem(max(1u, maxCounts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
+				mem.m_meshletInstancesBuffer = allocateTransientGpuMem(maxTotalMemReq.m_meshletInstanceCount * sizeof(GpuSceneMeshletInstance));
 
-			mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_drawIndexedIndirectArgsBuffer);
+				mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_meshletInstancesBuffer);
+			}
 		}
 	}
 
-	PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphFrameCallCount % m_runCtx.m_persistentMem.getSize()];
-	++m_runCtx.m_populateRenderGraphFrameCallCount;
+	const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique);
+	const MemoryRequirements& req = m_runCtx.m_totalMemRequirements[in.m_technique];
+	const PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphCallCount++ % m_runCtx.m_persistentMem.getSize()];
 
 	out.m_legacy.m_drawIndexedIndirectArgsBuffer = mem.m_drawIndexedIndirectArgsBuffer;
-	out.m_legacy.m_drawIndexedIndirectArgsBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs);
+	out.m_legacy.m_drawIndexedIndirectArgsBuffer.m_range = req.m_renderableInstanceCount * sizeof(DrawIndexedIndirectArgs);
 
 	out.m_legacy.m_renderableInstancesBuffer = mem.m_renderableInstancesBuffer;
-	out.m_legacy.m_renderableInstancesBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableInstance);
+	out.m_legacy.m_renderableInstancesBuffer.m_range = req.m_renderableInstanceCount * sizeof(GpuSceneRenderableInstance);
 
-	out.m_legacy.m_mdiDrawCountsBuffer = allocateTransientGpuMem(sizeof(U32) * counts.m_bucketCount);
+	out.m_legacy.m_mdiDrawCountsBuffer = allocateTransientGpuMem(sizeof(U32) * bucketCount);
 
-	out.m_mesh.m_taskShaderPayloadBuffer = mem.m_taskShaderPayloadBuffer;
-	out.m_mesh.m_taskShaderPayloadBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload);
+	out.m_mesh.m_meshletGroupInstancesBuffer = mem.m_meshletGroupsInstancesBuffer;
+	out.m_mesh.m_meshletGroupInstancesBuffer.m_range = req.m_meshletGroupInstanceCount * sizeof(GpuSceneMeshletGroupInstance);
 
-	out.m_mesh.m_taskShaderIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * counts.m_bucketCount);
+	out.m_mesh.m_taskShaderIndirectArgsBuffer = allocateTransientGpuMem(bucketCount * sizeof(DispatchIndirectArgs));
 
 	if(in.m_hashVisibles)
 	{
@@ -194,9 +301,14 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 	if(in.m_gatherAabbIndices)
 	{
-		out.m_visibleAaabbIndicesBuffer = allocateTransientGpuMem((counts.m_allUserCount + 1) * sizeof(U32));
+		out.m_visibleAaabbIndicesBuffer =
+			allocateTransientGpuMem(RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) * sizeof(U32));
 	}
 
+	// Set instance sub-ranges
+	out.m_legacy.m_bucketRenderableInstanceRanges = m_runCtx.m_renderableInstanceRanges[in.m_technique];
+	out.m_mesh.m_bucketMeshletGroupInstanceRanges = m_runCtx.m_meshletGroupInstanceRanges[in.m_technique];
+
 	// Zero some stuff
 	const BufferHandle zeroStuffDependency = rgraph.importBuffer(BufferUsageBit::kNone, out.m_legacy.m_mdiDrawCountsBuffer);
 	{
@@ -256,32 +368,48 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 
 	pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
-				  technique = in.m_technique, aabbCount = counts.m_aabbCount, out](RenderPassWorkContext& rpass) {
+				  technique = in.m_technique, out](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 		const Bool gatherAabbIndices = out.m_visibleAaabbIndicesBuffer.m_buffer != nullptr;
 		const Bool genHash = out.m_visiblesHashBuffer.m_buffer != nullptr;
 
+		U32 gatherType = 0;
+		if(out.m_mesh.m_meshletGroupInstancesBuffer.m_range > 0)
+		{
+			gatherType |= 2u;
+		}
+
+		if(out.m_legacy.m_renderableInstancesBuffer.m_range > 0)
+		{
+			gatherType |= 1u;
+		}
+		ANKI_ASSERT(gatherType != 0);
+
 		if(frustumTestData)
 		{
-			cmdb.bindShaderProgram(m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash].get());
+			cmdb.bindShaderProgram(m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][gatherType - 1u].get());
 		}
 		else
 		{
-			cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash].get());
+			cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][gatherType - 1u].get());
 		}
 
 		BufferOffsetRange aabbsBuffer;
+		U32 aabbCount = 0;
 		switch(technique)
 		{
 		case RenderingTechnique::kGBuffer:
 			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferOffsetRange();
+			aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
 			break;
 		case RenderingTechnique::kDepth:
 			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferOffsetRange();
+			aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
 			break;
 		case RenderingTechnique::kForward:
 			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferOffsetRange();
+			aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
 			break;
 		default:
 			ANKI_ASSERT(0);
@@ -291,37 +419,35 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 		cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
 		cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
-		cmdb.bindUavBuffer(0, 4, out.m_legacy.m_renderableInstancesBuffer);
-		cmdb.bindUavBuffer(0, 5, out.m_legacy.m_drawIndexedIndirectArgsBuffer);
-		cmdb.bindUavBuffer(0, 6, out.m_legacy.m_mdiDrawCountsBuffer);
-		cmdb.bindUavBuffer(0, 7, out.m_mesh.m_taskShaderIndirectArgsBuffer);
-		cmdb.bindUavBuffer(0, 8, out.m_mesh.m_taskShaderPayloadBuffer);
-
-		U32* drawIndirectArgsIndexOrTaskPayloadIndex =
-			allocateAndBindUav<U32>(cmdb, 0, 9, RenderStateBucketContainer::getSingleton().getBucketCount(technique));
-		U32 bucketCount = 0;
-		U32 legacyGeometryFlowDrawCount = 0;
-		U32 taskPayloadCount = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(
-			technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, [[maybe_unused]] U32 meshletCount) {
-				if(userCount == 0)
-				{
-					// Empty bucket
-					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = kMaxU32;
-				}
-				else if(meshletGroupCount)
-				{
-					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = taskPayloadCount;
-					taskPayloadCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
-				}
-				else
-				{
-					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = legacyGeometryFlowDrawCount;
-					legacyGeometryFlowDrawCount += userCount;
-				}
+		if(gatherType & 1u)
+		{
+			cmdb.bindUavBuffer(0, 4, out.m_legacy.m_renderableInstancesBuffer);
+			cmdb.bindUavBuffer(0, 5, out.m_legacy.m_drawIndexedIndirectArgsBuffer);
+			cmdb.bindUavBuffer(0, 6, out.m_legacy.m_mdiDrawCountsBuffer);
+		}
+		if(gatherType & 2u)
+		{
+			cmdb.bindUavBuffer(0, 7, out.m_mesh.m_taskShaderIndirectArgsBuffer);
+			cmdb.bindUavBuffer(0, 8, out.m_mesh.m_meshletGroupInstancesBuffer);
+		}
+
+		const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(technique);
+		UVec2* instanceRanges = allocateAndBindUav<UVec2>(cmdb, 0, 9, bucketCount);
+		for(U32 i = 0; i < bucketCount; ++i)
+		{
+			const Bool legacyBucket = m_runCtx.m_renderableInstanceRanges[technique][i].m_instanceCount > 0;
 
-				++bucketCount;
-			});
+			if(legacyBucket)
+			{
+				instanceRanges[i].x() = m_runCtx.m_renderableInstanceRanges[technique][i].m_firstInstance;
+				instanceRanges[i].y() = m_runCtx.m_renderableInstanceRanges[technique][i].m_instanceCount;
+			}
+			else
+			{
+				instanceRanges[i].x() = m_runCtx.m_meshletGroupInstanceRanges[technique][i].m_firstInstance;
+				instanceRanges[i].y() = m_runCtx.m_meshletGroupInstanceRanges[technique][i].m_instanceCount;
+			}
+		}
 
 		if(frustumTestData)
 		{
@@ -380,63 +506,29 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	});
 }
 
-Error GpuMeshletVisibility::init()
-{
-	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
-	{
-		ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityMeshlet.ankiprogbin", {{"HZB_TEST", hzb}}, m_meshletCullingProg,
-									 m_meshletCullingGrProgs[hzb]));
-	}
-
-	return Error::kNone;
-}
-
-void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
+void GpuVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
 {
 	RenderGraphDescription& rgraph = *in.m_rgraph;
 
-	const Counts counts = countTechnique(in.m_technique);
-
-	if(counts.m_allUserCount == 0) [[unlikely]]
+	if(in.m_taskShaderIndirectArgsBuffer.m_buffer == nullptr) [[unlikely]]
 	{
 		// Early exit
 		return;
 	}
 
 	// Allocate memory
-	const Bool firstCallInFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
-	if(firstCallInFrame)
-	{
-		// Allocate the big buffers once at the beginning of the frame
+	const U32 bucketCount = m_runCtx.m_renderableInstanceRanges[in.m_technique].getSize();
+	ANKI_ASSERT(RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique) == bucketCount);
 
-		m_runCtx.m_frameIdx = getRenderer().getFrameCount();
-		m_runCtx.m_populateRenderGraphFrameCallCount = 0;
-
-		// Find the max counts of all techniques
-		Counts maxCounts = {};
-		for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
-		{
-			maxCounts = maxCounts.max((in.m_technique == t) ? counts : countTechnique(t));
-		}
+	const PersistentMemoryMeshletRendering& mem = m_runCtx.m_persistentMeshletRenderingMem[m_runCtx.m_populateRenderGraphMeshletRenderingCallCount++
+																						   % m_runCtx.m_persistentMeshletRenderingMem.getSize()];
 
-		// Allocate memory
-		for(PersistentMemory& mem : m_runCtx.m_persistentMem)
-		{
-			mem = {};
-
-			mem.m_meshletInstancesBuffer = allocateTransientGpuMem(maxCounts.m_meshletCount * sizeof(GpuSceneMeshletInstance));
-
-			mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_meshletInstancesBuffer);
-		}
-	}
-
-	PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphFrameCallCount % m_runCtx.m_persistentMem.getSize()];
-	++m_runCtx.m_populateRenderGraphFrameCallCount;
-
-	out.m_drawIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * counts.m_bucketCount);
+	out.m_drawIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
 
 	out.m_meshletInstancesBuffer = mem.m_meshletInstancesBuffer;
-	out.m_meshletInstancesBuffer.m_range = counts.m_meshletCount * sizeof(GpuSceneMeshletInstance);
+	out.m_meshletInstancesBuffer.m_range = m_runCtx.m_totalMemRequirements[in.m_technique].m_meshletInstanceCount * sizeof(GpuSceneMeshletInstance);
+
+	out.m_bucketMeshletInstanceRanges = m_runCtx.m_meshletInstanceRanges[in.m_technique];
 
 	// Zero some stuff
 	const BufferHandle indirectArgsDep = rgraph.importBuffer(BufferUsageBit::kNone, out.m_drawIndirectArgsBuffer);
@@ -468,70 +560,66 @@ void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, Gp
 	pass.newBufferDependency(mem.m_bufferDepedency, BufferUsageBit::kUavComputeWrite);
 	pass.newBufferDependency(in.m_dependency, BufferUsageBit::kIndirectCompute);
 
-	pass.setWork([this, technique = in.m_technique, hzbRt = in.m_hzbRt, taskShaderPayloadsBuff = in.m_taskShaderPayloadBuffer,
-				  viewProjMat = in.m_viewProjectionMatrix, camTrf = in.m_cameraTransform, viewportSize = in.m_viewportSize,
-				  computeIndirectArgs = in.m_taskShaderIndirectArgsBuffer, out](RenderPassWorkContext& rpass) {
+	pass.setWork([this, technique = in.m_technique, hzbRt = in.m_hzbRt, viewProjMat = in.m_viewProjectionMatrix, camTrf = in.m_cameraTransform,
+				  viewportSize = in.m_viewportSize, computeIndirectArgs = in.m_taskShaderIndirectArgsBuffer, out,
+				  meshletGroupInstancesBuffer = in.m_meshletGroupInstancesBuffer,
+				  bucketMeshletGroupInstanceRanges = in.m_bucketMeshletGroupInstanceRanges](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
-		U32 bucketIdx = 0;
-		U32 firstPayload = 0;
-		PtrSize instancesBufferOffset = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(
-			technique, [&](const RenderStateInfo&, [[maybe_unused]] U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
-				if(!meshletGroupCount)
-				{
-					++bucketIdx;
-					return;
-				}
-
-				// Create a depedency to a part of the indirect args buffer
-				const BufferOffsetRange drawIndirectArgsBufferChunk = {out.m_drawIndirectArgsBuffer.m_buffer,
-																	   out.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketIdx,
-																	   sizeof(DrawIndirectArgs)};
+		const U32 bucketCount = out.m_bucketMeshletInstanceRanges.getSize();
 
-				const PtrSize instancesBufferSize = min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket) * sizeof(GpuSceneMeshletInstance);
-				const BufferOffsetRange instancesBuffer = {out.m_meshletInstancesBuffer.m_buffer,
-														   out.m_meshletInstancesBuffer.m_offset + instancesBufferOffset, instancesBufferSize};
+		for(U32 i = 0; i < bucketCount; ++i)
+		{
+			if(out.m_bucketMeshletInstanceRanges[i].m_instanceCount == 0)
+			{
+				continue;
+			}
 
-				const Bool hasHzb = hzbRt.isValid();
+			const Bool hasHzb = hzbRt.isValid();
 
-				cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb].get());
+			cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb].get());
 
-				cmdb.bindUavBuffer(0, 0, taskShaderPayloadsBuff);
-				cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
-				cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
-				cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
-				cmdb.bindUavBuffer(0, 4, UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
-				cmdb.bindUavBuffer(0, 5, drawIndirectArgsBufferChunk);
-				cmdb.bindUavBuffer(0, 6, instancesBuffer);
-				if(hasHzb)
-				{
-					rpass.bindColorTexture(0, 7, hzbRt);
-					cmdb.bindSampler(0, 8, getRenderer().getSamplers().m_nearestNearestClamp.get());
-				}
+			cmdb.bindUavBuffer(0, 0, meshletGroupInstancesBuffer);
+			cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 4, UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 5, out.m_drawIndirectArgsBuffer);
+			cmdb.bindUavBuffer(0, 6, out.m_meshletInstancesBuffer);
+			if(hasHzb)
+			{
+				rpass.bindColorTexture(0, 7, hzbRt);
+				cmdb.bindSampler(0, 8, getRenderer().getSamplers().m_nearestNearestClamp.get());
+			}
 
-				class MaterialGlobalConstants
-				{
-				public:
-					Mat4 m_viewProjectionMatrix;
-					Mat3x4 m_cameraTransform;
-
-					Vec2 m_viewportSizef;
-					U32 m_firstPayload;
-					U32 m_padding;
-				} consts;
-				consts.m_viewProjectionMatrix = viewProjMat;
-				consts.m_cameraTransform = camTrf;
-				consts.m_viewportSizef = Vec2(viewportSize);
-				consts.m_firstPayload = firstPayload;
-				cmdb.setPushConstants(&consts, sizeof(consts));
-
-				cmdb.dispatchComputeIndirect(computeIndirectArgs.m_buffer, computeIndirectArgs.m_offset + bucketIdx * sizeof(DispatchIndirectArgs));
-
-				firstPayload += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
-				instancesBufferOffset += instancesBufferSize;
-				++bucketIdx;
-			});
+			class Consts
+			{
+			public:
+				Mat4 m_viewProjectionMatrix;
+
+				Vec3 m_cameraPos;
+				U32 m_firstDrawArg;
+
+				Vec2 m_viewportSizef;
+				U32 m_firstMeshletGroup;
+				U32 m_firstMeshlet;
+
+				U32 m_meshletCount;
+				U32 m_padding1;
+				U32 m_padding2;
+				U32 m_padding3;
+			} consts;
+			consts.m_viewProjectionMatrix = viewProjMat;
+			consts.m_cameraPos = camTrf.getTranslationPart().xyz();
+			consts.m_firstDrawArg = i;
+			consts.m_viewportSizef = Vec2(viewportSize);
+			consts.m_firstMeshletGroup = bucketMeshletGroupInstanceRanges[i].getFirstInstance();
+			consts.m_firstMeshlet = out.m_bucketMeshletInstanceRanges[i].getFirstInstance();
+			consts.m_meshletCount = out.m_bucketMeshletInstanceRanges[i].getInstanceCount();
+			cmdb.setPushConstants(&consts, sizeof(consts));
+
+			cmdb.dispatchComputeIndirect(computeIndirectArgs.m_buffer, computeIndirectArgs.m_offset + i * sizeof(DispatchIndirectArgs));
+		};
 	});
 }
 

+ 119 - 93
AnKi/Renderer/Utils/GpuVisibility.h

@@ -14,37 +14,32 @@ namespace anki {
 /// @addtogroup renderer
 /// @{
 
-class GpuVisibilityCommonBase : public RendererObject
+/// @memberof GpuVisibility
+class InstanceRange
 {
-protected:
-	class Counts
+	friend class GpuVisibility;
+
+public:
+	U32 getFirstInstance() const
 	{
-	public:
-		U32 m_aabbCount;
-		U32 m_bucketCount;
-		U32 m_legacyGeometryFlowUserCount;
-		U32 m_modernGeometryFlowUserCount;
-		U32 m_meshletGroupCount;
-		U32 m_meshletCount;
-		U32 m_allUserCount;
-
-		Counts max(const Counts& b) const
-		{
-			Counts out;
-#define ANKI_MAX(member) out.member = anki::max(member, b.member)
-			ANKI_MAX(m_aabbCount);
-			ANKI_MAX(m_bucketCount);
-			ANKI_MAX(m_legacyGeometryFlowUserCount);
-			ANKI_MAX(m_modernGeometryFlowUserCount);
-			ANKI_MAX(m_meshletGroupCount);
-			ANKI_MAX(m_meshletCount);
-			ANKI_MAX(m_allUserCount);
-#undef ANKI_MAX
-			return out;
-		}
-	};
+		ANKI_ASSERT(isValid());
+		return m_firstInstance;
+	}
+
+	U32 getInstanceCount() const
+	{
+		ANKI_ASSERT(isValid());
+		return m_instanceCount;
+	}
+
+	Bool isValid() const
+	{
+		return m_instanceCount > 0;
+	}
 
-	static Counts countTechnique(RenderingTechnique t);
+private:
+	U32 m_firstInstance = 0;
+	U32 m_instanceCount = 0;
 };
 
 /// @memberof GpuVisibility
@@ -93,16 +88,21 @@ public:
 	{
 	public:
 		BufferOffsetRange m_renderableInstancesBuffer; ///< An array of GpuSceneRenderableInstance.
-		BufferOffsetRange m_drawIndexedIndirectArgsBuffer; ///< An array of DrawIndexedIndirectArgs.
 		BufferOffsetRange m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket (even those that use task/mesh flow).
+		BufferOffsetRange m_drawIndexedIndirectArgsBuffer; ///< Array of DrawIndexedIndirectArgs or DrawIndirectArgs.
+
+		/// Defines the element sub-ranges in the m_renderableInstancesBuffer an m_drawIndexedIndirectArgsBuffer per render state bucket.
+		ConstWeakArray<InstanceRange> m_bucketRenderableInstanceRanges;
 	} m_legacy; ///< Legacy vertex shading.
 
 	class
 	{
 	public:
-		/// An array of DispatchIndirectArgs, one for each render state bucket (even those that use legacy flow).
-		BufferOffsetRange m_taskShaderIndirectArgsBuffer;
-		BufferOffsetRange m_taskShaderPayloadBuffer; ///< The payloads of task shaders. One for each task shader threadgroup / meshlet group.
+		BufferOffsetRange m_taskShaderIndirectArgsBuffer; ///< An array of DispatchIndirectArgs, one for each render state bucket.
+		BufferOffsetRange m_meshletGroupInstancesBuffer; ///< Array with GpuSceneMeshletGroupInstance.
+
+		/// Defines the element sub-ranges in the m_meshletGroupInstancesBuffer per render state bucket.
+		ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges;
 	} m_mesh; ///< S/W meshlets or H/W mesh shading.
 
 	BufferOffsetRange m_visibleAaabbIndicesBuffer; ///< [Optional] Indices to the AABB buffer. The 1st element is the count.
@@ -115,61 +115,7 @@ public:
 	}
 };
 
-/// Performs GPU visibility for some pass.
-class GpuVisibility : public GpuVisibilityCommonBase
-{
-public:
-	Error init();
-
-	/// Perform frustum visibility testing.
-	/// @note Not thread-safe.
-	void populateRenderGraph(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
-	{
-		ANKI_ASSERT(in.m_viewProjectionMatrix != Mat4::getZero());
-		ANKI_ASSERT(in.m_viewportSize != UVec2(0u));
-		populateRenderGraphInternal(false, in, out);
-	}
-
-	/// Perform simple distance-based visibility testing.
-	/// @note Not thread-safe.
-	void populateRenderGraph(DistanceGpuVisibilityInput& in, GpuVisibilityOutput& out)
-	{
-		populateRenderGraphInternal(true, in, out);
-	}
-
-private:
-	ShaderProgramResourcePtr m_prog;
-	Array3d<ShaderProgramPtr, 2, 2, 2> m_frustumGrProgs;
-	Array2d<ShaderProgramPtr, 2, 2> m_distGrProgs;
-
-	// Contains quite large buffer that we want want to reuse muptiple times in a single frame.
-	class PersistentMemory
-	{
-	public:
-		// Legacy
-		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
-		BufferOffsetRange m_renderableInstancesBuffer; ///< Instance rate vertex buffer.
-
-		// Mesh
-		BufferOffsetRange m_taskShaderPayloadBuffer;
-
-		BufferHandle m_bufferDepedency;
-	};
-
-	class
-	{
-	public:
-		U64 m_frameIdx = kMaxU64;
-		U32 m_populateRenderGraphFrameCallCount = 0;
-
-		/// The more persistent memory there is the more passes will be able to run in parallel but the more memory is used.
-		Array<PersistentMemory, 4> m_persistentMem;
-	} m_runCtx;
-
-	void populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out);
-};
-
-/// @memberof GpuMeshletVisibility
+/// @memberof GpuVisibility
 class GpuMeshletVisibilityInput
 {
 public:
@@ -184,21 +130,33 @@ public:
 	UVec2 m_viewportSize;
 
 	BufferOffsetRange m_taskShaderIndirectArgsBuffer; ///< Taken from GpuVisibilityOutput.
-	BufferOffsetRange m_taskShaderPayloadBuffer; ///< Taken from GpuVisibilityOutput.
+	BufferOffsetRange m_meshletGroupInstancesBuffer; ///< Taken from GpuVisibilityOutput.
+	ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges; ///< Taken from GpuVisibilityOutput.
 
 	BufferHandle m_dependency;
 
 	RenderGraphDescription* m_rgraph = nullptr;
 
 	RenderTargetHandle m_hzbRt; ///< Optional.
+
+	void fillBuffers(const GpuVisibilityOutput& perObjVisOut)
+	{
+		m_taskShaderIndirectArgsBuffer = perObjVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+		m_meshletGroupInstancesBuffer = perObjVisOut.m_mesh.m_meshletGroupInstancesBuffer;
+		m_bucketMeshletGroupInstanceRanges = perObjVisOut.m_mesh.m_bucketMeshletGroupInstanceRanges;
+		m_dependency = perObjVisOut.m_dependency;
+	}
 };
 
-/// @memberof GpuMeshletVisibility
+/// @memberof GpuVisibility
 class GpuMeshletVisibilityOutput
 {
 public:
-	BufferOffsetRange m_meshletInstancesBuffer; ///< Array of UVec4 (encodes GpuSceneMeshletInstance) per instance vertex. One for each meshlet.
 	BufferOffsetRange m_drawIndirectArgsBuffer; ///< Array of DrawIndirectArgs. One for every render state bucket (even those that use that flow).
+	BufferOffsetRange m_meshletInstancesBuffer; ///< Array of GpuSceneMeshletInstance.
+
+	/// Defines the element sub-ranges in the m_meshletInstancesBuffer per render state bucket.
+	ConstWeakArray<InstanceRange> m_bucketMeshletInstanceRanges;
 
 	BufferHandle m_dependency; ///< Some dependency to wait on. Wait usage is indirect draw.
 
@@ -208,17 +166,37 @@ public:
 	}
 };
 
-/// Performs meshlet GPU visibility when the GPU doesn't support mesh shaders.
-class GpuMeshletVisibility : public GpuVisibilityCommonBase
+/// Performs GPU visibility for some pass.
+class GpuVisibility : public RendererObject
 {
 public:
 	Error init();
 
+	/// Perform frustum visibility testing.
+	/// @note Not thread-safe.
+	void populateRenderGraph(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
+	{
+		ANKI_ASSERT(in.m_viewProjectionMatrix != Mat4::getZero());
+		ANKI_ASSERT(in.m_viewportSize != UVec2(0u));
+		populateRenderGraphInternal(false, in, out);
+	}
+
+	/// Perform simple distance-based visibility testing.
+	/// @note Not thread-safe.
+	void populateRenderGraph(DistanceGpuVisibilityInput& in, GpuVisibilityOutput& out)
+	{
+		populateRenderGraphInternal(true, in, out);
+	}
+
 	/// Perform meshlet GPU visibility.
 	/// @note Not thread-safe.
 	void populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out);
 
 private:
+	ShaderProgramResourcePtr m_prog;
+	Array4d<ShaderProgramPtr, 2, 2, 2, 3> m_frustumGrProgs;
+	Array3d<ShaderProgramPtr, 2, 2, 3> m_distGrProgs;
+
 	ShaderProgramResourcePtr m_meshletCullingProg;
 	Array<ShaderProgramPtr, 2> m_meshletCullingGrProgs;
 
@@ -226,20 +204,68 @@ private:
 	class PersistentMemory
 	{
 	public:
+		// Legacy
+		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
+		BufferOffsetRange m_renderableInstancesBuffer; ///< Instance rate vertex buffer.
+
+		// HW & SW Meshlet rendering
+		BufferOffsetRange m_meshletGroupsInstancesBuffer;
+
+		// SW meshlet rendering
+		BufferOffsetRange m_meshletInstancesBuffer; ///< Instance rate vertex buffer.
+
+		BufferHandle m_bufferDepedency;
+	};
+
+	class PersistentMemoryMeshletRendering
+	{
+	public:
+		// SW meshlet rendering
 		BufferOffsetRange m_meshletInstancesBuffer; ///< Instance rate vertex buffer.
 
 		BufferHandle m_bufferDepedency;
 	};
 
+	class MemoryRequirements
+	{
+	public:
+		U32 m_renderableInstanceCount = 0; ///< Count of GpuSceneRenderableInstance and a few other things
+		U32 m_meshletGroupInstanceCount = 0; ///< Count of GpuSceneMeshletGroupInstance
+		U32 m_meshletInstanceCount = 0; ///< Count of GpuSceneMeshletInstance
+
+		MemoryRequirements max(const MemoryRequirements& b)
+		{
+			MemoryRequirements out;
+#define ANKI_MAX(member) out.member = anki::max(member, b.member)
+			ANKI_MAX(m_renderableInstanceCount);
+			ANKI_MAX(m_meshletGroupInstanceCount);
+			ANKI_MAX(m_meshletInstanceCount);
+#undef ANKI_MAX
+			return out;
+		}
+	};
+
 	class
 	{
 	public:
 		U64 m_frameIdx = kMaxU64;
-		U32 m_populateRenderGraphFrameCallCount = 0;
+		U32 m_populateRenderGraphCallCount = 0;
+		U32 m_populateRenderGraphMeshletRenderingCallCount = 0;
 
 		/// The more persistent memory there is the more passes will be able to run in parallel but the more memory is used.
 		Array<PersistentMemory, 4> m_persistentMem;
+		Array<PersistentMemoryMeshletRendering, 4> m_persistentMeshletRenderingMem; ///< See m_persistentMem.
+
+		Array<MemoryRequirements, U32(RenderingTechnique::kCount)> m_totalMemRequirements;
+
+		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_renderableInstanceRanges;
+		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_meshletGroupInstanceRanges;
+		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_meshletInstanceRanges;
 	} m_runCtx;
+
+	void populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out);
+
+	static void computeGpuVisibilityMemoryRequirements(RenderingTechnique t, MemoryRequirements& total, WeakArray<MemoryRequirements> perBucket);
 };
 
 /// @memberof GpuVisibilityNonRenderables

+ 69 - 16
AnKi/Scene/RenderStateBucket.cpp

@@ -13,12 +13,12 @@ RenderStateBucketContainer::~RenderStateBucketContainer()
 	{
 		for([[maybe_unused]] ExtendedBucket& b : m_buckets[t])
 		{
-			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0 && b.m_meshletGroupCount == 0);
+			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0 && b.m_lod0MeshletGroupCount == 0 && b.m_lod0MeshletCount == 0);
 		}
 
-		ANKI_ASSERT(m_bucketUserCount[t] == 0);
+		ANKI_ASSERT(m_bucketActiveUserCount[t] == 0);
 		ANKI_ASSERT(m_activeBucketCount[t] == 0);
-		ANKI_ASSERT(m_meshletGroupCount[t] == 0);
+		ANKI_ASSERT(m_lod0MeshletGroupCount[t] == 0);
 	}
 }
 
@@ -31,7 +31,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	toHash[2] = state.m_indexedDrawcall;
 	const U64 hash = computeHash(toHash.getBegin(), toHash.getSizeInBytes());
 
-	const U32 meshletGroupCount = lod0MeshletCount + (kMeshletGroupSize - 1) / kMeshletGroupSize;
+	const U32 meshletGroupCount = (lod0MeshletCount + (kMeshletGroupSize - 1)) / kMeshletGroupSize;
 
 	SceneDynamicArray<ExtendedBucket>& buckets = m_buckets[technique];
 
@@ -40,8 +40,9 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 
 	LockGuard lock(m_mtx);
 
-	++m_bucketUserCount[technique];
-	m_meshletGroupCount[technique] += meshletGroupCount;
+	++m_bucketActiveUserCount[technique];
+	m_lod0MeshletGroupCount[technique] += meshletGroupCount;
+	m_lod0MeshletCount[technique] += lod0MeshletCount;
 
 	// Search bucket
 	for(U32 i = 0; i < buckets.getSize(); ++i)
@@ -49,15 +50,17 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 		if(buckets[i].m_hash == hash)
 		{
 			++buckets[i].m_userCount;
-			buckets[i].m_meshletGroupCount += meshletGroupCount;
+			buckets[i].m_lod0MeshletGroupCount += meshletGroupCount;
 			buckets[i].m_lod0MeshletCount += lod0MeshletCount;
 
 			if(buckets[i].m_userCount == 1)
 			{
 				ANKI_ASSERT(!buckets[i].m_program.isCreated());
-				ANKI_ASSERT(buckets[i].m_meshletGroupCount == meshletGroupCount && buckets[i].m_meshletGroupCount == lod0MeshletCount);
+				ANKI_ASSERT(buckets[i].m_lod0MeshletGroupCount == meshletGroupCount && buckets[i].m_lod0MeshletGroupCount == lod0MeshletCount);
 				buckets[i].m_program = state.m_program;
 				++m_activeBucketCount[technique];
+
+				createPerfOrder(technique);
 			}
 			else
 			{
@@ -77,11 +80,13 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	newBucket.m_primitiveTopology = state.m_primitiveTopology;
 	newBucket.m_program = state.m_program;
 	newBucket.m_userCount = 1;
-	newBucket.m_meshletGroupCount = meshletGroupCount;
+	newBucket.m_lod0MeshletGroupCount = meshletGroupCount;
 	newBucket.m_lod0MeshletCount = lod0MeshletCount;
 
 	++m_activeBucketCount[technique];
 
+	createPerfOrder(technique);
+
 	out.m_index = buckets.getSize() - 1;
 	out.m_lod0MeshletCount = lod0MeshletCount;
 	return out;
@@ -96,7 +101,7 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
-	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMeshletGroupSize - 1) / kMeshletGroupSize;
+	const U32 meshletGroupCount = (bucketIndex.m_lod0MeshletCount + (kMeshletGroupSize - 1)) / kMeshletGroupSize;
 	const U32 meshletCount = bucketIndex.m_lod0MeshletCount;
 	bucketIndex.invalidate();
 
@@ -104,18 +109,21 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	ANKI_ASSERT(idx < m_buckets[technique].getSize());
 
-	ANKI_ASSERT(m_bucketUserCount[technique] > 0);
-	--m_bucketUserCount[technique];
+	ANKI_ASSERT(m_bucketActiveUserCount[technique] > 0);
+	--m_bucketActiveUserCount[technique];
+
+	ANKI_ASSERT(m_lod0MeshletGroupCount[technique] >= meshletGroupCount);
+	m_lod0MeshletGroupCount[technique] -= meshletGroupCount;
 
-	ANKI_ASSERT(m_meshletGroupCount[technique] >= meshletGroupCount);
-	m_meshletGroupCount[technique] -= meshletGroupCount;
+	ANKI_ASSERT(m_lod0MeshletCount[technique] >= meshletCount);
+	m_lod0MeshletCount[technique] -= meshletCount;
 
 	ExtendedBucket& bucket = m_buckets[technique][idx];
-	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_meshletGroupCount >= meshletGroupCount
+	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_lod0MeshletGroupCount >= meshletGroupCount
 				&& bucket.m_lod0MeshletCount >= meshletCount);
 
 	--bucket.m_userCount;
-	bucket.m_meshletGroupCount -= meshletGroupCount;
+	bucket.m_lod0MeshletGroupCount -= meshletGroupCount;
 	bucket.m_lod0MeshletCount -= meshletCount;
 
 	if(bucket.m_userCount == 0)
@@ -125,7 +133,52 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 		ANKI_ASSERT(m_activeBucketCount[technique] > 0);
 		--m_activeBucketCount[technique];
+
+		createPerfOrder(technique);
 	}
 }
 
+void RenderStateBucketContainer::createPerfOrder(RenderingTechnique t)
+{
+	const U32 bucketCount = m_buckets[t].getSize();
+
+	m_bucketPerfOrder[t].resize(bucketCount);
+	for(U32 i = 0; i < bucketCount; ++i)
+	{
+		m_bucketPerfOrder[t][i] = i;
+	}
+
+	std::sort(m_bucketPerfOrder[t].getBegin(), m_bucketPerfOrder[t].getBegin() + bucketCount, [&, this](U32 a, U32 b) {
+		auto getProgramHeaviness = [](const ShaderProgram& p) {
+			U64 size = U64(p.getShaderBinarySize(ShaderType::kFragment)) << 32u; // Fragment is more important
+			if(!!(p.getShaderTypes() & ShaderTypeBit::kVertex))
+			{
+				size |= p.getShaderBinarySize(ShaderType::kVertex);
+			}
+			else
+			{
+				ANKI_ASSERT(!!(p.getShaderTypes() & ShaderTypeBit::kMesh));
+				size |= p.getShaderBinarySize(ShaderType::kMesh);
+			}
+			return size;
+		};
+
+		const Bool aIsActive = m_buckets[t][a].m_program.isCreated();
+		const Bool bIsActive = m_buckets[t][b].m_program.isCreated();
+		const Bool aHasDiscard = (aIsActive) ? m_buckets[t][a].m_program->hasDiscard() : false;
+		const Bool bHasDiscard = (bIsActive) ? m_buckets[t][b].m_program->hasDiscard() : false;
+		const U64 aProgramHeaviness = (aIsActive) ? getProgramHeaviness(*m_buckets[t][a].m_program) : 0;
+		const U64 bProgramHeaviness = (bIsActive) ? getProgramHeaviness(*m_buckets[t][b].m_program) : 0;
+
+		if(aHasDiscard != bHasDiscard)
+		{
+			return !aHasDiscard;
+		}
+		else
+		{
+			return aProgramHeaviness < bProgramHeaviness;
+		}
+	});
+}
+
 } // end namespace anki

+ 29 - 8
AnKi/Scene/RenderStateBucket.h

@@ -96,20 +96,37 @@ public:
 	{
 		for(const ExtendedBucket& b : m_buckets[technique])
 		{
-			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_meshletGroupCount, b.m_lod0MeshletCount);
+			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_lod0MeshletGroupCount, b.m_lod0MeshletCount);
+		}
+	}
+
+	/// Iterate empty and non-empty buckets from the bucket that has the least heavy shader program to the one with the heavy.
+	template<typename TFunc>
+	void iterateBucketsPerformanceOrder(RenderingTechnique technique, TFunc func) const
+	{
+		for(U32 i : m_bucketPerfOrder[technique])
+		{
+			const ExtendedBucket& b = m_buckets[technique][i];
+			func(static_cast<const RenderStateInfo&>(b), i, b.m_userCount, b.m_lod0MeshletGroupCount, b.m_lod0MeshletCount);
 		}
 	}
 
 	/// Get the number of renderables of all the buckets of a specific rendering technique.
-	U32 getBucketsUserCount(RenderingTechnique technique) const
+	U32 getBucketsActiveUserCount(RenderingTechnique technique) const
 	{
-		return m_bucketUserCount[technique];
+		return m_bucketActiveUserCount[technique];
 	}
 
 	/// Get the number of meshlet groups of a technique.
-	U32 getBucketsMeshletGroupCount(RenderingTechnique technique) const
+	U32 getBucketsLod0MeshletGroupCount(RenderingTechnique technique) const
 	{
-		return m_meshletGroupCount[technique];
+		return m_lod0MeshletGroupCount[technique];
+	}
+
+	/// Get the number of meshlets of a technique of LOD 0.
+	U32 getBucketsLod0MeshletCount(RenderingTechnique technique) const
+	{
+		return m_lod0MeshletCount[technique];
 	}
 
 	/// Get number of empty and non-empty buckets.
@@ -130,20 +147,24 @@ private:
 	public:
 		U64 m_hash = 0;
 		U32 m_userCount = 0;
-		U32 m_meshletGroupCount = 0;
+		U32 m_lod0MeshletGroupCount = 0;
 		U32 m_lod0MeshletCount = 0;
 	};
 
 	Array<SceneDynamicArray<ExtendedBucket>, U32(RenderingTechnique::kCount)> m_buckets;
-	Array<U32, U32(RenderingTechnique::kCount)> m_bucketUserCount = {};
-	Array<U32, U32(RenderingTechnique::kCount)> m_meshletGroupCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_bucketActiveUserCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_lod0MeshletGroupCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_lod0MeshletCount = {};
 	Array<U32, U32(RenderingTechnique::kCount)> m_activeBucketCount = {};
+	Array<SceneDynamicArray<U32>, U32(RenderingTechnique::kCount)> m_bucketPerfOrder; ///< Orders the buckets from the least heavy to the most.
 
 	Mutex m_mtx;
 
 	RenderStateBucketContainer() = default;
 
 	~RenderStateBucketContainer();
+
+	void createPerfOrder(RenderingTechnique t);
 };
 
 inline RenderStateBucketIndex::~RenderStateBucketIndex()

+ 1 - 1
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -322,7 +322,7 @@ struct FirstPayload
 
 [numthreads(ANKI_TASK_SHADER_THREADGROUP_SIZE, 1, 1)] void main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX)
 {
-	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_firstPayload.m_val.x + svGroupId];
+	const GpuSceneMeshletGroupInstance inPayload = g_meshletGroups[g_firstPayload.m_val.x + svGroupId];
 
 	const U32 lod = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
 	const U32 renderableIdx = (inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);

+ 69 - 46
AnKi/Shaders/GpuVisibility.ankiprog

@@ -7,9 +7,13 @@
 #pragma anki mutator DISTANCE_TEST 0 1
 #pragma anki mutator GATHER_AABBS 0 1
 #pragma anki mutator HASH_VISIBLES 0 1
+#pragma anki mutator GATHER_TYPE 1 2 3
 
 #pragma anki skip_mutation DISTANCE_TEST 1 HZB_TEST 1
 
+#define GATHER_MDI (GATHER_TYPE & 1u)
+#define GATHER_MESHLET_GROUPS (GATHER_TYPE & 2u)
+
 #pragma anki technique_start comp
 
 #include <AnKi/Shaders/Common.hlsl>
@@ -32,6 +36,7 @@ struct DrawIndirectArgsWithPadding
 [[vk::binding(2)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
 [[vk::binding(3)]] ByteAddressBuffer g_gpuScene;
 
+#if GATHER_MDI
 // These 3 have the same size
 [[vk::binding(4)]] RWStructuredBuffer<UVec4> g_instanceRateRenderables;
 [[vk::binding(5)]] RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs;
@@ -39,13 +44,16 @@ struct DrawIndirectArgsWithPadding
 
 // The MDI counts. One for each render state bucket
 [[vk::binding(6)]] RWStructuredBuffer<U32> g_mdiDrawCounts;
+#endif
 
+#if GATHER_MESHLET_GROUPS
 // For mesh shading
 [[vk::binding(7)]] RWStructuredBuffer<DispatchIndirectArgs> g_taskShaderIndirectArgs;
-[[vk::binding(8)]] RWStructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(8)]] RWStructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroupInstances;
+#endif
 
 // One for each render state bucket. It's either the index of the next indirect args or the index to the next task payload
-[[vk::binding(9)]] StructuredBuffer<U32> g_drawIndirectArgsIndexOrTaskPayloadIndex;
+[[vk::binding(9)]] StructuredBuffer<UVec2> g_instanceRanges;
 
 #if DISTANCE_TEST == 0
 [[vk::binding(10)]] ConstantBuffer<FrustumGpuVisibilityConstants> g_consts;
@@ -154,84 +162,99 @@ struct DrawIndirectArgsWithPadding
 	const GpuSceneMeshLod meshLod = g_meshLods[meshLodIndex];
 
 	const Bool isParticleEmitter = renderable.m_particleEmitterOffset != 0;
-	const Bool usesMeshShaders = meshLod.m_meshletCount != 0u;
+	ANKI_MAYBE_UNUSED(isParticleEmitter);
 
+	const Bool usesMeshShaders = meshLod.m_meshletCount != 0u;
 	if(usesMeshShaders)
 	{
+#if GATHER_MESHLET_GROUPS
 		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMeshletGroupSize - 1u)) / kMeshletGroupSize;
 
-		U32 payloadIdx;
-		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, payloadIdx);
+		U32 instanceIdx;
+		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, instanceIdx);
 
-		if(payloadIdx == 0u)
+		if(instanceIdx == 0u)
 		{
 			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
 			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountZ = 1u;
 		}
-		else if(payloadIdx >= kMaxMeshletGroupCountPerRenderStateBucket)
+		else if(instanceIdx >= g_instanceRanges[renderStateBucket].y)
 		{
 			// Reached a memory limit, cancel the job
 			ANKI_ASSERT(0);
-			payloadIdx = 0;
+			instanceIdx = 0;
 			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 0u;
 		}
 
-		payloadIdx += g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
+		instanceIdx += g_instanceRanges[renderStateBucket].x;
 
 		// Divide the mesh into meshlet groups and add them as task payloads
-		GpuSceneTaskShaderPayload payload;
-		payload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit = (lod << 30u) | (renderableIdx << 9u);
+		GpuSceneMeshletGroupInstance instance;
+		instance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit = (lod << 30u) | (renderableIdx << 9u);
 
 		for(U32 i = 0; i < meshletGroupCount; ++i)
 		{
-			g_taskShaderPayloads[payloadIdx + i] = payload;
+			g_meshletGroupInstances[instanceIdx + i] = instance;
 
-			++payload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
+			++instance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
 		}
+#endif
 	}
 	else
 	{
+#if GATHER_MDI
 		U32 bucketDrawcallIdx;
 		InterlockedAdd(g_mdiDrawCounts[renderStateBucket], 1, bucketDrawcallIdx);
-		const U32 indirectIdx = bucketDrawcallIdx + g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
 
-		if(!isParticleEmitter)
+		if(bucketDrawcallIdx >= g_instanceRanges[renderStateBucket].y)
 		{
-			// Regular renderables are always indexed
-
-			DrawIndexedIndirectArgs indirect;
-			indirect.m_indexCount = meshLod.m_indexCount;
-			indirect.m_instanceCount = 1;
-			indirect.m_firstIndex = meshLod.m_firstIndex;
-			indirect.m_vertexOffset = 0;
-			indirect.m_firstInstance = bucketDrawcallIdx;
-			g_drawIndexedIndirectArgs[indirectIdx] = indirect;
-
-			UVec4 instanceVertex;
-			instanceVertex.x = renderable.m_worldTransformsOffset;
-			instanceVertex.y = renderable.m_constantsOffset;
-			instanceVertex.z = meshLodIndex;
-			instanceVertex.w = renderable.m_boneTransformsOffset;
-			g_instanceRateRenderables[indirectIdx] = instanceVertex;
+			// OoM, ignore
+			ANKI_ASSERT(0);
+			U32 orig;
+			InterlockedExchange(g_mdiDrawCounts[renderStateBucket], g_instanceRanges[renderStateBucket].y, orig);
 		}
 		else
 		{
-			const GpuSceneParticleEmitter emitter = g_gpuScene.Load<GpuSceneParticleEmitter>(renderable.m_particleEmitterOffset);
-
-			DrawIndirectArgsWithPadding indirect;
-			indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
-			indirect.m_instanceCount = 1;
-			indirect.m_firstVertex = 0;
-			indirect.m_firstInstance = bucketDrawcallIdx;
-			g_drawIndirectArgs[indirectIdx] = indirect;
-
-			UVec4 instanceVertex;
-			instanceVertex.x = renderable.m_worldTransformsOffset;
-			instanceVertex.y = renderable.m_constantsOffset;
-			instanceVertex.z = meshLodIndex;
-			instanceVertex.w = renderable.m_particleEmitterOffset;
-			g_instanceRateRenderables[indirectIdx] = instanceVertex;
+			const U32 indirectIdx = bucketDrawcallIdx + g_instanceRanges[renderStateBucket].x;
+			if(!isParticleEmitter)
+			{
+				// Regular renderables are always indexed
+
+				DrawIndexedIndirectArgs indirect;
+				indirect.m_indexCount = meshLod.m_indexCount;
+				indirect.m_instanceCount = 1;
+				indirect.m_firstIndex = meshLod.m_firstIndex;
+				indirect.m_vertexOffset = 0;
+				indirect.m_firstInstance = bucketDrawcallIdx;
+				g_drawIndexedIndirectArgs[indirectIdx] = indirect;
+
+				UVec4 instanceVertex;
+				instanceVertex.x = renderable.m_worldTransformsOffset;
+				instanceVertex.y = renderable.m_constantsOffset;
+				instanceVertex.z = meshLodIndex;
+				instanceVertex.w = renderable.m_boneTransformsOffset;
+				g_instanceRateRenderables[indirectIdx] = instanceVertex;
+			}
+			else
+			{
+				const GpuSceneParticleEmitter emitter = g_gpuScene.Load<GpuSceneParticleEmitter>(renderable.m_particleEmitterOffset);
+
+				DrawIndirectArgsWithPadding indirect;
+				indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
+				indirect.m_instanceCount = 1;
+				indirect.m_firstVertex = 0;
+				indirect.m_firstInstance = bucketDrawcallIdx;
+				g_drawIndirectArgs[indirectIdx] = indirect;
+
+				UVec4 instanceVertex;
+				instanceVertex.x = renderable.m_worldTransformsOffset;
+				instanceVertex.y = renderable.m_constantsOffset;
+				instanceVertex.z = meshLodIndex;
+				instanceVertex.w = renderable.m_particleEmitterOffset;
+				g_instanceRateRenderables[indirectIdx] = instanceVertex;
+			}
 		}
+#endif
 	}
 
 #if HASH_VISIBLES

+ 42 - 27
AnKi/Shaders/GpuVisibilityMeshlet.ankiprog

@@ -19,35 +19,42 @@
 
 #define THREADGROUP_SIZE ANKI_TASK_SHADER_THREADGROUP_SIZE
 
-[[vk::binding(0)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(0)]] StructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroupInstances;
 [[vk::binding(1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
 [[vk::binding(2)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
 [[vk::binding(3)]] ByteAddressBuffer g_gpuScene;
 [[vk::binding(4)]] StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes;
-[[vk::binding(5)]] RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArg;
-[[vk::binding(6)]] RWStructuredBuffer<GpuSceneMeshletInstance> g_drawInstances;
+[[vk::binding(5)]] RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArgs;
+[[vk::binding(6)]] RWStructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances;
 [[vk::binding(7)]] Texture2D<Vec4> g_hzbTexture;
 [[vk::binding(8)]] SamplerState g_nearestClampSampler;
 
-struct MaterialGlobalConstants
+struct Consts
 {
 	Mat4 m_viewProjectionMatrix;
-	Mat3x4 m_cameraTransform;
+
+	Vec3 m_cameraPos;
+	U32 m_firstDrawArg;
 
 	Vec2 m_viewportSizef;
-	U32 m_firstPayload;
-	U32 m_padding;
+	U32 m_firstMeshletGroup;
+	U32 m_firstMeshlet;
+
+	U32 m_maxMeshlets;
+	U32 m_padding1;
+	U32 m_padding2;
+	U32 m_padding3;
 };
-[[vk::push_constant]] ConstantBuffer<MaterialGlobalConstants> g_consts;
+[[vk::push_constant]] ConstantBuffer<Consts> g_consts;
 
 [numthreads(THREADGROUP_SIZE, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupId : SV_GROUPID,
 											   U32 svGroupIndex : SV_GROUPINDEX)
 {
-	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_consts.m_firstPayload + svGroupId];
+	const GpuSceneMeshletGroupInstance groupInstance = g_meshletGroupInstances[g_consts.m_firstMeshletGroup + svGroupId];
 
-	const U32 lod = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
-	const U32 renderableIdx = (inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
-	const U32 meshletGroup = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit & ((1u << 9u) - 1u);
+	const U32 lod = groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
+	const U32 renderableIdx = (groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
+	const U32 meshletGroup = groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit & ((1u << 9u) - 1u);
 
 	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
 	const GpuSceneMeshLod meshLod = g_meshLods[renderable.m_meshLodsIndex + lod];
@@ -66,8 +73,7 @@ struct MaterialGlobalConstants
 
 #if MESHLET_BACKFACE_CULLING
 		const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
-		cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform,
-								   g_consts.m_cameraTransform.getTranslationPart());
+		cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform, g_consts.m_cameraPos);
 #endif
 
 		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
@@ -96,19 +102,28 @@ struct MaterialGlobalConstants
 		if(!cull)
 		{
 			U32 instanceIdx;
-			InterlockedAdd(g_indirectDrawArg[0].m_instanceCount, 1u, instanceIdx);
-
-			InterlockedMax(g_indirectDrawArg[0].m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
-
-			GpuSceneMeshletInstance instance;
-			instance.m_meshletGeometryDescriptorIndex = firstMeshletGeometryDescriptor + svGroupIndex;
-			instance.m_worldTransformsOffset_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsOffset << 7u;
-			instance.m_worldTransformsOffset_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
-			instance.m_constantsOffset = renderable.m_constantsOffset;
-			instance.m_boneTransformsOrParticleEmitterOffset =
-				(renderable.m_boneTransformsOffset) ? renderable.m_boneTransformsOffset : renderable.m_particleEmitterOffset;
-
-			g_drawInstances[instanceIdx] = instance;
+			InterlockedAdd(g_indirectDrawArgs[g_consts.m_firstDrawArg].m_instanceCount, 1u, instanceIdx);
+
+			if(instanceIdx >= g_consts.m_maxMeshlets)
+			{
+				// OoM, ignore
+				U32 orig;
+				InterlockedExchange(g_indirectDrawArgs[g_consts.m_firstDrawArg].m_instanceCount, g_consts.m_maxMeshlets, orig);
+			}
+			else
+			{
+				InterlockedMax(g_indirectDrawArgs[g_consts.m_firstDrawArg].m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
+
+				GpuSceneMeshletInstance instance;
+				instance.m_meshletGeometryDescriptorIndex = firstMeshletGeometryDescriptor + svGroupIndex;
+				instance.m_worldTransformsOffset_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsOffset << 7u;
+				instance.m_worldTransformsOffset_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
+				instance.m_constantsOffset = renderable.m_constantsOffset;
+				instance.m_boneTransformsOrParticleEmitterOffset =
+					(renderable.m_boneTransformsOffset) ? renderable.m_boneTransformsOffset : renderable.m_particleEmitterOffset;
+
+				g_meshletInstances[g_consts.m_firstMeshlet + instanceIdx] = instance;
+			}
 		}
 	}
 }

+ 0 - 4
AnKi/Shaders/Include/Common.h

@@ -795,10 +795,6 @@ constexpr U32 kMeshletGroupSize = ANKI_TASK_SHADER_THREADGROUP_SIZE;
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 32u
 static_assert(kMaxVerticesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
 
-/// Assume that a render state bucket can't go beyond 100M triangles. This helps ground some memory allocations.
-constexpr U32 kMaxVisibleMeshletsPerRenderStateBucket = 100000000 / kMaxPrimitivesPerMeshlet;
-constexpr U32 kMaxMeshletGroupCountPerRenderStateBucket = kMaxVisibleMeshletsPerRenderStateBucket / kMeshletGroupSize;
-
 struct DrawIndirectArgs
 {
 	U32 m_vertexCount;

+ 1 - 1
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -39,7 +39,7 @@ struct GpuSceneRenderableInstance
 static_assert(sizeof(GpuSceneRenderableInstance) == sizeof(UVec4));
 
 /// Input to a single task shader threadgroup. Something similar to GpuSceneRenderableInstance but for mesh shading.
-struct GpuSceneTaskShaderPayload
+struct GpuSceneMeshletGroupInstance
 {
 	U32 m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
 };

+ 1 - 1
AnKi/Shaders/Include/MaterialTypes.h

@@ -47,7 +47,7 @@ enum class MaterialBinding : U32
 	// For mesh shading
 	kMeshletBoundingVolumes, ///< Points to the unified geom buffer
 	kMeshletGeometryDescriptors, ///< Points to the unified geom buffer
-	kTaskShaderPayloads,
+	kMeshletGroups,
 	kRenderables,
 	kMeshLods,
 	kHzbTexture,

+ 1 - 1
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -27,7 +27,7 @@ ANKI_BINDLESS_SET(MaterialSet::kBindless)
 [[vk::binding(MaterialBinding::kMeshletBoundingVolumes, MaterialSet::kGlobal)]] StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes;
 [[vk::binding(MaterialBinding::kMeshletGeometryDescriptors, MaterialSet::kGlobal)]] StructuredBuffer<MeshletGeometryDescriptor>
 	g_meshletGeometryDescriptors;
-[[vk::binding(MaterialBinding::kTaskShaderPayloads, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(MaterialBinding::kMeshletGroups, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroups;
 [[vk::binding(MaterialBinding::kRenderables, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
 [[vk::binding(MaterialBinding::kMeshLods, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
 [[vk::binding(MaterialBinding::kHzbTexture, MaterialSet::kGlobal)]] Texture2D<Vec4> g_hzbTexture;