Browse Source

Change GPU visibility to 2 stage. Remove task shaders

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
e71a8b1106
35 changed files with 1420 additions and 1503 deletions
  1. 1 1
      AnKi/Core/CVarSet.cpp
  2. 1 1
      AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
  3. 35 0
      AnKi/Core/StatsSet.h
  4. 9 4
      AnKi/Gr/Vulkan/VkDescriptor.cpp
  5. 1 0
      AnKi/Gr/Vulkan/VkDescriptor.h
  6. 1 1
      AnKi/Gr/Vulkan/VkGraphicsState.cpp
  7. 1 22
      AnKi/Renderer/ForwardShading.cpp
  8. 0 1
      AnKi/Renderer/ForwardShading.h
  9. 2 26
      AnKi/Renderer/GBuffer.cpp
  10. 0 3
      AnKi/Renderer/GBuffer.h
  11. 6 46
      AnKi/Renderer/IndirectDiffuseProbes.cpp
  12. 25 64
      AnKi/Renderer/ProbeReflections.cpp
  13. 0 3
      AnKi/Renderer/ProbeReflections.h
  14. 19 63
      AnKi/Renderer/ShadowMapping.cpp
  15. 3 5
      AnKi/Renderer/ShadowMapping.h
  16. 71 77
      AnKi/Renderer/Utils/Drawer.cpp
  17. 9 23
      AnKi/Renderer/Utils/Drawer.h
  18. 490 405
      AnKi/Renderer/Utils/GpuVisibility.cpp
  19. 37 144
      AnKi/Renderer/Utils/GpuVisibility.h
  20. 0 6
      AnKi/Resource/MaterialResource.cpp
  21. 23 14
      AnKi/Scene/RenderStateBucket.cpp
  22. 14 7
      AnKi/Scene/RenderStateBucket.h
  23. 18 0
      AnKi/Shaders/Common.hlsl
  24. 14 120
      AnKi/Shaders/GBufferGeneric.ankiprog
  25. 0 292
      AnKi/Shaders/GpuVisibility.ankiprog
  26. 0 137
      AnKi/Shaders/GpuVisibilityMeshlet.ankiprog
  27. 318 0
      AnKi/Shaders/GpuVisibilityStage1.ankiprog
  28. 261 0
      AnKi/Shaders/GpuVisibilityStage2.ankiprog
  29. 1 8
      AnKi/Shaders/Include/GpuSceneTypes.h
  30. 11 0
      AnKi/Shaders/Include/GpuVisibilityTypes.h
  31. 9 8
      AnKi/Shaders/Include/MaterialTypes.h
  32. 6 0
      AnKi/Shaders/Intellisense.hlsl
  33. 11 1
      AnKi/Shaders/MaterialShadersCommon.hlsl
  34. 16 12
      AnKi/Shaders/ShadowMappingVetVisibility.ankiprog
  35. 7 9
      Samples/PhysicsPlayground/Assets/Smoke.ankimtl

+ 1 - 1
AnKi/Core/CVarSet.cpp

@@ -35,7 +35,7 @@ void CVarSet::registerCVar(CVar* cvar)
 
 Error CVarSet::setMultiple(ConstWeakArray<const Char*> arr)
 {
-	for(U i = 0; i < arr.getSize(); ++i)
+	for(U32 i = 0; i < arr.getSize(); ++i)
 	{
 		ANKI_ASSERT(arr[i]);
 		const CString varName = arr[i];

+ 1 - 1
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -81,7 +81,7 @@ private:
 		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
 
 		BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
-								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kTransferDestination;
+								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kAllTransfer;
 		if(GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
 		{
 			buffUsage |= (BufferUsageBit::kAccelerationStructureBuildScratch | BufferUsageBit::kAccelerationStructureBuild);

+ 35 - 0
AnKi/Core/StatsSet.h

@@ -176,6 +176,41 @@ public:
 #endif
 	}
 
+	template<std::integral T>
+	U64 max(T value)
+	{
+#if ANKI_STATS_ENABLED
+		ANKI_ASSERT(!(m_flags & StatFlag::kFloat));
+		checkThread();
+		U64 orig;
+		if(!!(m_flags & StatFlag::kMainThreadUpdates))
+		{
+			orig = m_u;
+			m_u = value;
+		}
+		else
+		{
+			orig = m_atomic.max(value);
+		}
+		return orig;
+#else
+		(void)value;
+		return 0;
+#endif
+	}
+
+	template<std::floating_point T>
+	F64 max([[maybe_unused]] T value)
+	{
+#if ANKI_STATS_ENABLED
+		ANKI_ASSERT("Not supported for floats");
+		return 0.0;
+#else
+		(void)value;
+		return 0.0;
+#endif
+	}
+
 	template<std::integral T>
 	U64 getValue() const
 	{

+ 9 - 4
AnKi/Gr/Vulkan/VkDescriptor.cpp

@@ -73,8 +73,9 @@ void DescriptorAllocator::createNewBlock()
 	inf.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
 	inf.flags = 0;
 	inf.maxSets = g_dsAllocatorConsts.m_maxSets * powu(kDescriptorSetGrowScale, m_blocks.getSize());
-	static_assert(DescriptorType::kAccelerationStructure == DescriptorType::kCount - 1, "Needs to be the last for the bellow to work");
-	inf.poolSizeCount = rtEnabled ? U32(DescriptorType::kCount) : U32(DescriptorType::kCount) - 1;
+	ANKI_ASSERT(g_dsAllocatorConsts.m_descriptorCount.getBack().first == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR
+				&& "Needs to be the last for the bellow to work");
+	inf.poolSizeCount = rtEnabled ? g_dsAllocatorConsts.m_descriptorCount.getSize() : g_dsAllocatorConsts.m_descriptorCount.getSize() - 1;
 	inf.pPoolSizes = poolSizes.getBegin();
 
 	VkDescriptorPool handle;
@@ -82,6 +83,7 @@ void DescriptorAllocator::createNewBlock()
 
 	Block& block = *m_blocks.emplaceBack();
 	block.m_pool = handle;
+	block.m_maxDsets = inf.maxSets;
 
 	g_descriptorSetsAllocatedStatVar.increment(1);
 }
@@ -101,7 +103,7 @@ void DescriptorAllocator::allocate(VkDescriptorSetLayout layout, VkDescriptorSet
 	do
 	{
 		VkResult res;
-		if(m_blocks[m_activeBlock].m_dsetsAllocatedCount > g_dsAllocatorConsts.m_maxSets * powu(kDescriptorSetGrowScale, m_activeBlock) * 2)
+		if(m_blocks[m_activeBlock].m_dsetsAllocatedCount > m_blocks[m_activeBlock].m_maxDsets * 2)
 		{
 			// The driver doesn't respect VkDescriptorPoolCreateInfo::maxSets. It should have thrown OoM already. To avoid growing the same DS forever
 			// force OoM
@@ -163,7 +165,10 @@ void DescriptorAllocator::reset()
 	// Reset the remaining pools
 	for(Block& b : m_blocks)
 	{
-		vkResetDescriptorPool(getVkDevice(), b.m_pool, 0);
+		if(b.m_dsetsAllocatedCount > 0)
+		{
+			vkResetDescriptorPool(getVkDevice(), b.m_pool, 0);
+		}
 		b.m_dsetsAllocatedCount = 0;
 	}
 

+ 1 - 0
AnKi/Gr/Vulkan/VkDescriptor.h

@@ -45,6 +45,7 @@ private:
 	public:
 		VkDescriptorPool m_pool = VK_NULL_HANDLE;
 		U32 m_dsetsAllocatedCount = 0;
+		U32 m_maxDsets = 0;
 	};
 
 	static constexpr U32 kDescriptorSetGrowScale = 2;

+ 1 - 1
AnKi/Gr/Vulkan/VkGraphicsState.cpp

@@ -470,7 +470,7 @@ Error PipelineCache::init(CString cacheDir)
 	ANKI_VK_CHECK(vkCreatePipelineCache(getVkDevice(), &ci, nullptr, &m_cacheHandle));
 
 #if ANKI_PLATFORM_MOBILE
-	ANKI_ASSERT(GrManager::getSingleton().getDeviceCapabilities() != GpuVendor::kNone);
+	ANKI_ASSERT(GrManager::getSingleton().getDeviceCapabilities().m_gpuVendor != GpuVendor::kUnknown);
 	if(GrManager::getSingleton().getDeviceCapabilities().m_gpuVendor == GpuVendor::kQualcomm)
 	{
 		// Calling vkCreateGraphicsPipeline from multiple threads crashes qualcomm's compiler

+ 1 - 22
AnKi/Renderer/ForwardShading.cpp

@@ -42,21 +42,6 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
 	visIn.m_viewportSize = getRenderer().getInternalResolution();
 
 	getRenderer().getGpuVisibility().populateRenderGraph(visIn, m_runCtx.m_visOut);
-
-	if(getRenderer().runSoftwareMeshletRendering())
-	{
-		GpuMeshletVisibilityInput meshIn;
-		meshIn.m_passesName = "FW shading";
-		meshIn.m_technique = RenderingTechnique::kForward;
-		meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
-		meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
-		meshIn.m_viewportSize = getRenderer().getInternalResolution();
-		meshIn.m_rgraph = &rgraph;
-		meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
-		meshIn.fillBuffers(m_runCtx.m_visOut);
-
-		getRenderer().getGpuVisibility().populateRenderGraph(meshIn, m_runCtx.m_meshVisOut);
-	}
 }
 
 void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgraphCtx)
@@ -101,11 +86,6 @@ void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgr
 		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
 		args.fill(m_runCtx.m_visOut);
 
-		if(m_runCtx.m_meshVisOut.isFilled())
-		{
-			args.fill(m_runCtx.m_meshVisOut);
-		}
-
 		getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 
 		// Restore state
@@ -130,8 +110,7 @@ void ForwardShading::setDependencies(GraphicsRenderPass& pass)
 
 	if(m_runCtx.m_visOut.containsDrawcalls())
 	{
-		pass.newBufferDependency((m_runCtx.m_meshVisOut.isFilled()) ? m_runCtx.m_meshVisOut.m_dependency : m_runCtx.m_visOut.m_dependency,
-								 BufferUsageBit::kIndirectDraw);
+		pass.newBufferDependency(m_runCtx.m_visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 	}
 }
 

+ 0 - 1
AnKi/Renderer/ForwardShading.h

@@ -45,7 +45,6 @@ private:
 	{
 	public:
 		GpuVisibilityOutput m_visOut;
-		GpuMeshletVisibilityOutput m_meshVisOut;
 	} m_runCtx;
 };
 /// @}

+ 2 - 26
AnKi/Renderer/GBuffer.cpp

@@ -107,7 +107,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 
 	// Visibility
 	GpuVisibilityOutput visOut;
-	GpuMeshletVisibilityOutput meshletVisOut;
 	{
 		const CommonMatrices& matrices = (getRenderer().getFrameCount() <= 1) ? ctx.m_matrices : ctx.m_prevMatrices;
 		const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
@@ -127,21 +126,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 
 		m_runCtx.m_visibleAaabbIndicesBuffer = visOut.m_visibleAaabbIndicesBuffer;
 		m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_dependency;
-
-		if(getRenderer().runSoftwareMeshletRendering())
-		{
-			GpuMeshletVisibilityInput meshIn;
-			meshIn.m_passesName = "GBuffer";
-			meshIn.m_technique = RenderingTechnique::kGBuffer;
-			meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
-			meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
-			meshIn.m_viewportSize = getRenderer().getInternalResolution();
-			meshIn.m_rgraph = &rgraph;
-			meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
-			meshIn.fillBuffers(visOut);
-
-			getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-		}
 	}
 
 	const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
@@ -181,7 +165,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 	pass.setRenderpassInfo(WeakArray{colorRti}, &depthRti, 0, 0, kMaxU32, kMaxU32, (enableVrs) ? &sriRt : nullptr,
 						   (enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0,
 						   (enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0);
-	pass.setWork([this, &ctx, visOut, meshletVisOut](RenderPassWorkContext& rgraphCtx) {
+	pass.setWork([this, &ctx, visOut](RenderPassWorkContext& rgraphCtx) {
 		ANKI_TRACE_SCOPED_EVENT(GBuffer);
 
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -214,10 +198,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		}
 
 		args.fill(visOut);
-		if(meshletVisOut.isFilled())
-		{
-			args.fill(meshletVisOut);
-		}
 
 		cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
 		getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
@@ -243,11 +223,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageGeometryRead | BufferUsageBit::kStorageFragmentRead);
 
 	// Only add one depedency to the GPU visibility. No need to track all buffers
-	if(meshletVisOut.isFilled())
-	{
-		pass.newBufferDependency(meshletVisOut.m_dependency, BufferUsageBit::kIndirectDraw);
-	}
-	else if(visOut.containsDrawcalls())
+	if(visOut.containsDrawcalls())
 	{
 		pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 	}

+ 0 - 3
AnKi/Renderer/GBuffer.h

@@ -10,9 +10,6 @@
 
 namespace anki {
 
-// Forward
-class GpuVisibilityOutput;
-
 /// @addtogroup renderer
 /// @{
 

+ 6 - 46
AnKi/Renderer/IndirectDiffuseProbes.cpp

@@ -197,7 +197,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 		{
 			// GBuffer visibility
 			GpuVisibilityOutput visOut;
-			GpuMeshletVisibilityOutput meshletVisOut;
 			Frustum frustum;
 			{
 				frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
@@ -215,22 +214,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				visIn.m_lodDistances = lodDistances;
 				visIn.m_rgraph = &rgraph;
 				visIn.m_viewportSize = UVec2(m_tileSize);
+				visIn.m_limitMemory = true;
 
 				getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
-
-				if(getRenderer().runSoftwareMeshletRendering())
-				{
-					GpuMeshletVisibilityInput meshIn;
-					meshIn.m_passesName = visIn.m_passesName;
-					meshIn.m_technique = RenderingTechnique::kGBuffer;
-					meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
-					meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
-					meshIn.m_viewportSize = UVec2(m_tileSize);
-					meshIn.m_rgraph = &rgraph;
-					meshIn.fillBuffers(visOut);
-
-					getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-				}
 			}
 
 			// GBuffer
@@ -258,10 +244,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				pass.newTextureDependency(gbufferDepthRt, TextureUsageBit::kAllFramebuffer,
 										  TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));
 
-				pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency,
-										 BufferUsageBit::kIndirectDraw);
+				pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 
-				pass.setWork([this, visOut, meshletVisOut, viewProjMat = frustum.getViewProjectionMatrix(),
+				pass.setWork([this, visOut, viewProjMat = frustum.getViewProjectionMatrix(),
 							  viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
 					ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
 					CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -278,11 +263,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_viewport = UVec4(0, 0, m_tileSize, m_tileSize);
 					args.fill(visOut);
 
-					if(meshletVisOut.isFilled())
-					{
-						args.fill(meshletVisOut);
-					}
-
 					getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 
 					// It's secondary, no need to restore any state
@@ -291,7 +271,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 
 			// Shadow visibility. Optional
 			GpuVisibilityOutput shadowVisOut;
-			GpuMeshletVisibilityOutput shadowMeshletVisOut;
 			Mat4 cascadeProjMat;
 			Mat3x4 cascadeViewMat;
 			Mat4 cascadeViewProjMat;
@@ -313,22 +292,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				visIn.m_lodDistances = lodDistances;
 				visIn.m_rgraph = &rgraph;
 				visIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
+				visIn.m_limitMemory = true;
 
 				getRenderer().getGpuVisibility().populateRenderGraph(visIn, shadowVisOut);
-
-				if(getRenderer().runSoftwareMeshletRendering())
-				{
-					GpuMeshletVisibilityInput meshIn;
-					meshIn.m_passesName = visIn.m_passesName;
-					meshIn.m_technique = RenderingTechnique::kDepth;
-					meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
-					meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
-					meshIn.m_viewportSize = visIn.m_viewportSize;
-					meshIn.m_rgraph = &rgraph;
-					meshIn.fillBuffers(shadowVisOut);
-
-					getRenderer().getGpuVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
-				}
 			}
 
 			// Shadow pass. Optional
@@ -345,10 +311,9 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 
 				pass.newTextureDependency(shadowsRt, TextureUsageBit::kAllFramebuffer,
 										  TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));
-				pass.newBufferDependency((shadowMeshletVisOut.isFilled()) ? shadowMeshletVisOut.m_dependency : shadowVisOut.m_dependency,
-										 BufferUsageBit::kIndirectDraw);
+				pass.newBufferDependency(shadowVisOut.m_dependency, BufferUsageBit::kIndirectDraw);
 
-				pass.setWork([this, shadowVisOut, shadowMeshletVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
+				pass.setWork([this, shadowVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
 					ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
 					CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
@@ -367,11 +332,6 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_viewport = UVec4(0, 0, rez, rez);
 					args.fill(shadowVisOut);
 
-					if(shadowMeshletVisOut.isFilled())
-					{
-						args.fill(shadowMeshletVisOut);
-					}
-
 					getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 
 					// It's secondary, no need to restore the state

+ 25 - 64
AnKi/Renderer/ProbeReflections.cpp

@@ -197,7 +197,6 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	{
 		// GBuffer visibility
 		GpuVisibilityOutput visOut;
-		GpuMeshletVisibilityOutput meshletVisOut;
 		Frustum frustum;
 		{
 			frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
@@ -215,22 +214,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_rgraph = &rgraph;
 			visIn.m_viewportSize = UVec2(m_gbuffer.m_tileSize);
+			visIn.m_limitMemory = true;
 
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
-
-			if(getRenderer().runSoftwareMeshletRendering())
-			{
-				GpuMeshletVisibilityInput meshIn;
-				meshIn.m_passesName = "Cube refl: GBuffer";
-				meshIn.m_technique = RenderingTechnique::kGBuffer;
-				meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
-				meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
-				meshIn.m_viewportSize = UVec2(m_gbuffer.m_tileSize);
-				meshIn.m_rgraph = &rgraph;
-				meshIn.fillBuffers(visOut);
-
-				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-			}
 		}
 
 		// GBuffer pass
@@ -258,37 +244,31 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			}
 
 			pass.newTextureDependency(gbufferDepthRt, TextureUsageBit::kAllFramebuffer, DepthStencilAspectBit::kDepth);
-			pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency, BufferUsageBit::kIndirectDraw);
-
-			pass.setWork([this, visOut, meshletVisOut, viewProjMat = frustum.getViewProjectionMatrix(),
-						  viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
-				ANKI_TRACE_SCOPED_EVENT(ProbeReflections);
-				CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
-
-				cmdb.setViewport(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
-
-				RenderableDrawerArguments args;
-				args.m_viewMatrix = viewMat;
-				args.m_cameraTransform = viewMat.getInverseTransformation();
-				args.m_viewProjectionMatrix = viewProjMat;
-				args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care about prev mats
-				args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
-				args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
-				args.m_viewport = UVec4(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
-				args.fill(visOut);
-
-				if(meshletVisOut.isFilled())
-				{
-					args.fill(meshletVisOut);
-				}
-
-				getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
-			});
+			pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
+
+			pass.setWork(
+				[this, visOut, viewProjMat = frustum.getViewProjectionMatrix(), viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
+					ANKI_TRACE_SCOPED_EVENT(ProbeReflections);
+					CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+					cmdb.setViewport(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
+
+					RenderableDrawerArguments args;
+					args.m_viewMatrix = viewMat;
+					args.m_cameraTransform = viewMat.getInverseTransformation();
+					args.m_viewProjectionMatrix = viewProjMat;
+					args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care about prev mats
+					args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
+					args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
+					args.m_viewport = UVec4(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
+					args.fill(visOut);
+
+					getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
+				});
 		}
 
 		// Shadow visibility. Optional
 		GpuVisibilityOutput shadowVisOut;
-		GpuMeshletVisibilityOutput shadowMeshletVisOut;
 		Mat4 cascadeViewProjMat;
 		Mat3x4 cascadeViewMat;
 		Mat4 cascadeProjMat;
@@ -310,22 +290,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_rgraph = &rgraph;
 			visIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
+			visIn.m_limitMemory = true;
 
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, shadowVisOut);
-
-			if(getRenderer().runSoftwareMeshletRendering())
-			{
-				GpuMeshletVisibilityInput meshIn;
-				meshIn.m_passesName = "Cube refl: Shadows";
-				meshIn.m_technique = RenderingTechnique::kDepth;
-				meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
-				meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
-				meshIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
-				meshIn.m_rgraph = &rgraph;
-				meshIn.fillBuffers(shadowVisOut);
-
-				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
-			}
 		}
 
 		// Shadows. Optional
@@ -342,10 +309,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			pass.setRenderpassInfo({}, &depthRti);
 
 			pass.newTextureDependency(shadowMapRt, TextureUsageBit::kAllFramebuffer, DepthStencilAspectBit::kDepth);
-			pass.newBufferDependency((shadowMeshletVisOut.isFilled()) ? shadowMeshletVisOut.m_dependency : shadowVisOut.m_dependency,
-									 BufferUsageBit::kIndirectDraw);
+			pass.newBufferDependency(shadowVisOut.m_dependency, BufferUsageBit::kIndirectDraw);
 
-			pass.setWork([this, shadowVisOut, shadowMeshletVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
+			pass.setWork([this, shadowVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
 				ANKI_TRACE_SCOPED_EVENT(ProbeReflections);
 
 				CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -364,11 +330,6 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				args.m_viewport = UVec4(0, 0, rez, rez);
 				args.fill(shadowVisOut);
 
-				if(shadowMeshletVisOut.isFilled())
-				{
-					args.fill(shadowMeshletVisOut);
-				}
-
 				getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 			});
 		}

+ 0 - 3
AnKi/Renderer/ProbeReflections.h

@@ -11,9 +11,6 @@
 
 namespace anki {
 
-// Forward
-class GpuVisibilityOutput;
-
 /// @addtogroup renderer
 /// @{
 

+ 19 - 63
AnKi/Renderer/ShadowMapping.cpp

@@ -375,19 +375,6 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 					createVetVisibilityPass(generateTempPassName("Shadows: Vet point light lightIdx:%u", lightIdx), *lightc, visOut, rgraph);
 			}
 
-			// Additional visibility
-			GpuMeshletVisibilityOutput meshletVisOut;
-			if(getRenderer().runSoftwareMeshletRendering())
-			{
-				PassthroughGpuMeshletVisibilityInput meshIn;
-				meshIn.m_passesName = generateTempPassName("Shadows point light lightIdx:%u", lightIdx);
-				meshIn.m_technique = RenderingTechnique::kDepth;
-				meshIn.m_rgraph = &rgraph;
-				meshIn.fillBuffers(visOut);
-
-				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-			}
-
 			// Draw
 			Array<ShadowSubpassInfo, 6> subpasses;
 			for(U32 face = 0; face < 6; ++face)
@@ -405,7 +392,7 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				subpasses[face].m_viewProjMat = frustum.getViewProjectionMatrix();
 			}
 
-			createDrawShadowsPass(subpasses, visOut, meshletVisOut, generateTempPassName("Shadows: Point light lightIdx:%u", lightIdx), rgraph);
+			createDrawShadowsPass(subpasses, visOut, generateTempPassName("Shadows: Point light lightIdx:%u", lightIdx), rgraph);
 		}
 		else
 		{
@@ -466,24 +453,8 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 					createVetVisibilityPass(generateTempPassName("Shadows: Vet spot light lightIdx:%u", lightIdx), *lightc, visOut, rgraph);
 			}
 
-			// Additional visibility
-			GpuMeshletVisibilityOutput meshletVisOut;
-			if(getRenderer().runSoftwareMeshletRendering())
-			{
-				GpuMeshletVisibilityInput meshIn;
-				meshIn.m_passesName = generateTempPassName("Shadows spot light lightIdx:%u", lightIdx);
-				meshIn.m_technique = RenderingTechnique::kDepth;
-				meshIn.m_viewProjectionMatrix = lightc->getSpotLightViewProjectionMatrix();
-				meshIn.m_cameraTransform = lightc->getSpotLightViewMatrix().getInverseTransformation();
-				meshIn.m_viewportSize = atlasViewport.zw();
-				meshIn.m_rgraph = &rgraph;
-				meshIn.fillBuffers(visOut);
-
-				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-			}
-
 			// Add draw pass
-			createDrawShadowsPass(atlasViewport, lightc->getSpotLightViewProjectionMatrix(), lightc->getSpotLightViewMatrix(), visOut, meshletVisOut,
+			createDrawShadowsPass(atlasViewport, lightc->getSpotLightViewProjectionMatrix(), lightc->getSpotLightViewMatrix(), visOut,
 								  clearTileIndirectArgs, {}, generateTempPassName("Shadows: Spot light lightIdx:%u", lightIdx), rgraph);
 		}
 		else
@@ -552,24 +523,8 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 			GpuVisibilityOutput visOut;
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
 
-			// Additional visibility
-			GpuMeshletVisibilityOutput meshletVisOut;
-			if(getRenderer().runSoftwareMeshletRendering())
-			{
-				GpuMeshletVisibilityInput meshIn;
-				meshIn.m_passesName = generateTempPassName("Shadows: Dir light cascade lightIdx:%u cascade:%u", lightIdx, cascade);
-				meshIn.m_technique = RenderingTechnique::kDepth;
-				meshIn.m_viewProjectionMatrix = cascadeViewProjMats[cascade];
-				meshIn.m_cameraTransform = cascadeViewMats[cascade].getInverseTransformation();
-				meshIn.m_viewportSize = dirLightAtlasViewports[cascade].zw();
-				meshIn.m_rgraph = &rgraph;
-				meshIn.fillBuffers(visOut);
-
-				getRenderer().getGpuVisibility().populateRenderGraph(meshIn, meshletVisOut);
-			}
-
 			// Draw
-			createDrawShadowsPass(dirLightAtlasViewports[cascade], cascadeViewProjMats[cascade], cascadeViewMats[cascade], visOut, meshletVisOut, {},
+			createDrawShadowsPass(dirLightAtlasViewports[cascade], cascadeViewProjMats[cascade], cascadeViewMats[cascade], visOut, {},
 								  hzbGenIn.m_cascades[cascade].m_hzbRt,
 								  generateTempPassName("Shadows: Dir light lightIdx:%u cascade:%u", lightIdx, cascade), rgraph);
 
@@ -592,7 +547,8 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
 	pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kStorageComputeWrite);
 
 	pass.setWork([this, &lightc, hashBuff = visOut.m_visiblesHashBuffer, mdiBuff = visOut.m_legacy.m_mdiDrawCountsBuffer, clearTileIndirectArgs,
-				  taskShadersIndirectArgs = visOut.m_mesh.m_taskShaderIndirectArgsBuffer](RenderPassWorkContext& rpass) {
+				  dispatchMeshIndirectArgs = visOut.m_mesh.m_dispatchMeshIndirectArgsBuffer,
+				  drawIndirectArgs = visOut.m_mesh.m_drawIndirectArgs](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 		cmdb.bindShaderProgram(m_vetVisibilityGrProg.get());
@@ -601,11 +557,16 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
 		cmdb.setPushConstants(&lightIndex, sizeof(lightIndex));
 
 		cmdb.bindStorageBuffer(ANKI_REG(t0), hashBuff);
-		cmdb.bindStorageBuffer(ANKI_REG(u0), mdiBuff);
+		cmdb.bindStorageBuffer(ANKI_REG(u0), mdiBuff.isValid() ? mdiBuff : BufferView(&getRenderer().getDummyBuffer()).setRange(sizeof(U32)));
 		cmdb.bindStorageBuffer(ANKI_REG(u1), GpuSceneArrays::Light::getSingleton().getBufferView());
 		cmdb.bindStorageBuffer(ANKI_REG(u2), GpuSceneArrays::LightVisibleRenderablesHash::getSingleton().getBufferView());
 		cmdb.bindStorageBuffer(ANKI_REG(u3), clearTileIndirectArgs);
-		cmdb.bindStorageBuffer(ANKI_REG(u4), taskShadersIndirectArgs);
+		cmdb.bindStorageBuffer(ANKI_REG(u4), dispatchMeshIndirectArgs.isValid()
+												 ? dispatchMeshIndirectArgs
+												 : BufferView(&getRenderer().getDummyBuffer()).setRange(sizeof(DispatchIndirectArgs)));
+		cmdb.bindStorageBuffer(ANKI_REG(u5), drawIndirectArgs.isValid()
+												 ? drawIndirectArgs
+												 : BufferView(&getRenderer().getDummyBuffer()).setRange(sizeof(DrawIndirectArgs)));
 
 		ANKI_ASSERT(RenderStateBucketContainer::getSingleton().getBucketCount(RenderingTechnique::kDepth) <= 64 && "TODO");
 		cmdb.dispatchCompute(1, 1, 1);
@@ -615,8 +576,8 @@ BufferView ShadowMapping::createVetVisibilityPass(CString passName, const LightC
 }
 
 void ShadowMapping::createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput& visOut,
-										  const GpuMeshletVisibilityOutput& meshletVisOut, const BufferView& clearTileIndirectArgs,
-										  const RenderTargetHandle hzbRt, CString passName, RenderGraphBuilder& rgraph)
+										  const BufferView& clearTileIndirectArgs, const RenderTargetHandle hzbRt, CString passName,
+										  RenderGraphBuilder& rgraph)
 {
 	ShadowSubpassInfo spass;
 	spass.m_clearTileIndirectArgs = clearTileIndirectArgs;
@@ -625,11 +586,11 @@ void ShadowMapping::createDrawShadowsPass(const UVec4& viewport, const Mat4& vie
 	spass.m_viewport = viewport;
 	spass.m_viewProjMat = viewProjMat;
 
-	createDrawShadowsPass({&spass, 1}, visOut, meshletVisOut, passName, rgraph);
+	createDrawShadowsPass({&spass, 1}, visOut, passName, rgraph);
 }
 
-void ShadowMapping::createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subpasses_, const GpuVisibilityOutput& visOut,
-										  const GpuMeshletVisibilityOutput& meshletVisOut, CString passName, RenderGraphBuilder& rgraph)
+void ShadowMapping::createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subpasses_, const GpuVisibilityOutput& visOut, CString passName,
+										  RenderGraphBuilder& rgraph)
 {
 	WeakArray<ShadowSubpassInfo> subpasses;
 	newArray<ShadowSubpassInfo>(getRenderer().getFrameMemoryPool(), subpasses_.getSize(), subpasses);
@@ -666,10 +627,10 @@ void ShadowMapping::createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subp
 	smRti.m_subresource.m_depthStencilAspect = DepthStencilAspectBit::kDepth;
 	pass.setRenderpassInfo({}, &smRti, viewport[0], viewport[1], viewport[2], viewport[3]);
 
-	pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency, BufferUsageBit::kIndirectDraw);
+	pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 	pass.newTextureDependency(m_runCtx.m_rt, TextureUsageBit::kFramebufferWrite);
 
-	pass.setWork([this, visOut, meshletVisOut, subpasses, loadFb](RenderPassWorkContext& rgraphCtx) {
+	pass.setWork([this, visOut, subpasses, loadFb](RenderPassWorkContext& rgraphCtx) {
 		ANKI_TRACE_SCOPED_EVENT(ShadowMapping);
 
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -716,11 +677,6 @@ void ShadowMapping::createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subp
 				args.m_hzbTexture = rgraphCtx.createTextureView(spass.m_hzbRt, TextureSubresourceDesc::all());
 			}
 
-			if(meshletVisOut.isFilled())
-			{
-				args.fill(meshletVisOut);
-			}
-
 			getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 		}
 	});

+ 3 - 5
AnKi/Renderer/ShadowMapping.h

@@ -14,7 +14,6 @@ namespace anki {
 
 // Forward
 class GpuVisibilityOutput;
-class GpuMeshletVisibilityOutput;
 extern NumericCVar<U32> g_shadowMappingPcfCVar;
 
 /// @addtogroup renderer
@@ -84,11 +83,10 @@ private:
 									   RenderGraphBuilder& rgraph) const;
 
 	void createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput& visOut,
-							   const GpuMeshletVisibilityOutput& meshletVisOut, const BufferView& clearTileIndirectArgs,
-							   const RenderTargetHandle hzbRt, CString passName, RenderGraphBuilder& rgraph);
+							   const BufferView& clearTileIndirectArgs, const RenderTargetHandle hzbRt, CString passName, RenderGraphBuilder& rgraph);
 
-	void createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subPasses, const GpuVisibilityOutput& visOut,
-							   const GpuMeshletVisibilityOutput& meshletVisOut, CString passName, RenderGraphBuilder& rgraph);
+	void createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subPasses, const GpuVisibilityOutput& visOut, CString passName,
+							   RenderGraphBuilder& rgraph);
 };
 /// @}
 

+ 71 - 77
AnKi/Renderer/Utils/Drawer.cpp

@@ -63,9 +63,9 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESHLET_BOUNDING_VOLUMES), UnifiedGeometryBuffer::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESHLET_GEOMETRY_DESCRIPTORS), UnifiedGeometryBuffer::getSingleton().getBufferView());
-	if(args.m_mesh.m_meshletGroupInstancesBuffer.isValid())
+	if(args.m_mesh.m_meshletInstancesBuffer.isValid())
 	{
-		cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESHLET_GROUPS), args.m_mesh.m_meshletGroupInstancesBuffer);
+		cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESHLET_INSTANCES), args.m_mesh.m_meshletInstancesBuffer);
 	}
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_RENDERABLES), GpuSceneArrays::Renderable::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESH_LODS), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
@@ -76,6 +76,11 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 												   : TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
 	cmdb.bindSampler(ANKI_REG(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER), getRenderer().getSamplers().m_nearestNearestClamp.get());
 
+	if(args.m_mesh.m_firstMeshletBuffer.isValid())
+	{
+		cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_FIRST_MESHLET), args.m_mesh.m_firstMeshletBuffer);
+	}
+
 	// Misc
 	cmdb.bindIndexBuffer(UnifiedGeometryBuffer::getSingleton().getBufferView(), IndexType::kU16);
 }
@@ -109,81 +114,70 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 	cmdb.setVertexAttribute(VertexAttributeSemantic::kMisc0, 0, Format::kR32G32B32A32_Uint, 0);
 
-	RenderStateBucketContainer::getSingleton().iterateBucketsPerformanceOrder(
-		args.m_renderingTechinuqe,
-		[&](const RenderStateInfo& state, U32 bucketIdx, U32 userCount, U32 meshletGroupCount, [[maybe_unused]] U32 meshletCount) {
-			if(userCount == 0)
-			{
-				return;
-			}
-
-			cmdb.bindShaderProgram(state.m_program.get());
-
-			const Bool meshlets = meshletGroupCount > 0;
-
-			if(meshlets && meshShaderHwSupport)
-			{
-				const UVec4 firstPayload(args.m_mesh.m_bucketMeshletGroupInstanceRanges[bucketIdx].getFirstInstance());
-				cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
-
-				cmdb.drawMeshTasksIndirect(BufferView(
-					&args.m_mesh.m_taskShaderIndirectArgsBuffer.getBuffer(),
-					args.m_mesh.m_taskShaderIndirectArgsBuffer.getOffset() + sizeof(DispatchIndirectArgs) * bucketIdx, sizeof(DispatchIndirectArgs)));
-			}
-			else if(meshlets)
-			{
-				const InstanceRange& instanceRange = args.m_softwareMesh.m_bucketMeshletInstanceRanges[bucketIdx];
-				const BufferView vertBufferView = BufferView(args.m_softwareMesh.m_meshletInstancesBuffer)
-													  .incrementOffset(instanceRange.getFirstInstance() * sizeof(GpuSceneMeshletInstance))
-													  .setRange(instanceRange.getInstanceCount() * sizeof(GpuSceneMeshletInstance));
-				cmdb.bindVertexBuffer(0, vertBufferView, sizeof(GpuSceneMeshletInstance), VertexStepRate::kInstance);
-
-				const BufferView indirectArgsBuffView = BufferView(args.m_softwareMesh.m_drawIndirectArgsBuffer)
-															.incrementOffset(sizeof(DrawIndirectArgs) * bucketIdx)
-															.setRange(sizeof(DrawIndirectArgs));
-				cmdb.drawIndirect(PrimitiveTopology::kTriangles, indirectArgsBuffView);
-			}
-			else if(state.m_indexedDrawcall)
-			{
-				// Legacy
-
-				const InstanceRange& instanceRange = args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx];
-				const U32 maxDrawCount = instanceRange.getInstanceCount();
-
-				const BufferView vertBufferView = BufferView(args.m_legacy.m_renderableInstancesBuffer)
-													  .incrementOffset(instanceRange.getFirstInstance() * sizeof(GpuSceneRenderableInstance))
-													  .setRange(instanceRange.getInstanceCount() * sizeof(GpuSceneRenderableInstance));
-				cmdb.bindVertexBuffer(0, vertBufferView, sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
-
-				const BufferView indirectArgsBuffView = BufferView(args.m_legacy.m_drawIndexedIndirectArgsBuffer)
-															.incrementOffset(instanceRange.getFirstInstance() * sizeof(DrawIndexedIndirectArgs))
-															.setRange(instanceRange.getInstanceCount() * sizeof(DrawIndexedIndirectArgs));
-				const BufferView mdiCountBuffView =
-					BufferView(args.m_legacy.m_mdiDrawCountsBuffer).incrementOffset(sizeof(U32) * bucketIdx).setRange(sizeof(U32));
-				cmdb.drawIndexedIndirectCount(state.m_primitiveTopology, indirectArgsBuffView, sizeof(DrawIndexedIndirectArgs), mdiCountBuffView,
-											  maxDrawCount);
-			}
-			else
-			{
-				// Legacy
-
-				const InstanceRange& instanceRange = args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx];
-				const U32 maxDrawCount = instanceRange.getInstanceCount();
-
-				const BufferView vertBufferView = BufferView(args.m_legacy.m_renderableInstancesBuffer)
-													  .incrementOffset(instanceRange.getFirstInstance() * sizeof(GpuSceneRenderableInstance))
-													  .setRange(instanceRange.getInstanceCount() * sizeof(GpuSceneRenderableInstance));
-				cmdb.bindVertexBuffer(0, vertBufferView, sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
-
-				// Yes, the DrawIndexedIndirectArgs is intentional
-				const BufferView indirectArgsBuffView = BufferView(args.m_legacy.m_drawIndexedIndirectArgsBuffer)
-															.incrementOffset(instanceRange.getFirstInstance() * sizeof(DrawIndexedIndirectArgs))
-															.setRange(instanceRange.getInstanceCount() * sizeof(DrawIndexedIndirectArgs));
-				const BufferView countBuffView =
-					BufferView(args.m_legacy.m_mdiDrawCountsBuffer).incrementOffset(sizeof(U32) * bucketIdx).setRange(sizeof(U32));
-				cmdb.drawIndirectCount(state.m_primitiveTopology, indirectArgsBuffView, sizeof(DrawIndexedIndirectArgs), countBuffView, maxDrawCount);
-			}
-		});
+	RenderStateBucketContainer::getSingleton().iterateBucketsPerformanceOrder(args.m_renderingTechinuqe, [&](const RenderStateInfo& state,
+																											 U32 bucketIdx, U32 userCount,
+																											 U32 meshletCount) {
+		if(userCount == 0)
+		{
+			return;
+		}
+
+		cmdb.bindShaderProgram(state.m_program.get());
+
+		const Bool bMeshlets = meshletCount > 0;
+
+		if(bMeshlets && meshShaderHwSupport)
+		{
+			const UVec4 consts(bucketIdx);
+			cmdb.setPushConstants(&consts, sizeof(consts));
+
+			cmdb.drawMeshTasksIndirect(BufferView(args.m_mesh.m_dispatchMeshIndirectArgsBuffer)
+										   .incrementOffset(sizeof(DispatchIndirectArgs) * bucketIdx)
+										   .setRange(sizeof(DispatchIndirectArgs)));
+		}
+		else if(bMeshlets)
+		{
+			cmdb.bindVertexBuffer(0, args.m_mesh.m_meshletInstancesBuffer, sizeof(GpuSceneMeshletInstance), VertexStepRate::kInstance);
+
+			const BufferView indirectArgsBuffView =
+				BufferView(args.m_mesh.m_indirectDrawArgs).incrementOffset(sizeof(DrawIndirectArgs) * bucketIdx).setRange(sizeof(DrawIndirectArgs));
+			cmdb.drawIndirect(PrimitiveTopology::kTriangles, indirectArgsBuffView);
+		}
+		else if(state.m_indexedDrawcall)
+		{
+			// Legacy indexed
+
+			const InstanceRange& instanceRange = args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx];
+			const U32 maxDrawCount = instanceRange.getInstanceCount();
+
+			cmdb.bindVertexBuffer(0, args.m_legacy.m_renderableInstancesBuffer, sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
+
+			const BufferView indirectArgsBuffView = BufferView(args.m_legacy.m_drawIndexedIndirectArgsBuffer)
+														.incrementOffset(instanceRange.getFirstInstance() * sizeof(DrawIndexedIndirectArgs))
+														.setRange(instanceRange.getInstanceCount() * sizeof(DrawIndexedIndirectArgs));
+			const BufferView mdiCountBuffView =
+				BufferView(args.m_legacy.m_mdiDrawCountsBuffer).incrementOffset(sizeof(U32) * bucketIdx).setRange(sizeof(U32));
+			cmdb.drawIndexedIndirectCount(state.m_primitiveTopology, indirectArgsBuffView, sizeof(DrawIndexedIndirectArgs), mdiCountBuffView,
+										  maxDrawCount);
+		}
+		else
+		{
+			// Legacy non-indexed
+
+			const InstanceRange& instanceRange = args.m_legacy.m_bucketRenderableInstanceRanges[bucketIdx];
+			const U32 maxDrawCount = instanceRange.getInstanceCount();
+
+			cmdb.bindVertexBuffer(0, args.m_legacy.m_renderableInstancesBuffer, sizeof(GpuSceneRenderableInstance), VertexStepRate::kInstance);
+
+			// Yes, the DrawIndexedIndirectArgs is intentional
+			const BufferView indirectArgsBuffView = BufferView(args.m_legacy.m_drawIndexedIndirectArgsBuffer)
+														.incrementOffset(instanceRange.getFirstInstance() * sizeof(DrawIndexedIndirectArgs))
+														.setRange(instanceRange.getInstanceCount() * sizeof(DrawIndexedIndirectArgs));
+			const BufferView countBuffView =
+				BufferView(args.m_legacy.m_mdiDrawCountsBuffer).incrementOffset(sizeof(U32) * bucketIdx).setRange(sizeof(U32));
+			cmdb.drawIndirectCount(state.m_primitiveTopology, indirectArgsBuffView, sizeof(DrawIndexedIndirectArgs), countBuffView, maxDrawCount);
+		}
+	});
 
 #if ANKI_STATS_ENABLED
 	if(pplineQuery.isCreated())

+ 9 - 23
AnKi/Renderer/Utils/Drawer.h

@@ -45,38 +45,24 @@ public:
 	class
 	{
 	public:
-		BufferView m_taskShaderIndirectArgsBuffer;
-		BufferView m_meshletGroupInstancesBuffer;
+		BufferView m_dispatchMeshIndirectArgsBuffer;
+		BufferView m_indirectDrawArgs;
 
-		ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges;
-	} m_mesh;
-
-	class
-	{
-	public:
 		BufferView m_meshletInstancesBuffer;
-		BufferView m_drawIndirectArgsBuffer;
 
-		ConstWeakArray<InstanceRange> m_bucketMeshletInstanceRanges;
-	} m_softwareMesh;
+		BufferView m_firstMeshletBuffer;
+	} m_mesh;
 
 	void fill(const GpuVisibilityOutput& visOut)
 	{
 		m_legacy.m_mdiDrawCountsBuffer = visOut.m_legacy.m_mdiDrawCountsBuffer;
 		m_legacy.m_renderableInstancesBuffer = visOut.m_legacy.m_renderableInstancesBuffer;
 		m_legacy.m_drawIndexedIndirectArgsBuffer = visOut.m_legacy.m_drawIndexedIndirectArgsBuffer;
-		m_legacy.m_bucketRenderableInstanceRanges = visOut.m_legacy.m_bucketRenderableInstanceRanges;
-		m_mesh.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-		m_mesh.m_meshletGroupInstancesBuffer = visOut.m_mesh.m_meshletGroupInstancesBuffer;
-		m_mesh.m_bucketMeshletGroupInstanceRanges = visOut.m_mesh.m_bucketMeshletGroupInstanceRanges;
-	}
-
-	void fill(const GpuMeshletVisibilityOutput& visOut)
-	{
-		ANKI_ASSERT(visOut.isFilled());
-		m_softwareMesh.m_meshletInstancesBuffer = visOut.m_meshletInstancesBuffer;
-		m_softwareMesh.m_bucketMeshletInstanceRanges = visOut.m_bucketMeshletInstanceRanges;
-		m_softwareMesh.m_drawIndirectArgsBuffer = visOut.m_drawIndirectArgsBuffer;
+		m_legacy.m_bucketRenderableInstanceRanges = visOut.m_legacy.m_bucketIndirectArgsRanges;
+		m_mesh.m_dispatchMeshIndirectArgsBuffer = visOut.m_mesh.m_dispatchMeshIndirectArgsBuffer;
+		m_mesh.m_indirectDrawArgs = visOut.m_mesh.m_drawIndirectArgs;
+		m_mesh.m_meshletInstancesBuffer = visOut.m_mesh.m_meshletInstancesBuffer;
+		m_mesh.m_firstMeshletBuffer = visOut.m_mesh.m_firstMeshletBuffer;
 	}
 };
 

+ 490 - 405
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -22,21 +22,64 @@ constexpr U32 kMaxVisibleObjects = 30 * 1024;
 
 constexpr U32 kMaxVisiblePrimitives = 40'000'000;
 constexpr U32 kMaxVisibleMeshlets = kMaxVisiblePrimitives / kMaxPrimitivesPerMeshlet;
-constexpr PtrSize kMaxMeshletMemory = kMaxVisibleMeshlets * sizeof(GpuSceneMeshletInstance);
 
-constexpr U32 kVisibleMaxMeshletGroups = max(kMaxVisibleObjects, (kMaxVisibleMeshlets + kMeshletGroupSize - 1) / kMeshletGroupSize);
-constexpr PtrSize kMaxMeshletGroupMemory = kVisibleMaxMeshletGroups * sizeof(GpuSceneMeshletGroupInstance);
+static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem",
+												  StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
 
-static NumericCVar<PtrSize> g_maxMeshletMemoryPerTest(CVarSubsystem::kRenderer, "MaxMeshletMemoryPerTest", kMaxMeshletMemory, 1_KB, 100_MB,
-													  "Max memory that will be allocated per GPU occlusion test for storing meshlets");
-static NumericCVar<PtrSize> g_maxMeshletGroupMemoryPerTest(CVarSubsystem::kRenderer, "MaxMeshletGroupMemoryPerTest", kMaxMeshletGroupMemory, 1_KB,
-														   100_MB,
-														   "Max memory that will be allocated per GPU occlusion test for storing meshlet groups");
+static StatCounter g_maxGpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU vis mem: max ever used/frame",
+													 StatFlag::kBytes | StatFlag::kMainThreadUpdates);
 
-static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU visibility mem",
-												  StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
+class GpuVisLimits
+{
+public:
+	U32 m_maxVisibleLegacyRenderables = 0;
+	U32 m_totalLegacyRenderables = 0;
+
+	U32 m_maxVisibleMeshlets = 0;
+};
+
+static GpuVisLimits computeLimits(RenderingTechnique t)
+{
+	GpuVisLimits out;
+
+	const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
+
+	const U32 meshletUserCount = buckets.getBucketsActiveUserCountWithMeshletSupport(t);
+	ANKI_ASSERT(meshletUserCount == 0 || (g_meshletRenderingCVar.get() || GrManager::getSingleton().getDeviceCapabilities().m_meshShaders));
+	out.m_totalLegacyRenderables = buckets.getBucketsActiveUserCountWithNoMeshletSupport(t);
+	out.m_maxVisibleLegacyRenderables = min(out.m_totalLegacyRenderables, kMaxVisibleObjects);
 
-static BufferView allocateTransientGpuMem(PtrSize size)
+	out.m_maxVisibleMeshlets = (meshletUserCount) ? min(kMaxVisibleMeshlets, buckets.getBucketsLod0MeshletCount(t)) : 0;
+
+	return out;
+}
+
+class GpuVisMemoryStats : public RendererObject, public MakeSingletonSimple<GpuVisMemoryStats>
+{
+public:
+	void informAboutAllocation(PtrSize size)
+	{
+		if(m_frameIdx != getRenderer().getFrameCount())
+		{
+			// First call in the frame, update the stat var
+
+			m_frameIdx = getRenderer().getFrameCount();
+
+			m_maxMemUsedInFrame = max(m_maxMemUsedInFrame, m_memUsedThisFrame);
+			m_memUsedThisFrame = 0;
+			g_maxGpuVisMemoryAllocatedStatVar.set(m_maxMemUsedInFrame);
+		}
+
+		m_memUsedThisFrame += size;
+	}
+
+private:
+	PtrSize m_memUsedThisFrame = 0;
+	PtrSize m_maxMemUsedInFrame = 0;
+	U64 m_frameIdx = kMaxU64;
+};
+
+BufferView allocateTransientGpuMem(PtrSize size)
 {
 	BufferView out = {};
 
@@ -44,6 +87,8 @@ static BufferView allocateTransientGpuMem(PtrSize size)
 	{
 		g_gpuVisMemoryAllocatedStatVar.increment(size);
 		out = GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
+
+		GpuVisMemoryStats::getSingleton().informAboutAllocation(size);
 	}
 
 	return out;
@@ -57,15 +102,15 @@ Error GpuVisibility::init()
 		{
 			for(MutatorValue genHash = 0; genHash < 2; ++genHash)
 			{
-				for(MutatorValue gatherType = 0; gatherType < 3; ++gatherType)
+				for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
 				{
-					ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+					ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
 												 {{"HZB_TEST", hzb},
 												  {"DISTANCE_TEST", 0},
 												  {"GATHER_AABBS", gatherAabbs},
 												  {"HASH_VISIBLES", genHash},
-												  {"GATHER_TYPE", gatherType + 1}},
-												 m_prog, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherType]));
+												  {"GATHER_MESHLETS", gatherMeshlets}},
+												 m_1stStageProg, m_frustumGrProgs[hzb][gatherAabbs][genHash][gatherMeshlets]));
 				}
 			}
 		}
@@ -75,85 +120,38 @@ Error GpuVisibility::init()
 	{
 		for(MutatorValue genHash = 0; genHash < 2; ++genHash)
 		{
-			for(MutatorValue gatherType = 0; gatherType < 3; ++gatherType)
+			for(MutatorValue gatherMeshlets = 0; gatherMeshlets < 2; ++gatherMeshlets)
 			{
-				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage1.ankiprogbin",
 											 {{"HZB_TEST", 0},
 											  {"DISTANCE_TEST", 1},
 											  {"GATHER_AABBS", gatherAabbs},
 											  {"HASH_VISIBLES", genHash},
-											  {"GATHER_TYPE", gatherType + 1}},
-											 m_prog, m_distGrProgs[gatherAabbs][genHash][gatherType]));
+											  {"GATHER_MESHLETS", gatherMeshlets}},
+											 m_1stStageProg, m_distGrProgs[gatherAabbs][genHash][gatherMeshlets]));
 			}
 		}
 	}
 
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2.ankiprogbin", {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}},
+								 m_2ndStageProg, m_gatherGrProg, "Legacy"));
+
 	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
 	{
 		for(MutatorValue passthrough = 0; passthrough < 2; ++passthrough)
 		{
-			ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityMeshlet.ankiprogbin", {{"HZB_TEST", hzb}, {"PASSTHROUGH", passthrough}},
-										 m_meshletCullingProg, m_meshletCullingGrProgs[hzb][passthrough]));
+			for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
+			{
+				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2.ankiprogbin",
+											 {{"HZB_TEST", hzb}, {"PASSTHROUGH", passthrough}, {"MESH_SHADERS", meshShaders}}, m_2ndStageProg,
+											 m_meshletGrProgs[hzb][passthrough][meshShaders], "Meshlets"));
+			}
 		}
 	}
 
 	return Error::kNone;
 }
 
-void GpuVisibility::computeGpuVisibilityMemoryRequirements(RenderingTechnique t, MemoryRequirements& total, WeakArray<MemoryRequirements> perBucket)
-{
-	ANKI_ASSERT(perBucket.getSize() == RenderStateBucketContainer::getSingleton().getBucketCount(t));
-
-	U32 totalMeshletCount = 0;
-	U32 totalMeshletGroupCount = 0;
-	U32 totalRenderableCount = 0;
-
-	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
-		if(meshletCount)
-		{
-			totalMeshletCount += meshletCount;
-			totalMeshletGroupCount += meshletGroupCount;
-		}
-		else
-		{
-			totalRenderableCount += userCount;
-		}
-	});
-
-	const U32 maxVisibleMeshlets = min(U32(g_maxMeshletMemoryPerTest.get() / sizeof(GpuSceneMeshletInstance)), totalMeshletCount);
-	const U32 maxVisibleMeshletGroups = min(U32(g_maxMeshletGroupMemoryPerTest.get() / sizeof(GpuSceneMeshletGroupInstance)), totalMeshletGroupCount);
-	const U32 maxVisibleRenderables = min(kMaxVisibleObjects, totalRenderableCount);
-
-	total = {};
-
-	U32 bucketCount = 0;
-	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
-		MemoryRequirements& bucket = perBucket[bucketCount++];
-
-		// Use U64 cause some expressions are overflowing
-
-		if(meshletCount)
-		{
-			ANKI_ASSERT(meshletGroupCount > 0);
-
-			ANKI_ASSERT(totalMeshletCount > 0);
-			bucket.m_meshletInstanceCount = max(1u, U32(U64(meshletCount) * maxVisibleMeshlets / totalMeshletCount));
-
-			ANKI_ASSERT(totalMeshletGroupCount > 0);
-			bucket.m_meshletGroupInstanceCount = max(1u, U32(U64(meshletGroupCount) * maxVisibleMeshletGroups / totalMeshletGroupCount));
-		}
-		else if(userCount > 0)
-		{
-			ANKI_ASSERT(totalRenderableCount > 0);
-			bucket.m_renderableInstanceCount = max(1u, U32(U64(userCount) * maxVisibleRenderables / totalRenderableCount));
-		}
-
-		total.m_meshletInstanceCount += bucket.m_meshletInstanceCount;
-		total.m_meshletGroupInstanceCount += bucket.m_meshletGroupInstanceCount;
-		total.m_renderableInstanceCount += bucket.m_renderableInstanceCount;
-	});
-}
-
 void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
 {
 	ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
@@ -198,455 +196,542 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		const FrustumGpuVisibilityInput& fin = static_cast<FrustumGpuVisibilityInput&>(in);
 		frustumTestData->m_viewProjMat = fin.m_viewProjectionMatrix;
 		frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
+
+		if(fin.m_hzbRt)
+		{
+			frustumTestData->m_hzbRt = *fin.m_hzbRt;
+		}
 	}
 
-	// Allocate memory
-	const Bool firstCallInFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
+	const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
 	if(firstCallInFrame)
 	{
-		// First call in frame. Init stuff
+		m_persistentMemory.m_frameIdx = getRenderer().getFrameCount();
+	}
 
-		m_runCtx.m_frameIdx = getRenderer().getFrameCount();
-		m_runCtx.m_populateRenderGraphCallCount = 0;
-		m_runCtx.m_populateRenderGraphMeshletRenderingCallCount = 0;
+	// OoM
+	if(firstCallInFrame)
+	{
+		U32 data;
+		PtrSize dataReadSize;
+		getRenderer().getReadbackManager().readMostRecentData(m_outOfMemoryReadback, &data, sizeof(data), dataReadSize);
 
-		// Calc memory requirements
-		MemoryRequirements maxTotalMemReq;
-		WeakArray<MemoryRequirements> bucketsMemReqs;
-		for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
+		if(dataReadSize == sizeof(U32) && data != 0)
 		{
-			const U32 tBucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(t);
-			if(tBucketCount == 0)
+			CString who;
+			switch(data)
 			{
-				continue;
+			case 0b1:
+				who = "Stage 1";
+				break;
+			case 0b10:
+				who = "Stage 2";
+				break;
+			case 0b11:
+				who = "Both stages";
+				break;
+			default:
+				ANKI_ASSERT(0);
 			}
 
-			newArray<MemoryRequirements>(getRenderer().getFrameMemoryPool(), tBucketCount, bucketsMemReqs);
+			ANKI_RESOURCE_LOGE("GPU visibility went out of memory: %s", who.cstr());
+		}
 
-			computeGpuVisibilityMemoryRequirements(t, m_runCtx.m_totalMemRequirements[t], bucketsMemReqs);
+		getRenderer().getReadbackManager().allocateData(m_outOfMemoryReadback, sizeof(U32), m_outOfMemoryReadbackBuffer);
+	}
 
-			maxTotalMemReq = maxTotalMemReq.max(m_runCtx.m_totalMemRequirements[t]);
+	// Get some limits
+	const RenderStateBucketContainer& buckets = RenderStateBucketContainer::getSingleton();
+	const U32 bucketCount = buckets.getBucketCount(in.m_technique);
+	const GpuVisLimits limits = computeLimits(in.m_technique);
 
-			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_renderableInstanceRanges[t]);
-			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_meshletGroupInstanceRanges[t]);
-			newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), tBucketCount, m_runCtx.m_meshletInstanceRanges[t]);
+	const Bool bHwMeshletRendering = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders && limits.m_maxVisibleMeshlets > 0;
+	const Bool bSwMeshletRendering = g_meshletRenderingCVar.get() && !bHwMeshletRendering && limits.m_maxVisibleMeshlets > 0;
+	const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
+	const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
 
-			U32 renderablesFirstInstance = 0, groupsFirstInstance = 0, meshletsFirstInstance = 0;
-			for(U32 i = 0; i < tBucketCount; ++i)
-			{
-				m_runCtx.m_renderableInstanceRanges[t][i].m_firstInstance = renderablesFirstInstance;
-				m_runCtx.m_renderableInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_renderableInstanceCount;
+	// Allocate persistent memory for the frame
+	if(firstCallInFrame)
+	{
+		GpuVisLimits maxLimits;
+		for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
+		{
+			const GpuVisLimits limits = computeLimits(t);
+			maxLimits.m_maxVisibleLegacyRenderables = max(maxLimits.m_maxVisibleLegacyRenderables, limits.m_maxVisibleLegacyRenderables);
+			maxLimits.m_maxVisibleMeshlets = max(maxLimits.m_maxVisibleMeshlets, limits.m_maxVisibleMeshlets);
+		}
+
+		m_persistentMemory.m_stage1.m_visibleRenderables =
+			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleLegacyRenderables);
+		m_persistentMemory.m_stage1.m_visibleMeshlets =
+			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * maxLimits.m_maxVisibleMeshlets);
+
+		m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables =
+			allocateTransientGpuMem(sizeof(UVec4) * maxLimits.m_maxVisibleLegacyRenderables);
+		m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs =
+			allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * maxLimits.m_maxVisibleLegacyRenderables);
+
+		m_persistentMemory.m_stage2Meshlet.m_meshletInstances =
+			allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
+
+		m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
+																		   : m_persistentMemory.m_stage1.m_visibleRenderables,
+													   BufferUsageBit::kNone);
+	}
 
-				m_runCtx.m_meshletGroupInstanceRanges[t][i].m_firstInstance = groupsFirstInstance;
-				m_runCtx.m_meshletGroupInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_meshletGroupInstanceCount;
+	// Compute the MDI sub-ranges
+	if(limits.m_maxVisibleLegacyRenderables)
+	{
+		newArray<InstanceRange>(getRenderer().getFrameMemoryPool(), bucketCount, out.m_legacy.m_bucketIndirectArgsRanges);
+
+		U32 ibucket = 0;
+		U32 offset = 0;
+		buckets.iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletCount) {
+			out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance = offset;
 
-				m_runCtx.m_meshletInstanceRanges[t][i].m_firstInstance = meshletsFirstInstance;
-				m_runCtx.m_meshletInstanceRanges[t][i].m_instanceCount = bucketsMemReqs[i].m_meshletInstanceCount;
+			if(meshletCount == 0 && userCount > 0)
+			{
+				out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount =
+					max(1u, U32(U64(userCount) * limits.m_maxVisibleLegacyRenderables / limits.m_totalLegacyRenderables));
 
-				renderablesFirstInstance += bucketsMemReqs[i].m_renderableInstanceCount;
-				groupsFirstInstance += bucketsMemReqs[i].m_meshletGroupInstanceCount;
-				meshletsFirstInstance += bucketsMemReqs[i].m_meshletInstanceCount;
+				offset += out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
 			}
-		}
 
-		// Allocate persistent memory
-		for(PersistentMemory& mem : m_runCtx.m_persistentMem)
-		{
-			mem = {};
+			++ibucket;
+		});
 
-			mem.m_drawIndexedIndirectArgsBuffer = allocateTransientGpuMem(maxTotalMemReq.m_renderableInstanceCount * sizeof(DrawIndexedIndirectArgs));
-			mem.m_renderableInstancesBuffer = allocateTransientGpuMem(maxTotalMemReq.m_renderableInstanceCount * sizeof(GpuSceneRenderableInstance));
+		// The last element should point to the limit of the buffer
+		InstanceRange& last = out.m_legacy.m_bucketIndirectArgsRanges.getBack();
+		ANKI_ASSERT(limits.m_maxVisibleLegacyRenderables >= last.m_firstInstance);
+		last.m_instanceCount = limits.m_maxVisibleLegacyRenderables - last.m_firstInstance;
+	}
 
-			mem.m_meshletGroupsInstancesBuffer =
-				allocateTransientGpuMem(maxTotalMemReq.m_meshletGroupInstanceCount * sizeof(GpuSceneMeshletGroupInstance));
+	// Allocate memory for stage 1
+	class Stage1Mem
+	{
+	public:
+		BufferView m_counters;
+		BufferView m_visibleRenderables;
+		BufferView m_visibleMeshlets;
 
-			mem.m_bufferDepedency = rgraph.importBuffer((mem.m_drawIndexedIndirectArgsBuffer.isValid()) ? mem.m_drawIndexedIndirectArgsBuffer
-																										: mem.m_meshletGroupsInstancesBuffer,
-														BufferUsageBit::kNone);
-		}
+		BufferView m_renderablePrefixSums;
+		BufferView m_meshletPrefixSums;
+		BufferView m_stage2IndirectArgs;
 
-		if(getRenderer().runSoftwareMeshletRendering())
+		BufferView m_visibleAabbIndices;
+		BufferView m_hash;
+	} stage1Mem;
+
+	stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * 3);
+	if(in.m_limitMemory)
+	{
+		PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
+		if(newRange)
 		{
-			// Because someone will need it later
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleRenderables.getRange());
+			stage1Mem.m_visibleRenderables = BufferView(m_persistentMemory.m_stage1.m_visibleRenderables).setRange(newRange);
+		}
 
-			for(PersistentMemoryMeshletRendering& mem : m_runCtx.m_persistentMeshletRenderingMem)
-			{
-				mem = {};
+		newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
+		if(newRange)
+		{
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage1.m_visibleMeshlets.getRange());
+			stage1Mem.m_visibleMeshlets = BufferView(m_persistentMemory.m_stage1.m_visibleMeshlets).setRange(newRange);
+		}
+	}
+	else
+	{
+		stage1Mem.m_visibleRenderables = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables);
+		stage1Mem.m_visibleMeshlets = allocateTransientGpuMem(sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets);
+	}
+	stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
+	stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
+	stage1Mem.m_stage2IndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * 2);
 
-				mem.m_meshletInstancesBuffer = allocateTransientGpuMem(maxTotalMemReq.m_meshletInstanceCount * sizeof(GpuSceneMeshletInstance));
+	if(in.m_gatherAabbIndices)
+	{
+		stage1Mem.m_visibleAabbIndices = allocateTransientGpuMem(sizeof(U32) * buckets.getBucketsActiveUserCount(in.m_technique));
+	}
 
-				mem.m_bufferDepedency = rgraph.importBuffer(mem.m_meshletInstancesBuffer, BufferUsageBit::kNone);
-			}
-		}
+	if(in.m_hashVisibles)
+	{
+		stage1Mem.m_hash = allocateTransientGpuMem(sizeof(GpuVisibilityHash));
 	}
 
-	const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique);
-	const MemoryRequirements& req = m_runCtx.m_totalMemRequirements[in.m_technique];
-	const PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphCallCount++ % m_runCtx.m_persistentMem.getSize()];
+	// Allocate memory for stage 2
+	class Stage2Mem
+	{
+	public:
+		class
+		{
+		public:
+			BufferView m_instanceRateRenderables;
+			BufferView m_drawIndexedIndirectArgs;
 
-	out.m_legacy.m_drawIndexedIndirectArgsBuffer =
-		(req.m_renderableInstanceCount)
-			? BufferView(mem.m_drawIndexedIndirectArgsBuffer).setRange(req.m_renderableInstanceCount * sizeof(DrawIndexedIndirectArgs))
-			: BufferView();
+			BufferView m_mdiDrawCounts;
+		} m_legacy;
 
-	out.m_legacy.m_renderableInstancesBuffer =
-		(req.m_renderableInstanceCount)
-			? BufferView(mem.m_renderableInstancesBuffer).setRange(req.m_renderableInstanceCount * sizeof(GpuSceneRenderableInstance))
-			: BufferView();
+		class
+		{
+		public:
+			BufferView m_indirectDrawArgs;
+			BufferView m_dispatchMeshIndirectArgs;
 
-	out.m_legacy.m_mdiDrawCountsBuffer = allocateTransientGpuMem(sizeof(U32) * bucketCount);
+			BufferView m_meshletInstances;
+		} m_meshlet;
+	} stage2Mem;
 
-	out.m_mesh.m_meshletGroupInstancesBuffer =
-		(req.m_meshletGroupInstanceCount)
-			? BufferView(mem.m_meshletGroupsInstancesBuffer).setRange(req.m_meshletGroupInstanceCount * sizeof(GpuSceneMeshletGroupInstance))
-			: BufferView();
+	if(bLegacyRendering)
+	{
+		if(in.m_limitMemory)
+		{
+			PtrSize newRange = sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables;
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables.getRange());
+			stage2Mem.m_legacy.m_instanceRateRenderables = BufferView(m_persistentMemory.m_stage2Legacy.m_instanceRateRenderables).setRange(newRange);
 
-	out.m_mesh.m_taskShaderIndirectArgsBuffer = allocateTransientGpuMem(bucketCount * sizeof(DispatchIndirectArgs));
+			newRange = sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables;
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs.getRange());
+			stage2Mem.m_legacy.m_drawIndexedIndirectArgs = BufferView(m_persistentMemory.m_stage2Legacy.m_drawIndexedIndirectArgs).setRange(newRange);
+		}
+		else
+		{
+			stage2Mem.m_legacy.m_instanceRateRenderables = allocateTransientGpuMem(sizeof(UVec4) * limits.m_maxVisibleLegacyRenderables);
+			stage2Mem.m_legacy.m_drawIndexedIndirectArgs =
+				allocateTransientGpuMem(sizeof(DrawIndexedIndirectArgs) * limits.m_maxVisibleLegacyRenderables);
+		}
 
-	if(in.m_hashVisibles)
+		stage2Mem.m_legacy.m_mdiDrawCounts = allocateTransientGpuMem(sizeof(U32) * bucketCount);
+	}
+
+	if(bMeshletRendering)
 	{
-		out.m_visiblesHashBuffer = allocateTransientGpuMem(sizeof(GpuVisibilityHash));
+		if(bHwMeshletRendering)
+		{
+			stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
+		}
+		else
+		{
+			stage2Mem.m_meshlet.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
+		}
+
+		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
+		if(in.m_limitMemory)
+		{
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletInstances.getRange());
+			stage2Mem.m_meshlet.m_meshletInstances = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletInstances).setRange(newRange);
+		}
+		else
+		{
+			stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange);
+		}
 	}
 
-	if(in.m_gatherAabbIndices)
+	// Setup output
+	out.m_legacy.m_renderableInstancesBuffer = stage2Mem.m_legacy.m_instanceRateRenderables;
+	out.m_legacy.m_mdiDrawCountsBuffer = stage2Mem.m_legacy.m_mdiDrawCounts;
+	out.m_legacy.m_drawIndexedIndirectArgsBuffer = stage2Mem.m_legacy.m_drawIndexedIndirectArgs;
+	out.m_mesh.m_dispatchMeshIndirectArgsBuffer = stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs;
+	out.m_mesh.m_drawIndirectArgs = stage2Mem.m_meshlet.m_indirectDrawArgs;
+	out.m_mesh.m_meshletInstancesBuffer = stage2Mem.m_meshlet.m_meshletInstances;
+	out.m_visibleAaabbIndicesBuffer = stage1Mem.m_visibleAabbIndices;
+	out.m_visiblesHashBuffer = stage1Mem.m_hash;
+	if(bHwMeshletRendering)
 	{
-		out.m_visibleAaabbIndicesBuffer =
-			allocateTransientGpuMem((RenderStateBucketContainer::getSingleton().getBucketsActiveUserCount(in.m_technique) + 1) * sizeof(U32));
+		out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
 	}
 
-	// Set instance sub-ranges
-	out.m_legacy.m_bucketRenderableInstanceRanges = m_runCtx.m_renderableInstanceRanges[in.m_technique];
-	out.m_mesh.m_bucketMeshletGroupInstanceRanges = m_runCtx.m_meshletGroupInstanceRanges[in.m_technique];
+	// Use one buffer as a depedency. Doesn't matter which
+	out.m_dependency = (in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_stage2IndirectArgs, BufferUsageBit::kNone);
 
 	// Zero some stuff
-	const BufferHandle zeroStuffDependency = rgraph.importBuffer(out.m_legacy.m_mdiDrawCountsBuffer, BufferUsageBit::kNone);
+	const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
 	{
-		Array<Char, 128> passName;
-		snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU vis zero: %s", in.m_passesName.cstr());
-
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(passName.getBegin());
-		pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kTransferDestination);
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
+		pass.newBufferDependency(zeroMemDep, BufferUsageBit::kTransferDestination);
 
-		pass.setWork([out](RenderPassWorkContext& rpass) {
+		pass.setWork([stage1Mem, stage2Mem, this](RenderPassWorkContext& rpass) {
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
-			cmdb.pushDebugMarker("MDI counts", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(out.m_legacy.m_mdiDrawCountsBuffer, 0);
+			cmdb.pushDebugMarker("Temp counters", Vec3(1.0f, 1.0f, 1.0f));
+			cmdb.fillBuffer(stage1Mem.m_counters, 0);
 			cmdb.popDebugMarker();
 
-			if(out.m_mesh.m_taskShaderIndirectArgsBuffer.isValid())
+			if(stage1Mem.m_renderablePrefixSums.isValid())
 			{
-				cmdb.pushDebugMarker("Task shader indirect args", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(out.m_mesh.m_taskShaderIndirectArgsBuffer, 0);
+				cmdb.pushDebugMarker("Renderable prefix sums", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage1Mem.m_renderablePrefixSums, 0);
 				cmdb.popDebugMarker();
 			}
 
-			if(out.m_visiblesHashBuffer.isValid())
+			if(stage1Mem.m_meshletPrefixSums.isValid())
 			{
-				cmdb.pushDebugMarker("Visibles hash", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(out.m_visiblesHashBuffer, 0);
+				cmdb.pushDebugMarker("Meshlet prefix sums", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage1Mem.m_meshletPrefixSums, 0);
 				cmdb.popDebugMarker();
 			}
 
-			if(out.m_visibleAaabbIndicesBuffer.isValid())
+			if(stage2Mem.m_legacy.m_drawIndexedIndirectArgs.isValid())
 			{
-				cmdb.pushDebugMarker("Visible AABB indices", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(BufferView(out.m_visibleAaabbIndicesBuffer).setRange(sizeof(U32)), 0);
+				cmdb.pushDebugMarker("Draw indexed indirect args", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, 0);
 				cmdb.popDebugMarker();
 			}
-		});
-	}
 
-	// Set the out dependency. Use one of the big buffers.
-	out.m_dependency = mem.m_bufferDepedency;
+			if(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs.isValid())
+			{
+				cmdb.pushDebugMarker("Dispatch indirect args", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, 0);
+				cmdb.popDebugMarker();
+			}
 
-	// Create the renderpass
-	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis: %s", in.m_passesName.cstr()));
+			if(stage2Mem.m_meshlet.m_indirectDrawArgs.isValid())
+			{
+				cmdb.pushDebugMarker("Draw indirect args (S/W meshlet rendering)", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage2Mem.m_meshlet.m_indirectDrawArgs, 0);
+				cmdb.popDebugMarker();
+			}
 
-	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
-	pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kStorageComputeWrite);
-	pass.newBufferDependency(out.m_dependency, BufferUsageBit::kStorageComputeWrite);
+			if(stage2Mem.m_legacy.m_mdiDrawCounts.isValid())
+			{
+				cmdb.pushDebugMarker("MDI counts", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(stage2Mem.m_legacy.m_mdiDrawCounts, 0);
+				cmdb.popDebugMarker();
+			}
 
-	if(!distanceBased && static_cast<FrustumGpuVisibilityInput&>(in).m_hzbRt)
-	{
-		frustumTestData->m_hzbRt = *static_cast<FrustumGpuVisibilityInput&>(in).m_hzbRt;
-		pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSampledCompute);
+			cmdb.pushDebugMarker("OoM readback", Vec3(1.0f, 1.0f, 1.0f));
+			cmdb.fillBuffer(m_outOfMemoryReadbackBuffer, 0);
+			cmdb.popDebugMarker();
+		});
 	}
 
-	pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
-				  technique = in.m_technique, out](RenderPassWorkContext& rpass) {
-		CommandBuffer& cmdb = *rpass.m_commandBuffer;
-
-		const Bool gatherAabbIndices = out.m_visibleAaabbIndicesBuffer.isValid();
-		const Bool genHash = out.m_visiblesHashBuffer.isValid();
-
-		U32 gatherType = 0;
-		if(out.m_mesh.m_meshletGroupInstancesBuffer.isValid())
-		{
-			gatherType |= 2u;
-		}
-
-		if(out.m_legacy.m_renderableInstancesBuffer.isValid())
-		{
-			gatherType |= 1u;
-		}
-		ANKI_ASSERT(gatherType != 0);
+	// 1st stage
+	{
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st pass: %s", in.m_passesName.cstr()));
 
-		if(frustumTestData)
-		{
-			cmdb.bindShaderProgram(m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][gatherType - 1u].get());
-		}
-		else
-		{
-			cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][gatherType - 1u].get());
-		}
+		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
+		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kStorageComputeWrite);
+		pass.newBufferDependency(zeroMemDep, BufferUsageBit::kStorageComputeWrite);
 
-		BufferView aabbsBuffer;
-		U32 aabbCount = 0;
-		switch(technique)
+		if(frustumTestData && frustumTestData->m_hzbRt.isValid())
 		{
-		case RenderingTechnique::kGBuffer:
-			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferView();
-			aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
-			break;
-		case RenderingTechnique::kDepth:
-			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferView();
-			aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
-			break;
-		case RenderingTechnique::kForward:
-			aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferView();
-			aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
-			break;
-		default:
-			ANKI_ASSERT(0);
+			pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSampledCompute);
 		}
 
-		cmdb.bindStorageBuffer(ANKI_REG(t0), aabbsBuffer);
-		cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::Renderable::getSingleton().getBufferView());
-		cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
-		cmdb.bindStorageBuffer(ANKI_REG(t3), GpuSceneArrays::Transform::getSingleton().getBufferView());
-		cmdb.bindStorageBuffer(ANKI_REG(t4), GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
-		if(gatherType & 1u)
-		{
-			cmdb.bindStorageBuffer(ANKI_REG(u0), out.m_legacy.m_renderableInstancesBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(u1), out.m_legacy.m_drawIndexedIndirectArgsBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(u2), out.m_legacy.m_drawIndexedIndirectArgsBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(u3), out.m_legacy.m_mdiDrawCountsBuffer);
-		}
-		if(gatherType & 2u)
-		{
-			cmdb.bindStorageBuffer(ANKI_REG(u4), out.m_mesh.m_taskShaderIndirectArgsBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(u5), out.m_mesh.m_meshletGroupInstancesBuffer);
-		}
+		pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
+					  technique = in.m_technique, stage1Mem, bLegacyRendering, bMeshletRendering](RenderPassWorkContext& rpass) {
+			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
-		const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(technique);
-		UVec2* instanceRanges = allocateAndBindStorageBuffer<UVec2>(cmdb, ANKI_REG(t5), bucketCount);
-		for(U32 i = 0; i < bucketCount; ++i)
-		{
-			const Bool legacyBucket = m_runCtx.m_renderableInstanceRanges[technique][i].m_instanceCount > 0;
+			const Bool gatherAabbIndices = stage1Mem.m_visibleAabbIndices.isValid();
+			const Bool genHash = stage1Mem.m_hash.isValid();
+			const Bool gatherMeshlets = stage1Mem.m_visibleMeshlets.isValid();
 
-			if(legacyBucket)
+			if(frustumTestData)
 			{
-				instanceRanges[i].x() = m_runCtx.m_renderableInstanceRanges[technique][i].m_firstInstance;
-				instanceRanges[i].y() = m_runCtx.m_renderableInstanceRanges[technique][i].m_instanceCount;
+				cmdb.bindShaderProgram(m_frustumGrProgs[frustumTestData->m_hzbRt.isValid()][gatherAabbIndices][genHash][gatherMeshlets].get());
 			}
 			else
 			{
-				instanceRanges[i].x() = m_runCtx.m_meshletGroupInstanceRanges[technique][i].m_firstInstance;
-				instanceRanges[i].y() = m_runCtx.m_meshletGroupInstanceRanges[technique][i].m_instanceCount;
+				cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices][genHash][gatherMeshlets].get());
 			}
-		}
-
-		if(frustumTestData)
-		{
-			FrustumGpuVisibilityUniforms* unis = allocateAndBindConstants<FrustumGpuVisibilityUniforms>(cmdb, ANKI_REG(b0));
 
-			Array<Plane, 6> planes;
-			extractClipPlanes(frustumTestData->m_viewProjMat, planes);
-			for(U32 i = 0; i < 6; ++i)
+			BufferView aabbsBuffer;
+			U32 aabbCount = 0;
+			switch(technique)
 			{
-				unis->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+			case RenderingTechnique::kGBuffer:
+				aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getBufferView();
+				aabbCount = GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount();
+				break;
+			case RenderingTechnique::kDepth:
+				aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getBufferView();
+				aabbCount = GpuSceneArrays::RenderableBoundingVolumeDepth::getSingleton().getElementCount();
+				break;
+			case RenderingTechnique::kForward:
+				aabbsBuffer = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getBufferView();
+				aabbCount = GpuSceneArrays::RenderableBoundingVolumeForward::getSingleton().getElementCount();
+				break;
+			default:
+				ANKI_ASSERT(0);
 			}
 
-			ANKI_ASSERT(kMaxLodCount == 3);
-			unis->m_maxLodDistances[0] = lodDistances[0];
-			unis->m_maxLodDistances[1] = lodDistances[1];
-			unis->m_maxLodDistances[2] = kMaxF32;
-			unis->m_maxLodDistances[3] = kMaxF32;
+			cmdb.bindStorageBuffer(ANKI_REG(t0), aabbsBuffer);
+			cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::Renderable::getSingleton().getBufferView());
+			cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
+			cmdb.bindStorageBuffer(ANKI_REG(t3), GpuSceneArrays::Transform::getSingleton().getBufferView());
+			cmdb.bindStorageBuffer(ANKI_REG(t4), GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
+
+			cmdb.bindStorageBuffer(ANKI_REG(u0), stage1Mem.m_counters);
 
-			unis->m_lodReferencePoint = lodReferencePoint;
-			unis->m_viewProjectionMat = frustumTestData->m_viewProjMat;
-			unis->m_finalRenderTargetSize = Vec2(frustumTestData->m_finalRenderTargetSize);
+			cmdb.bindStorageBuffer(ANKI_REG(u1), (bLegacyRendering) ? stage1Mem.m_visibleRenderables : BufferView(&getRenderer().getDummyBuffer()));
+			cmdb.bindStorageBuffer(ANKI_REG(u2), (bMeshletRendering) ? stage1Mem.m_visibleMeshlets : BufferView(&getRenderer().getDummyBuffer()));
 
-			if(frustumTestData->m_hzbRt.isValid())
+			cmdb.bindStorageBuffer(ANKI_REG(u3), (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(&getRenderer().getDummyBuffer()));
+			cmdb.bindStorageBuffer(ANKI_REG(u4), (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(&getRenderer().getDummyBuffer()));
+
+			cmdb.bindStorageBuffer(ANKI_REG(u5), stage1Mem.m_stage2IndirectArgs);
+
+			cmdb.bindStorageBuffer(ANKI_REG(u6), m_outOfMemoryReadbackBuffer);
+
+			if(gatherAabbIndices)
 			{
-				rpass.bindTexture(ANKI_REG(t6), frustumTestData->m_hzbRt);
-				cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
+				cmdb.bindStorageBuffer(ANKI_REG(u7), stage1Mem.m_visibleAabbIndices);
 			}
-		}
-		else
-		{
-			DistanceGpuVisibilityUniforms unis;
-			unis.m_pointOfTest = distTestData->m_pointOfTest;
-			unis.m_testRadius = distTestData->m_testRadius;
 
-			unis.m_maxLodDistances[0] = lodDistances[0];
-			unis.m_maxLodDistances[1] = lodDistances[1];
-			unis.m_maxLodDistances[2] = kMaxF32;
-			unis.m_maxLodDistances[3] = kMaxF32;
+			if(genHash)
+			{
+				cmdb.bindStorageBuffer(ANKI_REG(u8), stage1Mem.m_hash);
+			}
 
-			unis.m_lodReferencePoint = lodReferencePoint;
+			if(frustumTestData)
+			{
+				FrustumGpuVisibilityUniforms* unis = allocateAndBindConstants<FrustumGpuVisibilityUniforms>(cmdb, ANKI_REG(b0));
 
-			cmdb.setPushConstants(&unis, sizeof(unis));
-		}
+				Array<Plane, 6> planes;
+				extractClipPlanes(frustumTestData->m_viewProjMat, planes);
+				for(U32 i = 0; i < 6; ++i)
+				{
+					unis->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+				}
 
-		if(gatherAabbIndices)
-		{
-			cmdb.bindStorageBuffer(ANKI_REG(u6), out.m_visibleAaabbIndicesBuffer);
-		}
+				ANKI_ASSERT(kMaxLodCount == 3);
+				unis->m_maxLodDistances[0] = lodDistances[0];
+				unis->m_maxLodDistances[1] = lodDistances[1];
+				unis->m_maxLodDistances[2] = kMaxF32;
+				unis->m_maxLodDistances[3] = kMaxF32;
 
-		if(genHash)
-		{
-			cmdb.bindStorageBuffer(ANKI_REG(u7), out.m_visiblesHashBuffer);
-		}
+				unis->m_lodReferencePoint = lodReferencePoint;
+				unis->m_viewProjectionMat = frustumTestData->m_viewProjMat;
+				unis->m_finalRenderTargetSize = Vec2(frustumTestData->m_finalRenderTargetSize);
 
-		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
-	});
-}
+				if(frustumTestData->m_hzbRt.isValid())
+				{
+					rpass.bindTexture(ANKI_REG(t5), frustumTestData->m_hzbRt);
+					cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
+				}
+			}
+			else
+			{
+				DistanceGpuVisibilityUniforms unis;
+				unis.m_pointOfTest = distTestData->m_pointOfTest;
+				unis.m_testRadius = distTestData->m_testRadius;
 
-void GpuVisibility::populateRenderGraphMeshletInternal(Bool passthrough, BaseGpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
-{
-	RenderGraphBuilder& rgraph = *in.m_rgraph;
+				unis.m_maxLodDistances[0] = lodDistances[0];
+				unis.m_maxLodDistances[1] = lodDistances[1];
+				unis.m_maxLodDistances[2] = kMaxF32;
+				unis.m_maxLodDistances[3] = kMaxF32;
 
-	if(!in.m_taskShaderIndirectArgsBuffer.isValid()) [[unlikely]]
-	{
-		// Early exit
-		return;
-	}
+				unis.m_lodReferencePoint = lodReferencePoint;
 
-	class NonPassthrough
+				cmdb.setPushConstants(&unis, sizeof(unis));
+			}
+
+			dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
+		});
+	} // end 1st stage
+
+	// 2nd stage
 	{
-	public:
-		Mat4 m_viewProjectionMatrix;
-		Mat3x4 m_cameraTransform;
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd pass: %s", in.m_passesName.cstr()));
 
-		UVec2 m_viewportSize;
+		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kStorageComputeWrite);
 
-		RenderTargetHandle m_hzbRt;
-	}* nonPassthroughData = nullptr;
+		if(frustumTestData && frustumTestData->m_hzbRt.isValid())
+		{
+			pass.newTextureDependency(frustumTestData->m_hzbRt, TextureUsageBit::kSampledCompute);
+		}
 
-	if(!passthrough)
-	{
-		GpuMeshletVisibilityInput& nonPassthroughIn = static_cast<GpuMeshletVisibilityInput&>(in);
+		pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
+					  lodReferencePoint = in.m_lodReferencePoint](RenderPassWorkContext& rpass) {
+			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
-		nonPassthroughData = newInstance<NonPassthrough>(getRenderer().getFrameMemoryPool());
-		nonPassthroughData->m_viewProjectionMatrix = nonPassthroughIn.m_viewProjectionMatrix;
-		nonPassthroughData->m_cameraTransform = nonPassthroughIn.m_cameraTransform;
-		nonPassthroughData->m_viewportSize = nonPassthroughIn.m_viewportSize;
-		nonPassthroughData->m_hzbRt = nonPassthroughIn.m_hzbRt;
-	}
+			if(bLegacyRendering)
+			{
+				cmdb.bindShaderProgram(m_gatherGrProg.get());
 
-	// Allocate memory
-	const U32 bucketCount = m_runCtx.m_renderableInstanceRanges[in.m_technique].getSize();
-	ANKI_ASSERT(RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique) == bucketCount);
+				cmdb.bindStorageBuffer(ANKI_REG(t0), GpuSceneArrays::Renderable::getSingleton().getBufferView());
+				cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
+				cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
 
-	const PersistentMemoryMeshletRendering& mem = m_runCtx.m_persistentMeshletRenderingMem[m_runCtx.m_populateRenderGraphMeshletRenderingCallCount++
-																						   % m_runCtx.m_persistentMeshletRenderingMem.getSize()];
+				cmdb.bindStorageBuffer(ANKI_REG(t3), stage1Mem.m_visibleRenderables);
+				cmdb.bindStorageBuffer(ANKI_REG(t4), stage1Mem.m_counters);
+				cmdb.bindStorageBuffer(ANKI_REG(t5), stage1Mem.m_renderablePrefixSums);
 
-	out.m_drawIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
+				UVec2* firstDrawIndirectArgAndCount =
+					allocateAndBindStorageBuffer<UVec2>(cmdb, ANKI_REG(t6), out.m_legacy.m_bucketIndirectArgsRanges.getSize());
+				for(U32 ibucket = 0; ibucket < out.m_legacy.m_bucketIndirectArgsRanges.getSize(); ++ibucket)
+				{
+					firstDrawIndirectArgAndCount->x() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_firstInstance;
+					firstDrawIndirectArgAndCount->y() = out.m_legacy.m_bucketIndirectArgsRanges[ibucket].m_instanceCount;
+					++firstDrawIndirectArgAndCount;
+				}
 
-	out.m_meshletInstancesBuffer =
-		BufferView(mem.m_meshletInstancesBuffer)
-			.setRange(m_runCtx.m_totalMemRequirements[in.m_technique].m_meshletInstanceCount * sizeof(GpuSceneMeshletInstance));
+				cmdb.bindStorageBuffer(ANKI_REG(u0), stage2Mem.m_legacy.m_instanceRateRenderables);
+				cmdb.bindStorageBuffer(ANKI_REG(u1), stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
+				cmdb.bindStorageBuffer(ANKI_REG(u2), stage2Mem.m_legacy.m_drawIndexedIndirectArgs);
 
-	out.m_bucketMeshletInstanceRanges = m_runCtx.m_meshletInstanceRanges[in.m_technique];
+				cmdb.bindStorageBuffer(ANKI_REG(u3), stage2Mem.m_legacy.m_mdiDrawCounts);
 
-	// Zero some stuff
-	const BufferHandle indirectArgsDep = rgraph.importBuffer(out.m_drawIndirectArgsBuffer, BufferUsageBit::kNone);
-	{
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU meshlet vis zero: %s", in.m_passesName.cstr()));
-		pass.newBufferDependency(indirectArgsDep, BufferUsageBit::kTransferDestination);
+				cmdb.bindStorageBuffer(ANKI_REG(u4), m_outOfMemoryReadbackBuffer);
 
-		pass.setWork([drawIndirectArgsBuffer = out.m_drawIndirectArgsBuffer](RenderPassWorkContext& rpass) {
-			CommandBuffer& cmdb = *rpass.m_commandBuffer;
+				cmdb.dispatchComputeIndirect(BufferView(stage1Mem.m_stage2IndirectArgs).setRange(sizeof(DispatchIndirectArgs)));
+			}
 
-			cmdb.pushDebugMarker("Draw indirect args", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(drawIndirectArgsBuffer, 0);
-			cmdb.popDebugMarker();
-		});
-	}
+			if(bMeshletRendering)
+			{
+				const Bool hzbTex = frustumTestData && frustumTestData->m_hzbRt.isValid();
+				const Bool passthrough = frustumTestData == nullptr;
+				const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
 
-	out.m_dependency = mem.m_bufferDepedency;
+				cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders].get());
 
-	// Create the renderpass
-	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU meshlet vis: %s", in.m_passesName.cstr()));
+				cmdb.bindStorageBuffer(ANKI_REG(t0), GpuSceneArrays::Renderable::getSingleton().getBufferView());
+				cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
+				cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::Transform::getSingleton().getBufferView());
 
-	pass.newBufferDependency(indirectArgsDep, BufferUsageBit::kStorageComputeWrite);
-	pass.newBufferDependency(mem.m_bufferDepedency, BufferUsageBit::kStorageComputeWrite);
-	pass.newBufferDependency(in.m_dependency, BufferUsageBit::kIndirectCompute);
+				cmdb.bindStorageBuffer(ANKI_REG(t3), UnifiedGeometryBuffer::getSingleton().getBufferView());
 
-	pass.setWork([this, nonPassthroughData, computeIndirectArgs = in.m_taskShaderIndirectArgsBuffer, out,
-				  meshletGroupInstancesBuffer = in.m_meshletGroupInstancesBuffer,
-				  bucketMeshletGroupInstanceRanges = in.m_bucketMeshletGroupInstanceRanges](RenderPassWorkContext& rpass) {
-		CommandBuffer& cmdb = *rpass.m_commandBuffer;
+				if(hzbTex)
+				{
+					rpass.bindTexture(ANKI_REG(t4), frustumTestData->m_hzbRt);
+					cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
+				}
 
-		const U32 bucketCount = out.m_bucketMeshletInstanceRanges.getSize();
+				cmdb.bindStorageBuffer(ANKI_REG(t5), stage1Mem.m_counters);
+				cmdb.bindStorageBuffer(ANKI_REG(t6), stage1Mem.m_meshletPrefixSums);
+				cmdb.bindStorageBuffer(ANKI_REG(t7), stage1Mem.m_visibleMeshlets);
 
-		for(U32 i = 0; i < bucketCount; ++i)
-		{
-			if(out.m_bucketMeshletInstanceRanges[i].m_instanceCount == 0)
-			{
-				continue;
-			}
+				cmdb.bindStorageBuffer(ANKI_REG(u0), (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs
+																		   : stage2Mem.m_meshlet.m_indirectDrawArgs);
+				cmdb.bindStorageBuffer(ANKI_REG(u1), stage2Mem.m_meshlet.m_meshletInstances);
 
-			const Bool hasHzb = (nonPassthroughData) ? nonPassthroughData->m_hzbRt.isValid() : false;
-			const Bool isPassthrough = (nonPassthroughData == nullptr);
+				cmdb.bindStorageBuffer(ANKI_REG(u2), m_outOfMemoryReadbackBuffer);
 
-			cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb][isPassthrough].get());
+				if(!passthrough)
+				{
+					class Consts
+					{
+					public:
+						Mat4 m_viewProjectionMatrix;
+
+						Vec3 m_cameraPos;
+						U32 m_padding1;
+
+						Vec2 m_viewportSizef;
+						UVec2 m_padding2;
+					} consts;
+					consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
+					consts.m_cameraPos = lodReferencePoint;
+					consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
+
+					cmdb.setPushConstants(&consts, sizeof(consts));
+				}
 
-			cmdb.bindStorageBuffer(ANKI_REG(t0), meshletGroupInstancesBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::Renderable::getSingleton().getBufferView());
-			cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
-			cmdb.bindStorageBuffer(ANKI_REG(t3), GpuSceneArrays::Transform::getSingleton().getBufferView());
-			cmdb.bindStorageBuffer(ANKI_REG(t4), UnifiedGeometryBuffer::getSingleton().getBufferView());
-			cmdb.bindStorageBuffer(ANKI_REG(u0), out.m_drawIndirectArgsBuffer);
-			cmdb.bindStorageBuffer(ANKI_REG(u1), out.m_meshletInstancesBuffer);
-			if(hasHzb)
-			{
-				rpass.bindTexture(ANKI_REG(t5), nonPassthroughData->m_hzbRt);
-				cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
+				cmdb.dispatchComputeIndirect(
+					BufferView(stage1Mem.m_stage2IndirectArgs).incrementOffset(sizeof(DispatchIndirectArgs)).setRange(sizeof(DispatchIndirectArgs)));
 			}
+		});
 
-			class Consts
-			{
-			public:
-				Mat4 m_viewProjectionMatrix;
-
-				Vec3 m_cameraPos;
-				U32 m_firstDrawArg;
-
-				Vec2 m_viewportSizef;
-				U32 m_firstMeshletGroup;
-				U32 m_firstMeshlet;
-
-				U32 m_meshletCount;
-				U32 m_padding1;
-				U32 m_padding2;
-				U32 m_padding3;
-			} consts;
-			consts.m_viewProjectionMatrix = (!isPassthrough) ? nonPassthroughData->m_viewProjectionMatrix : Mat4::getIdentity();
-			consts.m_cameraPos = (!isPassthrough) ? nonPassthroughData->m_cameraTransform.getTranslationPart().xyz() : Vec3(0.0f);
-			consts.m_firstDrawArg = i;
-			consts.m_viewportSizef = (!isPassthrough) ? Vec2(nonPassthroughData->m_viewportSize) : Vec2(0.0f);
-			consts.m_firstMeshletGroup = bucketMeshletGroupInstanceRanges[i].getFirstInstance();
-			consts.m_firstMeshlet = out.m_bucketMeshletInstanceRanges[i].getFirstInstance();
-			consts.m_meshletCount = out.m_bucketMeshletInstanceRanges[i].getInstanceCount();
-			cmdb.setPushConstants(&consts, sizeof(consts));
-
-			cmdb.dispatchComputeIndirect(
-				BufferView(computeIndirectArgs).incrementOffset(i * sizeof(DispatchIndirectArgs)).setRange(sizeof(DispatchIndirectArgs)));
-		};
-	});
+	} // end 2nd stage
 }
 
 Error GpuVisibilityNonRenderables::init()

+ 37 - 144
AnKi/Renderer/Utils/GpuVisibility.h

@@ -56,6 +56,8 @@ public:
 
 	Bool m_gatherAabbIndices = false; ///< For debug draw.
 	Bool m_hashVisibles = false; ///< Create a hash for the visible renderables.
+
+	Bool m_limitMemory = false; ///< Use less memory but you pay some cost scheduling the work.
 };
 
 /// @memberof GpuVisibility
@@ -91,19 +93,20 @@ public:
 		BufferView m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket (even those that use task/mesh flow).
 		BufferView m_drawIndexedIndirectArgsBuffer; ///< Array of DrawIndexedIndirectArgs or DrawIndirectArgs.
 
-		/// Defines the element sub-ranges in the m_renderableInstancesBuffer an m_drawIndexedIndirectArgsBuffer per render state bucket.
-		ConstWeakArray<InstanceRange> m_bucketRenderableInstanceRanges;
+		/// Defines the element sub-ranges in the m_drawIndexedIndirectArgsBuffer per render state bucket.
+		WeakArray<InstanceRange> m_bucketIndirectArgsRanges;
 	} m_legacy; ///< Legacy vertex shading.
 
 	class
 	{
 	public:
-		BufferView m_taskShaderIndirectArgsBuffer; ///< An array of DispatchIndirectArgs, one for each render state bucket.
-		BufferView m_meshletGroupInstancesBuffer; ///< Array with GpuSceneMeshletGroupInstance.
+		BufferView m_dispatchMeshIndirectArgsBuffer; ///< H/W meshlet rendering array of DispatchIndirectArgs, one for each render state bucket.
+		BufferView m_drawIndirectArgs; ///< S/W meshlet rendering array of DrawIndirectArgs, one for each state bucket.
+
+		BufferView m_meshletInstancesBuffer;
 
-		/// Defines the element sub-ranges in the m_meshletGroupInstancesBuffer per render state bucket.
-		ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges;
-	} m_mesh; ///< S/W meshlets or H/W mesh shading.
+		BufferView m_firstMeshletBuffer; ///< For H/W meshlet rendering. Points to the first meshlet in the m_meshletInstancesBuffer. One per bucket.
+	} m_mesh; ///< S/W or H/W meshlet rendering.
 
 	BufferView m_visibleAaabbIndicesBuffer; ///< [Optional] Indices to the AABB buffer. The 1st element is the count.
 
@@ -115,67 +118,6 @@ public:
 	}
 };
 
-/// @memberof GpuVisibility
-class BaseGpuMeshletVisibilityInput
-{
-public:
-	CString m_passesName;
-
-	RenderingTechnique m_technique = RenderingTechnique::kCount;
-
-	BufferView m_taskShaderIndirectArgsBuffer; ///< Taken from GpuVisibilityOutput.
-	BufferView m_meshletGroupInstancesBuffer; ///< Taken from GpuVisibilityOutput.
-	ConstWeakArray<InstanceRange> m_bucketMeshletGroupInstanceRanges; ///< Taken from GpuVisibilityOutput.
-
-	BufferHandle m_dependency;
-
-	RenderGraphBuilder* m_rgraph = nullptr;
-
-	void fillBuffers(const GpuVisibilityOutput& perObjVisOut)
-	{
-		m_taskShaderIndirectArgsBuffer = perObjVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
-		m_meshletGroupInstancesBuffer = perObjVisOut.m_mesh.m_meshletGroupInstancesBuffer;
-		m_bucketMeshletGroupInstanceRanges = perObjVisOut.m_mesh.m_bucketMeshletGroupInstanceRanges;
-		m_dependency = perObjVisOut.m_dependency;
-	}
-};
-
-/// @memberof GpuVisibility
-class GpuMeshletVisibilityInput : public BaseGpuMeshletVisibilityInput
-{
-public:
-	Mat4 m_viewProjectionMatrix;
-	Mat3x4 m_cameraTransform;
-
-	/// The size of the viewport the visibility results will be used on. Used to kill objects that don't touch the sampling positions.
-	UVec2 m_viewportSize;
-
-	RenderTargetHandle m_hzbRt; ///< Optional.
-};
-
-/// @memberof GpuVisibility
-class PassthroughGpuMeshletVisibilityInput : public BaseGpuMeshletVisibilityInput
-{
-};
-
-/// @memberof GpuVisibility
-class GpuMeshletVisibilityOutput
-{
-public:
-	BufferView m_drawIndirectArgsBuffer; ///< Array of DrawIndirectArgs. One for every render state bucket (even those that use that flow).
-	BufferView m_meshletInstancesBuffer; ///< Array of GpuSceneMeshletInstance.
-
-	/// Defines the element sub-ranges in the m_meshletInstancesBuffer per render state bucket.
-	ConstWeakArray<InstanceRange> m_bucketMeshletInstanceRanges;
-
-	BufferHandle m_dependency; ///< Some dependency to wait on. Wait usage is indirect draw.
-
-	Bool isFilled() const
-	{
-		return m_dependency.isValid();
-	}
-};
-
 /// Performs GPU visibility for some pass.
 class GpuVisibility : public RendererObject
 {
@@ -198,96 +140,47 @@ public:
 		populateRenderGraphInternal(true, in, out);
 	}
 
-	/// Perform meshlet GPU visibility.
-	/// @note Not thread-safe.
-	void populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
-	{
-		populateRenderGraphMeshletInternal(false, in, out);
-	}
-
-	/// Perform meshlet GPU visibility.
-	/// @note Not thread-safe.
-	void populateRenderGraph(PassthroughGpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
-	{
-		populateRenderGraphMeshletInternal(true, in, out);
-	}
-
 private:
-	ShaderProgramResourcePtr m_prog;
-	Array4d<ShaderProgramPtr, 2, 2, 2, 3> m_frustumGrProgs;
-	Array3d<ShaderProgramPtr, 2, 2, 3> m_distGrProgs;
-
-	ShaderProgramResourcePtr m_meshletCullingProg;
-	Array2d<ShaderProgramPtr, 2, 2> m_meshletCullingGrProgs;
+	ShaderProgramResourcePtr m_1stStageProg;
+	Array4d<ShaderProgramPtr, 2, 2, 2, 2> m_frustumGrProgs;
+	Array3d<ShaderProgramPtr, 2, 2, 2> m_distGrProgs;
 
-	// Contains quite large buffer that we want want to reuse muptiple times in a single frame.
-	class PersistentMemory
-	{
-	public:
-		// Legacy
-		BufferView m_drawIndexedIndirectArgsBuffer;
-		BufferView m_renderableInstancesBuffer; ///< Instance rate vertex buffer.
-
-		// HW & SW Meshlet rendering
-		BufferView m_meshletGroupsInstancesBuffer;
-
-		// SW meshlet rendering
-		BufferView m_meshletInstancesBuffer; ///< Instance rate vertex buffer.
-
-		BufferHandle m_bufferDepedency;
-	};
+	ShaderProgramResourcePtr m_2ndStageProg;
+	ShaderProgramPtr m_gatherGrProg;
+	Array3d<ShaderProgramPtr, 2, 2, 2> m_meshletGrProgs;
 
-	class PersistentMemoryMeshletRendering
+	class
 	{
 	public:
-		// SW meshlet rendering
-		BufferView m_meshletInstancesBuffer; ///< Instance rate vertex buffer.
-
-		BufferHandle m_bufferDepedency;
-	};
+		class
+		{
+		public:
+			BufferView m_visibleRenderables;
+			BufferView m_visibleMeshlets;
+		} m_stage1;
 
-	class MemoryRequirements
-	{
-	public:
-		U32 m_renderableInstanceCount = 0; ///< Count of GpuSceneRenderableInstance and a few other things
-		U32 m_meshletGroupInstanceCount = 0; ///< Count of GpuSceneMeshletGroupInstance
-		U32 m_meshletInstanceCount = 0; ///< Count of GpuSceneMeshletInstance
+		class
+		{
+		public:
+			BufferView m_instanceRateRenderables;
+			BufferView m_drawIndexedIndirectArgs;
+		} m_stage2Legacy;
 
-		MemoryRequirements max(const MemoryRequirements& b)
+		class
 		{
-			MemoryRequirements out;
-#define ANKI_MAX(member) out.member = anki::max(member, b.member)
-			ANKI_MAX(m_renderableInstanceCount);
-			ANKI_MAX(m_meshletGroupInstanceCount);
-			ANKI_MAX(m_meshletInstanceCount);
-#undef ANKI_MAX
-			return out;
-		}
-	};
+		public:
+			BufferView m_meshletInstances;
+		} m_stage2Meshlet;
 
-	class
-	{
-	public:
 		U64 m_frameIdx = kMaxU64;
-		U32 m_populateRenderGraphCallCount = 0;
-		U32 m_populateRenderGraphMeshletRenderingCallCount = 0;
-
-		/// The more persistent memory there is the more passes will be able to run in parallel but the more memory is used.
-		Array<PersistentMemory, 4> m_persistentMem;
-		Array<PersistentMemoryMeshletRendering, 4> m_persistentMeshletRenderingMem; ///< See m_persistentMem.
 
-		Array<MemoryRequirements, U32(RenderingTechnique::kCount)> m_totalMemRequirements;
+		BufferHandle m_dep;
+	} m_persistentMemory;
 
-		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_renderableInstanceRanges;
-		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_meshletGroupInstanceRanges;
-		Array<WeakArray<InstanceRange>, U32(RenderingTechnique::kCount)> m_meshletInstanceRanges;
-	} m_runCtx;
+	MultiframeReadbackToken m_outOfMemoryReadback;
+	BufferView m_outOfMemoryReadbackBuffer;
 
 	void populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out);
-
-	void populateRenderGraphMeshletInternal(Bool passthrough, BaseGpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out);
-
-	static void computeGpuVisibilityMemoryRequirements(RenderingTechnique t, MemoryRequirements& total, WeakArray<MemoryRequirements> perBucket);
 };
 
 /// @memberof GpuVisibilityNonRenderables

+ 0 - 6
AnKi/Resource/MaterialResource.cpp

@@ -210,10 +210,6 @@ Error MaterialResource::parseShaderProgram(XmlElement shaderProgramEl, Bool asyn
 			m_techniquesMask |= RenderingTechniqueBit::kForward;
 			m_shaderTechniques |= ShaderTechniqueBit::kLegacy;
 		}
-		else if(t.m_name.getBegin() == CString("CommonTask"))
-		{
-			// Ignore
-		}
 		else
 		{
 			ANKI_RESOURCE_LOGE("Found unneeded technique in the shader: %s", t.m_name.getBegin());
@@ -579,7 +575,6 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 		if(key.getMeshletRendering() && meshShadersSupported)
 		{
 			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "GBufferMeshShaders");
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kTask, "CommonTask");
 		}
 		else if(key.getMeshletRendering())
 		{
@@ -594,7 +589,6 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 		if(key.getMeshletRendering() && meshShadersSupported)
 		{
 			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "ShadowsMeshShaders");
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kTask, "CommonTask");
 		}
 		else if(key.getMeshletRendering())
 		{

+ 23 - 14
AnKi/Scene/RenderStateBucket.cpp

@@ -13,12 +13,11 @@ RenderStateBucketContainer::~RenderStateBucketContainer()
 	{
 		for([[maybe_unused]] ExtendedBucket& b : m_buckets[t])
 		{
-			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0 && b.m_lod0MeshletGroupCount == 0 && b.m_lod0MeshletCount == 0);
+			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0 && b.m_lod0MeshletCount == 0);
 		}
 
 		ANKI_ASSERT(m_bucketActiveUserCount[t] == 0);
 		ANKI_ASSERT(m_activeBucketCount[t] == 0);
-		ANKI_ASSERT(m_lod0MeshletGroupCount[t] == 0);
 	}
 }
 
@@ -31,8 +30,6 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	toHash[2] = state.m_indexedDrawcall;
 	const U64 hash = computeHash(toHash.getBegin(), toHash.getSizeInBytes());
 
-	const U32 meshletGroupCount = (lod0MeshletCount + (kMeshletGroupSize - 1)) / kMeshletGroupSize;
-
 	SceneDynamicArray<ExtendedBucket>& buckets = m_buckets[technique];
 
 	RenderStateBucketIndex out;
@@ -41,7 +38,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	LockGuard lock(m_mtx);
 
 	++m_bucketActiveUserCount[technique];
-	m_lod0MeshletGroupCount[technique] += meshletGroupCount;
+	m_bucketActiveUserCountWithMeshlets[technique] += (lod0MeshletCount) ? 1 : 0;
 	m_lod0MeshletCount[technique] += lod0MeshletCount;
 
 	// Search bucket
@@ -49,14 +46,27 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	{
 		if(buckets[i].m_hash == hash)
 		{
+			// Bucket found
+
+			if(buckets[i].m_userCount > 0)
+			{
+				if(lod0MeshletCount)
+				{
+					ANKI_ASSERT(buckets[i].m_lod0MeshletCount > 0 && "A bucket either does meshlet rendering or not");
+				}
+				else
+				{
+					ANKI_ASSERT(buckets[i].m_lod0MeshletCount == 0 && "A bucket either does meshlet rendering or not");
+				}
+			}
+
 			++buckets[i].m_userCount;
-			buckets[i].m_lod0MeshletGroupCount += meshletGroupCount;
 			buckets[i].m_lod0MeshletCount += lod0MeshletCount;
 
 			if(buckets[i].m_userCount == 1)
 			{
 				ANKI_ASSERT(!buckets[i].m_program.isCreated());
-				ANKI_ASSERT(buckets[i].m_lod0MeshletGroupCount == meshletGroupCount && buckets[i].m_lod0MeshletCount == lod0MeshletCount);
+				ANKI_ASSERT(buckets[i].m_lod0MeshletCount == lod0MeshletCount);
 				buckets[i].m_program = state.m_program;
 				++m_activeBucketCount[technique];
 
@@ -80,7 +90,6 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	newBucket.m_primitiveTopology = state.m_primitiveTopology;
 	newBucket.m_program = state.m_program;
 	newBucket.m_userCount = 1;
-	newBucket.m_lod0MeshletGroupCount = meshletGroupCount;
 	newBucket.m_lod0MeshletCount = lod0MeshletCount;
 
 	++m_activeBucketCount[technique];
@@ -101,7 +110,6 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
-	const U32 meshletGroupCount = (bucketIndex.m_lod0MeshletCount + (kMeshletGroupSize - 1)) / kMeshletGroupSize;
 	const U32 meshletCount = bucketIndex.m_lod0MeshletCount;
 	bucketIndex.invalidate();
 
@@ -112,18 +120,19 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 	ANKI_ASSERT(m_bucketActiveUserCount[technique] > 0);
 	--m_bucketActiveUserCount[technique];
 
-	ANKI_ASSERT(m_lod0MeshletGroupCount[technique] >= meshletGroupCount);
-	m_lod0MeshletGroupCount[technique] -= meshletGroupCount;
+	if(meshletCount)
+	{
+		ANKI_ASSERT(m_bucketActiveUserCountWithMeshlets[technique] >= 1);
+		--m_bucketActiveUserCountWithMeshlets[technique];
+	}
 
 	ANKI_ASSERT(m_lod0MeshletCount[technique] >= meshletCount);
 	m_lod0MeshletCount[technique] -= meshletCount;
 
 	ExtendedBucket& bucket = m_buckets[technique][idx];
-	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_lod0MeshletGroupCount >= meshletGroupCount
-				&& bucket.m_lod0MeshletCount >= meshletCount);
+	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_lod0MeshletCount >= meshletCount);
 
 	--bucket.m_userCount;
-	bucket.m_lod0MeshletGroupCount -= meshletGroupCount;
 	bucket.m_lod0MeshletCount -= meshletCount;
 
 	if(bucket.m_userCount == 0)

+ 14 - 7
AnKi/Scene/RenderStateBucket.h

@@ -76,6 +76,7 @@ private:
 };
 
 /// Holds an array of all render state buckets.
+/// It creates buckets at will. If a bucket looses all its users then it becomes inactive but still be part of the bucket list.
 class RenderStateBucketContainer : public MakeSingleton<RenderStateBucketContainer>
 {
 	template<typename>
@@ -96,7 +97,7 @@ public:
 	{
 		for(const ExtendedBucket& b : m_buckets[technique])
 		{
-			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_lod0MeshletGroupCount, b.m_lod0MeshletCount);
+			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_lod0MeshletCount);
 		}
 	}
 
@@ -107,7 +108,7 @@ public:
 		for(U32 i : m_bucketPerfOrder[technique])
 		{
 			const ExtendedBucket& b = m_buckets[technique][i];
-			func(static_cast<const RenderStateInfo&>(b), i, b.m_userCount, b.m_lod0MeshletGroupCount, b.m_lod0MeshletCount);
+			func(static_cast<const RenderStateInfo&>(b), i, b.m_userCount, b.m_lod0MeshletCount);
 		}
 	}
 
@@ -117,10 +118,17 @@ public:
 		return m_bucketActiveUserCount[technique];
 	}
 
-	/// Get the number of meshlet groups of a technique.
-	U32 getBucketsLod0MeshletGroupCount(RenderingTechnique technique) const
+	/// Get the number of renderables of all the buckets that support meshlets.
+	U32 getBucketsActiveUserCountWithMeshletSupport(RenderingTechnique technique) const
 	{
-		return m_lod0MeshletGroupCount[technique];
+		return m_bucketActiveUserCountWithMeshlets[technique];
+	}
+
+	/// Get the number of renderables of all the buckets that support meshlets.
+	U32 getBucketsActiveUserCountWithNoMeshletSupport(RenderingTechnique technique) const
+	{
+		ANKI_ASSERT(m_bucketActiveUserCount[technique] >= m_bucketActiveUserCountWithMeshlets[technique]);
+		return m_bucketActiveUserCount[technique] - m_bucketActiveUserCountWithMeshlets[technique];
 	}
 
 	/// Get the number of meshlets of a technique of LOD 0.
@@ -147,13 +155,12 @@ private:
 	public:
 		U64 m_hash = 0;
 		U32 m_userCount = 0;
-		U32 m_lod0MeshletGroupCount = 0;
 		U32 m_lod0MeshletCount = 0;
 	};
 
 	Array<SceneDynamicArray<ExtendedBucket>, U32(RenderingTechnique::kCount)> m_buckets;
 	Array<U32, U32(RenderingTechnique::kCount)> m_bucketActiveUserCount = {};
-	Array<U32, U32(RenderingTechnique::kCount)> m_lod0MeshletGroupCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_bucketActiveUserCountWithMeshlets = {};
 	Array<U32, U32(RenderingTechnique::kCount)> m_lod0MeshletCount = {};
 	Array<U32, U32(RenderingTechnique::kCount)> m_activeBucketCount = {};
 	Array<SceneDynamicArray<U32>, U32(RenderingTechnique::kCount)> m_bucketPerfOrder; ///< Orders the buckets from the least heavy to the most.

+ 18 - 0
AnKi/Shaders/Common.hlsl

@@ -113,3 +113,21 @@ DEFINE_COMPARISON2(max)
 
 #undef DEFINE_COMPARISON2
 #undef DEFINE_COMPARISON
+
+template<typename T>
+U32 getStructuredBufferElementCount(T x)
+{
+	U32 size, stride;
+	x.GetDimensions(size, stride);
+	return size;
+}
+
+template<typename T>
+U32 checkStructuredBuffer(T buff, U32 idx)
+{
+	ANKI_ASSERT(idx < getStructuredBufferElementCount(buff));
+	return 0u;
+}
+
+// Safely access a structured buffer. Throw an assertion if it's out of bounds
+#define SBUFF(buff, idx) buff[(idx) + checkStructuredBuffer(buff, idx)]

+ 14 - 120
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -161,15 +161,6 @@ struct FragOut
 	Vec2 m_color3 : SV_TARGET3;
 };
 
-struct TaskOut
-{
-	U32 m_firstMeshletGeometryDescriptor;
-	U32 m_visibleMeshletsRelativeIndices[kMeshletGroupSize / sizeof(U32)];
-	U32 m_worldTransformsIndex;
-	U32 m_uniformsOffset;
-	U32 m_boneTransformsOrParticleEmitterOffset;
-};
-
 struct Mat3x4_2
 {
 	Mat3x4 m_a;
@@ -317,103 +308,6 @@ VertOut main(VertIn input)
 };
 #endif // ANKI_VERTEX_SHADER
 
-// ===========================================================================
-// Task                                                                      =
-// ===========================================================================
-#if ANKI_TASK_SHADER
-groupshared TaskOut s_payload;
-groupshared U32 s_visibleMeshletCount;
-
-struct FirstPayload
-{
-	UVec4 m_val;
-};
-
-ANKI_PUSH_CONSTANTS(FirstPayload, g_firstPayload)
-
-[numthreads(ANKI_TASK_SHADER_THREADGROUP_SIZE, 1, 1)] void main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX)
-{
-	const GpuSceneMeshletGroupInstance inPayload = g_meshletGroups[g_firstPayload.m_val.x + svGroupId];
-
-	const U32 lod = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
-	const U32 renderableIdx = (inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
-	const U32 meshletGroup = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit & ((1u << 9u) - 1u);
-
-	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
-	const GpuSceneMeshLod meshLod = g_meshLods[renderable.m_meshLodsIndex + lod];
-	U32 firstMeshletBoundingVolume = meshletGroup * kMeshletGroupSize;
-	const U32 meshletCount = min(kMeshletGroupSize, meshLod.m_meshletCount - firstMeshletBoundingVolume);
-	firstMeshletBoundingVolume += meshLod.m_firstMeshletBoundingVolume;
-	const U32 firstMeshletGeometryDescriptor = meshletGroup * kMeshletGroupSize + meshLod.m_firstMeshletGeometryDescriptor;
-
-	if(svGroupIndex == 0u)
-	{
-		s_payload.m_firstMeshletGeometryDescriptor = firstMeshletGeometryDescriptor;
-		s_payload.m_worldTransformsIndex = renderable.m_worldTransformsIndex;
-		s_payload.m_uniformsOffset = renderable.m_uniformsOffset;
-		s_payload.m_boneTransformsOrParticleEmitterOffset = renderable.m_boneTransformsOffset;
-
-		s_visibleMeshletCount = 0;
-
-		[unroll] for(U32 i = 0; i < kMeshletGroupSize / sizeof(U32); ++i)
-		{
-			s_payload.m_visibleMeshletsRelativeIndices[i] = 0u;
-		}
-	}
-
-	GroupMemoryBarrierWithGroupSync();
-
-	if(svGroupIndex < meshletCount)
-	{
-		Bool cull = false;
-
-		const MeshletBoundingVolume meshletBoundingVol = g_meshletBoundingVolumes[firstMeshletBoundingVolume + svGroupIndex];
-		const Mat3x4 worldTransform = g_transforms[renderable.m_worldTransformsIndex];
-
-#	if MESHLET_BACKFACE_CULLING
-		cull = cullBackfaceMeshlet(meshletBoundingVol, worldTransform, g_globalUniforms.m_cameraTransform.getTranslationPart());
-#	endif
-
-		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
-		const Mat4 mvp = mul(g_globalUniforms.m_viewProjectionMatrix, wordTransform4);
-
-		Vec2 minNdc, maxNdc;
-		F32 aabbMinDepth;
-		projectAabb(meshletBoundingVol.m_aabbMin, meshletBoundingVol.m_aabbMax, mvp, minNdc, maxNdc, aabbMinDepth);
-
-#	if MESHLET_OUTSIDE_OF_SCREEN_CULLING
-		// Outside of the screen
-		cull = cull || (any(minNdc > 1.0f) || any(maxNdc < -1.0f));
-#	endif
-
-#	if MESHLET_NO_SAMPLING_POINT_CULLING
-		// Sampling points test
-		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_globalUniforms.m_viewport.zw;
-		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_globalUniforms.m_viewport.zw;
-		cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
-#	endif
-
-#	if MESHLET_HZB_CULLING
-		cull = cull || (g_globalUniforms.m_enableHzbTesting == 1u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
-#	endif
-
-		if(!cull)
-		{
-			U32 idx;
-			InterlockedAdd(s_visibleMeshletCount, 1u, idx);
-
-			const U32 groupIdx = idx / 4u;
-			const U32 localIdx = idx % 4u;
-
-			const U32 mask = svGroupIndex << (localIdx * 8u);
-			InterlockedOr(s_payload.m_visibleMeshletsRelativeIndices[groupIdx], mask);
-		}
-	}
-
-	DispatchMesh(s_visibleMeshletCount, 1, 1, s_payload);
-}
-#endif
-
 // ===========================================================================
 // Mesh                                                                      =
 // ===========================================================================
@@ -426,15 +320,18 @@ groupshared F32 s_clipW[kMaxVerticesPerMeshlet];
 constexpr int g_trick = 0; // Trick the formatter
 
 [numthreads(ANKI_MESH_SHADER_THREADGROUP_SIZE, 1, 1)] [outputtopology("triangle")] void
-main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, in payload TaskOut payload,
-	 out vertices MeshPerVertOut verts[kMaxVerticesPerMeshlet], out primitives MeshPerPrimitiveOut primitives[kMaxPrimitivesPerMeshlet],
-	 out indices UVec3 indices[kMaxPrimitivesPerMeshlet])
+main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, out vertices MeshPerVertOut verts[kMaxVerticesPerMeshlet],
+	 out primitives MeshPerPrimitiveOut primitives[kMaxPrimitivesPerMeshlet], out indices UVec3 indices[kMaxPrimitivesPerMeshlet])
 {
-	const U32 groupIdx = svGroupId / 4u;
-	const U32 localIdx = svGroupId % 4u;
-	const U32 relativeMeshletIdx = (payload.m_visibleMeshletsRelativeIndices[groupIdx] >> (localIdx * 8u)) & 0xFFu;
+	const U32 instanceIdx = g_firstMeshlet[g_pushConsts.m_bucketIndex] + svGroupId;
 
-	const MeshletGeometryDescriptor meshlet = g_meshletGeometryDescriptors[payload.m_firstMeshletGeometryDescriptor + relativeMeshletIdx];
+	const GpuSceneMeshletInstance instance = g_meshletInstances[instanceIdx];
+	const U32 uniformsOffset = instance.m_uniformsOffset;
+	const U32 worldTransformsIndex = instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit >> 7u;
+	const U32 boneTransformsOffset = instance.m_boneTransformsOffsetOrParticleEmitterIndex;
+	ANKI_MAYBE_UNUSED(boneTransformsOffset);
+
+	const MeshletGeometryDescriptor meshlet = g_meshletGeometryDescriptors[instance.m_meshletGeometryDescriptorIndex];
 	const U32 primCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint >> 16u;
 	const U32 vertCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint & 0xFFFFu;
 
@@ -452,8 +349,8 @@ main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, in payload Ta
 
 			UnpackedMeshVertex vert = loadVertex(meshlet, idx, ANKI_BONES);
 
-			const Mat3x4 worldTransform = g_transforms[payload.m_worldTransformsIndex];
-			const Mat3x4 prevWorldTransform = g_transforms[payload.m_worldTransformsIndex];
+			const Mat3x4 worldTransform = g_transforms[worldTransformsIndex];
+			const Mat3x4 prevWorldTransform = g_transforms[worldTransformsIndex + 1u];
 			ANKI_MAYBE_UNUSED(prevWorldTransform);
 
 #	if UVS
@@ -464,7 +361,7 @@ main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, in payload Ta
 
 			// Do stuff
 #	if ANKI_BONES
-			skinning(vert, payload.m_boneTransformsOrParticleEmitterOffset, vert.m_position, prevPos, vert.m_normal);
+			skinning(vert, boneTransformsOffset, vert.m_position, prevPos, vert.m_normal);
 #	endif
 
 			const Vec3 worldPos = mul(worldTransform, Vec4(vert.m_position, 1.0));
@@ -534,7 +431,7 @@ main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX, in payload Ta
 			primitives[idx].m_cullPrimitive = cull;
 #	endif
 
-			primitives[idx].m_uniformsOffset = payload.m_uniformsOffset;
+			primitives[idx].m_uniformsOffset = uniformsOffset;
 #	if VISUALIZE_MESHLETS
 			primitives[idx].m_meshletIndex = relativeMeshletIdx;
 #	endif
@@ -782,9 +679,6 @@ FragOut main(
 #pragma anki technique_start vert ShadowsSwMeshletRendering uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
 #pragma anki technique_end vert ShadowsSwMeshletRendering
 
-#pragma anki technique_start task CommonTask uses_mutators
-#pragma anki technique_end task CommonTask
-
 #pragma anki technique_start mesh GBufferMeshShaders uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
 #pragma anki technique_end mesh GBufferMeshShaders
 

+ 0 - 292
AnKi/Shaders/GpuVisibility.ankiprog

@@ -1,292 +0,0 @@
-// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
-// All rights reserved.
-// Code licensed under the BSD License.
-// http://www.anki3d.org/LICENSE
-
-#pragma anki mutator HZB_TEST 0 1
-#pragma anki mutator DISTANCE_TEST 0 1
-#pragma anki mutator GATHER_AABBS 0 1
-#pragma anki mutator HASH_VISIBLES 0 1
-#pragma anki mutator GATHER_TYPE 1 2 3
-
-#pragma anki skip_mutation DISTANCE_TEST 1 HZB_TEST 1
-
-#define GATHER_MDI (GATHER_TYPE & 1u)
-#define GATHER_MESHLET_GROUPS (GATHER_TYPE & 2u)
-
-#pragma anki technique_start comp
-
-#include <AnKi/Shaders/Common.hlsl>
-#include <AnKi/Shaders/Include/GpuSceneTypes.h>
-#include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
-#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
-
-struct DrawIndirectArgsWithPadding
-{
-	U32 m_vertexCount;
-	U32 m_instanceCount;
-	U32 m_firstVertex;
-	U32 m_firstInstance;
-	U32 m_padding;
-};
-
-// Buffers that point to the GPU scene
-StructuredBuffer<GpuSceneRenderableBoundingVolume> g_renderableBoundingVolumes : register(t0);
-StructuredBuffer<GpuSceneRenderable> g_renderables : register(t1);
-StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
-StructuredBuffer<Mat3x4> g_transforms : register(t3);
-StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(t4);
-
-#if GATHER_MDI
-// These 3 have the same size
-RWStructuredBuffer<UVec4> g_instanceRateRenderables : register(u0);
-RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs : register(u1);
-RWStructuredBuffer<DrawIndirectArgsWithPadding> g_drawIndirectArgs : register(u2);
-
-// The MDI counts. One for each render state bucket
-RWStructuredBuffer<U32> g_mdiDrawCounts : register(u3);
-#endif
-
-#if GATHER_MESHLET_GROUPS
-// For mesh shading
-RWStructuredBuffer<DispatchIndirectArgs> g_taskShaderIndirectArgs : register(u4);
-RWStructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroupInstances : register(u5);
-#endif
-
-// One for each render state bucket. It's either the index of the next indirect args or the index to the next task payload
-StructuredBuffer<UVec2> g_instanceRanges : register(t5);
-
-#if DISTANCE_TEST == 0
-ConstantBuffer<FrustumGpuVisibilityUniforms> g_unis : register(b0);
-#else
-ANKI_PUSH_CONSTANTS(DistanceGpuVisibilityUniforms, g_unis)
-#endif
-
-#if HZB_TEST
-Texture2D<Vec4> g_hzbTex : register(t6);
-SamplerState g_nearestAnyClampSampler : register(s0);
-#endif
-
-#if GATHER_AABBS
-RWStructuredBuffer<U32> g_visibleAabbIndices : register(u6); ///< Indices of the visible AABBs. The 1st element is the count.
-#endif
-
-#if HASH_VISIBLES
-RWStructuredBuffer<GpuVisibilityHash> g_hash : register(u7);
-#endif
-
-[numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
-{
-	const U32 bvolumeIdx = svDispatchThreadId.x;
-	U32 bvolumeCount;
-	U32 unused;
-	g_renderableBoundingVolumes.GetDimensions(bvolumeCount, unused);
-	if(bvolumeIdx >= bvolumeCount)
-	{
-		return;
-	}
-
-	const GpuSceneRenderableBoundingVolume bvolume = g_renderableBoundingVolumes[bvolumeIdx];
-
-	const Vec3 sphereCenter = (bvolume.m_aabbMin + bvolume.m_aabbMax) * 0.5f;
-	const F32 sphereRadius = bvolume.m_sphereRadius;
-
-#if DISTANCE_TEST == 0
-	// Frustum test
-	//
-	if(!frustumTest(g_unis.m_clipPlanes, sphereCenter, sphereRadius))
-	{
-		return;
-	}
-
-	// Screen-space AABB calculation and checking
-	//
-	Vec2 minNdc, maxNdc;
-	F32 aabbMinDepth;
-	projectAabb(bvolume.m_aabbMin, bvolume.m_aabbMax, g_unis.m_viewProjectionMat, minNdc, maxNdc, aabbMinDepth);
-
-	if(any(minNdc > 1.0f) || any(maxNdc < -1.0f))
-	{
-		// Outside of the screen
-		return;
-	}
-
-	const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_finalRenderTargetSize;
-	const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_finalRenderTargetSize;
-	if(any(round(windowCoordsMin) == round(windowCoordsMax)))
-	{
-		// Doesn't touch the sampling points
-		return;
-	}
-
-	// HiZ culling
-	//
-#	if HZB_TEST
-	if(cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTex, g_nearestAnyClampSampler))
-	{
-		return;
-	}
-#	endif // HZB_TEST
-
-#else // DISTANCE_TEST == 1
-	if(!testSphereSphereCollision(sphereCenter, sphereRadius, g_unis.m_pointOfTest, g_unis.m_testRadius))
-	{
-		return;
-	}
-#endif
-
-	// Compute the LOD
-	//
-	const F32 distFromLodPoint = length(sphereCenter - g_unis.m_lodReferencePoint) - sphereRadius;
-
-	U32 lod;
-	if(distFromLodPoint < g_unis.m_maxLodDistances[0])
-	{
-		lod = 0u;
-	}
-	else if(distFromLodPoint < g_unis.m_maxLodDistances[1])
-	{
-		lod = 1u;
-	}
-	else
-	{
-		lod = 2u;
-	}
-
-	// Add the drawcall
-	//
-	const U32 renderStateBucket = bvolume.m_renderableIndex_20bit_renderStateBucket_12bit & ((1u << 12u) - 1u);
-	const U32 renderableIdx = bvolume.m_renderableIndex_20bit_renderStateBucket_12bit >> 12u;
-
-	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
-	const U32 meshLodIndex = renderable.m_meshLodsIndex + lod;
-	const GpuSceneMeshLod meshLod = g_meshLods[meshLodIndex];
-
-	const Bool isParticleEmitter = renderable.m_particleEmitterIndex < kMaxU32;
-	ANKI_MAYBE_UNUSED(isParticleEmitter);
-
-	const Bool usesMeshShaders = meshLod.m_meshletCount != 0u;
-	if(usesMeshShaders)
-	{
-#if GATHER_MESHLET_GROUPS
-		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMeshletGroupSize - 1u)) / kMeshletGroupSize;
-
-		U32 instanceIdx;
-		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, instanceIdx);
-
-		if(instanceIdx == 0u)
-		{
-			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
-			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountZ = 1u;
-		}
-		else if(instanceIdx >= g_instanceRanges[renderStateBucket].y)
-		{
-			// Reached a memory limit, cancel the job
-			ANKI_ASSERT(0);
-			instanceIdx = 0;
-			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 0u;
-		}
-
-		instanceIdx += g_instanceRanges[renderStateBucket].x;
-
-		// Divide the mesh into meshlet groups and add them as task payloads
-		GpuSceneMeshletGroupInstance instance;
-		instance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit = (lod << 30u) | (renderableIdx << 9u);
-
-		for(U32 i = 0; i < meshletGroupCount; ++i)
-		{
-			g_meshletGroupInstances[instanceIdx + i] = instance;
-
-			++instance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
-		}
-#endif
-	}
-	else
-	{
-#if GATHER_MDI
-		U32 bucketDrawcallIdx;
-		InterlockedAdd(g_mdiDrawCounts[renderStateBucket], 1, bucketDrawcallIdx);
-
-		if(bucketDrawcallIdx >= g_instanceRanges[renderStateBucket].y)
-		{
-			// OoM, ignore
-			ANKI_ASSERT(0);
-			U32 orig;
-			InterlockedExchange(g_mdiDrawCounts[renderStateBucket], g_instanceRanges[renderStateBucket].y, orig);
-		}
-		else
-		{
-			const U32 indirectIdx = bucketDrawcallIdx + g_instanceRanges[renderStateBucket].x;
-			if(!isParticleEmitter)
-			{
-				// Regular renderables are always indexed
-
-				DrawIndexedIndirectArgs indirect;
-				indirect.m_indexCount = meshLod.m_indexCount;
-				indirect.m_instanceCount = 1;
-				indirect.m_firstIndex = meshLod.m_firstIndex;
-				indirect.m_vertexOffset = 0;
-				indirect.m_firstInstance = bucketDrawcallIdx;
-				g_drawIndexedIndirectArgs[indirectIdx] = indirect;
-
-				UVec4 instanceVertex;
-				instanceVertex.x = renderable.m_worldTransformsIndex;
-				instanceVertex.y = renderable.m_uniformsOffset;
-				instanceVertex.z = meshLodIndex;
-				instanceVertex.w = renderable.m_boneTransformsOffset;
-				g_instanceRateRenderables[indirectIdx] = instanceVertex;
-			}
-			else
-			{
-				const GpuSceneParticleEmitter emitter = g_particleEmitters[renderable.m_particleEmitterIndex];
-
-				DrawIndirectArgsWithPadding indirect;
-				indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
-				indirect.m_instanceCount = 1;
-				indirect.m_firstVertex = 0;
-				indirect.m_firstInstance = bucketDrawcallIdx;
-				indirect.m_padding = 0;
-				g_drawIndirectArgs[indirectIdx] = indirect;
-
-				UVec4 instanceVertex;
-				instanceVertex.x = renderable.m_worldTransformsIndex;
-				instanceVertex.y = renderable.m_uniformsOffset;
-				instanceVertex.z = meshLodIndex;
-				instanceVertex.w = renderable.m_particleEmitterIndex;
-				g_instanceRateRenderables[indirectIdx] = instanceVertex;
-			}
-		}
-#endif
-	}
-
-#if HASH_VISIBLES
-	// Update the renderables hash
-	{
-		// Transform a random point as a way to get a feel for the transform
-		const Mat3x4 trf = g_transforms[renderable.m_worldTransformsIndex];
-		const Vec3 pt = mul(trf, Vec4(1503.98f, 2006.8f, -1400.16f, 1.0f));
-		const UVec3 ptu = UVec3(asuint(pt.x), asuint(pt.y), asuint(pt.z));
-
-		U32 hash = ptu.x;
-		hash ^= ptu.y;
-		hash ^= ptu.z;
-		hash ^= renderable.m_uuid;
-
-		InterlockedXor(g_hash[0].m_renderablesHash, hash);
-
-		const Bool deformable = isParticleEmitter || renderable.m_boneTransformsOffset != 0;
-		if(deformable)
-		{
-			g_hash[0].m_containsDeformable = 1;
-		}
-	}
-#endif
-
-#if GATHER_AABBS
-	U32 index;
-	InterlockedAdd(g_visibleAabbIndices[0], 1, index);
-	g_visibleAabbIndices[index + 1] = bvolumeIdx;
-#endif
-}
-
-#pragma anki technique_end comp

+ 0 - 137
AnKi/Shaders/GpuVisibilityMeshlet.ankiprog

@@ -1,137 +0,0 @@
-// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
-// All rights reserved.
-// Code licensed under the BSD License.
-// http://www.anki3d.org/LICENSE
-
-#pragma anki mutator HZB_TEST 0 1
-#pragma anki mutator PASSTHROUGH 0 1
-
-#pragma anki technique_start comp
-
-#include <AnKi/Shaders/Common.hlsl>
-#include <AnKi/Shaders/Include/GpuSceneTypes.h>
-#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
-#include <AnKi/Shaders/PackFunctions.hlsl>
-
-#define MESHLET_BACKFACE_CULLING 1
-#define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
-#define MESHLET_NO_SAMPLING_POINT_CULLING 1
-#define MESHLET_HZB_CULLING HZB_TEST
-
-#define THREADGROUP_SIZE ANKI_TASK_SHADER_THREADGROUP_SIZE
-
-StructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroupInstances : register(t0);
-StructuredBuffer<GpuSceneRenderable> g_renderables : register(t1);
-StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
-StructuredBuffer<Mat3x4> g_transforms : register(t3);
-StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes : register(t4);
-RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArgs : register(u0);
-RWStructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(u1);
-Texture2D<Vec4> g_hzbTexture : register(t5);
-SamplerState g_nearestClampSampler : register(s0);
-
-struct Consts
-{
-	Mat4 m_viewProjectionMatrix;
-
-	Vec3 m_cameraPos;
-	U32 m_firstDrawArg;
-
-	Vec2 m_viewportSizef;
-	U32 m_firstMeshletGroup;
-	U32 m_firstMeshlet;
-
-	U32 m_maxMeshlets;
-	U32 m_padding1;
-	U32 m_padding2;
-	U32 m_padding3;
-};
-ANKI_PUSH_CONSTANTS(Consts, g_unis)
-
-[numthreads(THREADGROUP_SIZE, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupId : SV_GROUPID,
-											   U32 svGroupIndex : SV_GROUPINDEX)
-{
-	const GpuSceneMeshletGroupInstance groupInstance = g_meshletGroupInstances[g_unis.m_firstMeshletGroup + svGroupId];
-
-	const U32 lod = groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
-	const U32 renderableIdx = (groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
-	const U32 meshletGroup = groupInstance.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit & ((1u << 9u) - 1u);
-
-	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
-	const GpuSceneMeshLod meshLod = g_meshLods[renderable.m_meshLodsIndex + lod];
-	U32 firstMeshletBoundingVolume = meshletGroup * kMeshletGroupSize;
-	const U32 meshletCount = min(kMeshletGroupSize, meshLod.m_meshletCount - firstMeshletBoundingVolume);
-	firstMeshletBoundingVolume += meshLod.m_firstMeshletBoundingVolume;
-	const U32 firstMeshletGeometryDescriptor = meshletGroup * kMeshletGroupSize + meshLod.m_firstMeshletGeometryDescriptor;
-
-	// Meshlet culling
-	if(svGroupIndex < meshletCount)
-	{
-		Bool cull = false;
-
-		const MeshletBoundingVolume meshletBoundingVol = g_meshletBoundingVolumes[firstMeshletBoundingVolume + svGroupIndex];
-
-#if !PASSTHROUGH
-
-		const Mat3x4 worldTransform = g_transforms[renderable.m_worldTransformsIndex];
-
-#	if MESHLET_BACKFACE_CULLING
-		const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
-		cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform, g_unis.m_cameraPos);
-#	endif
-
-		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
-		const Mat4 mvp = mul(g_unis.m_viewProjectionMatrix, wordTransform4);
-
-		Vec2 minNdc, maxNdc;
-		F32 aabbMinDepth;
-		projectAabb(meshletBoundingVol.m_aabbMin, meshletBoundingVol.m_aabbMax, mvp, minNdc, maxNdc, aabbMinDepth);
-
-#	if MESHLET_OUTSIDE_OF_SCREEN_CULLING
-		// Outside of the screen
-		cull = cull || (any(minNdc > 1.0f) || any(maxNdc < -1.0f));
-#	endif
-
-#	if MESHLET_NO_SAMPLING_POINT_CULLING
-		// Sampling points test
-		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_viewportSizef;
-		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_viewportSizef;
-		cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
-#	endif
-
-#	if MESHLET_HZB_CULLING
-		cull = cull || (renderable.m_boneTransformsOffset == 0u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
-#	endif
-
-#endif // !PASSTHROUGH
-
-		if(!cull)
-		{
-			U32 instanceIdx;
-			InterlockedAdd(g_indirectDrawArgs[g_unis.m_firstDrawArg].m_instanceCount, 1u, instanceIdx);
-
-			if(instanceIdx >= g_unis.m_maxMeshlets)
-			{
-				// OoM, ignore
-				U32 orig;
-				InterlockedExchange(g_indirectDrawArgs[g_unis.m_firstDrawArg].m_instanceCount, g_unis.m_maxMeshlets, orig);
-			}
-			else
-			{
-				InterlockedMax(g_indirectDrawArgs[g_unis.m_firstDrawArg].m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
-
-				GpuSceneMeshletInstance instance;
-				instance.m_meshletGeometryDescriptorIndex = firstMeshletGeometryDescriptor + svGroupIndex;
-				instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsIndex << 7u;
-				instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
-				instance.m_uniformsOffset = renderable.m_uniformsOffset;
-				instance.m_boneTransformsOffsetOrParticleEmitterIndex =
-					(renderable.m_boneTransformsOffset) ? renderable.m_boneTransformsOffset : renderable.m_particleEmitterIndex;
-
-				g_meshletInstances[g_unis.m_firstMeshlet + instanceIdx] = instance;
-			}
-		}
-	}
-}
-
-#pragma anki technique_end comp

+ 318 - 0
AnKi/Shaders/GpuVisibilityStage1.ankiprog

@@ -0,0 +1,318 @@
+// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki mutator HZB_TEST 0 1
+#pragma anki mutator DISTANCE_TEST 0 1
+#pragma anki mutator GATHER_AABBS 0 1
+#pragma anki mutator HASH_VISIBLES 0 1
+#pragma anki mutator GATHER_MESHLETS 0 1
+
+#pragma anki skip_mutation DISTANCE_TEST 1 HZB_TEST 1
+
+#pragma anki technique_start comp
+
+#include <AnKi/Shaders/Common.hlsl>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+#include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
+#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
+
+#define NUMTHREADS 64
+
+// Buffers that point to the GPU scene
+StructuredBuffer<GpuSceneRenderableBoundingVolume> g_renderableBoundingVolumes : register(t0);
+StructuredBuffer<GpuSceneRenderable> g_renderables : register(t1);
+StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
+StructuredBuffer<Mat3x4> g_transforms : register(t3);
+StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(t4);
+
+// 1st counter is the visible renderable count, 2nd the visible meshlet count and 3rd the number of threadgroups having been executed
+RWStructuredBuffer<U32> g_counters : register(u0);
+
+RWStructuredBuffer<GpuVisibilityVisibleRenderableDesc> g_visibleRenderables : register(u1); // Indices of visible renderables
+RWStructuredBuffer<GpuVisibilityVisibleMeshletDesc> g_visibleMeshlets : register(u2); // Descriptors of visible meshlets
+
+// One U32 per bucket. For each bucket it's the offset where stage 2 will put the per-bucket results
+RWStructuredBuffer<U32> g_renderablePrefixSums : register(u3);
+RWStructuredBuffer<U32> g_meshletPrefixSums : register(u4);
+
+RWStructuredBuffer<DispatchIndirectArgs> g_stage2IndirectArgs : register(u5); // 2 elements. One for MDI and another for meshlets
+
+RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u6);
+
+#if GATHER_AABBS
+RWStructuredBuffer<U32> g_visibleAabbIndices : register(u7); // Indices of the visible AABBs. The 1st element is the count.
+#endif
+
+#if HASH_VISIBLES
+RWStructuredBuffer<GpuVisibilityHash> g_hash : register(u8);
+#endif
+
+#if DISTANCE_TEST == 0
+ConstantBuffer<FrustumGpuVisibilityUniforms> g_unis : register(b0);
+#else
+ANKI_PUSH_CONSTANTS(DistanceGpuVisibilityUniforms, g_unis)
+#endif
+
+#if HZB_TEST
+Texture2D<Vec4> g_hzbTex : register(t5);
+SamplerState g_nearestAnyClampSampler : register(s0);
+#endif
+
+Bool isVisible(GpuSceneRenderableBoundingVolume bvolume)
+{
+	const Vec3 sphereCenter = (bvolume.m_aabbMin + bvolume.m_aabbMax) * 0.5f;
+	const F32 sphereRadius = bvolume.m_sphereRadius;
+
+#if DISTANCE_TEST == 0
+	// Frustum test
+	//
+	if(!frustumTest(g_unis.m_clipPlanes, sphereCenter, sphereRadius))
+	{
+		return false;
+	}
+
+	// Screen-space AABB calculation and checking
+	//
+	Vec2 minNdc, maxNdc;
+	F32 aabbMinDepth;
+	projectAabb(bvolume.m_aabbMin, bvolume.m_aabbMax, g_unis.m_viewProjectionMat, minNdc, maxNdc, aabbMinDepth);
+
+	if(any(minNdc > 1.0f) || any(maxNdc < -1.0f))
+	{
+		// Outside of the screen
+		return false;
+	}
+
+	const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_finalRenderTargetSize;
+	const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_finalRenderTargetSize;
+	if(any(round(windowCoordsMin) == round(windowCoordsMax)))
+	{
+		// Doesn't touch the sampling points
+		return false;
+	}
+
+	// HiZ culling
+	//
+#	if HZB_TEST
+	if(cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTex, g_nearestAnyClampSampler))
+	{
+		return false;
+	}
+#	endif // HZB_TEST
+
+#else // DISTANCE_TEST == 1
+	if(!testSphereSphereCollision(sphereCenter, sphereRadius, g_unis.m_pointOfTest, g_unis.m_testRadius))
+	{
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+[numthreads(NUMTHREADS, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
+{
+	const U32 bvolumeIdx = svDispatchThreadId.x;
+	const U32 bvolumeCount = getStructuredBufferElementCount(g_renderableBoundingVolumes);
+	Bool skip = (bvolumeIdx >= bvolumeCount);
+
+	GpuSceneRenderableBoundingVolume bvolume;
+	if(!skip)
+	{
+		bvolume = SBUFF(g_renderableBoundingVolumes, bvolumeIdx);
+		skip = !isVisible(bvolume);
+	}
+
+	const U32 maxVisibleMeshlets = getStructuredBufferElementCount(g_visibleMeshlets);
+	const U32 maxVisibleInstances = getStructuredBufferElementCount(g_visibleRenderables);
+
+	if(!skip)
+	{
+		// Object is visible, add it to a bunch of buffers
+
+		// Compute the LOD
+		//
+		const Vec3 sphereCenter = (bvolume.m_aabbMin + bvolume.m_aabbMax) * 0.5f;
+		const F32 sphereRadius = bvolume.m_sphereRadius;
+		const F32 distFromLodPoint = length(sphereCenter - g_unis.m_lodReferencePoint) - sphereRadius;
+
+		U32 lod;
+		if(distFromLodPoint < g_unis.m_maxLodDistances[0])
+		{
+			lod = 0u;
+		}
+		else if(distFromLodPoint < g_unis.m_maxLodDistances[1])
+		{
+			lod = 1u;
+		}
+		else
+		{
+			lod = 2u;
+		}
+
+		// Add the object
+		//
+		const U32 renderableIdx = bvolume.m_renderableIndex_20bit_renderStateBucket_12bit >> 12u;
+		const GpuSceneRenderable renderable = SBUFF(g_renderables, renderableIdx);
+		const U32 renderStateBucket = bvolume.m_renderableIndex_20bit_renderStateBucket_12bit & ((1u << 12u) - 1u);
+		const U32 meshLodIndex = renderable.m_meshLodsIndex + lod;
+		const GpuSceneMeshLod meshLod = SBUFF(g_meshLods, meshLodIndex);
+
+		const Bool isParticleEmitter = renderable.m_particleEmitterIndex < kMaxU32;
+		ANKI_MAYBE_UNUSED(isParticleEmitter);
+
+		const Bool hasMeshlets = meshLod.m_meshletCount != 0u;
+		if(GATHER_MESHLETS && hasMeshlets)
+		{
+			GpuVisibilityVisibleMeshletDesc desc;
+			desc.m_renderableIndex_30bit_renderStageBucket_12bit = (renderableIdx << 12u) | renderStateBucket;
+			desc.m_lod_2bit_meshletIndex_30bit = lod << 30u;
+
+			// X dimension will be fixed later
+			U32 firstMeshletIndex;
+			InterlockedAdd(SBUFF(g_counters, 1), meshLod.m_meshletCount, firstMeshletIndex);
+
+			if(firstMeshletIndex + meshLod.m_meshletCount > maxVisibleMeshlets)
+			{
+				// OoM, do nothing
+			}
+			else
+			{
+				// All good
+
+				// Store the meshlet descriptors
+				for(U32 i = 0; i < meshLod.m_meshletCount; ++i)
+				{
+					SBUFF(g_visibleMeshlets, firstMeshletIndex + i) = desc;
+					++desc.m_lod_2bit_meshletIndex_30bit;
+				}
+
+				// Add to the prefix sum
+				const U32 bucketCount = getStructuredBufferElementCount(g_meshletPrefixSums);
+				for(U32 i = renderStateBucket + 1; i < bucketCount; ++i)
+				{
+					InterlockedAdd(SBUFF(g_meshletPrefixSums, i), meshLod.m_meshletCount);
+				}
+			}
+		}
+		else
+		{
+			// X dimension will be fixed later
+			U32 firstInstance;
+			InterlockedAdd(SBUFF(g_counters, 0), 1, firstInstance);
+
+			if(firstInstance >= maxVisibleInstances)
+			{
+				// OoM, do nothing
+			}
+			else
+			{
+				// All good
+
+				// Store the renderable
+				GpuVisibilityVisibleRenderableDesc visRenderable;
+				visRenderable.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit = lod << 30u;
+				visRenderable.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit |= renderableIdx << 10u;
+				visRenderable.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit |= renderStateBucket;
+				SBUFF(g_visibleRenderables, firstInstance) = visRenderable;
+
+				// Add to the prefix sum
+				const U32 bucketCount = getStructuredBufferElementCount(g_renderablePrefixSums);
+				for(U32 i = renderStateBucket + 1; i < bucketCount; ++i)
+				{
+					InterlockedAdd(SBUFF(g_renderablePrefixSums, i), 1);
+				}
+			}
+		}
+
+#if HASH_VISIBLES
+		// Update the renderables hash
+		{
+			// Transform a random point as a way to get a feel for the transform
+			const Mat3x4 trf = SBUFF(g_transforms, renderable.m_worldTransformsIndex);
+			const Vec3 pt = mul(trf, Vec4(1503.98f, 2006.8f, -1400.16f, 1.0f));
+			const UVec3 ptu = UVec3(asuint(pt.x), asuint(pt.y), asuint(pt.z));
+
+			U32 hash = ptu.x;
+			hash ^= ptu.y;
+			hash ^= ptu.z;
+			hash ^= renderable.m_uuid;
+
+			InterlockedXor(SBUFF(g_hash, 0).m_renderablesHash, hash);
+
+			const Bool deformable = isParticleEmitter || renderable.m_boneTransformsOffset != 0;
+			if(deformable)
+			{
+				SBUFF(g_hash, 0).m_containsDeformable = 1;
+			}
+		}
+#endif
+
+#if GATHER_AABBS
+		U32 index;
+		InterlockedAdd(SBUFF(g_visibleAabbIndices, 0), 1, index);
+		SBUFF(g_visibleAabbIndices, index + 1) = bvolumeIdx;
+#endif
+	}
+
+	// Check if it's the last threadgroup running
+	Bool lastThreadExecuting = false;
+	if(svGroupIndex == 0)
+	{
+		U32 threadgroupIdx;
+		InterlockedAdd(SBUFF(g_counters, 2), 1, threadgroupIdx);
+		const U32 threadgroupCount = (bvolumeCount + NUMTHREADS - 1) / NUMTHREADS;
+		lastThreadExecuting = (threadgroupIdx + 1 == threadgroupCount);
+	}
+
+	// Sync to make sure all the atomic ops have finished before the following code reads them
+	AllMemoryBarrierWithGroupSync();
+
+	if(lastThreadExecuting)
+	{
+		// Last thing executing, fixup some sizes
+
+		// Renderables
+		U32 visibleInstancesCount;
+		if(SBUFF(g_counters, 0) <= maxVisibleInstances)
+		{
+			visibleInstancesCount = SBUFF(g_counters, 0);
+		}
+		else
+		{
+			// OoM, fix a few things and inform the CPU
+			visibleInstancesCount = maxVisibleInstances;
+			SBUFF(g_counters, 0) = maxVisibleInstances;
+			InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
+		}
+
+		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountX = (visibleInstancesCount + NUMTHREADS - 1) / NUMTHREADS;
+		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountY = 1;
+		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountZ = 1;
+
+		// Meshlets
+		U32 visibleMeshletCount;
+		if(SBUFF(g_counters, 1) <= maxVisibleMeshlets)
+		{
+			visibleMeshletCount = SBUFF(g_counters, 1);
+		}
+		else
+		{
+			// OoM, fix a few things and inform the CPU
+			visibleMeshletCount = maxVisibleMeshlets;
+			SBUFF(g_counters, 1) = maxVisibleMeshlets;
+			InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
+		}
+
+		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountX = (visibleMeshletCount + NUMTHREADS - 1) / NUMTHREADS;
+		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountY = 1;
+		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountZ = 1;
+
+		// Reset it for the next job
+		SBUFF(g_counters, 2) = 0;
+	}
+}
+
+#pragma anki technique_end comp

+ 261 - 0
AnKi/Shaders/GpuVisibilityStage2.ankiprog

@@ -0,0 +1,261 @@
+// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki mutator HZB_TEST 0 1
+#pragma anki mutator PASSTHROUGH 0 1
+#pragma anki mutator MESH_SHADERS 0 1
+
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+#include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
+#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
+#include <AnKi/Shaders/PackFunctions.hlsl>
+
+#pragma anki technique_start comp Legacy uses_mutators
+
+struct DrawIndirectArgsWithPadding
+{
+	U32 m_vertexCount;
+	U32 m_instanceCount;
+	U32 m_firstVertex;
+	U32 m_firstInstance;
+	U32 m_padding;
+};
+
+// GPU scene
+StructuredBuffer<GpuSceneRenderable> g_renderables : register(t0);
+StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(t1);
+StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
+
+StructuredBuffer<GpuVisibilityVisibleRenderableDesc> g_visibleRenderables : register(t3);
+StructuredBuffer<U32> g_visibleRenderableCount : register(t4);
+StructuredBuffer<U32> g_renderablePrefixSums : register(t5);
+
+// One for each bucket. Points to the 1st indirect args struct. 2nd element contains the max count
+StructuredBuffer<UVec2> g_firstDrawIndirectArgAndCount : register(t6);
+
+// These 3 have the same size
+RWStructuredBuffer<UVec4> g_instanceRateRenderables : register(u0);
+RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs : register(u1);
+RWStructuredBuffer<DrawIndirectArgsWithPadding> g_drawIndirectArgs : register(u2); // This points to the same buffer as the above
+
+// The MDI counts. One for each render state bucket
+RWStructuredBuffer<U32> g_mdiDrawCounts : register(u3);
+
+RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u4);
+
+[numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	if(svDispatchThreadId.x >= g_visibleRenderableCount[0])
+	{
+		return;
+	}
+
+	const GpuVisibilityVisibleRenderableDesc desc = SBUFF(g_visibleRenderables, svDispatchThreadId.x);
+
+	const U32 renderStateBucket = desc.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit & ((1u << 10u) - 1u);
+	const U32 renderableIdx = (desc.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit >> 10u) & ((1u << 20u) - 1u);
+	const U32 lod = desc.m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit >> 30u;
+
+	const GpuSceneRenderable renderable = SBUFF(g_renderables, renderableIdx);
+	const U32 meshLodIndex = renderable.m_meshLodsIndex + lod;
+	const GpuSceneMeshLod meshLod = SBUFF(g_meshLods, meshLodIndex);
+
+	const Bool isParticleEmitter = renderable.m_particleEmitterIndex < kMaxU32;
+
+	U32 bucketLocalIndex;
+	InterlockedAdd(SBUFF(g_mdiDrawCounts, renderStateBucket), 1u, bucketLocalIndex);
+
+	const U32 instanceIndex = bucketLocalIndex + SBUFF(g_renderablePrefixSums, renderStateBucket);
+	const U32 indirectArgsIndex = bucketLocalIndex + SBUFF(g_firstDrawIndirectArgAndCount, renderStateBucket).x;
+
+	if(bucketLocalIndex >= SBUFF(g_firstDrawIndirectArgAndCount, renderStateBucket).y)
+	{
+		// OoM, try to recover and inform the CPU
+		U32 orig;
+		InterlockedExchange(SBUFF(g_mdiDrawCounts, renderStateBucket), SBUFF(g_firstDrawIndirectArgAndCount, renderStateBucket).y, orig);
+		InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 2);
+	}
+	else if(!isParticleEmitter)
+	{
+		// Regular renderables are always indexed
+
+		DrawIndexedIndirectArgs indirect;
+		indirect.m_indexCount = meshLod.m_indexCount;
+		indirect.m_instanceCount = 1;
+		indirect.m_firstIndex = meshLod.m_firstIndex;
+		indirect.m_vertexOffset = 0;
+		indirect.m_firstInstance = instanceIndex;
+		SBUFF(g_drawIndexedIndirectArgs, indirectArgsIndex) = indirect;
+
+		UVec4 instanceVertex;
+		instanceVertex.x = renderable.m_worldTransformsIndex;
+		instanceVertex.y = renderable.m_uniformsOffset;
+		instanceVertex.z = meshLodIndex;
+		instanceVertex.w = renderable.m_boneTransformsOffset;
+		SBUFF(g_instanceRateRenderables, instanceIndex) = instanceVertex;
+	}
+	else
+	{
+		const GpuSceneParticleEmitter emitter = SBUFF(g_particleEmitters, renderable.m_particleEmitterIndex);
+
+		DrawIndirectArgsWithPadding indirect;
+		indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
+		indirect.m_instanceCount = 1;
+		indirect.m_firstVertex = 0;
+		indirect.m_firstInstance = instanceIndex;
+		indirect.m_padding = 0;
+		SBUFF(g_drawIndirectArgs, indirectArgsIndex) = indirect;
+
+		UVec4 instanceVertex;
+		instanceVertex.x = renderable.m_worldTransformsIndex;
+		instanceVertex.y = renderable.m_uniformsOffset;
+		instanceVertex.z = meshLodIndex;
+		instanceVertex.w = renderable.m_particleEmitterIndex;
+		SBUFF(g_instanceRateRenderables, instanceIndex) = instanceVertex;
+	}
+}
+
+#pragma anki technique_end comp Legacy
+
+#pragma anki technique_start comp Meshlets uses_mutators HZB_TEST PASSTHROUGH MESH_SHADERS
+
+#define MESHLET_BACKFACE_CULLING 1
+#define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
+#define MESHLET_NO_SAMPLING_POINT_CULLING 1
+#define MESHLET_HZB_CULLING HZB_TEST
+
+// GPU scene
+StructuredBuffer<GpuSceneRenderable> g_renderables : register(t0);
+StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t1);
+StructuredBuffer<Mat3x4> g_transforms : register(t2);
+
+// UGB
+StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes : register(t3);
+
+#if MESHLET_HZB_CULLING
+Texture2D<Vec4> g_hzbTexture : register(t4);
+SamplerState g_nearestClampSampler : register(s0);
+#endif
+
+// Prev stage results
+StructuredBuffer<U32> g_counters : register(t5); // 2nd element is the visible meshlet count
+StructuredBuffer<U32> g_meshletPrefixSums : register(t6);
+StructuredBuffer<GpuVisibilityVisibleMeshletDesc> g_visibleMeshlets : register(t7);
+
+// New results
+#if MESH_SHADERS
+RWStructuredBuffer<DispatchIndirectArgs> g_dispatchMeshIndirectArgs : register(u0);
+#else
+RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArgs : register(u0);
+#endif
+RWStructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(u1);
+
+RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u2);
+
+struct Consts
+{
+	Mat4 m_viewProjectionMatrix;
+
+	Vec3 m_cameraPos;
+	U32 m_padding1;
+
+	Vec2 m_viewportSizef;
+	UVec2 m_padding2;
+};
+ANKI_PUSH_CONSTANTS(Consts, g_unis)
+
+[numthreads(64, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	const U32 visibleMeshlets = SBUFF(g_counters, 1);
+	if(svDispatchThreadId >= visibleMeshlets)
+	{
+		return;
+	}
+
+	const GpuVisibilityVisibleMeshletDesc desc = SBUFF(g_visibleMeshlets, svDispatchThreadId);
+
+	const U32 renderableIdx = desc.m_renderableIndex_30bit_renderStageBucket_12bit >> 12u;
+	const U32 renderStateBucket = desc.m_renderableIndex_30bit_renderStageBucket_12bit & ((1u << 12u) - 1u);
+	const U32 lod = desc.m_lod_2bit_meshletIndex_30bit >> 30u;
+	const U32 meshletIdx = desc.m_lod_2bit_meshletIndex_30bit & ((1u << 30u) - 1u);
+
+	const GpuSceneRenderable renderable = SBUFF(g_renderables, renderableIdx);
+	const GpuSceneMeshLod meshLod = SBUFF(g_meshLods, renderable.m_meshLodsIndex + lod);
+	const MeshletBoundingVolume meshletBoundingVol = SBUFF(g_meshletBoundingVolumes, meshLod.m_firstMeshletBoundingVolume + meshletIdx);
+
+	// Meshlet culling
+	Bool cull = false;
+
+#if !PASSTHROUGH
+	const Mat3x4 worldTransform = SBUFF(g_transforms, renderable.m_worldTransformsIndex);
+
+#	if MESHLET_BACKFACE_CULLING
+	const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
+	cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform, g_unis.m_cameraPos);
+#	endif
+
+	const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
+	const Mat4 mvp = mul(g_unis.m_viewProjectionMatrix, wordTransform4);
+
+	Vec2 minNdc, maxNdc;
+	F32 aabbMinDepth;
+	projectAabb(meshletBoundingVol.m_aabbMin, meshletBoundingVol.m_aabbMax, mvp, minNdc, maxNdc, aabbMinDepth);
+
+#	if MESHLET_OUTSIDE_OF_SCREEN_CULLING
+	// Outside of the screen
+	cull = cull || (any(minNdc > 1.0f) || any(maxNdc < -1.0f));
+#	endif
+
+#	if MESHLET_NO_SAMPLING_POINT_CULLING
+	// Sampling points test
+	const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_viewportSizef;
+	const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_viewportSizef;
+	cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
+#	endif
+
+#	if MESHLET_HZB_CULLING
+	cull = cull || (renderable.m_boneTransformsOffset == 0u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
+#	endif
+
+#endif // !PASSTHROUGH
+
+	if(!cull)
+	{
+		U32 instanceIdx;
+#if MESH_SHADERS
+		InterlockedAdd(SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountX, 1u, instanceIdx);
+#else
+		InterlockedAdd(SBUFF(g_indirectDrawArgs, renderStateBucket).m_instanceCount, 1u, instanceIdx);
+#endif
+
+		if(instanceIdx == 0)
+		{
+			// First instance, init the drawcall
+#if MESH_SHADERS
+			SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountY = 1u;
+			SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountZ = 1u;
+#else
+			SBUFF(g_indirectDrawArgs, renderStateBucket).m_firstInstance = SBUFF(g_meshletPrefixSums, renderStateBucket);
+#endif
+		}
+
+#if !MESH_SHADERS
+		// Try to limit the vertex size
+		InterlockedMax(SBUFF(g_indirectDrawArgs, renderStateBucket).m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
+#endif
+
+		GpuSceneMeshletInstance instance;
+		instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsIndex << 7u;
+		instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
+		instance.m_uniformsOffset = renderable.m_uniformsOffset;
+		instance.m_boneTransformsOffsetOrParticleEmitterIndex =
+			(renderable.m_boneTransformsOffset) ? renderable.m_boneTransformsOffset : renderable.m_particleEmitterIndex;
+		instance.m_meshletGeometryDescriptorIndex = meshLod.m_firstMeshletGeometryDescriptor + meshletIdx;
+
+		SBUFF(g_meshletInstances, SBUFF(g_meshletPrefixSums, renderStateBucket) + instanceIdx) = instance;
+	}
+}
+
+#pragma anki technique_end comp Meshlets

+ 1 - 8
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -38,13 +38,6 @@ struct GpuSceneRenderableInstance
 };
 static_assert(sizeof(GpuSceneRenderableInstance) == sizeof(UVec4));
 
-/// Input to a single task shader threadgroup. Something similar to GpuSceneRenderableInstance but for mesh shading.
-struct GpuSceneMeshletGroupInstance
-{
-	U32 m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
-};
-static_assert(kMaxLodCount == 3);
-
 /// Minimal data passed to the vertex shaders in the case of meshlet rendering.
 struct GpuSceneMeshletInstance
 {
@@ -62,7 +55,7 @@ struct GpuSceneRenderableBoundingVolume
 	F32 m_sphereRadius ANKI_CPP_CODE(= 0.0f);
 
 	Vec3 m_aabbMax ANKI_CPP_CODE(= Vec3(kSomeFarDistance));
-	U32 m_renderableIndex_20bit_renderStateBucket_12bit; ///< High 20bits point to a GpuSceneRenderable. Rest 12bits are the render state bucket idx.
+	U32 m_renderableIndex_20bit_renderStateBucket_12bit;
 };
 static_assert(sizeof(GpuSceneRenderableBoundingVolume) == sizeof(Vec4) * 2);
 

+ 11 - 0
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -55,4 +55,15 @@ struct GpuVisibilityHash
 	U32 m_containsDeformable;
 };
 
+struct GpuVisibilityVisibleRenderableDesc
+{
+	U32 m_lod_2bit_renderableIndex_20bit_renderStageBucket_10bit;
+};
+
+struct GpuVisibilityVisibleMeshletDesc
+{
+	U32 m_renderableIndex_30bit_renderStageBucket_12bit;
+	U32 m_lod_2bit_meshletIndex_30bit;
+};
+
 ANKI_END_NAMESPACE

+ 9 - 8
AnKi/Shaders/Include/MaterialTypes.h

@@ -32,26 +32,27 @@ static_assert(sizeof(MaterialGlobalUniforms) == 16 * sizeof(Vec4));
 
 #define ANKI_MATERIAL_REGISTER_MESHLET_BOUNDING_VOLUMES t1 ///< Points to the unified geom buffer
 #define ANKI_MATERIAL_REGISTER_MESHLET_GEOMETRY_DESCRIPTORS t2 ///< Points to the unified geom buffer
-#define ANKI_MATERIAL_REGISTER_MESHLET_GROUPS t3
+#define ANKI_MATERIAL_REGISTER_MESHLET_INSTANCES t3
 #define ANKI_MATERIAL_REGISTER_RENDERABLES t4
 #define ANKI_MATERIAL_REGISTER_MESH_LODS t5
 #define ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS t6
 #define ANKI_MATERIAL_REGISTER_TRANSFORMS t7
 #define ANKI_MATERIAL_REGISTER_HZB_TEXTURE t8
 #define ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER s1
+#define ANKI_MATERIAL_REGISTER_FIRST_MESHLET t9
 
 // For FW shading:
 #define ANKI_MATERIAL_REGISTER_LINEAR_CLAMP_SAMPLER s2
 #define ANKI_MATERIAL_REGISTER_SHADOW_SAMPLER s3
-#define ANKI_MATERIAL_REGISTER_SCENE_DEPTH t9
-#define ANKI_MATERIAL_REGISTER_LIGHT_VOLUME t10
+#define ANKI_MATERIAL_REGISTER_SCENE_DEPTH t10
+#define ANKI_MATERIAL_REGISTER_LIGHT_VOLUME t11
 #define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_UNIFORMS b1
-#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_POINT_LIGHTS t11
-#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_SPOT_LIGHTS t12
-#define ANKI_MATERIAL_REGISTER_SHADOW_ATLAS t13
-#define ANKI_MATERIAL_REGISTER_CLUSTERS t14
+#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_POINT_LIGHTS t12
+#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_SPOT_LIGHTS t13
+#define ANKI_MATERIAL_REGISTER_SHADOW_ATLAS t14
+#define ANKI_MATERIAL_REGISTER_CLUSTERS t15
 
 // Always last because it's variable. Texture buffer bindings pointing to unified geom buffer:
-#define ANKI_MATERIAL_REGISTER_UNIFIED_GEOMETRY_START t15
+#define ANKI_MATERIAL_REGISTER_UNIFIED_GEOMETRY_START t16
 
 ANKI_END_NAMESPACE

+ 6 - 0
AnKi/Shaders/Intellisense.hlsl

@@ -264,6 +264,12 @@ void InterlockedMax(T dest, T value);
 template<typename T>
 void InterlockedExchange(T dest, T value, T& originalValue);
 
+template<typename T>
+void InterlockedOr(T dest, T value, T& originalValue);
+
+template<typename T>
+void InterlockedOr(T dest, T value);
+
 // Wave ops
 
 template<typename T>

+ 11 - 1
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -23,13 +23,23 @@ ByteAddressBuffer g_gpuScene : register(ANKI_MATERIAL_REGISTER_GPU_SCENE);
 
 StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes : register(ANKI_MATERIAL_REGISTER_MESHLET_BOUNDING_VOLUMES);
 StructuredBuffer<MeshletGeometryDescriptor> g_meshletGeometryDescriptors : register(ANKI_MATERIAL_REGISTER_MESHLET_GEOMETRY_DESCRIPTORS);
-StructuredBuffer<GpuSceneMeshletGroupInstance> g_meshletGroups : register(ANKI_MATERIAL_REGISTER_MESHLET_GROUPS);
+StructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(ANKI_MATERIAL_REGISTER_MESHLET_INSTANCES);
 StructuredBuffer<GpuSceneRenderable> g_renderables : register(ANKI_MATERIAL_REGISTER_RENDERABLES);
 StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(ANKI_MATERIAL_REGISTER_MESH_LODS);
 StructuredBuffer<Mat3x4> g_transforms : register(ANKI_MATERIAL_REGISTER_TRANSFORMS);
 Texture2D<Vec4> g_hzbTexture : register(ANKI_MATERIAL_REGISTER_HZB_TEXTURE);
 SamplerState g_nearestClampSampler : register(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER);
 StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS);
+StructuredBuffer<U32> g_firstMeshlet : register(ANKI_MATERIAL_REGISTER_FIRST_MESHLET);
+
+#if ANKI_MESH_SHADER
+struct PushConsts
+{
+	UVec3 m_padding;
+	U32 m_bucketIndex;
+};
+ANKI_PUSH_CONSTANTS(PushConsts, g_pushConsts)
+#endif
 
 // FW shading specific
 #if defined(FORWARD_SHADING)

+ 16 - 12
AnKi/Shaders/ShadowMappingVetVisibility.ankiprog

@@ -16,7 +16,8 @@ RWStructuredBuffer<U32> g_mdiDrawCounts : register(u0);
 RWStructuredBuffer<GpuSceneLight> g_lights : register(u1);
 RWStructuredBuffer<GpuSceneLightVisibleRenderablesHash> g_lightHashes : register(u2);
 RWStructuredBuffer<DrawIndirectArgs> g_clearTileIndirectArgs : register(u3);
-RWStructuredBuffer<DispatchIndirectArgs> g_taskShaderIndirectArgs : register(u4);
+RWStructuredBuffer<DispatchIndirectArgs> g_dispatchMeshIndirectArgs : register(u4);
+RWStructuredBuffer<DrawIndirectArgs> g_drawIndirectArgs : register(u5);
 
 struct Uniforms
 {
@@ -56,24 +57,27 @@ groupshared U32 s_renderLight;
 
 	if(s_renderLight == 0)
 	{
-		// Won't render, nullify MDI counts
+		// Nullify indirect args
 
-		U32 renderStateBucketCount, unused;
-		g_mdiDrawCounts.GetDimensions(renderStateBucketCount, unused);
-		ANKI_ASSERT(renderStateBucketCount <= 64);
-
-		if(svGroupIndex < renderStateBucketCount)
+		const U32 mdiCounts = getStructuredBufferElementCount(g_mdiDrawCounts);
+		ANKI_ASSERT(mdiCounts <= 64);
+		if(svGroupIndex < mdiCounts)
 		{
 			g_mdiDrawCounts[svGroupIndex] = 0u;
 		}
 
-		// Do the same for the task shaders
-		g_taskShaderIndirectArgs.GetDimensions(renderStateBucketCount, unused);
-		ANKI_ASSERT(renderStateBucketCount <= 64);
+		const U32 argCount = getStructuredBufferElementCount(g_dispatchMeshIndirectArgs);
+		ANKI_ASSERT(argCount <= 64);
+		if(svGroupIndex < argCount)
+		{
+			g_dispatchMeshIndirectArgs[svGroupIndex].m_threadGroupCountX = 0u;
+		}
 
-		if(svGroupIndex < renderStateBucketCount)
+		const U32 drawCount = getStructuredBufferElementCount(g_drawIndirectArgs);
+		ANKI_ASSERT(drawCount <= 64);
+		if(svGroupIndex < drawCount)
 		{
-			g_taskShaderIndirectArgs[svGroupIndex].m_threadGroupCountX = 0u;
+			g_drawIndirectArgs[svGroupIndex].m_vertexCount = 0;
 		}
 	}
 }

+ 7 - 9
Samples/PhysicsPlayground/Assets/Smoke.ankimtl

@@ -1,13 +1,11 @@
 <?xml version="1.0" encoding="UTF-8" ?>
-<material>
-	<shaderPrograms>
-		<shaderProgram name="ForwardShadingParticles">
-			<mutation>
-				<mutator name="ANIMATED_TEXTURE" value="0"/>
-				<mutator name="LIGHT" value="1"/>
-			</mutation>
-		</shaderProgram>
-	</shaderPrograms>
+<material shadow="0">
+	<shaderProgram name="ForwardShadingParticles">
+		<mutation>
+			<mutator name="ANIMATED_TEXTURE" value="0"/>
+			<mutator name="LIGHT" value="1"/>
+		</mutation>
+	</shaderProgram>
 
 	<inputs>
 		<input name="m_diffuseMap" value="Assets/Smoke.ankitex"/>