Browse Source

Add two-phase occlusion culling

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
2b05fae952

+ 69 - 84
AnKi/Renderer/GBuffer.cpp

@@ -107,11 +107,11 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 
 
 	// Visibility
 	// Visibility
 	GpuVisibilityOutput visOut;
 	GpuVisibilityOutput visOut;
+	FrustumGpuVisibilityInput visIn;
 	{
 	{
-		const CommonMatrices& matrices = (getRenderer().getFrameCount() <= 1) ? ctx.m_matrices : ctx.m_prevMatrices;
+		const CommonMatrices& matrices = ctx.m_matrices;
 		const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 		const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 
-		FrustumGpuVisibilityInput visIn;
 		visIn.m_passesName = "GBuffer";
 		visIn.m_passesName = "GBuffer";
 		visIn.m_technique = RenderingTechnique::kGBuffer;
 		visIn.m_technique = RenderingTechnique::kGBuffer;
 		visIn.m_viewProjectionMatrix = matrices.m_viewProjection;
 		visIn.m_viewProjectionMatrix = matrices.m_viewProjection;
@@ -121,6 +121,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		visIn.m_hzbRt = &m_runCtx.m_hzbRt;
 		visIn.m_hzbRt = &m_runCtx.m_hzbRt;
 		visIn.m_gatherAabbIndices = g_dbgCVar.get();
 		visIn.m_gatherAabbIndices = g_dbgCVar.get();
 		visIn.m_viewportSize = getRenderer().getInternalResolution();
 		visIn.m_viewportSize = getRenderer().getInternalResolution();
+		visIn.m_twoPhaseOcclusionCulling = getRenderer().getMeshletRenderingType() != MeshletRenderingType::kNone;
 
 
 		getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
 		getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
 
 
@@ -128,8 +129,6 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_dependency;
 		m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_dependency;
 	}
 	}
 
 
-	const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
-
 	// Create RTs
 	// Create RTs
 	Array<RenderTargetHandle, kMaxColorRenderTargets> rts;
 	Array<RenderTargetHandle, kMaxColorRenderTargets> rts;
 	for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
 	for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
@@ -138,104 +137,90 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		rts[i] = m_runCtx.m_colorRts[i];
 		rts[i] = m_runCtx.m_colorRts[i];
 	}
 	}
 
 
-	RenderTargetHandle sriRt;
-	if(enableVrs)
-	{
-		sriRt = getRenderer().getVrsSriGeneration().getSriRt();
-	}
+	// Create the GBuffer pass
+	auto genGBuffer = [&](Bool firstPass) {
+		GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass((firstPass) ? "GBuffer" : "GBuffer 2nd phase");
 
 
-	// Create pass
-	GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("GBuffer");
-
-	Array<GraphicsRenderPassTargetDesc, kGBufferColorRenderTargetCount> colorRti;
-	colorRti[0].m_handle = rts[0];
-	colorRti[0].m_loadOperation = RenderTargetLoadOperation::kClear;
-	colorRti[1].m_handle = rts[1];
-	colorRti[1].m_loadOperation = RenderTargetLoadOperation::kClear;
-	colorRti[2].m_handle = rts[2];
-	colorRti[2].m_loadOperation = RenderTargetLoadOperation::kClear;
-	colorRti[3].m_handle = rts[3];
-	colorRti[3].m_loadOperation = RenderTargetLoadOperation::kClear;
-	colorRti[3].m_clearValue.m_colorf = {1.0f, 1.0f, 1.0f, 1.0f};
-	GraphicsRenderPassTargetDesc depthRti(m_runCtx.m_crntFrameDepthRt);
-	depthRti.m_loadOperation = RenderTargetLoadOperation::kClear;
-	depthRti.m_clearValue.m_depthStencil.m_depth = 1.0f;
-	depthRti.m_subresource.m_depthStencilAspect = DepthStencilAspectBit::kDepth;
-
-	pass.setRenderpassInfo(WeakArray{colorRti}, &depthRti, 0, 0, kMaxU32, kMaxU32, (enableVrs) ? &sriRt : nullptr,
-						   (enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0,
-						   (enableVrs) ? getRenderer().getVrsSriGeneration().getSriTexelDimension() : 0);
-	pass.setWork([this, &ctx, visOut](RenderPassWorkContext& rgraphCtx) {
-		ANKI_TRACE_SCOPED_EVENT(GBuffer);
-
-		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
-
-		// Set some state, leave the rest to default
-		cmdb.setViewport(0, 0, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
-
-		const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
-		if(enableVrs)
+		const TextureUsageBit rtUsage =
+			(firstPass) ? TextureUsageBit::kFramebufferWrite : (TextureUsageBit::kFramebufferRead | TextureUsageBit::kFramebufferWrite);
+		for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
 		{
 		{
-			// Just set some low value, the attachment will take over
-			cmdb.setVrsRate(VrsRate::k1x1);
+			pass.newTextureDependency(m_runCtx.m_colorRts[i], rtUsage);
 		}
 		}
 
 
-		RenderableDrawerArguments args;
-		args.m_viewMatrix = ctx.m_matrices.m_view;
-		args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
-		args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
-		args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
-		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
-		args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
-		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
+		pass.newTextureDependency(m_runCtx.m_crntFrameDepthRt, rtUsage);
+
+		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(),
+								 BufferUsageBit::kStorageGeometryRead | BufferUsageBit::kStorageFragmentRead);
 
 
-		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+		// Only add one depedency to the GPU visibility. No need to track all buffers
+		if(visOut.containsDrawcalls())
 		{
 		{
-			const TextureSubresourceDesc subresource = TextureSubresourceDesc::all();
-			Texture* tex;
-			rgraphCtx.getRenderTargetState(m_runCtx.m_hzbRt, subresource, tex);
-			args.m_hzbTexture = TextureView(tex, subresource);
+			pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw | BufferUsageBit::kStorageGeometryRead);
+		}
+		else
+		{
+			// Weird, make a check
+			ANKI_ASSERT(GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount() == 0);
 		}
 		}
 
 
-		args.fill(visOut);
+		const RenderTargetLoadOperation loadOp = (firstPass) ? RenderTargetLoadOperation::kClear : RenderTargetLoadOperation::kLoad;
+		Array<GraphicsRenderPassTargetDesc, kGBufferColorRenderTargetCount> colorRti;
+		for(U32 i = 0; i < 4; ++i)
+		{
+			colorRti[i].m_handle = rts[i];
+			colorRti[i].m_loadOperation = loadOp;
+		}
+		colorRti[3].m_clearValue.m_colorf = {1.0f, 1.0f, 1.0f, 1.0f};
+		GraphicsRenderPassTargetDesc depthRti(m_runCtx.m_crntFrameDepthRt);
+		depthRti.m_loadOperation = loadOp;
+		depthRti.m_clearValue.m_depthStencil.m_depth = 1.0f;
+		depthRti.m_subresource.m_depthStencilAspect = DepthStencilAspectBit::kDepth;
+		pass.setRenderpassInfo(WeakArray{colorRti}, &depthRti, 0, 0, kMaxU32, kMaxU32);
 
 
-		cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
-		getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
-	});
+		pass.setWork([this, &ctx, visOut](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(GBuffer);
 
 
-	for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
-	{
-		pass.newTextureDependency(m_runCtx.m_colorRts[i], TextureUsageBit::kFramebufferWrite);
-	}
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 
-	pass.newTextureDependency(m_runCtx.m_crntFrameDepthRt, TextureUsageBit::kAllFramebuffer);
+			// Set some state, leave the rest to default
+			cmdb.setViewport(0, 0, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
 
 
-	if(enableVrs)
-	{
-		pass.newTextureDependency(sriRt, TextureUsageBit::kFramebufferShadingRate);
-	}
+			RenderableDrawerArguments args;
+			args.m_viewMatrix = ctx.m_matrices.m_view;
+			args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
+			args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
+			args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
+			args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
+			args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
+			args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
 
 
-	if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
-	{
-		pass.newTextureDependency(m_runCtx.m_hzbRt, TextureUsageBit::kSampledGeometry);
-	}
+			args.fill(visOut);
 
 
-	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageGeometryRead | BufferUsageBit::kStorageFragmentRead);
+			cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
+			getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
+		});
+	};
 
 
-	// Only add one depedency to the GPU visibility. No need to track all buffers
-	if(visOut.containsDrawcalls())
-	{
-		pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kIndirectDraw);
-	}
-	else
-	{
-		// Weird, make a check
-		ANKI_ASSERT(GpuSceneArrays::RenderableBoundingVolumeGBuffer::getSingleton().getElementCount() == 0);
-	}
+	genGBuffer(true);
 
 
-	// HZB generation for the next frame
+	// HZB generation for the 3rd stage or next frame
 	getRenderer().getHzbGenerator().populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,
 	getRenderer().getHzbGenerator().populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,
 														UVec2(m_hzbRt->getWidth(), m_hzbRt->getHeight()), rgraph);
 														UVec2(m_hzbRt->getWidth(), m_hzbRt->getHeight()), rgraph);
+
+	// 2nd phase
+	if(visIn.m_twoPhaseOcclusionCulling)
+	{
+		// Visibility (again)
+		getRenderer().getGpuVisibility().populateRenderGraphStage3(visIn, visOut);
+
+		// GBuffer again
+		genGBuffer(false);
+
+		// HZB generation for the next frame
+		getRenderer().getHzbGenerator().populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,
+															UVec2(m_hzbRt->getWidth(), m_hzbRt->getHeight()), rgraph);
+	}
 }
 }
 
 
 void GBuffer::getDebugRenderTarget(CString rtName, Array<RenderTargetHandle, kMaxDebugRenderTargets>& handles,
 void GBuffer::getDebugRenderTarget(CString rtName, Array<RenderTargetHandle, kMaxDebugRenderTargets>& handles,

+ 13 - 0
AnKi/Renderer/Renderer.cpp

@@ -159,6 +159,19 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	m_tileCounts.y() = (m_internalResolution.y() + kClusteredShadingTileSize - 1) / kClusteredShadingTileSize;
 	m_tileCounts.y() = (m_internalResolution.y() + kClusteredShadingTileSize - 1) / kClusteredShadingTileSize;
 	m_zSplitCount = g_zSplitCountCVar.get();
 	m_zSplitCount = g_zSplitCountCVar.get();
 
 
+	if(g_meshletRenderingCVar.get() && !GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	{
+		m_meshletRenderingType = MeshletRenderingType::kSoftware;
+	}
+	else if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	{
+		m_meshletRenderingType = MeshletRenderingType::kMeshShaders;
+	}
+	else
+	{
+		m_meshletRenderingType = MeshletRenderingType::kNone;
+	}
+
 	// A few sanity checks
 	// A few sanity checks
 	if(m_internalResolution.x() < 64 || m_internalResolution.y() < 64)
 	if(m_internalResolution.x() < 64 || m_internalResolution.y() < 64)
 	{
 	{

+ 11 - 2
AnKi/Renderer/Renderer.h

@@ -43,6 +43,13 @@ public:
 	SamplerPtr m_trilinearClampShadow;
 	SamplerPtr m_trilinearClampShadow;
 };
 };
 
 
+enum class MeshletRenderingType
+{
+	kNone,
+	kMeshShaders,
+	kSoftware
+};
+
 /// Offscreen renderer.
 /// Offscreen renderer.
 class Renderer
 class Renderer
 {
 {
@@ -91,9 +98,9 @@ public:
 		return m_frameCount;
 		return m_frameCount;
 	}
 	}
 
 
-	Bool runSoftwareMeshletRendering() const
+	MeshletRenderingType getMeshletRenderingType() const
 	{
 	{
-		return g_meshletRenderingCVar.get() && !GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
+		return m_meshletRenderingType;
 	}
 	}
 
 
 	/// Create the init info for a 2D texture that will be used as a render target.
 	/// Create the init info for a 2D texture that will be used as a render target.
@@ -226,6 +233,8 @@ private:
 	Mutex m_pipelineQueriesMtx;
 	Mutex m_pipelineQueriesMtx;
 #endif
 #endif
 
 
+	MeshletRenderingType m_meshletRenderingType = MeshletRenderingType::kNone;
+
 	Error initInternal(UVec2 swapchainSize);
 	Error initInternal(UVec2 swapchainSize);
 
 
 	void gpuSceneCopy(RenderingContext& ctx);
 	void gpuSceneCopy(RenderingContext& ctx);

+ 0 - 5
AnKi/Renderer/ShadowMapping.cpp

@@ -672,11 +672,6 @@ void ShadowMapping::createDrawShadowsPass(ConstWeakArray<ShadowSubpassInfo> subp
 			args.m_viewport = UVec4(spass.m_viewport[0], spass.m_viewport[1], spass.m_viewport[2], spass.m_viewport[3]);
 			args.m_viewport = UVec4(spass.m_viewport[0], spass.m_viewport[1], spass.m_viewport[2], spass.m_viewport[3]);
 			args.fill(visOut);
 			args.fill(visOut);
 
 
-			if(spass.m_hzbRt.isValid())
-			{
-				args.m_hzbTexture = rgraphCtx.createTextureView(spass.m_hzbRt, TextureSubresourceDesc::all());
-			}
-
 			getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 			getRenderer().getRenderableDrawer().drawMdi(args, cmdb);
 		}
 		}
 	});
 	});

+ 0 - 5
AnKi/Renderer/Utils/Drawer.cpp

@@ -44,8 +44,6 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 		ANKI_ASSERT(args.m_viewport != UVec4(0u));
 		ANKI_ASSERT(args.m_viewport != UVec4(0u));
 		globalUniforms->m_viewport = Vec4(args.m_viewport);
 		globalUniforms->m_viewport = Vec4(args.m_viewport);
 
 
-		globalUniforms->m_enableHzbTesting = args.m_hzbTexture.isValid();
-
 		cmdb.bindUniformBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_GLOBAL_UNIFORMS), globalUniformsToken);
 		cmdb.bindUniformBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_GLOBAL_UNIFORMS), globalUniformsToken);
 	}
 	}
 
 
@@ -71,9 +69,6 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESH_LODS), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_MESH_LODS), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_TRANSFORMS), GpuSceneArrays::Transform::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_TRANSFORMS), GpuSceneArrays::Transform::getSingleton().getBufferView());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS), GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
 	cmdb.bindStorageBuffer(ANKI_REG(ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS), GpuSceneArrays::ParticleEmitter::getSingleton().getBufferViewSafe());
-	cmdb.bindTexture(ANKI_REG(ANKI_MATERIAL_REGISTER_HZB_TEXTURE),
-					 (args.m_hzbTexture.isValid()) ? args.m_hzbTexture
-												   : TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
 	cmdb.bindSampler(ANKI_REG(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER), getRenderer().getSamplers().m_nearestNearestClamp.get());
 	cmdb.bindSampler(ANKI_REG(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER), getRenderer().getSamplers().m_nearestNearestClamp.get());
 
 
 	if(args.m_mesh.m_firstMeshletBuffer.isValid())
 	if(args.m_mesh.m_firstMeshletBuffer.isValid())

+ 0 - 2
AnKi/Renderer/Utils/Drawer.h

@@ -26,8 +26,6 @@ public:
 
 
 	UVec4 m_viewport;
 	UVec4 m_viewport;
 
 
-	TextureView m_hzbTexture; ///< Optional.
-
 	Sampler* m_sampler = nullptr;
 	Sampler* m_sampler = nullptr;
 
 
 	RenderingTechnique m_renderingTechinuqe = RenderingTechnique::kCount;
 	RenderingTechnique m_renderingTechinuqe = RenderingTechnique::kCount;

+ 220 - 84
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -151,8 +151,9 @@ Error GpuVisibility::init()
 		}
 		}
 	}
 	}
 
 
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2.ankiprogbin", {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}},
-								 m_2ndStageProg, m_gatherGrProg, "Legacy"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
+								 {{"HZB_TEST", 0}, {"PASSTHROUGH", 0}, {"MESH_SHADERS", 0}, {"STORE_MESHLETS_FAILED_HZB", 1}}, m_2ndStageProg,
+								 m_gatherGrProg, "Legacy"));
 
 
 	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
 	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
 	{
 	{
@@ -160,9 +161,16 @@ Error GpuVisibility::init()
 		{
 		{
 			for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
 			for(MutatorValue meshShaders = 0; meshShaders < 2; ++meshShaders)
 			{
 			{
-				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2.ankiprogbin",
-											 {{"HZB_TEST", hzb}, {"PASSTHROUGH", passthrough}, {"MESH_SHADERS", meshShaders}}, m_2ndStageProg,
-											 m_meshletGrProgs[hzb][passthrough][meshShaders], "Meshlets"));
+				for(MutatorValue storeMeshletsFailedHzb = 0; storeMeshletsFailedHzb < 2; ++storeMeshletsFailedHzb)
+				{
+					ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityStage2And3.ankiprogbin",
+												 {{"HZB_TEST", hzb},
+												  {"PASSTHROUGH", passthrough},
+												  {"MESH_SHADERS", meshShaders},
+												  {"STORE_MESHLETS_FAILED_HZB", storeMeshletsFailedHzb}},
+												 m_2ndStageProg, m_meshletGrProgs[hzb][passthrough][meshShaders][storeMeshletsFailedHzb],
+												 "Meshlets"));
+				}
 			}
 			}
 		}
 		}
 	}
 	}
@@ -201,6 +209,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	FrustumTestData* frustumTestData = nullptr;
 	FrustumTestData* frustumTestData = nullptr;
 	DistanceTestData* distTestData = nullptr;
 	DistanceTestData* distTestData = nullptr;
 
 
+	Bool bStoreMeshletsFailedHzb = false;
 	if(distanceBased)
 	if(distanceBased)
 	{
 	{
 		distTestData = newInstance<DistanceTestData>(getRenderer().getFrameMemoryPool());
 		distTestData = newInstance<DistanceTestData>(getRenderer().getFrameMemoryPool());
@@ -219,6 +228,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		{
 		{
 			frustumTestData->m_hzbRt = *fin.m_hzbRt;
 			frustumTestData->m_hzbRt = *fin.m_hzbRt;
 		}
 		}
+
+		bStoreMeshletsFailedHzb = fin.m_twoPhaseOcclusionCulling;
 	}
 	}
 
 
 	const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
 	const Bool firstCallInFrame = m_persistentMemory.m_frameIdx != getRenderer().getFrameCount();
@@ -263,11 +274,16 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	const U32 bucketCount = buckets.getBucketCount(in.m_technique);
 	const U32 bucketCount = buckets.getBucketCount(in.m_technique);
 	const GpuVisLimits limits = computeLimits(in.m_technique);
 	const GpuVisLimits limits = computeLimits(in.m_technique);
 
 
-	const Bool bHwMeshletRendering = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders && limits.m_maxVisibleMeshlets > 0;
-	const Bool bSwMeshletRendering = g_meshletRenderingCVar.get() && !bHwMeshletRendering && limits.m_maxVisibleMeshlets > 0;
+	const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
+	const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
 	const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
 	const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
 	const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
 	const Bool bLegacyRendering = limits.m_maxVisibleLegacyRenderables > 0;
 
 
+	if(bStoreMeshletsFailedHzb)
+	{
+		ANKI_ASSERT(bMeshletRendering && frustumTestData->m_hzbRt.isValid());
+	}
+
 	// Allocate persistent memory for the frame
 	// Allocate persistent memory for the frame
 	if(firstCallInFrame)
 	if(firstCallInFrame)
 	{
 	{
@@ -292,6 +308,11 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		m_persistentMemory.m_stage2Meshlet.m_meshletInstances =
 		m_persistentMemory.m_stage2Meshlet.m_meshletInstances =
 			allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
 			allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
 
 
+		m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb =
+			allocateTransientGpuMem(sizeof(GpuVisibilityVisibleRenderableDesc) * maxLimits.m_maxVisibleMeshlets);
+
+		m_persistentMemory.m_stage3.m_meshletInstances = allocateTransientGpuMem(sizeof(GpuSceneMeshletInstance) * maxLimits.m_maxVisibleMeshlets);
+
 		m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
 		m_persistentMemory.m_dep = rgraph.importBuffer((bMeshletRendering) ? m_persistentMemory.m_stage1.m_visibleMeshlets
 																		   : m_persistentMemory.m_stage1.m_visibleRenderables,
 																		   : m_persistentMemory.m_stage1.m_visibleRenderables,
 													   BufferUsageBit::kNone);
 													   BufferUsageBit::kNone);
@@ -334,13 +355,13 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 
 		BufferView m_renderablePrefixSums;
 		BufferView m_renderablePrefixSums;
 		BufferView m_meshletPrefixSums;
 		BufferView m_meshletPrefixSums;
-		BufferView m_stage2IndirectArgs;
+		BufferView m_gpuVisIndirectDispatchArgs;
 
 
 		BufferView m_visibleAabbIndices;
 		BufferView m_visibleAabbIndices;
 		BufferView m_hash;
 		BufferView m_hash;
 	} stage1Mem;
 	} stage1Mem;
 
 
-	stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * 3);
+	stage1Mem.m_counters = allocateTransientGpuMem(sizeof(U32) * U32(GpuVisibilityCounter::kCount));
 	if(in.m_limitMemory)
 	if(in.m_limitMemory)
 	{
 	{
 		PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
 		PtrSize newRange = sizeof(GpuVisibilityVisibleRenderableDesc) * limits.m_maxVisibleLegacyRenderables;
@@ -364,7 +385,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 	}
 	stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
 	stage1Mem.m_renderablePrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
 	stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
 	stage1Mem.m_meshletPrefixSums = allocateTransientGpuMem(sizeof(U32) * bucketCount);
-	stage1Mem.m_stage2IndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * 2);
+	stage1Mem.m_gpuVisIndirectDispatchArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::kCount));
 
 
 	if(in.m_gatherAabbIndices)
 	if(in.m_gatherAabbIndices)
 	{
 	{
@@ -396,6 +417,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			BufferView m_dispatchMeshIndirectArgs;
 			BufferView m_dispatchMeshIndirectArgs;
 
 
 			BufferView m_meshletInstances;
 			BufferView m_meshletInstances;
+
+			BufferView m_meshletsFailedHzb;
 		} m_meshlet;
 		} m_meshlet;
 	} stage2Mem;
 	} stage2Mem;
 
 
@@ -442,6 +465,53 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		{
 		{
 			stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange);
 			stage2Mem.m_meshlet.m_meshletInstances = allocateTransientGpuMem(newRange);
 		}
 		}
+
+		if(bStoreMeshletsFailedHzb)
+		{
+			const PtrSize newRange = sizeof(GpuVisibilityVisibleMeshletDesc) * limits.m_maxVisibleMeshlets;
+			if(in.m_limitMemory)
+			{
+				ANKI_ASSERT(newRange <= m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb.getRange());
+				stage2Mem.m_meshlet.m_meshletsFailedHzb = BufferView(m_persistentMemory.m_stage2Meshlet.m_meshletsFailedHzb).setRange(newRange);
+			}
+			else
+			{
+				stage2Mem.m_meshlet.m_meshletsFailedHzb = allocateTransientGpuMem(newRange);
+			}
+		}
+	}
+
+	// Stage 3 memory
+	class Stage3Mem
+	{
+	public:
+		BufferView m_indirectDrawArgs;
+		BufferView m_dispatchMeshIndirectArgs;
+
+		BufferView m_meshletInstances;
+	} stage3Mem;
+
+	if(bStoreMeshletsFailedHzb)
+	{
+		if(bHwMeshletRendering)
+		{
+			stage3Mem.m_dispatchMeshIndirectArgs = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * bucketCount);
+		}
+		else
+		{
+			stage3Mem.m_indirectDrawArgs = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * bucketCount);
+		}
+
+		const PtrSize newRange = sizeof(GpuSceneMeshletInstance) * limits.m_maxVisibleMeshlets;
+		if(in.m_limitMemory)
+		{
+			ANKI_ASSERT(newRange <= m_persistentMemory.m_stage3.m_meshletInstances.getRange());
+			stage3Mem.m_meshletInstances = BufferView(m_persistentMemory.m_stage3.m_meshletInstances).setRange(newRange);
+		}
+		else
+		{
+			stage3Mem.m_meshletInstances = allocateTransientGpuMem(newRange);
+		}
 	}
 	}
 
 
 	// Setup output
 	// Setup output
@@ -457,9 +527,21 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	{
 	{
 		out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
 		out.m_mesh.m_firstMeshletBuffer = stage1Mem.m_meshletPrefixSums;
 	}
 	}
+	if(bStoreMeshletsFailedHzb)
+	{
+		out.m_stage1And2Mem.m_meshletsFailedHzb = stage2Mem.m_meshlet.m_meshletsFailedHzb;
+		out.m_stage1And2Mem.m_counters = stage1Mem.m_counters;
+		out.m_stage1And2Mem.m_meshletPrefixSums = stage1Mem.m_meshletPrefixSums;
+		out.m_stage1And2Mem.m_gpuVisIndirectDispatchArgs = stage1Mem.m_gpuVisIndirectDispatchArgs;
+
+		out.m_stage3Mem.m_indirectDrawArgs = stage3Mem.m_indirectDrawArgs;
+		out.m_stage3Mem.m_dispatchMeshIndirectArgs = stage3Mem.m_dispatchMeshIndirectArgs;
+		out.m_stage3Mem.m_meshletInstances = stage3Mem.m_meshletInstances;
+	}
 
 
 	// Use one buffer as a depedency. Doesn't matter which
 	// Use one buffer as a depedency. Doesn't matter which
-	out.m_dependency = (in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_stage2IndirectArgs, BufferUsageBit::kNone);
+	out.m_dependency =
+		(in.m_limitMemory) ? m_persistentMemory.m_dep : rgraph.importBuffer(stage1Mem.m_gpuVisIndirectDispatchArgs, BufferUsageBit::kNone);
 
 
 	// Zero some stuff
 	// Zero some stuff
 	const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
 	const BufferHandle zeroMemDep = rgraph.importBuffer(stage1Mem.m_counters, BufferUsageBit::kNone);
@@ -467,64 +549,46 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
 		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis zero: %s", in.m_passesName.cstr()));
 		pass.newBufferDependency(zeroMemDep, BufferUsageBit::kTransferDestination);
 		pass.newBufferDependency(zeroMemDep, BufferUsageBit::kTransferDestination);
 
 
-		pass.setWork([stage1Mem, stage2Mem, this](RenderPassWorkContext& rpass) {
+		pass.setWork([stage1Mem, stage2Mem, stage3Mem, this](RenderPassWorkContext& rpass) {
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 
-			cmdb.pushDebugMarker("Temp counters", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(stage1Mem.m_counters, 0);
-			cmdb.popDebugMarker();
-
-			if(stage1Mem.m_renderablePrefixSums.isValid())
-			{
-				cmdb.pushDebugMarker("Renderable prefix sums", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage1Mem.m_renderablePrefixSums, 0);
-				cmdb.popDebugMarker();
-			}
-
-			if(stage1Mem.m_meshletPrefixSums.isValid())
-			{
-				cmdb.pushDebugMarker("Meshlet prefix sums", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage1Mem.m_meshletPrefixSums, 0);
-				cmdb.popDebugMarker();
-			}
-
-			if(stage2Mem.m_legacy.m_drawIndexedIndirectArgs.isValid())
-			{
-				cmdb.pushDebugMarker("Draw indexed indirect args", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, 0);
-				cmdb.popDebugMarker();
-			}
+			constexpr Bool debugZeroing = false; // For debugging purposes zero everything
 
 
-			if(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs.isValid())
-			{
-				cmdb.pushDebugMarker("Dispatch indirect args", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, 0);
-				cmdb.popDebugMarker();
-			}
-
-			if(stage2Mem.m_meshlet.m_indirectDrawArgs.isValid())
-			{
-				cmdb.pushDebugMarker("Draw indirect args (S/W meshlet rendering)", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage2Mem.m_meshlet.m_indirectDrawArgs, 0);
-				cmdb.popDebugMarker();
-			}
-
-			if(stage2Mem.m_legacy.m_mdiDrawCounts.isValid())
-			{
-				cmdb.pushDebugMarker("MDI counts", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(stage2Mem.m_legacy.m_mdiDrawCounts, 0);
-				cmdb.popDebugMarker();
-			}
-
-			cmdb.pushDebugMarker("OoM readback", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(m_outOfMemoryReadbackBuffer, 0);
-			cmdb.popDebugMarker();
+#define ANKI_ZERO(buff, alwaysZero) \
+	if((alwaysZero || debugZeroing) && buff.isValid()) \
+	{ \
+		cmdb.pushDebugMarker(#buff, Vec3(1.0f, 1.0f, 1.0f)); \
+		cmdb.fillBuffer(buff, 0); \
+		cmdb.popDebugMarker(); \
+	}
+			ANKI_ZERO(stage1Mem.m_counters, true)
+			ANKI_ZERO(stage1Mem.m_visibleRenderables, false)
+			ANKI_ZERO(stage1Mem.m_visibleMeshlets, false)
+			ANKI_ZERO(stage1Mem.m_renderablePrefixSums, true)
+			ANKI_ZERO(stage1Mem.m_meshletPrefixSums, true)
+			ANKI_ZERO(stage1Mem.m_gpuVisIndirectDispatchArgs, false)
+			ANKI_ZERO(stage1Mem.m_visibleAabbIndices, false)
+			ANKI_ZERO(stage1Mem.m_hash, true)
+
+			ANKI_ZERO(stage2Mem.m_legacy.m_instanceRateRenderables, false)
+			ANKI_ZERO(stage2Mem.m_legacy.m_drawIndexedIndirectArgs, true)
+			ANKI_ZERO(stage2Mem.m_legacy.m_mdiDrawCounts, true)
+			ANKI_ZERO(stage2Mem.m_meshlet.m_indirectDrawArgs, true)
+			ANKI_ZERO(stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs, true)
+			ANKI_ZERO(stage2Mem.m_meshlet.m_meshletInstances, false)
+			ANKI_ZERO(stage2Mem.m_meshlet.m_meshletsFailedHzb, false)
+
+			ANKI_ZERO(stage3Mem.m_indirectDrawArgs, true)
+			ANKI_ZERO(stage3Mem.m_dispatchMeshIndirectArgs, true)
+			ANKI_ZERO(stage3Mem.m_meshletInstances, false)
+
+#undef ANKI_ZERO
 		});
 		});
 	}
 	}
 
 
 	// 1st stage
 	// 1st stage
 	{
 	{
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st pass: %s", in.m_passesName.cstr()));
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 1st stage: %s", in.m_passesName.cstr()));
 
 
 		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
 		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
 		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kStorageComputeWrite);
 		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kStorageComputeWrite);
@@ -586,7 +650,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			cmdb.bindStorageBuffer(ANKI_REG(u3), (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(&getRenderer().getDummyBuffer()));
 			cmdb.bindStorageBuffer(ANKI_REG(u3), (bLegacyRendering) ? stage1Mem.m_renderablePrefixSums : BufferView(&getRenderer().getDummyBuffer()));
 			cmdb.bindStorageBuffer(ANKI_REG(u4), (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(&getRenderer().getDummyBuffer()));
 			cmdb.bindStorageBuffer(ANKI_REG(u4), (bMeshletRendering) ? stage1Mem.m_meshletPrefixSums : BufferView(&getRenderer().getDummyBuffer()));
 
 
-			cmdb.bindStorageBuffer(ANKI_REG(u5), stage1Mem.m_stage2IndirectArgs);
+			cmdb.bindStorageBuffer(ANKI_REG(u5), stage1Mem.m_gpuVisIndirectDispatchArgs);
 
 
 			cmdb.bindStorageBuffer(ANKI_REG(u6), m_outOfMemoryReadbackBuffer);
 			cmdb.bindStorageBuffer(ANKI_REG(u6), m_outOfMemoryReadbackBuffer);
 
 
@@ -649,7 +713,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 
 	// 2nd stage
 	// 2nd stage
 	{
 	{
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd pass: %s", in.m_passesName.cstr()));
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 2nd stage: %s", in.m_passesName.cstr()));
 
 
 		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kStorageComputeWrite);
 		pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kStorageComputeWrite);
 
 
@@ -659,7 +723,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		}
 		}
 
 
 		pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
 		pass.setWork([this, stage1Mem, stage2Mem, bLegacyRendering, bMeshletRendering, bHwMeshletRendering, out, frustumTestData,
-					  lodReferencePoint = in.m_lodReferencePoint](RenderPassWorkContext& rpass) {
+					  lodReferencePoint = in.m_lodReferencePoint, bStoreMeshletsFailedHzb](RenderPassWorkContext& rpass) {
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 
 			if(bLegacyRendering)
 			if(bLegacyRendering)
@@ -691,7 +755,10 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 
 				cmdb.bindStorageBuffer(ANKI_REG(u4), m_outOfMemoryReadbackBuffer);
 				cmdb.bindStorageBuffer(ANKI_REG(u4), m_outOfMemoryReadbackBuffer);
 
 
-				cmdb.dispatchComputeIndirect(BufferView(stage1Mem.m_stage2IndirectArgs).setRange(sizeof(DispatchIndirectArgs)));
+				cmdb.dispatchComputeIndirect(
+					BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
+						.incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageLegacy))
+						.setRange(sizeof(DispatchIndirectArgs)));
 			}
 			}
 
 
 			if(bMeshletRendering)
 			if(bMeshletRendering)
@@ -700,7 +767,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 				const Bool passthrough = frustumTestData == nullptr;
 				const Bool passthrough = frustumTestData == nullptr;
 				const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
 				const Bool meshShaders = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
 
 
-				cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders].get());
+				cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][meshShaders][bStoreMeshletsFailedHzb].get());
 
 
 				cmdb.bindStorageBuffer(ANKI_REG(t0), GpuSceneArrays::Renderable::getSingleton().getBufferView());
 				cmdb.bindStorageBuffer(ANKI_REG(t0), GpuSceneArrays::Renderable::getSingleton().getBufferView());
 				cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
 				cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
@@ -714,29 +781,25 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 					cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
 					cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
 				}
 				}
 
 
-				cmdb.bindStorageBuffer(ANKI_REG(t5), stage1Mem.m_counters);
-				cmdb.bindStorageBuffer(ANKI_REG(t6), stage1Mem.m_meshletPrefixSums);
-				cmdb.bindStorageBuffer(ANKI_REG(t7), stage1Mem.m_visibleMeshlets);
+				cmdb.bindStorageBuffer(ANKI_REG(u0), stage1Mem.m_counters);
+				cmdb.bindStorageBuffer(ANKI_REG(t5), stage1Mem.m_meshletPrefixSums);
+				cmdb.bindStorageBuffer(ANKI_REG(t6), stage1Mem.m_visibleMeshlets);
 
 
-				cmdb.bindStorageBuffer(ANKI_REG(u0), (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs
+				cmdb.bindStorageBuffer(ANKI_REG(u1), (bHwMeshletRendering) ? stage2Mem.m_meshlet.m_dispatchMeshIndirectArgs
 																		   : stage2Mem.m_meshlet.m_indirectDrawArgs);
 																		   : stage2Mem.m_meshlet.m_indirectDrawArgs);
-				cmdb.bindStorageBuffer(ANKI_REG(u1), stage2Mem.m_meshlet.m_meshletInstances);
+				cmdb.bindStorageBuffer(ANKI_REG(u2), stage2Mem.m_meshlet.m_meshletInstances);
 
 
-				cmdb.bindStorageBuffer(ANKI_REG(u2), m_outOfMemoryReadbackBuffer);
+				cmdb.bindStorageBuffer(ANKI_REG(u3), m_outOfMemoryReadbackBuffer);
 
 
-				if(!passthrough)
+				if(bStoreMeshletsFailedHzb)
 				{
 				{
-					class Consts
-					{
-					public:
-						Mat4 m_viewProjectionMatrix;
-
-						Vec3 m_cameraPos;
-						U32 m_padding1;
+					cmdb.bindStorageBuffer(ANKI_REG(u4), stage2Mem.m_meshlet.m_meshletsFailedHzb);
+					cmdb.bindStorageBuffer(ANKI_REG(u5), stage1Mem.m_gpuVisIndirectDispatchArgs);
+				}
 
 
-						Vec2 m_viewportSizef;
-						UVec2 m_padding2;
-					} consts;
+				if(!passthrough)
+				{
+					GpuVisibilityMeshletUniforms consts;
 					consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
 					consts.m_viewProjectionMatrix = frustumTestData->m_viewProjMat;
 					consts.m_cameraPos = lodReferencePoint;
 					consts.m_cameraPos = lodReferencePoint;
 					consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
 					consts.m_viewportSizef = Vec2(frustumTestData->m_finalRenderTargetSize);
@@ -745,13 +808,86 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 				}
 				}
 
 
 				cmdb.dispatchComputeIndirect(
 				cmdb.dispatchComputeIndirect(
-					BufferView(stage1Mem.m_stage2IndirectArgs).incrementOffset(sizeof(DispatchIndirectArgs)).setRange(sizeof(DispatchIndirectArgs)));
+					BufferView(stage1Mem.m_gpuVisIndirectDispatchArgs)
+						.incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k2ndStageMeshlets))
+						.setRange(sizeof(DispatchIndirectArgs)));
 			}
 			}
 		});
 		});
 
 
 	} // end 2nd stage
 	} // end 2nd stage
 }
 }
 
 
+void GpuVisibility::populateRenderGraphStage3(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
+{
+	RenderGraphBuilder& rgraph = *in.m_rgraph;
+
+	const GpuVisLimits limits = computeLimits(in.m_technique);
+	const Bool bHwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kMeshShaders && limits.m_maxVisibleMeshlets > 0;
+	const Bool bSwMeshletRendering = getRenderer().getMeshletRenderingType() == MeshletRenderingType::kSoftware && limits.m_maxVisibleMeshlets > 0;
+	const Bool bMeshletRendering = bHwMeshletRendering || bSwMeshletRendering;
+
+	if(!bMeshletRendering)
+	{
+		return;
+	}
+
+	// Set the output
+	if(bHwMeshletRendering)
+	{
+		out.m_mesh.m_dispatchMeshIndirectArgsBuffer = out.m_stage3Mem.m_dispatchMeshIndirectArgs;
+	}
+	else
+	{
+		out.m_mesh.m_drawIndirectArgs = out.m_stage3Mem.m_indirectDrawArgs;
+	}
+	out.m_mesh.m_meshletInstancesBuffer = out.m_stage3Mem.m_meshletInstances;
+
+	// Create the pass
+	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU vis 3rd stage: %s", in.m_passesName.cstr()));
+
+	pass.newBufferDependency(out.m_dependency, BufferUsageBit::kIndirectCompute | BufferUsageBit::kStorageComputeWrite);
+	pass.newBufferDependency(m_persistentMemory.m_dep, BufferUsageBit::kIndirectCompute | BufferUsageBit::kStorageComputeWrite);
+	pass.newTextureDependency(*in.m_hzbRt, TextureUsageBit::kSampledCompute);
+
+	pass.setWork([this, hzbRt = *in.m_hzbRt, bHwMeshletRendering, stage1And2Mem = out.m_stage1And2Mem, stage3Mem = out.m_stage3Mem,
+				  in](RenderPassWorkContext& rpass) {
+		CommandBuffer& cmdb = *rpass.m_commandBuffer;
+
+		const Bool hzbTex = true;
+		const Bool passthrough = false;
+		const Bool bStoreMeshletsFailedHzb = false;
+		cmdb.bindShaderProgram(m_meshletGrProgs[hzbTex][passthrough][bHwMeshletRendering][bStoreMeshletsFailedHzb].get());
+
+		cmdb.bindStorageBuffer(ANKI_REG(t0), GpuSceneArrays::Renderable::getSingleton().getBufferView());
+		cmdb.bindStorageBuffer(ANKI_REG(t1), GpuSceneArrays::MeshLod::getSingleton().getBufferView());
+		cmdb.bindStorageBuffer(ANKI_REG(t2), GpuSceneArrays::Transform::getSingleton().getBufferView());
+
+		cmdb.bindStorageBuffer(ANKI_REG(t3), UnifiedGeometryBuffer::getSingleton().getBufferView());
+
+		rpass.bindTexture(ANKI_REG(t4), hzbRt);
+		cmdb.bindSampler(ANKI_REG(s0), getRenderer().getSamplers().m_nearestNearestClamp.get());
+
+		cmdb.bindStorageBuffer(ANKI_REG(u0), stage1And2Mem.m_counters);
+		cmdb.bindStorageBuffer(ANKI_REG(t5), stage1And2Mem.m_meshletPrefixSums);
+		cmdb.bindStorageBuffer(ANKI_REG(t6), stage1And2Mem.m_meshletsFailedHzb);
+
+		cmdb.bindStorageBuffer(ANKI_REG(u1), (bHwMeshletRendering) ? stage3Mem.m_dispatchMeshIndirectArgs : stage3Mem.m_indirectDrawArgs);
+		cmdb.bindStorageBuffer(ANKI_REG(u2), stage3Mem.m_meshletInstances);
+
+		cmdb.bindStorageBuffer(ANKI_REG(u3), m_outOfMemoryReadbackBuffer);
+
+		GpuVisibilityMeshletUniforms consts;
+		consts.m_viewProjectionMatrix = in.m_viewProjectionMatrix;
+		consts.m_cameraPos = in.m_lodReferencePoint;
+		consts.m_viewportSizef = Vec2(in.m_viewportSize);
+		cmdb.setPushConstants(&consts, sizeof(consts));
+
+		cmdb.dispatchComputeIndirect(BufferView(stage1And2Mem.m_gpuVisIndirectDispatchArgs)
+										 .incrementOffset(sizeof(DispatchIndirectArgs) * U32(GpuVisibilityIndirectDispatches::k3rdStageMeshlets))
+										 .setRange(sizeof(DispatchIndirectArgs)));
+	});
+}
+
 Error GpuVisibilityNonRenderables::init()
 Error GpuVisibilityNonRenderables::init()
 {
 {
 	ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));
 	ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));

+ 34 - 1
AnKi/Renderer/Utils/GpuVisibility.h

@@ -70,6 +70,8 @@ public:
 	UVec2 m_viewportSize;
 	UVec2 m_viewportSize;
 
 
 	const RenderTargetHandle* m_hzbRt = nullptr; ///< Optional.
 	const RenderTargetHandle* m_hzbRt = nullptr; ///< Optional.
+
+	Bool m_twoPhaseOcclusionCulling = false; ///< If it's false then it's only a single phase. Only applies when meshlet rendering is enabled.
 };
 };
 
 
 /// @memberof GpuVisibility
 /// @memberof GpuVisibility
@@ -83,6 +85,8 @@ public:
 /// @memberof GpuVisibility
 /// @memberof GpuVisibility
 class GpuVisibilityOutput
 class GpuVisibilityOutput
 {
 {
+	friend class GpuVisibility;
+
 public:
 public:
 	BufferHandle m_dependency; ///< Just expose one handle for depedencies. No need to track all buffers. Wait on it using indirect draw usage.
 	BufferHandle m_dependency; ///< Just expose one handle for depedencies. No need to track all buffers. Wait on it using indirect draw usage.
 
 
@@ -116,6 +120,24 @@ public:
 	{
 	{
 		return m_dependency.isValid();
 		return m_dependency.isValid();
 	}
 	}
+
+private:
+	class
+	{
+	public:
+		BufferView m_meshletsFailedHzb;
+		BufferView m_counters;
+		BufferView m_meshletPrefixSums;
+		BufferView m_gpuVisIndirectDispatchArgs;
+	} m_stage1And2Mem; ///< Output of the 2nd (or 1st) stage that will be used in the 3rd
+
+	class
+	{
+	public:
+		BufferView m_indirectDrawArgs;
+		BufferView m_dispatchMeshIndirectArgs;
+		BufferView m_meshletInstances;
+	} m_stage3Mem; ///< Output of the 3rd stage.
 };
 };
 
 
 /// Performs GPU visibility for some pass.
 /// Performs GPU visibility for some pass.
@@ -133,6 +155,10 @@ public:
 		populateRenderGraphInternal(false, in, out);
 		populateRenderGraphInternal(false, in, out);
 	}
 	}
 
 
+	/// Perform the optional stage 3: 2nd phase of the 2-phase occlusion culling.
+	/// @note Not thread-safe.
+	void populateRenderGraphStage3(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out);
+
 	/// Perform simple distance-based visibility testing.
 	/// Perform simple distance-based visibility testing.
 	/// @note Not thread-safe.
 	/// @note Not thread-safe.
 	void populateRenderGraph(DistanceGpuVisibilityInput& in, GpuVisibilityOutput& out)
 	void populateRenderGraph(DistanceGpuVisibilityInput& in, GpuVisibilityOutput& out)
@@ -147,7 +173,7 @@ private:
 
 
 	ShaderProgramResourcePtr m_2ndStageProg;
 	ShaderProgramResourcePtr m_2ndStageProg;
 	ShaderProgramPtr m_gatherGrProg;
 	ShaderProgramPtr m_gatherGrProg;
-	Array3d<ShaderProgramPtr, 2, 2, 2> m_meshletGrProgs;
+	Array4d<ShaderProgramPtr, 2, 2, 2, 2> m_meshletGrProgs;
 
 
 	class
 	class
 	{
 	{
@@ -170,8 +196,15 @@ private:
 		{
 		{
 		public:
 		public:
 			BufferView m_meshletInstances;
 			BufferView m_meshletInstances;
+			BufferView m_meshletsFailedHzb;
 		} m_stage2Meshlet;
 		} m_stage2Meshlet;
 
 
+		class
+		{
+		public:
+			BufferView m_meshletInstances;
+		} m_stage3;
+
 		U64 m_frameIdx = kMaxU64;
 		U64 m_frameIdx = kMaxU64;
 
 
 		BufferHandle m_dep;
 		BufferHandle m_dep;

+ 15 - 30
AnKi/Renderer/Utils/HzbGenerator.cpp

@@ -57,20 +57,9 @@ Error HzbGenerator::init()
 	m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
 	m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
 
 
 	// Zero counter buffer
 	// Zero counter buffer
-	{
-		CommandBufferInitInfo cmdbInit;
-		cmdbInit.m_flags |= CommandBufferFlag::kSmallBatch;
-		CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(cmdbInit);
-
-		cmdb->fillBuffer(BufferView(m_counterBuffer.get()), 0);
-
-		FencePtr fence;
-		cmdb->endRecording();
-		GrManager::getSingleton().submit(cmdb.get(), {}, &fence);
-
-		fence->clientWait(6.0_sec);
-	}
+	zeroBuffer(m_counterBuffer.get());
 
 
+	// Boxes buffer
 	buffInit = BufferInitInfo("HzbBoxIndices");
 	buffInit = BufferInitInfo("HzbBoxIndices");
 	buffInit.m_size = sizeof(kBoxIndices);
 	buffInit.m_size = sizeof(kBoxIndices);
 	buffInit.m_usage = BufferUsageBit::kIndex;
 	buffInit.m_usage = BufferUsageBit::kIndex;
@@ -84,24 +73,19 @@ Error HzbGenerator::init()
 	return Error::kNone;
 	return Error::kNone;
 }
 }
 
 
-void HzbGenerator::populateRenderGraphInternal(ConstWeakArray<DispatchInput> dispatchInputs, U32 firstCounterBufferElement, CString customName,
-											   RenderGraphBuilder& rgraph) const
+void HzbGenerator::populateRenderGraphInternal(ConstWeakArray<DispatchInput> dispatchInputs, CString customName, RenderGraphBuilder& rgraph)
 {
 {
 	const U32 dispatchCount = dispatchInputs.getSize();
 	const U32 dispatchCount = dispatchInputs.getSize();
 
 
-#if ANKI_ASSERTIONS_ENABLED
 	if(m_crntFrame != getRenderer().getFrameCount())
 	if(m_crntFrame != getRenderer().getFrameCount())
 	{
 	{
 		m_crntFrame = getRenderer().getFrameCount();
 		m_crntFrame = getRenderer().getFrameCount();
-		m_counterBufferElementUseMask = 0;
+		m_counterBufferCrntElementCount = 0;
 	}
 	}
 
 
-	for(U32 i = 0; i < dispatchCount; ++i)
-	{
-		ANKI_ASSERT(!(m_counterBufferElementUseMask & (1 << (firstCounterBufferElement + i))));
-		m_counterBufferElementUseMask |= (1 << (firstCounterBufferElement + i));
-	}
-#endif
+	const U32 counterBufferElement = m_counterBufferCrntElementCount;
+	m_counterBufferCrntElementCount += dispatchCount;
+	ANKI_ASSERT(counterBufferElement < kCounterBufferElementCount);
 
 
 	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass((customName.isEmpty()) ? "HZB generation" : customName);
 	NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass((customName.isEmpty()) ? "HZB generation" : customName);
 
 
@@ -115,7 +99,7 @@ void HzbGenerator::populateRenderGraphInternal(ConstWeakArray<DispatchInput> dis
 		dispatchInputsCopy[i] = dispatchInputs[i];
 		dispatchInputsCopy[i] = dispatchInputs[i];
 	}
 	}
 
 
-	pass.setWork([this, dispatchInputsCopy, dispatchCount, firstCounterBufferElement](RenderPassWorkContext& rgraphCtx) {
+	pass.setWork([this, dispatchInputsCopy, dispatchCount, counterBufferElement](RenderPassWorkContext& rgraphCtx) {
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 
 		cmdb.bindShaderProgram(m_genPyramidGrProg.get());
 		cmdb.bindShaderProgram(m_genPyramidGrProg.get());
@@ -166,8 +150,9 @@ void HzbGenerator::populateRenderGraphInternal(ConstWeakArray<DispatchInput> dis
 				++mipsReg.m_bindPoint;
 				++mipsReg.m_bindPoint;
 			}
 			}
 
 
-			cmdb.bindStorageBuffer(
-				ANKI_REG(u0), BufferView(m_counterBuffer.get(), (firstCounterBufferElement + dispatch) * m_counterBufferElementSize, sizeof(U32)));
+			cmdb.bindStorageBuffer(ANKI_REG(u0), BufferView(m_counterBuffer.get())
+													 .incrementOffset((counterBufferElement + dispatch) * m_counterBufferElementSize)
+													 .setRange(sizeof(U32)));
 			rgraphCtx.bindTexture(ANKI_REG(t0), in.m_srcDepthRt, TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));
 			rgraphCtx.bindTexture(ANKI_REG(t0), in.m_srcDepthRt, TextureSubresourceDesc::firstSurface(DepthStencilAspectBit::kDepth));
 
 
 			cmdb.dispatchCompute(dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1], 1);
 			cmdb.dispatchCompute(dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1], 1);
@@ -176,17 +161,17 @@ void HzbGenerator::populateRenderGraphInternal(ConstWeakArray<DispatchInput> dis
 }
 }
 
 
 void HzbGenerator::populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
 void HzbGenerator::populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
-									   RenderGraphBuilder& rgraph, CString customName) const
+									   RenderGraphBuilder& rgraph, CString customName)
 {
 {
 	DispatchInput in;
 	DispatchInput in;
 	in.m_dstHzbRt = dstHzbRt;
 	in.m_dstHzbRt = dstHzbRt;
 	in.m_dstHzbRtSize = dstHzbRtSize;
 	in.m_dstHzbRtSize = dstHzbRtSize;
 	in.m_srcDepthRt = srcDepthRt;
 	in.m_srcDepthRt = srcDepthRt;
 	in.m_srcDepthRtSize = srcDepthRtSize;
 	in.m_srcDepthRtSize = srcDepthRtSize;
-	populateRenderGraphInternal({&in, 1}, 0, customName, rgraph);
+	populateRenderGraphInternal({&in, 1}, customName, rgraph);
 }
 }
 
 
-void HzbGenerator::populateRenderGraphDirectionalLight(const HzbDirectionalLightInput& in, RenderGraphBuilder& rgraph) const
+void HzbGenerator::populateRenderGraphDirectionalLight(const HzbDirectionalLightInput& in, RenderGraphBuilder& rgraph)
 {
 {
 	const U32 cascadeCount = in.m_cascadeCount;
 	const U32 cascadeCount = in.m_cascadeCount;
 	ANKI_ASSERT(cascadeCount > 0);
 	ANKI_ASSERT(cascadeCount > 0);
@@ -323,7 +308,7 @@ void HzbGenerator::populateRenderGraphDirectionalLight(const HzbDirectionalLight
 		inputs[i].m_srcDepthRt = depthRts[i];
 		inputs[i].m_srcDepthRt = depthRts[i];
 		inputs[i].m_srcDepthRtSize = cascade.m_hzbRtSize * 2;
 		inputs[i].m_srcDepthRtSize = cascade.m_hzbRtSize * 2;
 	}
 	}
-	populateRenderGraphInternal({&inputs[0], cascadeCount}, 1, "HZB generation shadow cascades", rgraph);
+	populateRenderGraphInternal({&inputs[0], cascadeCount}, "HZB generation shadow cascades", rgraph);
 }
 }
 
 
 } // end namespace anki
 } // end namespace anki

+ 7 - 12
AnKi/Renderer/Utils/HzbGenerator.h

@@ -44,9 +44,9 @@ public:
 	Error init();
 	Error init();
 
 
 	void populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
 	void populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
-							 RenderGraphBuilder& rgraph, CString customName = {}) const;
+							 RenderGraphBuilder& rgraph, CString customName = {});
 
 
-	void populateRenderGraphDirectionalLight(const HzbDirectionalLightInput& in, RenderGraphBuilder& rgraph) const;
+	void populateRenderGraphDirectionalLight(const HzbDirectionalLightInput& in, RenderGraphBuilder& rgraph);
 
 
 private:
 private:
 	class DispatchInput
 	class DispatchInput
@@ -70,20 +70,15 @@ private:
 	SamplerPtr m_maxSampler;
 	SamplerPtr m_maxSampler;
 
 
 	// This class assumes that the populateRenderGraph and the populateRenderGraphDirectionalLight will be called once per frame
 	// This class assumes that the populateRenderGraph and the populateRenderGraphDirectionalLight will be called once per frame
-	static constexpr U32 kCounterBufferElementCount = 1 + kMaxShadowCascades; ///< One for the main pass and a few for shadow cascades
-	U32 m_counterBufferElementSize = 0;
+	static constexpr U32 kCounterBufferElementCount = 2 + kMaxShadowCascades; ///< Two for the main pass and a few for shadow cascades
 	BufferPtr m_counterBuffer;
 	BufferPtr m_counterBuffer;
+	U64 m_crntFrame = 0;
+	U32 m_counterBufferElementSize = 0;
+	U32 m_counterBufferCrntElementCount = 0;
 
 
 	BufferPtr m_boxIndexBuffer;
 	BufferPtr m_boxIndexBuffer;
 
 
-#if ANKI_ASSERTIONS_ENABLED
-	// Some helper things to make sure that we don't re-use the counters inside a frame
-	mutable U64 m_crntFrame = 0;
-	mutable U8 m_counterBufferElementUseMask = 0;
-#endif
-
-	void populateRenderGraphInternal(ConstWeakArray<DispatchInput> dispatchInputs, U32 firstCounterBufferElement, CString customName,
-									 RenderGraphBuilder& rgraph) const;
+	void populateRenderGraphInternal(ConstWeakArray<DispatchInput> dispatchInputs, CString customName, RenderGraphBuilder& rgraph);
 };
 };
 /// @}
 /// @}
 
 

+ 57 - 52
AnKi/Shaders/GpuVisibilityStage1.ankiprog

@@ -47,7 +47,7 @@ RWStructuredBuffer<U32> g_renderablePrefixSums : register(u3);
 RWStructuredBuffer<U32> g_meshletPrefixSums : register(u4);
 RWStructuredBuffer<U32> g_meshletPrefixSums : register(u4);
 #endif
 #endif
 
 
-RWStructuredBuffer<DispatchIndirectArgs> g_stage2IndirectArgs : register(u5); // 2 elements. One for MDI and another for meshlets
+RWStructuredBuffer<DispatchIndirectArgs> g_gpuVisIndirectDispatchArgs : register(u5); // 2 elements. One for MDI and another for meshlets
 
 
 RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u6);
 RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u6);
 
 
@@ -195,7 +195,7 @@ Bool isVisible(GpuSceneRenderableBoundingVolume bvolume)
 
 
 			// X dimension will be fixed later
 			// X dimension will be fixed later
 			U32 firstMeshletIndex;
 			U32 firstMeshletIndex;
-			InterlockedAdd(SBUFF(g_counters, 1), meshLod.m_meshletCount, firstMeshletIndex);
+			InterlockedAdd(SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsSurvivingStage1Count), meshLod.m_meshletCount, firstMeshletIndex);
 
 
 			if(firstMeshletIndex + meshLod.m_meshletCount > maxVisibleMeshlets)
 			if(firstMeshletIndex + meshLod.m_meshletCount > maxVisibleMeshlets)
 			{
 			{
@@ -225,7 +225,7 @@ Bool isVisible(GpuSceneRenderableBoundingVolume bvolume)
 #if GATHER_LEGACY
 #if GATHER_LEGACY
 			// X dimension will be fixed later
 			// X dimension will be fixed later
 			U32 firstInstance;
 			U32 firstInstance;
-			InterlockedAdd(SBUFF(g_counters, 0), 1, firstInstance);
+			InterlockedAdd(SBUFF(g_counters, (U32)GpuVisibilityCounter::kVisibleRenderableCount), 1, firstInstance);
 
 
 			if(firstInstance >= maxVisibleInstances)
 			if(firstInstance >= maxVisibleInstances)
 			{
 			{
@@ -281,77 +281,82 @@ Bool isVisible(GpuSceneRenderableBoundingVolume bvolume)
 #endif
 #endif
 	}
 	}
 
 
+	// Sync to make sure all the atomic ops have finished before the following code reads them
+	AllMemoryBarrierWithGroupSync();
+
 	// Check if it's the last threadgroup running
 	// Check if it's the last threadgroup running
-	Bool lastThreadExecuting = false;
 	if(svGroupIndex == 0)
 	if(svGroupIndex == 0)
 	{
 	{
 		U32 threadgroupIdx;
 		U32 threadgroupIdx;
-		InterlockedAdd(SBUFF(g_counters, 2), 1, threadgroupIdx);
+		InterlockedAdd(SBUFF(g_counters, (U32)GpuVisibilityCounter::kThreadgroupCount), 1, threadgroupIdx);
 		const U32 threadgroupCount = (bvolumeCount + NUMTHREADS - 1) / NUMTHREADS;
 		const U32 threadgroupCount = (bvolumeCount + NUMTHREADS - 1) / NUMTHREADS;
-		lastThreadExecuting = (threadgroupIdx + 1 == threadgroupCount);
-	}
+		const Bool lastThreadExecuting = (threadgroupIdx + 1 == threadgroupCount);
 
 
-	// Sync to make sure all the atomic ops have finished before the following code reads them
-	AllMemoryBarrierWithGroupSync();
+		if(lastThreadExecuting)
+		{
+			// Last thing executing, fixup some sizes
 
 
-	if(lastThreadExecuting)
-	{
-		// Last thing executing, fixup some sizes
+			DispatchIndirectArgs args;
 
 
-		// Renderables
+			// Renderables
 #if GATHER_LEGACY
 #if GATHER_LEGACY
-		U32 visibleInstancesCount;
-		if(SBUFF(g_counters, 0) <= maxVisibleInstances)
-		{
-			visibleInstancesCount = SBUFF(g_counters, 0);
-		}
-		else
-		{
-			// OoM, fix a few things and inform the CPU
-			visibleInstancesCount = maxVisibleInstances;
-			SBUFF(g_counters, 0) = maxVisibleInstances;
-			InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
-		}
+			U32 visibleInstancesCount = SBUFF(g_counters, (U32)GpuVisibilityCounter::kVisibleRenderableCount);
+			if(visibleInstancesCount > maxVisibleInstances)
+			{
+				// OoM, fix a few things and inform the CPU
+				visibleInstancesCount = maxVisibleInstances;
+				SBUFF(g_counters, (U32)GpuVisibilityCounter::kVisibleRenderableCount) = maxVisibleInstances;
+				InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
+			}
 
 
-		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountX = (visibleInstancesCount + NUMTHREADS - 1) / NUMTHREADS;
-		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountY = 1;
-		SBUFF(g_stage2IndirectArgs, 0).m_threadGroupCountZ = 1;
+			args.m_threadGroupCountX = (visibleInstancesCount + NUMTHREADS - 1) / NUMTHREADS;
+			args.m_threadGroupCountY = 1;
+			args.m_threadGroupCountZ = 1;
+			SBUFF(g_gpuVisIndirectDispatchArgs, (U32)GpuVisibilityIndirectDispatches::k2ndStageLegacy) = args;
 #endif
 #endif
 
 
-		// Meshlets
+			// Meshlets
 #if GATHER_MESHLETS
 #if GATHER_MESHLETS
-		U32 visibleMeshletCount;
-		if(SBUFF(g_counters, 1) <= maxVisibleMeshlets)
-		{
-			visibleMeshletCount = SBUFF(g_counters, 1);
-		}
-		else
-		{
-			// OoM, fix a few things and inform the CPU
-			visibleMeshletCount = maxVisibleMeshlets;
-			SBUFF(g_counters, 1) = maxVisibleMeshlets;
-			InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
-		}
+			U32 meshletsForStage2Count = SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsSurvivingStage1Count);
+			if(meshletsForStage2Count > maxVisibleMeshlets)
+			{
+				// OoM, fix a few things and inform the CPU
+				meshletsForStage2Count = maxVisibleMeshlets;
+				SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsSurvivingStage1Count) = maxVisibleMeshlets;
+				InterlockedOr(SBUFF(g_outOfMemoryBuffer, 0), 1);
+			}
 
 
-		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountX = (visibleMeshletCount + NUMTHREADS - 1) / NUMTHREADS;
-		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountY = 1;
-		SBUFF(g_stage2IndirectArgs, 1).m_threadGroupCountZ = 1;
+			args.m_threadGroupCountX = (meshletsForStage2Count + NUMTHREADS - 1) / NUMTHREADS;
+			args.m_threadGroupCountY = 1;
+			args.m_threadGroupCountZ = 1;
+			SBUFF(g_gpuVisIndirectDispatchArgs, (U32)GpuVisibilityIndirectDispatches::k2ndStageMeshlets) = args;
 #endif
 #endif
 
 
-		// Prefix sums
-		for(U32 i = 1; i < bucketCount; ++i)
-		{
+			// Prefix sums. Use atomics because nVidia flickers
+			U32 prevLegacyVal = 0;
+			U32 prevMeshletVal = 0;
+			ANKI_MAYBE_UNUSED(prevMeshletVal);
+			ANKI_MAYBE_UNUSED(prevLegacyVal);
+			for(U32 i = 1; i < bucketCount; ++i)
+			{
+				U32 old;
+
 #if GATHER_LEGACY
 #if GATHER_LEGACY
-			SBUFF(g_renderablePrefixSums, i) += SBUFF(g_renderablePrefixSums, i - 1u);
+				// Equivalent to: g_renderablePrefixSums[i] += g_renderablePrefixSums[i - 1u]
+				InterlockedAdd(SBUFF(g_renderablePrefixSums, i), prevLegacyVal, old);
+				prevLegacyVal += old;
 #endif
 #endif
 
 
 #if GATHER_MESHLETS
 #if GATHER_MESHLETS
-			SBUFF(g_meshletPrefixSums, i) += SBUFF(g_meshletPrefixSums, i - 1u);
+				// Equivalent to: g_meshletPrefixSums[i] += g_meshletPrefixSums[i - 1u]
+				InterlockedAdd(SBUFF(g_meshletPrefixSums, i), prevMeshletVal, old);
+				prevMeshletVal += old;
 #endif
 #endif
-		}
+			}
 
 
-		// Reset it for the next job
-		SBUFF(g_counters, 2) = 0;
+			// Reset it for the next job
+			SBUFF(g_counters, (U32)GpuVisibilityCounter::kThreadgroupCount) = 0;
+		}
 	}
 	}
 }
 }
 
 

+ 119 - 66
AnKi/Shaders/GpuVisibilityStage2.ankiprog → AnKi/Shaders/GpuVisibilityStage2And3.ankiprog

@@ -6,12 +6,16 @@
 #pragma anki mutator HZB_TEST 0 1
 #pragma anki mutator HZB_TEST 0 1
 #pragma anki mutator PASSTHROUGH 0 1
 #pragma anki mutator PASSTHROUGH 0 1
 #pragma anki mutator MESH_SHADERS 0 1
 #pragma anki mutator MESH_SHADERS 0 1
+#pragma anki mutator STORE_MESHLETS_FAILED_HZB 0 1 // Two-phase occlusion culling
 
 
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
 #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
 #include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
 #include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
 #include <AnKi/Shaders/PackFunctions.hlsl>
 #include <AnKi/Shaders/PackFunctions.hlsl>
 
 
+#define NUMTHREADS 64u
+
+// This technique is used for legacy rendering. It gathers the visible renderables and places them into the correct buckets.
 #pragma anki technique_start comp Legacy uses_mutators
 #pragma anki technique_start comp Legacy uses_mutators
 
 
 struct DrawIndirectArgsWithPadding
 struct DrawIndirectArgsWithPadding
@@ -29,7 +33,7 @@ StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(t1);
 StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
 StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(t2);
 
 
 StructuredBuffer<GpuVisibilityVisibleRenderableDesc> g_visibleRenderables : register(t3);
 StructuredBuffer<GpuVisibilityVisibleRenderableDesc> g_visibleRenderables : register(t3);
-StructuredBuffer<U32> g_visibleRenderableCount : register(t4);
+StructuredBuffer<U32> g_counters : register(t4);
 StructuredBuffer<U32> g_renderablePrefixSums : register(t5);
 StructuredBuffer<U32> g_renderablePrefixSums : register(t5);
 
 
 // One for each bucket. Points to the 1st indirect args struct. 2nd element contains the max count
 // One for each bucket. Points to the 1st indirect args struct. 2nd element contains the max count
@@ -45,9 +49,9 @@ RWStructuredBuffer<U32> g_mdiDrawCounts : register(u3);
 
 
 RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u4);
 RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u4);
 
 
-[numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
+[numthreads(NUMTHREADS, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 {
-	if(svDispatchThreadId.x >= g_visibleRenderableCount[0])
+	if(svDispatchThreadId.x >= g_counters[(U32)GpuVisibilityCounter::kVisibleRenderableCount])
 	{
 	{
 		return;
 		return;
 	}
 	}
@@ -119,9 +123,9 @@ RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u4);
 
 
 #pragma anki technique_end comp Legacy
 #pragma anki technique_end comp Legacy
 
 
-#pragma anki technique_start comp Meshlets uses_mutators HZB_TEST PASSTHROUGH MESH_SHADERS
+#pragma anki technique_start comp Meshlets uses_mutators HZB_TEST PASSTHROUGH MESH_SHADERS STORE_MESHLETS_FAILED_HZB
 
 
-#define MESHLET_BACKFACE_CULLING 1
+#define MESHLET_BACKFACE_CULLING 0 // Doesn't work correctly for some reason
 #define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
 #define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
 #define MESHLET_NO_SAMPLING_POINT_CULLING 1
 #define MESHLET_NO_SAMPLING_POINT_CULLING 1
 #define MESHLET_HZB_CULLING HZB_TEST
 #define MESHLET_HZB_CULLING HZB_TEST
@@ -140,60 +144,41 @@ SamplerState g_nearestClampSampler : register(s0);
 #endif
 #endif
 
 
 // Prev stage results
 // Prev stage results
-StructuredBuffer<U32> g_counters : register(t5); // 2nd element is the visible meshlet count
-StructuredBuffer<U32> g_meshletPrefixSums : register(t6);
-StructuredBuffer<GpuVisibilityVisibleMeshletDesc> g_visibleMeshlets : register(t7);
+RWStructuredBuffer<U32> g_counters : register(u0); // 2nd element is the visible meshlet count
+StructuredBuffer<U32> g_meshletPrefixSums : register(t5);
+StructuredBuffer<GpuVisibilityVisibleMeshletDesc> g_visibleMeshlets : register(t6);
 
 
 // New results
 // New results
 #if MESH_SHADERS
 #if MESH_SHADERS
-RWStructuredBuffer<DispatchIndirectArgs> g_dispatchMeshIndirectArgs : register(u0);
+RWStructuredBuffer<DispatchIndirectArgs> g_dispatchMeshIndirectArgs : register(u1);
 #else
 #else
-RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArgs : register(u0);
+RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArgs : register(u1);
 #endif
 #endif
-RWStructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(u1);
+RWStructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(u2);
 
 
-RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u2);
+RWStructuredBuffer<U32> g_outOfMemoryBuffer : register(u3);
 
 
-struct Consts
-{
-	Mat4 m_viewProjectionMatrix;
+#if STORE_MESHLETS_FAILED_HZB
+RWStructuredBuffer<GpuVisibilityVisibleMeshletDesc> g_meshletsFailedHzb : register(u4);
 
 
-	Vec3 m_cameraPos;
-	U32 m_padding1;
+RWStructuredBuffer<DispatchIndirectArgs> g_gpuVisIndirectDispatchArgs : register(u5);
+#endif
 
 
-	Vec2 m_viewportSizef;
-	UVec2 m_padding2;
-};
-ANKI_PUSH_CONSTANTS(Consts, g_unis)
+ANKI_PUSH_CONSTANTS(GpuVisibilityMeshletUniforms, g_unis)
 
 
-[numthreads(64, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
+Bool cullMeshlet(GpuSceneRenderable renderable, const MeshletBoundingVolume meshletBoundingVol, out Bool meshletCulledByHzb)
 {
 {
-	const U32 visibleMeshlets = SBUFF(g_counters, 1);
-	if(svDispatchThreadId >= visibleMeshlets)
-	{
-		return;
-	}
-
-	const GpuVisibilityVisibleMeshletDesc desc = SBUFF(g_visibleMeshlets, svDispatchThreadId);
-
-	const U32 renderableIdx = desc.m_renderableIndex_30bit_renderStageBucket_12bit >> 12u;
-	const U32 renderStateBucket = desc.m_renderableIndex_30bit_renderStageBucket_12bit & ((1u << 12u) - 1u);
-	const U32 lod = desc.m_lod_2bit_meshletIndex_30bit >> 30u;
-	const U32 meshletIdx = desc.m_lod_2bit_meshletIndex_30bit & ((1u << 30u) - 1u);
-
-	const GpuSceneRenderable renderable = SBUFF(g_renderables, renderableIdx);
-	const GpuSceneMeshLod meshLod = SBUFF(g_meshLods, renderable.m_meshLodsIndex + lod);
-	const MeshletBoundingVolume meshletBoundingVol = SBUFF(g_meshletBoundingVolumes, meshLod.m_firstMeshletBoundingVolume + meshletIdx);
-
-	// Meshlet culling
-	Bool cull = false;
+	meshletCulledByHzb = false;
 
 
 #if !PASSTHROUGH
 #if !PASSTHROUGH
 	const Mat3x4 worldTransform = SBUFF(g_transforms, renderable.m_worldTransformsIndex);
 	const Mat3x4 worldTransform = SBUFF(g_transforms, renderable.m_worldTransformsIndex);
 
 
 #	if MESHLET_BACKFACE_CULLING
 #	if MESHLET_BACKFACE_CULLING
 	const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
 	const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
-	cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform, g_unis.m_cameraPos);
+	if(cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform, g_unis.m_cameraPos))
+	{
+		return true;
+	}
 #	endif
 #	endif
 
 
 	const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
 	const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
@@ -205,57 +190,125 @@ ANKI_PUSH_CONSTANTS(Consts, g_unis)
 
 
 #	if MESHLET_OUTSIDE_OF_SCREEN_CULLING
 #	if MESHLET_OUTSIDE_OF_SCREEN_CULLING
 	// Outside of the screen
 	// Outside of the screen
-	cull = cull || (any(minNdc > 1.0f) || any(maxNdc < -1.0f));
+	if(any(minNdc > 1.0f) || any(maxNdc < -1.0f))
+	{
+		return true;
+	}
 #	endif
 #	endif
 
 
 #	if MESHLET_NO_SAMPLING_POINT_CULLING
 #	if MESHLET_NO_SAMPLING_POINT_CULLING
 	// Sampling points test
 	// Sampling points test
 	const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_viewportSizef;
 	const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_unis.m_viewportSizef;
 	const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_viewportSizef;
 	const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_unis.m_viewportSizef;
-	cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
+	if(any(round(windowCoordsMin) == round(windowCoordsMax)))
+	{
+		return true;
+	}
 #	endif
 #	endif
 
 
 #	if MESHLET_HZB_CULLING
 #	if MESHLET_HZB_CULLING
-	cull = cull || (renderable.m_boneTransformsOffset == 0u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
+	meshletCulledByHzb = (renderable.m_boneTransformsOffset == 0u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
+	return meshletCulledByHzb;
 #	endif
 #	endif
 
 
 #endif // !PASSTHROUGH
 #endif // !PASSTHROUGH
 
 
-	if(!cull)
+	return false;
+}
+
+[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
+{
+	const U32 meshletsSurvivingStage1Count = SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsSurvivingStage1Count);
+	if(svDispatchThreadId < meshletsSurvivingStage1Count)
 	{
 	{
-		U32 instanceIdx;
+		const GpuVisibilityVisibleMeshletDesc desc = SBUFF(g_visibleMeshlets, svDispatchThreadId);
+
+		const U32 renderableIdx = desc.m_renderableIndex_30bit_renderStageBucket_12bit >> 12u;
+		const U32 renderStateBucket = desc.m_renderableIndex_30bit_renderStageBucket_12bit & ((1u << 12u) - 1u);
+		const U32 lod = desc.m_lod_2bit_meshletIndex_30bit >> 30u;
+		const U32 meshletIdx = desc.m_lod_2bit_meshletIndex_30bit & ((1u << 30u) - 1u);
+
+		const GpuSceneRenderable renderable = SBUFF(g_renderables, renderableIdx);
+		const GpuSceneMeshLod meshLod = SBUFF(g_meshLods, renderable.m_meshLodsIndex + lod);
+		const MeshletBoundingVolume meshletBoundingVol = SBUFF(g_meshletBoundingVolumes, meshLod.m_firstMeshletBoundingVolume + meshletIdx);
+
+		// Meshlet culling
+		Bool meshletCulledByHzb;
+		const Bool cull = cullMeshlet(renderable, meshletBoundingVol, meshletCulledByHzb);
+
+		if(!cull)
+		{
+			U32 instanceIdx;
 #if MESH_SHADERS
 #if MESH_SHADERS
-		InterlockedAdd(SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountX, 1u, instanceIdx);
+			InterlockedAdd(SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountX, 1u, instanceIdx);
 #else
 #else
-		InterlockedAdd(SBUFF(g_indirectDrawArgs, renderStateBucket).m_instanceCount, 1u, instanceIdx);
+			InterlockedAdd(SBUFF(g_indirectDrawArgs, renderStateBucket).m_instanceCount, 1u, instanceIdx);
 #endif
 #endif
 
 
-		if(instanceIdx == 0)
-		{
-			// First instance, init the drawcall
+			if(instanceIdx == 0)
+			{
+				// First instance, init the drawcall
 #if MESH_SHADERS
 #if MESH_SHADERS
-			SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountY = 1u;
-			SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountZ = 1u;
+				SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountY = 1u;
+				SBUFF(g_dispatchMeshIndirectArgs, renderStateBucket).m_threadGroupCountZ = 1u;
 #else
 #else
-			SBUFF(g_indirectDrawArgs, renderStateBucket).m_firstInstance = SBUFF(g_meshletPrefixSums, renderStateBucket);
+				SBUFF(g_indirectDrawArgs, renderStateBucket).m_firstInstance = SBUFF(g_meshletPrefixSums, renderStateBucket);
 #endif
 #endif
-		}
+			}
 
 
 #if !MESH_SHADERS
 #if !MESH_SHADERS
-		// Try to limit the vertex size
-		InterlockedMax(SBUFF(g_indirectDrawArgs, renderStateBucket).m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
+			// Try to limit the vertex size
+			InterlockedMax(SBUFF(g_indirectDrawArgs, renderStateBucket).m_vertexCount, meshletBoundingVol.m_primitiveCount * 3u);
+#endif
+
+			GpuSceneMeshletInstance instance;
+			instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsIndex << 7u;
+			instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
+			instance.m_uniformsOffset = renderable.m_uniformsOffset;
+			instance.m_boneTransformsOffsetOrParticleEmitterIndex = renderable.m_boneTransformsOffset;
+			instance.m_meshletGeometryDescriptorIndex = meshLod.m_firstMeshletGeometryDescriptor + meshletIdx;
+
+			SBUFF(g_meshletInstances, SBUFF(g_meshletPrefixSums, renderStateBucket) + instanceIdx) = instance;
+		}
+
+#if STORE_MESHLETS_FAILED_HZB
+		if(cull && meshletCulledByHzb)
+		{
+			U32 idx;
+			InterlockedAdd(SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsCulledByHzbCount), 1u, idx);
+			SBUFF(g_meshletsFailedHzb, idx) = desc;
+		}
 #endif
 #endif
+	}
 
 
-		GpuSceneMeshletInstance instance;
-		instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit = renderable.m_worldTransformsIndex << 7u;
-		instance.m_worldTransformsIndex_25bit_meshletPrimitiveCount_7bit |= meshletBoundingVol.m_primitiveCount;
-		instance.m_uniformsOffset = renderable.m_uniformsOffset;
-		instance.m_boneTransformsOffsetOrParticleEmitterIndex =
-			(renderable.m_boneTransformsOffset) ? renderable.m_boneTransformsOffset : renderable.m_particleEmitterIndex;
-		instance.m_meshletGeometryDescriptorIndex = meshLod.m_firstMeshletGeometryDescriptor + meshletIdx;
+#if STORE_MESHLETS_FAILED_HZB
+	// Sync to make sure all the atomic ops have finished before the following code reads them
+	AllMemoryBarrierWithGroupSync();
 
 
-		SBUFF(g_meshletInstances, SBUFF(g_meshletPrefixSums, renderStateBucket) + instanceIdx) = instance;
+	// Check if it's the last threadgroup running
+	if(svGroupIndex == 0)
+	{
+		U32 threadgroupIdx;
+		InterlockedAdd(SBUFF(g_counters, (U32)GpuVisibilityCounter::kThreadgroupCount), 1, threadgroupIdx);
+		const U32 threadgroupCount = (meshletsSurvivingStage1Count + NUMTHREADS - 1) / NUMTHREADS;
+		const Bool lastThreadExecuting = (threadgroupIdx + 1 == threadgroupCount);
+
+		if(lastThreadExecuting)
+		{
+			// Last thing executing, prepare stage 3
+
+			const U32 meshletsNeedReTestingCount = SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsCulledByHzbCount);
+
+			DispatchIndirectArgs args;
+			args.m_threadGroupCountX = (meshletsNeedReTestingCount + NUMTHREADS - 1) / NUMTHREADS;
+			args.m_threadGroupCountY = 1;
+			args.m_threadGroupCountZ = 1;
+			SBUFF(g_gpuVisIndirectDispatchArgs, (U32)GpuVisibilityIndirectDispatches::k3rdStageMeshlets) = args;
+
+			SBUFF(g_counters, (U32)GpuVisibilityCounter::kMeshletsSurvivingStage1Count) = meshletsNeedReTestingCount;
+		}
 	}
 	}
+#endif
 }
 }
 
 
 #pragma anki technique_end comp Meshlets
 #pragma anki technique_end comp Meshlets

+ 30 - 0
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -66,4 +66,34 @@ struct GpuVisibilityVisibleMeshletDesc
 	U32 m_lod_2bit_meshletIndex_30bit;
 	U32 m_lod_2bit_meshletIndex_30bit;
 };
 };
 
 
+struct GpuVisibilityMeshletUniforms
+{
+	Mat4 m_viewProjectionMatrix;
+
+	Vec3 m_cameraPos;
+	U32 m_padding1;
+
+	Vec2 m_viewportSizef;
+	UVec2 m_padding2;
+};
+
+enum class GpuVisibilityCounter : U32
+{
+	kVisibleRenderableCount,
+	kMeshletsSurvivingStage1Count,
+	kThreadgroupCount,
+	kMeshletsCulledByHzbCount,
+
+	kCount
+};
+
+enum class GpuVisibilityIndirectDispatches : U32
+{
+	k2ndStageLegacy,
+	k2ndStageMeshlets,
+	k3rdStageMeshlets,
+
+	kCount
+};
+
 ANKI_END_NAMESPACE
 ANKI_END_NAMESPACE

+ 9 - 15
AnKi/Shaders/Include/MaterialTypes.h

@@ -18,13 +18,8 @@ struct MaterialGlobalUniforms
 	Mat3x4 m_cameraTransform;
 	Mat3x4 m_cameraTransform;
 
 
 	Vec4 m_viewport;
 	Vec4 m_viewport;
-
-	U32 m_enableHzbTesting;
-	U32 m_padding0;
-	U32 m_padding1;
-	U32 m_padding2;
 };
 };
-static_assert(sizeof(MaterialGlobalUniforms) == 16 * sizeof(Vec4));
+static_assert(sizeof(MaterialGlobalUniforms) == 15 * sizeof(Vec4));
 
 
 #define ANKI_MATERIAL_REGISTER_TILINEAR_REPEAT_SAMPLER s0
 #define ANKI_MATERIAL_REGISTER_TILINEAR_REPEAT_SAMPLER s0
 #define ANKI_MATERIAL_REGISTER_GLOBAL_UNIFORMS b0
 #define ANKI_MATERIAL_REGISTER_GLOBAL_UNIFORMS b0
@@ -37,22 +32,21 @@ static_assert(sizeof(MaterialGlobalUniforms) == 16 * sizeof(Vec4));
 #define ANKI_MATERIAL_REGISTER_MESH_LODS t5
 #define ANKI_MATERIAL_REGISTER_MESH_LODS t5
 #define ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS t6
 #define ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS t6
 #define ANKI_MATERIAL_REGISTER_TRANSFORMS t7
 #define ANKI_MATERIAL_REGISTER_TRANSFORMS t7
-#define ANKI_MATERIAL_REGISTER_HZB_TEXTURE t8
 #define ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER s1
 #define ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER s1
-#define ANKI_MATERIAL_REGISTER_FIRST_MESHLET t9
+#define ANKI_MATERIAL_REGISTER_FIRST_MESHLET t8
 
 
 // For FW shading:
 // For FW shading:
 #define ANKI_MATERIAL_REGISTER_LINEAR_CLAMP_SAMPLER s2
 #define ANKI_MATERIAL_REGISTER_LINEAR_CLAMP_SAMPLER s2
 #define ANKI_MATERIAL_REGISTER_SHADOW_SAMPLER s3
 #define ANKI_MATERIAL_REGISTER_SHADOW_SAMPLER s3
-#define ANKI_MATERIAL_REGISTER_SCENE_DEPTH t10
-#define ANKI_MATERIAL_REGISTER_LIGHT_VOLUME t11
+#define ANKI_MATERIAL_REGISTER_SCENE_DEPTH t9
+#define ANKI_MATERIAL_REGISTER_LIGHT_VOLUME t10
 #define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_UNIFORMS b1
 #define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_UNIFORMS b1
-#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_POINT_LIGHTS t12
-#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_SPOT_LIGHTS t13
-#define ANKI_MATERIAL_REGISTER_SHADOW_ATLAS t14
-#define ANKI_MATERIAL_REGISTER_CLUSTERS t15
+#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_POINT_LIGHTS t11
+#define ANKI_MATERIAL_REGISTER_CLUSTER_SHADING_SPOT_LIGHTS t12
+#define ANKI_MATERIAL_REGISTER_SHADOW_ATLAS t13
+#define ANKI_MATERIAL_REGISTER_CLUSTERS t14
 
 
 // Always last because it's variable. Texture buffer bindings pointing to unified geom buffer:
 // Always last because it's variable. Texture buffer bindings pointing to unified geom buffer:
-#define ANKI_MATERIAL_REGISTER_UNIFIED_GEOMETRY_START t16
+#define ANKI_MATERIAL_REGISTER_UNIFIED_GEOMETRY_START t15
 
 
 ANKI_END_NAMESPACE
 ANKI_END_NAMESPACE

+ 0 - 1
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -27,7 +27,6 @@ StructuredBuffer<GpuSceneMeshletInstance> g_meshletInstances : register(ANKI_MAT
 StructuredBuffer<GpuSceneRenderable> g_renderables : register(ANKI_MATERIAL_REGISTER_RENDERABLES);
 StructuredBuffer<GpuSceneRenderable> g_renderables : register(ANKI_MATERIAL_REGISTER_RENDERABLES);
 StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(ANKI_MATERIAL_REGISTER_MESH_LODS);
 StructuredBuffer<GpuSceneMeshLod> g_meshLods : register(ANKI_MATERIAL_REGISTER_MESH_LODS);
 StructuredBuffer<Mat3x4> g_transforms : register(ANKI_MATERIAL_REGISTER_TRANSFORMS);
 StructuredBuffer<Mat3x4> g_transforms : register(ANKI_MATERIAL_REGISTER_TRANSFORMS);
-Texture2D<Vec4> g_hzbTexture : register(ANKI_MATERIAL_REGISTER_HZB_TEXTURE);
 SamplerState g_nearestClampSampler : register(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER);
 SamplerState g_nearestClampSampler : register(ANKI_MATERIAL_REGISTER_NEAREST_CLAMP_SAMPLER);
 StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS);
 StructuredBuffer<GpuSceneParticleEmitter> g_particleEmitters : register(ANKI_MATERIAL_REGISTER_PARTICLE_EMITTERS);
 StructuredBuffer<U32> g_firstMeshlet : register(ANKI_MATERIAL_REGISTER_FIRST_MESHLET);
 StructuredBuffer<U32> g_firstMeshlet : register(ANKI_MATERIAL_REGISTER_FIRST_MESHLET);

+ 1 - 1
Samples/Common/SampleApp.cpp

@@ -63,7 +63,7 @@ Error SampleApp::userMainLoop(Bool& quit, Second elapsedTime)
 
 
 	if(in.getKey(KeyCode::kY) == 1)
 	if(in.getKey(KeyCode::kY) == 1)
 	{
 	{
-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "SkyLut") ? "" : "SkyLut");
+		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "GBufferAlbedo") ? "" : "GBufferAlbedo");
 	}
 	}
 
 
 	if(in.getKey(KeyCode::kU) == 1)
 	if(in.getKey(KeyCode::kU) == 1)