Browse Source

Implement SW meshlet culling

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
a566493446
47 changed files with 663 additions and 434 deletions
  1. 1 0
      AnKi/Core/App.cpp
  2. 1 0
      AnKi/Core/App.h
  3. 24 6
      AnKi/Gr/RenderGraph.cpp
  4. 1 1
      AnKi/Gr/RenderGraph.h
  5. 26 2
      AnKi/Renderer/ForwardShading.cpp
  6. 2 1
      AnKi/Renderer/ForwardShading.h
  7. 61 44
      AnKi/Renderer/GBuffer.cpp
  8. 0 2
      AnKi/Renderer/GBuffer.h
  9. 59 13
      AnKi/Renderer/IndirectDiffuseProbes.cpp
  10. 51 6
      AnKi/Renderer/ProbeReflections.cpp
  11. 1 0
      AnKi/Renderer/Renderer.cpp
  12. 12 0
      AnKi/Renderer/Renderer.h
  13. 10 3
      AnKi/Renderer/RendererObject.cpp
  14. 3 1
      AnKi/Renderer/RendererObject.h
  15. 101 96
      AnKi/Renderer/ShadowMapping.cpp
  16. 4 16
      AnKi/Renderer/ShadowMapping.h
  17. 2 2
      AnKi/Renderer/Utils/Drawer.cpp
  18. 1 0
      AnKi/Renderer/Utils/Drawer.h
  19. 122 111
      AnKi/Renderer/Utils/GpuVisibility.cpp
  20. 15 8
      AnKi/Renderer/Utils/GpuVisibility.h
  21. 47 29
      AnKi/Resource/MaterialResource.cpp
  22. 11 1
      AnKi/Resource/MaterialResource.h
  23. 5 2
      AnKi/Resource/MeshResource.cpp
  24. 4 1
      AnKi/Resource/ModelResource.cpp
  25. 9 9
      AnKi/Resource/RenderingKey.h
  26. 1 1
      AnKi/Resource/ShaderProgramResource.h
  27. 3 3
      AnKi/Scene/Components/ModelComponent.cpp
  28. 7 11
      AnKi/Scene/RenderStateBucket.cpp
  29. 2 1
      AnKi/Scene/RenderStateBucket.h
  30. 39 28
      AnKi/Shaders/GBufferGeneric.ankiprog
  31. 18 14
      AnKi/Shaders/GpuVisibilityMeshlet.ankiprog
  32. 1 1
      AnKi/Shaders/Include/ClusteredShadingTypes.h
  33. 1 0
      AnKi/Shaders/Include/Common.h
  34. 2 2
      AnKi/Shaders/Include/MeshTypes.h
  35. 1 1
      AnKi/Shaders/LightShading.ankiprog
  36. 3 16
      AnKi/Shaders/MaterialShadersCommon.hlsl
  37. 1 1
      AnKi/Shaders/ShadowmapsResolve.ankiprog
  38. 10 0
      AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl
  39. 1 1
      AnKi/Shaders/VolumetricLightingAccumulation.ankiprog
  40. BIN
      Samples/SimpleScene/Assets/Mesh_0_d56f58fc33de003f.ankimesh
  41. BIN
      Samples/SimpleScene/Assets/Mesh_1_266a0dd9d2092f46.ankimesh
  42. BIN
      Samples/SimpleScene/Assets/Mesh_2_be53007bec464649.ankimesh
  43. BIN
      Samples/SimpleScene/Assets/Mesh_3_c026fdb5b74773ed.ankimesh
  44. BIN
      Samples/SimpleScene/Assets/Mesh_4_4d4aae6c030c4fd5.ankimesh
  45. BIN
      Samples/SimpleScene/Assets/Mesh_5_629309b27fa549a7.ankimesh
  46. BIN
      Samples/SimpleScene/Assets/Mesh_6_a078cf217893be6f.ankimesh
  47. BIN
      Samples/SimpleScene/Assets/Mesh_7_4b76b132380d8a62.ankimesh

+ 1 - 0
AnKi/Core/App.cpp

@@ -62,6 +62,7 @@ BoolCVar g_verboseLogCVar(CVarSubsystem::kCore, "VerboseLog", false, "Verbose lo
 BoolCVar g_benchmarkModeCVar(CVarSubsystem::kCore, "BenchmarkMode", false, "Run in a benchmark mode. Fixed timestep, unlimited target FPS");
 NumericCVar<U32> g_benchmarkModeFrameCountCVar(CVarSubsystem::kCore, "BenchmarkModeFrameCount", 60 * 60 * 2, 1, kMaxU32,
 											   "How many frames the benchmark will run before it quits");
+BoolCVar g_meshletRenderingCVar(CVarSubsystem::kCore, "MeshletRendering", false, "Do meshlet culling and rendering");
 
 #if ANKI_PLATFORM_MOBILE
 static StatCounter g_maliGpuActiveStatVar(StatCategory::kGpuMisc, "Mali active cycles", StatFlag::kMainThreadUpdates);

+ 1 - 0
AnKi/Core/App.h

@@ -21,6 +21,7 @@ extern NumericCVar<U32> g_windowHeightCVar;
 extern NumericCVar<U32> g_windowFullscreenCVar;
 extern NumericCVar<U32> g_targetFpsCVar;
 extern NumericCVar<U32> g_displayStatsCVar;
+extern BoolCVar g_meshletRenderingCVar;
 extern StatCounter g_cpuTotalTimeStatVar;
 extern StatCounter g_rendererGpuTimeStatVar;
 

+ 24 - 6
AnKi/Gr/RenderGraph.cpp

@@ -161,6 +161,23 @@ public:
 		, m_asBarriersBefore(pool)
 	{
 	}
+
+	Batch(Batch&& b)
+	{
+		*this = std::move(b);
+	}
+
+	Batch& operator=(Batch&& b)
+	{
+		m_passIndices = std::move(b.m_passIndices);
+		m_textureBarriersBefore = std::move(b.m_textureBarriersBefore);
+		m_bufferBarriersBefore = std::move(b.m_bufferBarriersBefore);
+		m_asBarriersBefore = std::move(b.m_asBarriersBefore);
+		m_cmdb = b.m_cmdb;
+		b.m_cmdb = nullptr;
+
+		return *this;
+	}
 };
 
 /// The RenderGraph build context.
@@ -662,7 +679,7 @@ Bool RenderGraph::passHasUnmetDependencies(const BakeContext& ctx, U32 passIdx)
 
 		for(const U32 depPassIdx : ctx.m_passes[passIdx].m_dependsOn)
 		{
-			if(ctx.m_passIsInBatch.get(depPassIdx) == false)
+			if(!ctx.m_passIsInBatch.get(depPassIdx))
 			{
 				// Dependency pass is not in a batch
 				depends = true;
@@ -828,6 +845,7 @@ void RenderGraph::initRenderPassesAndSetDeps(const RenderGraphDescription& descr
 		while(prevPassIdx--)
 		{
 			const RenderPassDescriptionBase& prevPass = *descr.m_passes[prevPassIdx];
+
 			if(passADependsOnB(inPass, prevPass))
 			{
 				outPass.m_dependsOn.emplaceBack(prevPassIdx);
@@ -846,9 +864,7 @@ void RenderGraph::initBatches()
 	Bool setTimestamp = m_ctx->m_gatherStatistics;
 	while(passesAssignedToBatchCount < passCount)
 	{
-		m_ctx->m_batches.emplaceBack(m_ctx->m_as.getMemoryPool().m_pool);
-		Batch& batch = m_ctx->m_batches.getBack();
-
+		Batch batch(m_ctx->m_as.getMemoryPool().m_pool);
 		Bool drawsToPresentable = false;
 
 		for(U32 i = 0; i < passCount; ++i)
@@ -896,11 +912,13 @@ void RenderGraph::initBatches()
 		}
 
 		// Mark batch's passes done
-		for(U32 passIdx : m_ctx->m_batches.getBack().m_passIndices)
+		for(U32 passIdx : batch.m_passIndices)
 		{
 			m_ctx->m_passIsInBatch.set(passIdx);
-			m_ctx->m_passes[passIdx].m_batchIdx = m_ctx->m_batches.getSize() - 1;
+			m_ctx->m_passes[passIdx].m_batchIdx = m_ctx->m_batches.getSize();
 		}
+
+		m_ctx->m_batches.emplaceBack(std::move(batch));
 	}
 }
 

+ 1 - 1
AnKi/Gr/RenderGraph.h

@@ -29,7 +29,7 @@ class RenderGraphDescription;
 
 /// @name RenderGraph constants
 /// @{
-constexpr U32 kMaxRenderGraphPasses = 256;
+constexpr U32 kMaxRenderGraphPasses = 512;
 constexpr U32 kMaxRenderGraphRenderTargets = 64; ///< Max imported or not render targets in RenderGraph.
 constexpr U32 kMaxRenderGraphBuffers = 256;
 constexpr U32 kMaxRenderGraphAccelerationStructures = 32;

+ 26 - 2
AnKi/Renderer/ForwardShading.cpp

@@ -37,9 +37,26 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
 	visIn.m_gatherAabbIndices = g_dbgCVar.get();
 	RenderTargetHandle hzb = getRenderer().getGBuffer().getHzbRt();
 	visIn.m_hzbRt = &hzb;
-	visIn.m_finalRenderTargetSize = getRenderer().getInternalResolution();
+	visIn.m_viewportSize = getRenderer().getInternalResolution();
 
 	getRenderer().getGpuVisibility().populateRenderGraph(visIn, m_runCtx.m_visOut);
+
+	if(getRenderer().runSoftwareMeshletRendering())
+	{
+		GpuMeshletVisibilityInput meshIn;
+		meshIn.m_passesName = "FW shading";
+		meshIn.m_technique = RenderingTechnique::kForward;
+		meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
+		meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
+		meshIn.m_viewportSize = getRenderer().getInternalResolution();
+		meshIn.m_taskShaderIndirectArgsBuffer = m_runCtx.m_visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+		meshIn.m_taskShaderPayloadBuffer = m_runCtx.m_visOut.m_mesh.m_taskShaderPayloadBuffer;
+		meshIn.m_dependency = m_runCtx.m_visOut.m_dependency;
+		meshIn.m_rgraph = &rgraph;
+		meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
+
+		getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, m_runCtx.m_meshVisOut);
+	}
 }
 
 void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgraphCtx)
@@ -83,6 +100,12 @@ void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgr
 		args.m_renderingTechinuqe = RenderingTechnique::kForward;
 		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
 		args.fillMdi(m_runCtx.m_visOut);
+
+		if(m_runCtx.m_meshVisOut.isFilled())
+		{
+			args.fill(m_runCtx.m_meshVisOut);
+		}
+
 		getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 
 		// Restore state
@@ -107,7 +130,8 @@ void ForwardShading::setDependencies(GraphicsRenderPassDescription& pass)
 
 	if(m_runCtx.m_visOut.containsDrawcalls())
 	{
-		pass.newBufferDependency(m_runCtx.m_visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+		pass.newBufferDependency((m_runCtx.m_meshVisOut.isFilled()) ? m_runCtx.m_meshVisOut.m_dependency : m_runCtx.m_visOut.m_dependency,
+								 BufferUsageBit::kIndirectDraw);
 	}
 }
 

+ 2 - 1
AnKi/Renderer/ForwardShading.h

@@ -36,7 +36,7 @@ public:
 	void getVisibleAabbsBuffer(BufferOffsetRange& visibleAaabbIndicesBuffer, BufferHandle& dep) const
 	{
 		visibleAaabbIndicesBuffer = m_runCtx.m_visOut.m_visibleAaabbIndicesBuffer;
-		dep = m_runCtx.m_visOut.m_someBufferHandle;
+		dep = m_runCtx.m_visOut.m_dependency;
 		ANKI_ASSERT(visibleAaabbIndicesBuffer.m_buffer != nullptr && dep.isValid());
 	}
 
@@ -45,6 +45,7 @@ private:
 	{
 	public:
 		GpuVisibilityOutput m_visOut;
+		GpuMeshletVisibilityOutput m_meshVisOut;
 	} m_runCtx;
 };
 /// @}

+ 61 - 44
AnKi/Renderer/GBuffer.cpp

@@ -100,45 +100,6 @@ Error GBuffer::initInternal()
 	return Error::kNone;
 }
 
-void GBuffer::runInThread(const RenderingContext& ctx, const GpuVisibilityOutput& visOut, RenderPassWorkContext& rgraphCtx) const
-{
-	ANKI_TRACE_SCOPED_EVENT(GBuffer);
-
-	CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
-
-	// Set some state, leave the rest to default
-	cmdb.setViewport(0, 0, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
-	cmdb.setRasterizationOrder(RasterizationOrder::kRelaxed);
-
-	const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
-	if(enableVrs)
-	{
-		// Just set some low value, the attachment will take over
-		cmdb.setVrsRate(VrsRate::k1x1);
-	}
-
-	RenderableDrawerArguments args;
-	args.m_viewMatrix = ctx.m_matrices.m_view;
-	args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
-	args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
-	args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
-	args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
-	args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
-	args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
-
-	TextureViewPtr hzbView;
-	if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
-	{
-		hzbView = rgraphCtx.createTextureView(m_runCtx.m_hzbRt);
-		args.m_hzbTexture = hzbView.get();
-	}
-
-	args.fillMdi(visOut);
-
-	cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
-	getRenderer().getSceneDrawer().drawMdi(args, cmdb);
-}
-
 void GBuffer::importRenderTargets(RenderingContext& ctx)
 {
 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
@@ -169,6 +130,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 
 	// Visibility
 	GpuVisibilityOutput visOut;
+	GpuMeshletVisibilityOutput meshletVisOut;
 	{
 		const CommonMatrices& matrices = (getRenderer().getFrameCount() <= 1) ? ctx.m_matrices : ctx.m_prevMatrices;
 		const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
@@ -182,12 +144,29 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		visIn.m_rgraph = &rgraph;
 		visIn.m_hzbRt = &m_runCtx.m_hzbRt;
 		visIn.m_gatherAabbIndices = g_dbgCVar.get();
-		visIn.m_finalRenderTargetSize = getRenderer().getInternalResolution();
+		visIn.m_viewportSize = getRenderer().getInternalResolution();
 
 		getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
 
 		m_runCtx.m_visibleAaabbIndicesBuffer = visOut.m_visibleAaabbIndicesBuffer;
-		m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_someBufferHandle;
+		m_runCtx.m_visibleAaabbIndicesBufferDepedency = visOut.m_dependency;
+
+		if(getRenderer().runSoftwareMeshletRendering())
+		{
+			GpuMeshletVisibilityInput meshIn;
+			meshIn.m_passesName = "GBuffer";
+			meshIn.m_technique = RenderingTechnique::kGBuffer;
+			meshIn.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
+			meshIn.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
+			meshIn.m_viewportSize = getRenderer().getInternalResolution();
+			meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+			meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+			meshIn.m_dependency = visOut.m_dependency;
+			meshIn.m_rgraph = &rgraph;
+			meshIn.m_hzbRt = getRenderer().getGBuffer().getHzbRt();
+
+			getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+		}
 	}
 
 	const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
@@ -230,8 +209,46 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 
 	pass.setFramebufferInfo(m_fbDescr, ConstWeakArray<RenderTargetHandle>(&rts[0], kGBufferColorRenderTargetCount), m_runCtx.m_crntFrameDepthRt,
 							sriRt);
-	pass.setWork(1, [this, &ctx, visOut](RenderPassWorkContext& rgraphCtx) {
-		runInThread(ctx, visOut, rgraphCtx);
+	pass.setWork(1, [this, &ctx, visOut, meshletVisOut](RenderPassWorkContext& rgraphCtx) {
+		ANKI_TRACE_SCOPED_EVENT(GBuffer);
+
+		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+		// Set some state, leave the rest to default
+		cmdb.setViewport(0, 0, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+		cmdb.setRasterizationOrder(RasterizationOrder::kRelaxed);
+
+		const Bool enableVrs = GrManager::getSingleton().getDeviceCapabilities().m_vrs && g_vrsCVar.get() && g_gbufferVrsCVar.get();
+		if(enableVrs)
+		{
+			// Just set some low value, the attachment will take over
+			cmdb.setVrsRate(VrsRate::k1x1);
+		}
+
+		RenderableDrawerArguments args;
+		args.m_viewMatrix = ctx.m_matrices.m_view;
+		args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
+		args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
+		args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
+		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
+		args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
+		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
+
+		TextureViewPtr hzbView;
+		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+		{
+			hzbView = rgraphCtx.createTextureView(m_runCtx.m_hzbRt);
+			args.m_hzbTexture = hzbView.get();
+		}
+
+		args.fillMdi(visOut);
+		if(meshletVisOut.isFilled())
+		{
+			args.fill(meshletVisOut);
+		}
+
+		cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
+		getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 	});
 
 	for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
@@ -255,7 +272,7 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kUavGeometryRead | BufferUsageBit::kUavFragmentRead);
 
 	// Only add one depedency to the GPU visibility. No need to track all buffers
-	pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+	pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 
 	// HZB generation for the next frame
 	getRenderer().getHzbGenerator().populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,

+ 0 - 2
AnKi/Renderer/GBuffer.h

@@ -89,8 +89,6 @@ private:
 	} m_runCtx;
 
 	Error initInternal();
-
-	void runInThread(const RenderingContext& ctx, const GpuVisibilityOutput& visOut, RenderPassWorkContext& rgraphCtx) const;
 };
 /// @}
 

+ 59 - 13
AnKi/Renderer/IndirectDiffuseProbes.cpp

@@ -202,6 +202,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 		{
 			// GBuffer visibility
 			GpuVisibilityOutput visOut;
+			GpuMeshletVisibilityOutput meshletVisOut;
 			Frustum frustum;
 			{
 				frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
@@ -211,15 +212,31 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 				FrustumGpuVisibilityInput visIn;
-				visIn.m_passesName = computeTempPassName("GI: GBuffer", cellIdx * 100 + f);
+				visIn.m_passesName = computeTempPassName("GI: GBuffer", cellIdx, "face", f);
 				visIn.m_technique = RenderingTechnique::kGBuffer;
 				visIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
 				visIn.m_lodReferencePoint = cellCenter;
 				visIn.m_lodDistances = lodDistances;
 				visIn.m_rgraph = &rgraph;
-				visIn.m_finalRenderTargetSize = UVec2(m_tileSize);
+				visIn.m_viewportSize = UVec2(m_tileSize);
 
 				getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
+
+				if(getRenderer().runSoftwareMeshletRendering())
+				{
+					GpuMeshletVisibilityInput meshIn;
+					meshIn.m_passesName = visIn.m_passesName;
+					meshIn.m_technique = RenderingTechnique::kGBuffer;
+					meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
+					meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
+					meshIn.m_viewportSize = UVec2(m_tileSize);
+					meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+					meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+					meshIn.m_dependency = visOut.m_dependency;
+					meshIn.m_rgraph = &rgraph;
+
+					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+				}
 			}
 
 			// GBuffer
@@ -238,7 +255,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				fbDescr.bake();
 
 				// Create the pass
-				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: GBuffer", cellIdx * 100 + f));
+				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: GBuffer", cellIdx, "face", f));
 				pass.setFramebufferInfo(fbDescr, gbufferColorRts, gbufferDepthRt);
 
 				for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
@@ -247,9 +264,10 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				}
 				pass.newTextureDependency(gbufferDepthRt, TextureUsageBit::kAllFramebuffer, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
 
-				pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+				pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency,
+										 BufferUsageBit::kIndirectDraw);
 
-				pass.setWork(1, [this, visOut, viewProjMat = frustum.getViewProjectionMatrix(),
+				pass.setWork(1, [this, visOut, meshletVisOut, viewProjMat = frustum.getViewProjectionMatrix(),
 								 viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
 					ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
 					CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -266,6 +284,11 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_viewport = UVec4(0, 0, m_tileSize, m_tileSize);
 					args.fillMdi(visOut);
 
+					if(meshletVisOut.isFilled())
+					{
+						args.fill(meshletVisOut);
+					}
+
 					getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 
 					// It's secondary, no need to restore any state
@@ -274,6 +297,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 
 			// Shadow visibility. Optional
 			GpuVisibilityOutput shadowVisOut;
+			GpuMeshletVisibilityOutput shadowMeshletVisOut;
 			Mat4 cascadeProjMat;
 			Mat3x4 cascadeViewMat;
 			Mat4 cascadeViewProjMat;
@@ -288,28 +312,45 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 				FrustumGpuVisibilityInput visIn;
-				visIn.m_passesName = computeTempPassName("GI: Shadows", cellIdx * 100 + f);
+				visIn.m_passesName = computeTempPassName("GI: Shadows", cellIdx, "face", f);
 				visIn.m_technique = RenderingTechnique::kDepth;
 				visIn.m_viewProjectionMatrix = cascadeViewProjMat;
 				visIn.m_lodReferencePoint = cellCenter;
 				visIn.m_lodDistances = lodDistances;
 				visIn.m_rgraph = &rgraph;
-				visIn.m_finalRenderTargetSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
+				visIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
 
 				getRenderer().getGpuVisibility().populateRenderGraph(visIn, shadowVisOut);
+
+				if(getRenderer().runSoftwareMeshletRendering())
+				{
+					GpuMeshletVisibilityInput meshIn;
+					meshIn.m_passesName = visIn.m_passesName;
+					meshIn.m_technique = RenderingTechnique::kDepth;
+					meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
+					meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
+					meshIn.m_viewportSize = visIn.m_viewportSize;
+					meshIn.m_taskShaderIndirectArgsBuffer = shadowVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+					meshIn.m_taskShaderPayloadBuffer = shadowVisOut.m_mesh.m_taskShaderPayloadBuffer;
+					meshIn.m_dependency = shadowVisOut.m_dependency;
+					meshIn.m_rgraph = &rgraph;
+
+					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
+				}
 			}
 
 			// Shadow pass. Optional
 			if(doShadows)
 			{
 				// Create the pass
-				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: Shadows", cellIdx * 100 + f));
+				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: Shadows", cellIdx, "face", f));
 				pass.setFramebufferInfo(m_shadowMapping.m_fbDescr, {}, shadowsRt);
 
 				pass.newTextureDependency(shadowsRt, TextureUsageBit::kAllFramebuffer, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
-				pass.newBufferDependency(shadowVisOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+				pass.newBufferDependency((shadowMeshletVisOut.isFilled()) ? shadowMeshletVisOut.m_dependency : shadowVisOut.m_dependency,
+										 BufferUsageBit::kIndirectDraw);
 
-				pass.setWork(1, [this, shadowVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
+				pass.setWork(1, [this, shadowVisOut, shadowMeshletVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
 					ANKI_TRACE_SCOPED_EVENT(RIndirectDiffuse);
 					CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
@@ -328,6 +369,11 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 					args.m_viewport = UVec4(0, 0, rez, rez);
 					args.fillMdi(shadowVisOut);
 
+					if(shadowMeshletVisOut.isFilled())
+					{
+						args.fill(shadowMeshletVisOut);
+					}
+
 					getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 
 					// It's secondary, no need to restore the state
@@ -338,7 +384,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 			GpuVisibilityNonRenderablesOutput lightVis;
 			{
 				GpuVisibilityNonRenderablesInput in;
-				in.m_passesName = computeTempPassName("GI: Light visibility", cellIdx * 100 + f);
+				in.m_passesName = computeTempPassName("GI: Light visibility", cellIdx, "face", f);
 				in.m_objectType = GpuSceneNonRenderableObjectType::kLight;
 				in.m_viewProjectionMat = frustum.getViewProjectionMatrix();
 				in.m_rgraph = &rgraph;
@@ -355,7 +401,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 				fbDescr.bake();
 
 				// Create the pass
-				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: Light shading", cellIdx * 100 + f));
+				GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(computeTempPassName("GI: Light shading", cellIdx, "face", f));
 				pass.setFramebufferInfo(fbDescr, {lightShadingRt});
 
 				pass.newBufferDependency(lightVis.m_visiblesBufferHandle, BufferUsageBit::kUavFragmentRead);
@@ -423,7 +469,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 
 		// Irradiance pass. First & 2nd bounce
 		{
-			ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(computeTempPassName("GI: Irradiance", cellIdx * 100));
+			ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(computeTempPassName("GI: Irradiance", cellIdx));
 
 			pass.newTextureDependency(lightShadingRt, TextureUsageBit::kSampledCompute);
 			pass.newTextureDependency(irradianceVolume, TextureUsageBit::kUavComputeWrite);

+ 51 - 6
AnKi/Renderer/ProbeReflections.cpp

@@ -198,6 +198,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	{
 		// GBuffer visibility
 		GpuVisibilityOutput visOut;
+		GpuMeshletVisibilityOutput meshletVisOut;
 		Frustum frustum;
 		{
 			frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
@@ -213,9 +214,25 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			visIn.m_lodReferencePoint = probeToRefresh->getWorldPosition();
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_rgraph = &rgraph;
-			visIn.m_finalRenderTargetSize = UVec2(m_gbuffer.m_tileSize);
+			visIn.m_viewportSize = UVec2(m_gbuffer.m_tileSize);
 
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
+
+			if(getRenderer().runSoftwareMeshletRendering())
+			{
+				GpuMeshletVisibilityInput meshIn;
+				meshIn.m_passesName = "Cube refl: GBuffer";
+				meshIn.m_technique = RenderingTechnique::kGBuffer;
+				meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
+				meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
+				meshIn.m_viewportSize = UVec2(m_gbuffer.m_tileSize);
+				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+				meshIn.m_dependency = visOut.m_dependency;
+				meshIn.m_rgraph = &rgraph;
+
+				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+			}
 		}
 
 		// GBuffer pass
@@ -243,9 +260,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			}
 
 			pass.newTextureDependency(gbufferDepthRt, TextureUsageBit::kAllFramebuffer, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
-			pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+			pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 
-			pass.setWork(1, [this, visOut, viewProjMat = frustum.getViewProjectionMatrix(),
+			pass.setWork(1, [this, visOut, meshletVisOut, viewProjMat = frustum.getViewProjectionMatrix(),
 							 viewMat = frustum.getViewMatrix()](RenderPassWorkContext& rgraphCtx) {
 				ANKI_TRACE_SCOPED_EVENT(ProbeReflections);
 				CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -262,6 +279,11 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				args.m_viewport = UVec4(0, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
 				args.fillMdi(visOut);
 
+				if(meshletVisOut.isFilled())
+				{
+					args.fill(meshletVisOut);
+				}
+
 				getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 			});
 		}
@@ -270,6 +292,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 		const LightComponent* dirLightc = SceneGraph::getSingleton().getDirectionalLight();
 		const Bool doShadows = dirLightc && dirLightc->getShadowEnabled();
 		GpuVisibilityOutput shadowVisOut;
+		GpuMeshletVisibilityOutput shadowMeshletVisOut;
 		Mat4 cascadeViewProjMat;
 		Mat3x4 cascadeViewMat;
 		Mat4 cascadeProjMat;
@@ -290,9 +313,25 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			visIn.m_lodReferencePoint = probeToRefresh->getWorldPosition();
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_rgraph = &rgraph;
-			visIn.m_finalRenderTargetSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
+			visIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
 
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, shadowVisOut);
+
+			if(getRenderer().runSoftwareMeshletRendering())
+			{
+				GpuMeshletVisibilityInput meshIn;
+				meshIn.m_passesName = "Cube refl: Shadows";
+				meshIn.m_technique = RenderingTechnique::kDepth;
+				meshIn.m_viewProjectionMatrix = cascadeViewProjMat;
+				meshIn.m_cameraTransform = cascadeViewMat.getInverseTransformation();
+				meshIn.m_viewportSize = UVec2(m_shadowMapping.m_rtDescr.m_height);
+				meshIn.m_taskShaderIndirectArgsBuffer = shadowVisOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+				meshIn.m_taskShaderPayloadBuffer = shadowVisOut.m_mesh.m_taskShaderPayloadBuffer;
+				meshIn.m_dependency = shadowVisOut.m_dependency;
+				meshIn.m_rgraph = &rgraph;
+
+				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, shadowMeshletVisOut);
+			}
 		}
 
 		// Shadows. Optional
@@ -303,9 +342,10 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			pass.setFramebufferInfo(m_shadowMapping.m_fbDescr, {}, shadowMapRt);
 
 			pass.newTextureDependency(shadowMapRt, TextureUsageBit::kAllFramebuffer, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
-			pass.newBufferDependency(shadowVisOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+			pass.newBufferDependency((shadowMeshletVisOut.isFilled()) ? shadowMeshletVisOut.m_dependency : shadowVisOut.m_dependency,
+									 BufferUsageBit::kIndirectDraw);
 
-			pass.setWork(1, [this, shadowVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
+			pass.setWork(1, [this, shadowVisOut, shadowMeshletVisOut, cascadeViewProjMat, cascadeViewMat](RenderPassWorkContext& rgraphCtx) {
 				ANKI_TRACE_SCOPED_EVENT(ProbeReflections);
 
 				CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -324,6 +364,11 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 				args.m_viewport = UVec4(0, 0, rez, rez);
 				args.fillMdi(shadowVisOut);
 
+				if(shadowMeshletVisOut.isFilled())
+				{
+					args.fill(shadowMeshletVisOut);
+				}
+
 				getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 			});
 		}

+ 1 - 0
AnKi/Renderer/Renderer.cpp

@@ -245,6 +245,7 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	}
 
 	ANKI_CHECK(m_visibility.init());
+	ANKI_CHECK(m_visibilityMeshlets.init());
 	ANKI_CHECK(m_nonRenderablesVisibility.init());
 	ANKI_CHECK(m_asVisibility.init());
 	ANKI_CHECK(m_hzbGenerator.init());

+ 12 - 0
AnKi/Renderer/Renderer.h

@@ -23,6 +23,7 @@ extern BoolCVar g_vrsLimitTo2x2CVar;
 extern BoolCVar g_preferComputeCVar;
 extern NumericCVar<F32> g_renderScalingCVar;
 extern BoolCVar g_rayTracedShadowsCVar;
+extern BoolCVar g_meshletRenderingCVar;
 extern NumericCVar<U8> g_shadowCascadeCountCVar;
 extern NumericCVar<F32> g_shadowCascade0DistanceCVar;
 extern NumericCVar<F32> g_shadowCascade1DistanceCVar;
@@ -109,6 +110,16 @@ public:
 		return m_visibility;
 	}
 
+	GpuMeshletVisibility& getGpuMeshletVisibility()
+	{
+		return m_visibilityMeshlets;
+	}
+
+	Bool runSoftwareMeshletRendering() const
+	{
+		return g_meshletRenderingCVar.get() && !GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
+	}
+
 	GpuVisibilityNonRenderables& getGpuVisibilityNonRenderables()
 	{
 		return m_nonRenderablesVisibility;
@@ -225,6 +236,7 @@ private:
 
 	RenderableDrawer m_sceneDrawer;
 	GpuVisibility m_visibility;
+	GpuMeshletVisibility m_visibilityMeshlets;
 	GpuVisibilityNonRenderables m_nonRenderablesVisibility;
 	GpuVisibilityAccelerationStructures m_asVisibility;
 	HzbGenerator m_hzbGenerator;

+ 10 - 3
AnKi/Renderer/RendererObject.cpp

@@ -111,9 +111,16 @@ void RendererObject::zeroBuffer(Buffer* buff)
 
 CString RendererObject::computeTempPassName(CString name, U32 index)
 {
-	thread_local static Char buff[128];
-	snprintf(buff, sizeof(buff), "%s #%u", name.cstr(), index);
-	return buff;
+	Char* str = static_cast<Char*>(getRenderer().getFrameMemoryPool().allocate(128, 1));
+	snprintf(str, 128, "%s #%u", name.cstr(), index);
+	return str;
+}
+
+CString RendererObject::computeTempPassName(CString name, U32 index, CString name2, U32 index2)
+{
+	Char* str = static_cast<Char*>(getRenderer().getFrameMemoryPool().allocate(128, 1));
+	snprintf(str, 128, "%s #%u %s #%u", name.cstr(), index, name2.cstr(), index2);
+	return str;
 }
 
 } // end namespace anki

+ 3 - 1
AnKi/Renderer/RendererObject.h

@@ -101,8 +101,10 @@ protected:
 
 	static void zeroBuffer(Buffer* buff);
 
-	/// Temp pass name. Output lives in global memory so use it before the next computeTempPassName call.
+	/// Temp pass name.
 	static CString computeTempPassName(CString name, U32 index);
+
+	static CString computeTempPassName(CString name, U32 index, CString name2, U32 index2);
 };
 /// @}
 

+ 101 - 96
AnKi/Renderer/ShadowMapping.cpp

@@ -380,8 +380,7 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				clearTileIndirectArgs = createVetVisibilityPass(computeTempPassName("Shadows: Vet point light", lightIdx), *lightc, visOut, rgraph);
 			}
 
-			// Add the draw pass
-			Array<ViewportDraw, 6> dviewports;
+			// Add additional visibility and draw passes
 			for(U32 face = 0; face < 6; ++face)
 			{
 				Frustum frustum;
@@ -390,13 +389,26 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				frustum.setWorldTransform(Transform(lightc->getWorldPosition().xyz0(), Frustum::getOmnidirectionalFrustumRotations()[face], 1.0f));
 				frustum.update();
 
-				dviewports[face].m_viewport = atlasViewports[face];
-				dviewports[face].m_viewProjMat = frustum.getViewProjectionMatrix();
-				dviewports[face].m_viewMat = frustum.getViewMatrix();
-				dviewports[face].m_clearTileIndirectArgs = clearTileIndirectArgs;
-			}
+				GpuMeshletVisibilityOutput meshletVisOut;
+				if(getRenderer().runSoftwareMeshletRendering())
+				{
+					GpuMeshletVisibilityInput meshIn;
+					meshIn.m_passesName = computeTempPassName("Shadows point light", lightIdx, "face", face);
+					meshIn.m_technique = RenderingTechnique::kDepth;
+					meshIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
+					meshIn.m_cameraTransform = frustum.getViewMatrix().getInverseTransformation();
+					meshIn.m_viewportSize = atlasViewports[face].zw();
+					meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+					meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+					meshIn.m_dependency = visOut.m_dependency;
+					meshIn.m_rgraph = &rgraph;
+
+					getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+				}
 
-			createMultipleDrawShadowsPass(dviewports, visOut, computeTempPassName("Shadows: Point light", lightIdx), rgraph);
+				createDrawShadowsPass(atlasViewports[face], frustum.getViewProjectionMatrix(), frustum.getViewMatrix(), visOut, meshletVisOut,
+									  clearTileIndirectArgs, {}, computeTempPassName("Shadows: Point light", lightIdx, "face", face), rgraph);
+			}
 		}
 		else
 		{
@@ -443,7 +455,7 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 			visIn.m_rgraph = &rgraph;
 			visIn.m_viewProjectionMatrix = lightc->getSpotLightViewProjectionMatrix();
 			visIn.m_hashVisibles = true;
-			visIn.m_finalRenderTargetSize = atlasViewport.zw();
+			visIn.m_viewportSize = atlasViewport.zw();
 
 			GpuVisibilityOutput visOut;
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
@@ -456,8 +468,26 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 				clearTileIndirectArgs = createVetVisibilityPass(computeTempPassName("Shadows: Vet spot light", lightIdx), *lightc, visOut, rgraph);
 			}
 
+			// Additional visibility
+			GpuMeshletVisibilityOutput meshletVisOut;
+			if(getRenderer().runSoftwareMeshletRendering())
+			{
+				GpuMeshletVisibilityInput meshIn;
+				meshIn.m_passesName = computeTempPassName("Shadows spot light", lightIdx);
+				meshIn.m_technique = RenderingTechnique::kDepth;
+				meshIn.m_viewProjectionMatrix = lightc->getSpotLightViewProjectionMatrix();
+				meshIn.m_cameraTransform = lightc->getSpotLightViewMatrix().getInverseTransformation();
+				meshIn.m_viewportSize = atlasViewport.zw();
+				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+				meshIn.m_dependency = visOut.m_dependency;
+				meshIn.m_rgraph = &rgraph;
+
+				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+			}
+
 			// Add draw pass
-			createDrawShadowsPass(atlasViewport, lightc->getSpotLightViewProjectionMatrix(), lightc->getSpotLightViewMatrix(), visOut,
+			createDrawShadowsPass(atlasViewport, lightc->getSpotLightViewProjectionMatrix(), lightc->getSpotLightViewMatrix(), visOut, meshletVisOut,
 								  clearTileIndirectArgs, {}, computeTempPassName("Shadows: Spot light", lightIdx), rgraph);
 		}
 		else
@@ -521,13 +551,31 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_hzbRt = &hzbGenIn.m_cascades[cascade].m_hzbRt;
 			visIn.m_rgraph = &rgraph;
-			visIn.m_finalRenderTargetSize = dirLightAtlasViewports[cascade].zw();
+			visIn.m_viewportSize = dirLightAtlasViewports[cascade].zw();
 
 			GpuVisibilityOutput visOut;
 			getRenderer().getGpuVisibility().populateRenderGraph(visIn, visOut);
 
+			// Additional visibility
+			GpuMeshletVisibilityOutput meshletVisOut;
+			if(getRenderer().runSoftwareMeshletRendering())
+			{
+				GpuMeshletVisibilityInput meshIn;
+				meshIn.m_passesName = computeTempPassName("Shadows: Dir light cascade", lightIdx);
+				meshIn.m_technique = RenderingTechnique::kDepth;
+				meshIn.m_viewProjectionMatrix = cascadeViewProjMats[cascade];
+				meshIn.m_cameraTransform = cascadeViewMats[cascade].getInverseTransformation();
+				meshIn.m_viewportSize = dirLightAtlasViewports[cascade].zw();
+				meshIn.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+				meshIn.m_taskShaderPayloadBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+				meshIn.m_dependency = visOut.m_dependency;
+				meshIn.m_rgraph = &rgraph;
+
+				getRenderer().getGpuMeshletVisibility().populateRenderGraph(meshIn, meshletVisOut);
+			}
+
 			// Draw
-			createDrawShadowsPass(dirLightAtlasViewports[cascade], cascadeViewProjMats[cascade], cascadeViewMats[cascade], visOut, {},
+			createDrawShadowsPass(dirLightAtlasViewports[cascade], cascadeViewProjMats[cascade], cascadeViewMats[cascade], visOut, meshletVisOut, {},
 								  hzbGenIn.m_cascades[cascade].m_hzbRt, computeTempPassName("Shadows: Dir light cascade", cascade), rgraph);
 
 			// Update the texture matrix to point to the correct region in the atlas
@@ -546,7 +594,7 @@ BufferOffsetRange ShadowMapping::createVetVisibilityPass(CString passName, const
 	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName);
 
 	// The shader doesn't actually write to the handle but have it as a write dependency for the drawer to correctly wait for this pass
-	pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(visOut.m_dependency, BufferUsageBit::kUavComputeWrite);
 
 	pass.setWork([this, &lightc, hashBuff = visOut.m_visiblesHashBuffer, mdiBuff = visOut.m_legacy.m_mdiDrawCountsBuffer, clearTileIndirectArgs,
 				  taskShadersIndirectArgs = visOut.m_mesh.m_taskShaderIndirectArgsBuffer](RenderPassWorkContext& rpass) {
@@ -564,8 +612,6 @@ BufferOffsetRange ShadowMapping::createVetVisibilityPass(CString passName, const
 		cmdb.bindUavBuffer(0, 4, clearTileIndirectArgs);
 		cmdb.bindUavBuffer(0, 5, taskShadersIndirectArgs);
 
-		// TODO add the s/w mesh stuff
-
 		ANKI_ASSERT(RenderStateBucketContainer::getSingleton().getBucketCount(RenderingTechnique::kDepth) <= 64 && "TODO");
 		cmdb.dispatchCompute(1, 1, 1);
 	});
@@ -573,106 +619,65 @@ BufferOffsetRange ShadowMapping::createVetVisibilityPass(CString passName, const
 	return clearTileIndirectArgs;
 }
 
-void ShadowMapping::createMultipleDrawShadowsPass(ConstWeakArray<ViewportDraw> viewports, const GpuVisibilityOutput visOut, CString passName,
-												  RenderGraphDescription& rgraph)
+void ShadowMapping::createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput& visOut,
+										  const GpuMeshletVisibilityOutput& meshletVisOut, const BufferOffsetRange& clearTileIndirectArgs,
+										  const RenderTargetHandle hzbRt, CString passName, RenderGraphDescription& rgraph)
 {
-	ANKI_ASSERT(viewports.getSize() > 0);
-
-	const Bool loadFb = (viewports[0].m_clearTileIndirectArgs.m_buffer != nullptr);
-	for(const ViewportDraw& v : viewports)
-	{
-		[[maybe_unused]] const Bool loadFb2 = v.m_clearTileIndirectArgs.m_buffer != nullptr;
-		ANKI_ASSERT(loadFb == loadFb2 && "All draws should be the same for simplicity");
-	}
-
-	// Compute the agregate viewport
-	UVec2 minViewportXy = viewports[0].m_viewport.xy();
-	UVec2 maxViewportXy = viewports[0].m_viewport.xy() + viewports[0].m_viewport.zw();
-	for(U32 i = 1; i < viewports.getSize(); ++i)
-	{
-		minViewportXy = minViewportXy.min(viewports[i].m_viewport.xy());
-		maxViewportXy = maxViewportXy.max(viewports[i].m_viewport.xy() + viewports[i].m_viewport.zw());
-	}
-	const UVec4 totalViewport(minViewportXy, maxViewportXy - minViewportXy);
-
-	// Store the arguments to some permanent memory
-	DynamicArray<ViewportDraw, MemoryPoolPtrWrapper<StackMemoryPool>> dviewports(&getRenderer().getFrameMemoryPool());
-	dviewports.resize(viewports.getSize());
-	for(U32 i = 0; i < viewports.getSize(); ++i)
-	{
-		dviewports[i] = viewports[i];
-	}
-	WeakArray<ViewportDraw> dviewportsArr;
-	dviewports.moveAndReset(dviewportsArr);
+	const Bool loadFb = (clearTileIndirectArgs.m_buffer != nullptr);
 
 	// Create the pass
 	GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(passName);
-	pass.setFramebufferInfo((loadFb) ? m_loadFbDescr : m_clearFbDescr, {}, m_runCtx.m_rt, {}, totalViewport[0], totalViewport[1], totalViewport[2],
-							totalViewport[3]);
+	pass.setFramebufferInfo((loadFb) ? m_loadFbDescr : m_clearFbDescr, {}, m_runCtx.m_rt, {}, viewport[0], viewport[1], viewport[2], viewport[3]);
 
-	pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+	pass.newBufferDependency((meshletVisOut.isFilled()) ? meshletVisOut.m_dependency : visOut.m_dependency, BufferUsageBit::kIndirectDraw);
 	pass.newTextureDependency(m_runCtx.m_rt, TextureUsageBit::kFramebufferWrite, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
 
-	pass.setWork(1, [this, visOut, viewports = dviewportsArr](RenderPassWorkContext& rgraphCtx) {
+	pass.setWork(1, [this, visOut, meshletVisOut, viewport, clearTileIndirectArgs, viewMat, viewProjMat, hzbRt](RenderPassWorkContext& rgraphCtx) {
 		ANKI_TRACE_SCOPED_EVENT(ShadowMapping);
 
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
-		for(U32 i = 0; i < viewports.getSize(); ++i)
-		{
-			const ViewportDraw& vp = viewports[i];
+		cmdb.setViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
 
-			cmdb.setViewport(vp.m_viewport[0], vp.m_viewport[1], vp.m_viewport[2], vp.m_viewport[3]);
-
-			if(vp.m_clearTileIndirectArgs.m_buffer)
-			{
-				// Clear the depth buffer using a quad because it needs to be conditional
+		if(clearTileIndirectArgs.m_buffer)
+		{
+			// Clear the depth buffer using a quad because it needs to be conditional
 
-				cmdb.bindShaderProgram(m_clearDepthGrProg.get());
-				cmdb.setDepthCompareOperation(CompareOperation::kAlways);
+			cmdb.bindShaderProgram(m_clearDepthGrProg.get());
+			cmdb.setDepthCompareOperation(CompareOperation::kAlways);
 
-				cmdb.drawIndirect(PrimitiveTopology::kTriangles, 1, vp.m_clearTileIndirectArgs.m_offset, vp.m_clearTileIndirectArgs.m_buffer);
+			cmdb.drawIndirect(PrimitiveTopology::kTriangles, 1, clearTileIndirectArgs.m_offset, clearTileIndirectArgs.m_buffer);
 
-				cmdb.setDepthCompareOperation(CompareOperation::kLess);
-			}
+			cmdb.setDepthCompareOperation(CompareOperation::kLess);
+		}
 
-			// Set state
-			cmdb.setPolygonOffset(kShadowsPolygonOffsetFactor, kShadowsPolygonOffsetUnits);
-
-			RenderableDrawerArguments args;
-			args.m_renderingTechinuqe = RenderingTechnique::kDepth;
-			args.m_viewMatrix = vp.m_viewMat;
-			args.m_cameraTransform = vp.m_viewMat.getInverseTransformation();
-			args.m_viewProjectionMatrix = vp.m_viewProjMat;
-			args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care
-			args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
-			args.m_viewport = UVec4(vp.m_viewport[0], vp.m_viewport[1], vp.m_viewport[2], vp.m_viewport[3]);
-			args.fillMdi(visOut);
-
-			TextureViewPtr hzbView;
-			if(vp.m_hzbRt.isValid())
-			{
-				hzbView = rgraphCtx.createTextureView(vp.m_hzbRt);
-				args.m_hzbTexture = hzbView.get();
-			}
+		// Set state
+		cmdb.setPolygonOffset(kShadowsPolygonOffsetFactor, kShadowsPolygonOffsetUnits);
+
+		RenderableDrawerArguments args;
+		args.m_renderingTechinuqe = RenderingTechnique::kDepth;
+		args.m_viewMatrix = viewMat;
+		args.m_cameraTransform = viewMat.getInverseTransformation();
+		args.m_viewProjectionMatrix = viewProjMat;
+		args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care
+		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
+		args.m_viewport = UVec4(viewport[0], viewport[1], viewport[2], viewport[3]);
+		args.fillMdi(visOut);
+
+		TextureViewPtr hzbView;
+		if(hzbRt.isValid())
+		{
+			hzbView = rgraphCtx.createTextureView(hzbRt);
+			args.m_hzbTexture = hzbView.get();
+		}
 
-			getRenderer().getSceneDrawer().drawMdi(args, cmdb);
+		if(meshletVisOut.isFilled())
+		{
+			args.fill(meshletVisOut);
 		}
-	});
-}
 
-void ShadowMapping::createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput visOut,
-										  const BufferOffsetRange& clearTileIndirectArgs, const RenderTargetHandle hzbRt, CString passName,
-										  RenderGraphDescription& rgraph)
-{
-	ViewportDraw vp;
-	vp.m_viewport = viewport;
-	vp.m_viewProjMat = viewProjMat;
-	vp.m_viewMat = viewMat;
-	vp.m_clearTileIndirectArgs = clearTileIndirectArgs;
-	vp.m_hzbRt = hzbRt;
-
-	createMultipleDrawShadowsPass({&vp, 1}, visOut, passName, rgraph);
+		getRenderer().getSceneDrawer().drawMdi(args, cmdb);
+	});
 }
 
 } // end namespace anki

+ 4 - 16
AnKi/Renderer/ShadowMapping.h

@@ -14,6 +14,7 @@ namespace anki {
 
 // Forward
 class GpuVisibilityOutput;
+class GpuMeshletVisibilityOutput;
 extern NumericCVar<U32> g_shadowMappingPcfCVar;
 
 /// @addtogroup renderer
@@ -34,16 +35,6 @@ public:
 	}
 
 private:
-	class ViewportDraw
-	{
-	public:
-		UVec4 m_viewport;
-		Mat4 m_viewProjMat;
-		Mat3x4 m_viewMat;
-		RenderTargetHandle m_hzbRt;
-		BufferOffsetRange m_clearTileIndirectArgs;
-	};
-
 	TileAllocator m_tileAlloc;
 	static constexpr U32 kTileAllocHierarchyCount = 4;
 	static constexpr U32 kPointLightMaxTileAllocHierarchy = 1;
@@ -85,12 +76,9 @@ private:
 	BufferOffsetRange createVetVisibilityPass(CString passName, const LightComponent& lightc, const GpuVisibilityOutput& visOut,
 											  RenderGraphDescription& rgraph) const;
 
-	void createMultipleDrawShadowsPass(ConstWeakArray<ViewportDraw> viewports, const GpuVisibilityOutput visOut, CString passName,
-									   RenderGraphDescription& rgraph);
-
-	void createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput visOut,
-							   const BufferOffsetRange& clearTileIndirectArgs, const RenderTargetHandle hzbRt, CString passName,
-							   RenderGraphDescription& rgraph);
+	void createDrawShadowsPass(const UVec4& viewport, const Mat4& viewProjMat, const Mat3x4& viewMat, const GpuVisibilityOutput& visOut,
+							   const GpuMeshletVisibilityOutput& meshletVisOut, const BufferOffsetRange& clearTileIndirectArgs,
+							   const RenderTargetHandle hzbRt, CString passName, RenderGraphDescription& rgraph);
 };
 /// @}
 

+ 2 - 2
AnKi/Renderer/Utils/Drawer.cpp

@@ -159,7 +159,7 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 	U32 legacyGeometryFlowUserCount = 0;
 	PtrSize meshletInstancesBufferOffset = 0;
 	RenderStateBucketContainer::getSingleton().iterateBuckets(
-		args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount, U32 meshletGroupCount) {
+		args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
 			if(userCount == 0)
 			{
 				++bucketCount;
@@ -221,7 +221,7 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 			++bucketCount;
 			allUserCount += userCount;
-			meshletInstancesBufferOffset += sizeof(GpuSceneMeshletInstance) * min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+			meshletInstancesBufferOffset += sizeof(GpuSceneMeshletInstance) * min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket);
 		});
 
 	ANKI_ASSERT(bucketCount == RenderStateBucketContainer::getSingleton().getBucketCount(args.m_renderingTechinuqe));

+ 1 - 0
AnKi/Renderer/Utils/Drawer.h

@@ -65,6 +65,7 @@ public:
 
 	void fill(const GpuMeshletVisibilityOutput& visOut)
 	{
+		ANKI_ASSERT(visOut.isFilled());
 		m_softwareMesh.m_meshletInstancesBuffer = visOut.m_meshletInstancesBuffer;
 		m_softwareMesh.m_drawIndirectArgsBuffer = visOut.m_drawIndirectArgsBuffer;
 	}

+ 122 - 111
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -17,6 +17,15 @@
 
 namespace anki {
 
+static StatCounter g_gpuVisMemoryAllocatedStatVar(StatCategory::kRenderer, "GPU visibility mem",
+												  StatFlag::kBytes | StatFlag::kMainThreadUpdates | StatFlag::kZeroEveryFrame);
+
+static BufferOffsetRange allocateTransientGpuMem(PtrSize size)
+{
+	g_gpuVisMemoryAllocatedStatVar.increment(size);
+	return GpuVisibleTransientMemoryPool::getSingleton().allocate(size);
+}
+
 GpuVisibilityCommonBase::Counts GpuVisibilityCommonBase::countTechnique(RenderingTechnique t)
 {
 	Counts out = {};
@@ -38,11 +47,12 @@ GpuVisibilityCommonBase::Counts GpuVisibilityCommonBase::countTechnique(Renderin
 
 	out.m_bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(t);
 
-	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount_) {
-		if(meshletGroupCount_)
+	RenderStateBucketContainer::getSingleton().iterateBuckets(t, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
+		if(meshletGroupCount)
 		{
 			out.m_modernGeometryFlowUserCount += userCount;
-			out.m_meshletGroupCount += min(meshletGroupCount_, kMaxMeshletGroupCountPerRenderStateBucket);
+			out.m_meshletGroupCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+			out.m_meshletCount += min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket);
 		}
 		else
 		{
@@ -118,7 +128,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		frustumTestData = newInstance<FrustumTestData>(getRenderer().getFrameMemoryPool());
 		const FrustumGpuVisibilityInput& fin = static_cast<FrustumGpuVisibilityInput&>(in);
 		frustumTestData->m_viewProjMat = fin.m_viewProjectionMatrix;
-		frustumTestData->m_finalRenderTargetSize = fin.m_finalRenderTargetSize;
+		frustumTestData->m_finalRenderTargetSize = fin.m_viewportSize;
 	}
 
 	const Counts counts = countTechnique(in.m_technique);
@@ -130,8 +140,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 
 	// Allocate memory
-	const Bool firstFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
-	if(firstFrame)
+	const Bool firstCallInFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
+	if(firstCallInFrame)
 	{
 		// Allocate the big buffers once at the beginning of the frame
 
@@ -150,13 +160,14 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		{
 			mem = {};
 
-			mem.m_drawIndexedIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(
-				max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
-			mem.m_renderableInstancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_legacyGeometryFlowUserCount)
-																									 * sizeof(GpuSceneRenderableVertex));
+			mem.m_drawIndexedIndirectArgsBuffer =
+				allocateTransientGpuMem(max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
+			mem.m_renderableInstancesBuffer =
+				allocateTransientGpuMem(max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex));
+
+			mem.m_taskShaderPayloadBuffer = allocateTransientGpuMem(max(1u, maxCounts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
 
-			mem.m_taskShaderPayloadBuffer =
-				GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
+			mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_drawIndexedIndirectArgsBuffer);
 		}
 	}
 
@@ -169,22 +180,21 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	out.m_legacy.m_renderableInstancesBuffer = mem.m_renderableInstancesBuffer;
 	out.m_legacy.m_renderableInstancesBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex);
 
-	out.m_legacy.m_mdiDrawCountsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(U32) * counts.m_bucketCount);
+	out.m_legacy.m_mdiDrawCountsBuffer = allocateTransientGpuMem(sizeof(U32) * counts.m_bucketCount);
 
 	out.m_mesh.m_taskShaderPayloadBuffer = mem.m_taskShaderPayloadBuffer;
 	out.m_mesh.m_taskShaderPayloadBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload);
 
-	out.m_mesh.m_taskShaderIndirectArgsBuffer =
-		GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * counts.m_bucketCount);
+	out.m_mesh.m_taskShaderIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DispatchIndirectArgs) * counts.m_bucketCount);
 
 	if(in.m_hashVisibles)
 	{
-		out.m_visiblesHashBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(GpuVisibilityHash));
+		out.m_visiblesHashBuffer = allocateTransientGpuMem(sizeof(GpuVisibilityHash));
 	}
 
 	if(in.m_gatherAabbIndices)
 	{
-		out.m_visibleAaabbIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((counts.m_allUserCount + 1) * sizeof(U32));
+		out.m_visibleAaabbIndicesBuffer = allocateTransientGpuMem((counts.m_allUserCount + 1) * sizeof(U32));
 	}
 
 	// Zero some stuff
@@ -227,11 +237,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 
 	// Set the out dependency. Use one of the big buffers.
-	if(!mem.m_bufferDepedency.isValid())
-	{
-		mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_drawIndexedIndirectArgsBuffer);
-	}
-	out.m_someBufferHandle = mem.m_bufferDepedency;
+	out.m_dependency = mem.m_bufferDepedency;
 
 	// Create the renderpass
 	Array<Char, 128> passName;
@@ -241,7 +247,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kUavComputeRead);
 	pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kUavComputeWrite);
-	pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavComputeWrite);
 
 	if(!distanceBased && static_cast<FrustumGpuVisibilityInput&>(in).m_hzbRt)
 	{
@@ -296,25 +302,26 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		U32 bucketCount = 0;
 		U32 legacyGeometryFlowDrawCount = 0;
 		U32 taskPayloadCount = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount) {
-			if(userCount == 0)
-			{
-				// Empty bucket
-				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = kMaxU32;
-			}
-			else if(meshletGroupCount)
-			{
-				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = taskPayloadCount;
-				taskPayloadCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
-			}
-			else
-			{
-				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = legacyGeometryFlowDrawCount;
-				legacyGeometryFlowDrawCount += userCount;
-			}
-
-			++bucketCount;
-		});
+		RenderStateBucketContainer::getSingleton().iterateBuckets(
+			technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount, [[maybe_unused]] U32 meshletCount) {
+				if(userCount == 0)
+				{
+					// Empty bucket
+					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = kMaxU32;
+				}
+				else if(meshletGroupCount)
+				{
+					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = taskPayloadCount;
+					taskPayloadCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+				}
+				else
+				{
+					drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = legacyGeometryFlowDrawCount;
+					legacyGeometryFlowDrawCount += userCount;
+				}
+
+				++bucketCount;
+			});
 
 		if(frustumTestData)
 		{
@@ -397,8 +404,8 @@ void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, Gp
 	}
 
 	// Allocate memory
-	const Bool firstFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
-	if(firstFrame)
+	const Bool firstCallInFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
+	if(firstCallInFrame)
 	{
 		// Allocate the big buffers once at the beginning of the frame
 
@@ -417,27 +424,28 @@ void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, Gp
 		{
 			mem = {};
 
-			mem.m_meshletInstancesBuffer =
-				GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_meshletGroupCount) * sizeof(UVec4));
+			mem.m_meshletInstancesBuffer = allocateTransientGpuMem(maxCounts.m_meshletCount * sizeof(GpuSceneMeshletInstance));
+
+			mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_meshletInstancesBuffer);
 		}
 	}
 
 	PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphFrameCallCount % m_runCtx.m_persistentMem.getSize()];
 	++m_runCtx.m_populateRenderGraphFrameCallCount;
 
-	out.m_drawIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * counts.m_bucketCount);
+	out.m_drawIndirectArgsBuffer = allocateTransientGpuMem(sizeof(DrawIndirectArgs) * counts.m_bucketCount);
 
 	out.m_meshletInstancesBuffer = mem.m_meshletInstancesBuffer;
-	out.m_meshletInstancesBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(UVec4);
+	out.m_meshletInstancesBuffer.m_range = counts.m_meshletCount * sizeof(GpuSceneMeshletInstance);
 
 	// Zero some stuff
-	const BufferHandle zeroStuffDependency = rgraph.importBuffer(BufferUsageBit::kNone, out.m_drawIndirectArgsBuffer);
+	const BufferHandle indirectArgsDep = rgraph.importBuffer(BufferUsageBit::kNone, out.m_drawIndirectArgsBuffer);
 	{
 		Array<Char, 128> passName;
 		snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU meshlet vis zero: %s", in.m_passesName.cstr());
 
 		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
-		pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kTransferDestination);
+		pass.newBufferDependency(indirectArgsDep, BufferUsageBit::kTransferDestination);
 
 		pass.setWork([drawIndirectArgsBuffer = out.m_drawIndirectArgsBuffer](RenderPassWorkContext& rpass) {
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
@@ -448,11 +456,6 @@ void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, Gp
 		});
 	}
 
-	// Set the out dependency. Use one of the big buffers.
-	if(!mem.m_bufferDepedency.isValid())
-	{
-		mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_meshletInstancesBuffer);
-	}
 	out.m_dependency = mem.m_bufferDepedency;
 
 	// Create the renderpass
@@ -461,66 +464,74 @@ void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, Gp
 
 	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
 
-	pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavComputeWrite);
-	pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kUavComputeRead);
+	pass.newBufferDependency(indirectArgsDep, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(mem.m_bufferDepedency, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(in.m_dependency, BufferUsageBit::kIndirectCompute);
 
 	pass.setWork([this, technique = in.m_technique, hzbRt = in.m_hzbRt, taskShaderPayloadsBuff = in.m_taskShaderPayloadBuffer,
 				  viewProjMat = in.m_viewProjectionMatrix, camTrf = in.m_cameraTransform, viewportSize = in.m_viewportSize,
-				  out](RenderPassWorkContext& rpass) {
+				  computeIndirectArgs = in.m_taskShaderIndirectArgsBuffer, out](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 		U32 bucketIdx = 0;
 		U32 firstPayload = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount) {
-			if(!meshletGroupCount)
-			{
+		PtrSize instancesBufferOffset = 0;
+		RenderStateBucketContainer::getSingleton().iterateBuckets(
+			technique, [&](const RenderStateInfo&, [[maybe_unused]] U32 userCount, U32 meshletGroupCount, U32 meshletCount) {
+				if(!meshletGroupCount)
+				{
+					++bucketIdx;
+					return;
+				}
+
+				// Create a depedency to a part of the indirect args buffer
+				const BufferOffsetRange drawIndirectArgsBufferChunk = {out.m_drawIndirectArgsBuffer.m_buffer,
+																	   out.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketIdx,
+																	   sizeof(DrawIndirectArgs)};
+
+				const PtrSize instancesBufferSize = min(meshletCount, kMaxVisibleMeshletsPerRenderStateBucket) * sizeof(GpuSceneMeshletInstance);
+				const BufferOffsetRange instancesBuffer = {out.m_meshletInstancesBuffer.m_buffer,
+														   out.m_meshletInstancesBuffer.m_offset + instancesBufferOffset, instancesBufferSize};
+
+				const Bool hasHzb = hzbRt.isValid();
+
+				cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb].get());
+
+				cmdb.bindUavBuffer(0, 0, taskShaderPayloadsBuff);
+				cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+				cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
+				cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
+				cmdb.bindUavBuffer(0, 4, UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
+				cmdb.bindUavBuffer(0, 5, drawIndirectArgsBufferChunk);
+				cmdb.bindUavBuffer(0, 6, instancesBuffer);
+				if(hasHzb)
+				{
+					rpass.bindColorTexture(0, 7, hzbRt);
+					cmdb.bindSampler(0, 8, getRenderer().getSamplers().m_nearestNearestClamp.get());
+				}
+
+				class MaterialGlobalConstants
+				{
+				public:
+					Mat4 m_viewProjectionMatrix;
+					Mat3x4 m_cameraTransform;
+
+					Vec2 m_viewportSizef;
+					U32 m_firstPayload;
+					U32 m_padding;
+				} consts;
+				consts.m_viewProjectionMatrix = viewProjMat;
+				consts.m_cameraTransform = camTrf;
+				consts.m_viewportSizef = Vec2(viewportSize);
+				consts.m_firstPayload = firstPayload;
+				cmdb.setPushConstants(&consts, sizeof(consts));
+
+				cmdb.dispatchComputeIndirect(computeIndirectArgs.m_buffer, computeIndirectArgs.m_offset + bucketIdx * sizeof(DispatchIndirectArgs));
+
+				firstPayload += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+				instancesBufferOffset += instancesBufferSize;
 				++bucketIdx;
-				return;
-			}
-
-			// Create a depedency to a part of the indirect args buffer
-			const BufferOffsetRange drawIndirectArgsBufferChunk = {out.m_drawIndirectArgsBuffer.m_buffer,
-																   out.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketIdx,
-																   sizeof(DrawIndirectArgs)};
-
-			const Bool hasHzb = hzbRt.isValid();
-
-			cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb].get());
-
-			cmdb.bindUavBuffer(0, 0, taskShaderPayloadsBuff);
-			cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
-			cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
-			cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
-			cmdb.bindUavBuffer(0, 4, UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
-			cmdb.bindUavBuffer(0, 5, out.m_drawIndirectArgsBuffer);
-			cmdb.bindUavBuffer(0, 6, out.m_meshletInstancesBuffer);
-			if(hasHzb)
-			{
-				rpass.bindColorTexture(0, 7, hzbRt);
-				cmdb.bindSampler(0, 8, getRenderer().getSamplers().m_nearestNearestClamp.get());
-			}
-
-			class MaterialGlobalConstants
-			{
-			public:
-				Mat4 m_viewProjectionMatrix;
-				Mat3x4 m_cameraTransform;
-
-				Vec2 m_viewportSizef;
-				U32 m_firstPayload;
-				U32 m_padding;
-			} consts;
-			consts.m_viewProjectionMatrix = viewProjMat;
-			consts.m_cameraTransform = camTrf;
-			consts.m_viewportSizef = Vec2(viewportSize);
-			consts.m_firstPayload = firstPayload;
-			cmdb.setPushConstants(&consts, sizeof(consts));
-
-			cmdb.dispatchCompute(meshletGroupCount, 1, 1);
-
-			firstPayload += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
-			++bucketIdx;
-		});
+			});
 	});
 }
 
@@ -626,7 +637,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 	}
 
 	// Allocate memory for the result
-	out.m_visiblesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((objCount + 1) * sizeof(U32));
+	out.m_visiblesBuffer = allocateTransientGpuMem((objCount + 1) * sizeof(U32));
 	out.m_visiblesBufferHandle = rgraph.importBuffer(BufferUsageBit::kNone, out.m_visiblesBuffer);
 
 	// Create the renderpass
@@ -728,12 +739,12 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 	// Allocate the transient buffers
 	const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
 
-	out.m_instancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(AccelerationStructureInstance));
+	out.m_instancesBuffer = allocateTransientGpuMem(aabbCount * sizeof(AccelerationStructureInstance));
 	out.m_someBufferHandle = rgraph.importBuffer(BufferUsageBit::kUavComputeWrite, out.m_instancesBuffer);
 
-	out.m_renderableIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((aabbCount + 1) * sizeof(U32));
+	out.m_renderableIndicesBuffer = allocateTransientGpuMem((aabbCount + 1) * sizeof(U32));
 
-	const BufferOffsetRange zeroInstancesDispatchArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs));
+	const BufferOffsetRange zeroInstancesDispatchArgsBuff = allocateTransientGpuMem(sizeof(DispatchIndirectArgs));
 
 	// Create vis pass
 	{

+ 15 - 8
AnKi/Renderer/Utils/GpuVisibility.h

@@ -25,6 +25,7 @@ protected:
 		U32 m_legacyGeometryFlowUserCount;
 		U32 m_modernGeometryFlowUserCount;
 		U32 m_meshletGroupCount;
+		U32 m_meshletCount;
 		U32 m_allUserCount;
 
 		Counts max(const Counts& b) const
@@ -36,6 +37,7 @@ protected:
 			ANKI_MAX(m_legacyGeometryFlowUserCount);
 			ANKI_MAX(m_modernGeometryFlowUserCount);
 			ANKI_MAX(m_meshletGroupCount);
+			ANKI_MAX(m_meshletCount);
 			ANKI_MAX(m_allUserCount);
 #undef ANKI_MAX
 			return out;
@@ -67,8 +69,8 @@ class FrustumGpuVisibilityInput : public BaseGpuVisibilityInput
 public:
 	Mat4 m_viewProjectionMatrix;
 
-	/// The size of the render target the visibility results will be used on. Used to kill objects that don't touch the sampling positions.
-	UVec2 m_finalRenderTargetSize;
+	/// The size of the viewport the visibility results will be used on. Used to kill objects that don't touch the sampling positions.
+	UVec2 m_viewportSize;
 
 	const RenderTargetHandle* m_hzbRt = nullptr; ///< Optional.
 };
@@ -85,7 +87,7 @@ public:
 class GpuVisibilityOutput
 {
 public:
-	BufferHandle m_someBufferHandle; ///< Just expose one handle for depedencies. No need to track all buffers.
+	BufferHandle m_dependency; ///< Just expose one handle for depedencies. No need to track all buffers. Wait on it using indirect draw usage.
 
 	class
 	{
@@ -109,7 +111,7 @@ public:
 
 	Bool containsDrawcalls() const
 	{
-		return m_someBufferHandle.isValid();
+		return m_dependency.isValid();
 	}
 };
 
@@ -124,7 +126,7 @@ public:
 	void populateRenderGraph(FrustumGpuVisibilityInput& in, GpuVisibilityOutput& out)
 	{
 		ANKI_ASSERT(in.m_viewProjectionMatrix != Mat4::getZero());
-		ANKI_ASSERT(in.m_finalRenderTargetSize != UVec2(0u));
+		ANKI_ASSERT(in.m_viewportSize != UVec2(0u));
 		populateRenderGraphInternal(false, in, out);
 	}
 
@@ -173,6 +175,8 @@ class GpuMeshletVisibilityInput
 public:
 	CString m_passesName;
 
+	RenderingTechnique m_technique = RenderingTechnique::kCount;
+
 	Mat4 m_viewProjectionMatrix;
 	Mat3x4 m_cameraTransform;
 
@@ -186,8 +190,6 @@ public:
 
 	RenderGraphDescription* m_rgraph = nullptr;
 
-	RenderingTechnique m_technique = RenderingTechnique::kCount;
-
 	RenderTargetHandle m_hzbRt; ///< Optional.
 };
 
@@ -198,7 +200,12 @@ public:
 	BufferOffsetRange m_meshletInstancesBuffer; ///< Array of UVec4 (encodes GpuSceneMeshletInstance) per instance vertex. One for each meshlet.
 	BufferOffsetRange m_drawIndirectArgsBuffer; ///< Array of DrawIndirectArgs. One for every render state bucket (even those that use that flow).
 
-	BufferHandle m_dependency;
+	BufferHandle m_dependency; ///< Some dependency to wait on. Wait usage is indirect draw.
+
+	Bool isFilled() const
+	{
+		return m_dependency.isValid();
+	}
 };
 
 /// Performs meshlet GPU visibility when the GPU doesn't support mesh shaders.

+ 47 - 29
AnKi/Resource/MaterialResource.cpp

@@ -6,6 +6,7 @@
 #include <AnKi/Resource/MaterialResource.h>
 #include <AnKi/Resource/ResourceManager.h>
 #include <AnKi/Resource/ImageResource.h>
+#include <AnKi/Core/App.h>
 #include <AnKi/Util/Xml.h>
 
 namespace anki {
@@ -167,15 +168,35 @@ Error MaterialResource::parseShaderProgram(XmlElement shaderProgramEl, Bool asyn
 	// Find present techniques
 	for(const ShaderProgramBinaryTechnique& t : m_prog->getBinary().m_techniques)
 	{
-		if(t.m_name.getBegin() == CString("GBuffer") || t.m_name.getBegin() == CString("GBufferMesh")
-		   || t.m_name.getBegin() == CString("GBufferMeshlet"))
+		if(t.m_name.getBegin() == CString("GBufferLegacy"))
 		{
 			m_techniquesMask |= RenderingTechniqueBit::kGBuffer;
+			m_shaderTechniques |= ShaderTechniqueBit::kLegacy;
 		}
-		else if(t.m_name.getBegin() == CString("Shadows") || t.m_name.getBegin() == CString("ShadowsMesh")
-				|| t.m_name.getBegin() == CString("ShadowsMeshlet"))
+		else if(t.m_name.getBegin() == CString("GBufferMeshShaders"))
+		{
+			m_techniquesMask |= RenderingTechniqueBit::kGBuffer;
+			m_shaderTechniques |= ShaderTechniqueBit::kMeshSaders;
+		}
+		else if(t.m_name.getBegin() == CString("GBufferSwMeshletRendering"))
+		{
+			m_techniquesMask |= RenderingTechniqueBit::kGBuffer;
+			m_shaderTechniques |= ShaderTechniqueBit::kSwMeshletRendering;
+		}
+		else if(t.m_name.getBegin() == CString("ShadowsLegacy"))
 		{
 			m_techniquesMask |= RenderingTechniqueBit::kDepth;
+			m_shaderTechniques |= ShaderTechniqueBit::kLegacy;
+		}
+		else if(t.m_name.getBegin() == CString("ShadowsMeshShaders"))
+		{
+			m_techniquesMask |= RenderingTechniqueBit::kDepth;
+			m_shaderTechniques |= ShaderTechniqueBit::kMeshSaders;
+		}
+		else if(t.m_name.getBegin() == CString("ShadowsSwMeshletRendering"))
+		{
+			m_techniquesMask |= RenderingTechniqueBit::kDepth;
+			m_shaderTechniques |= ShaderTechniqueBit::kSwMeshletRendering;
 		}
 		else if(t.m_name.getBegin() == CString("RtShadows"))
 		{
@@ -187,6 +208,7 @@ Error MaterialResource::parseShaderProgram(XmlElement shaderProgramEl, Bool asyn
 		else if(t.m_name.getBegin() == CString("Forward"))
 		{
 			m_techniquesMask |= RenderingTechniqueBit::kForward;
+			m_shaderTechniques |= ShaderTechniqueBit::kLegacy;
 		}
 		else if(t.m_name.getBegin() == CString("CommonTask"))
 		{
@@ -503,25 +525,24 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 		key.setVelocity(false);
 	}
 
-	if(!GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	const Bool meshShadersSupported = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
+	ANKI_ASSERT(!(key.getMeshletRendering() && (!meshShadersSupported && !g_meshletRenderingCVar.get()))
+				&& "Can't be asking for meshlet rendering if mesh shaders or SW meshlet rendering are not supported/enabled");
+	if(key.getMeshletRendering() && !(m_shaderTechniques & (ShaderTechniqueBit::kMeshSaders | ShaderTechniqueBit::kSwMeshletRendering)))
 	{
-		key.setMeshShaders(false);
+		key.setMeshletRendering(false);
 	}
 
 	ANKI_ASSERT(!key.getSkinned() || !!(m_presentBuildinMutatorMask & U32(1 << BuiltinMutatorId::kBones)));
 	ANKI_ASSERT(!key.getVelocity() || !!(m_presentBuildinMutatorMask & U32(1 << BuiltinMutatorId::kVelocity)));
 
-	MaterialVariant& variant = m_variantMatrix[key.getRenderingTechnique()][key.getSkinned()][key.getVelocity()][key.getMeshShaders()];
+	MaterialVariant& variant = m_variantMatrix[key.getRenderingTechnique()][key.getSkinned()][key.getVelocity()][key.getMeshletRendering()];
 
 	// Check if it's initialized
 	{
 		RLockGuard<RWMutex> lock(m_variantMatrixMtx);
 		if(variant.m_prog.isCreated()) [[likely]]
 		{
-			if(!(RenderingTechniqueBit(1 << key.getRenderingTechnique()) & RenderingTechniqueBit::kAllRt))
-			{
-				ANKI_ASSERT(key.getMeshShaders() == !!(variant.m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
-			}
 			return variant;
 		}
 	}
@@ -555,37 +576,38 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 	switch(key.getRenderingTechnique())
 	{
 	case RenderingTechnique::kGBuffer:
-		if(key.getMeshShaders())
+		if(key.getMeshletRendering() && meshShadersSupported)
 		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "GBufferMesh");
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "GBufferMeshShaders");
 			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kTask, "CommonTask");
 		}
+		else if(key.getMeshletRendering())
+		{
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "GBufferSwMeshletRendering");
+		}
 		else
 		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "GBuffer");
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "GBufferLegacy");
 		}
 		break;
 	case RenderingTechnique::kDepth:
-		if(key.getMeshShaders())
+		if(key.getMeshletRendering() && meshShadersSupported)
 		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "ShadowsMesh");
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kMesh | ShaderTypeBit::kFragment, "ShadowsMeshShaders");
 			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kTask, "CommonTask");
 		}
-		else
+		else if(key.getMeshletRendering())
 		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "Shadows");
-		}
-		break;
-	case RenderingTechnique::kForward:
-		if(key.getMeshShaders())
-		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kAllModernGeometry | ShaderTypeBit::kFragment, "Forward");
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "ShadowsSwMeshletRendering");
 		}
 		else
 		{
-			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "Forward");
+			initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "ShadowsLegacy");
 		}
 		break;
+	case RenderingTechnique::kForward:
+		initInfo.requestTechniqueAndTypes(ShaderTypeBit::kVertex | ShaderTypeBit::kFragment, "Forward");
+		break;
 	case RenderingTechnique::kRtShadow:
 		initInfo.requestTechniqueAndTypes(ShaderTypeBit::kAllHit, "RtShadows");
 		break;
@@ -607,10 +629,6 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 	{
 		variant.m_rtShaderGroupHandleIndex = progVariant->getShaderGroupHandleIndex();
 	}
-	else
-	{
-		ANKI_ASSERT(key.getMeshShaders() == !!(variant.m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
-	}
 
 	return variant;
 }

+ 11 - 1
AnKi/Resource/MaterialResource.h

@@ -227,9 +227,18 @@ private:
 		MutatorValue m_value;
 	};
 
+	enum class ShaderTechniqueBit : U8
+	{
+		kNone = 0,
+		kLegacy = 1 << 0,
+		kMeshSaders = 1 << 1,
+		kSwMeshletRendering = 1 << 2
+	};
+	ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS_FRIEND(ShaderTechniqueBit)
+
 	ShaderProgramResourcePtr m_prog;
 
-	mutable Array4d<MaterialVariant, U(RenderingTechnique::kCount), 2, 2, 2> m_variantMatrix; ///< [technique][skinned][vel][meshShader]
+	mutable Array4d<MaterialVariant, U(RenderingTechnique::kCount), 2, 2, 2> m_variantMatrix; ///< [technique][skinned][vel][meshletRendering]
 	mutable RWMutex m_variantMatrixMtx;
 
 	ResourceDynamicArray<PartialMutation> m_partialMutation; ///< Only with the non-builtins.
@@ -243,6 +252,7 @@ private:
 
 	Bool m_supportsSkinning = false;
 	RenderingTechniqueBit m_techniquesMask = RenderingTechniqueBit::kNone;
+	ShaderTechniqueBit m_shaderTechniques = ShaderTechniqueBit::kNone;
 
 	Error parseMutators(XmlElement mutatorsEl);
 	Error parseShaderProgram(XmlElement techniqueEl, Bool async);

+ 5 - 2
AnKi/Resource/MeshResource.cpp

@@ -12,6 +12,9 @@
 
 namespace anki {
 
+// Forward
+extern BoolCVar g_meshletRenderingCVar;
+
 class MeshResource::LoadContext
 {
 public:
@@ -142,7 +145,7 @@ Error MeshResource::load(const ResourceFilename& filename, Bool async)
 		}
 
 		// Meshlet
-		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders || g_meshletRenderingCVar.get())
 		{
 			const PtrSize meshletIndicesSize = header.m_meshletPrimitiveCounts[l] * sizeof(U8Vec4);
 			lod.m_meshletIndices = UnifiedGeometryBuffer::getSingleton().allocate(meshletIndicesSize, sizeof(U8Vec4));
@@ -346,7 +349,7 @@ Error MeshResource::loadAsync(MeshBinaryLoader& loader) const
 				outMeshletBoundingVolume.m_aabbMax = inMeshlet.m_boundingVolume.m_aabbMax;
 				outMeshletBoundingVolume.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm =
 					packSnorm4x8(Vec4(inMeshlet.m_coneDirection, cos(inMeshlet.m_coneAngle / 2.0f)));
-				outMeshletBoundingVolume.m_coneApex_R8G8B8A8_Snorm = packSnorm4x8(inMeshlet.m_coneApex.xyz0());
+				outMeshletBoundingVolume.m_coneApex = inMeshlet.m_coneApex;
 				outMeshletBoundingVolume.m_sphereRadius =
 					((outMeshletBoundingVolume.m_aabbMin + outMeshletBoundingVolume.m_aabbMax) / 2.0f - outMeshletBoundingVolume.m_aabbMax)
 						.getLength();

+ 4 - 1
AnKi/Resource/ModelResource.cpp

@@ -11,6 +11,9 @@
 
 namespace anki {
 
+// Forward
+extern BoolCVar g_meshletRenderingCVar;
+
 void ModelPatch::getGeometryInfo(U32 lod, ModelPatchGeometryInfo& inf) const
 {
 	lod = min<U32>(lod, m_meshLodCount - 1);
@@ -115,7 +118,7 @@ Error ModelPatch::init([[maybe_unused]] ModelResource* model, CString meshFName,
 			}
 		}
 
-		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+		if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders || g_meshletRenderingCVar.get())
 		{
 			U32 dummy;
 			m_mesh->getMeshletBufferInfo(l, lod.m_meshletBoundingVolumesUgbOffset, lod.m_meshletGometryDescriptorsUgbOffset, dummy);

+ 9 - 9
AnKi/Resource/RenderingKey.h

@@ -40,12 +40,12 @@ ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS(RenderingTechniqueBit)
 class RenderingKey
 {
 public:
-	RenderingKey(RenderingTechnique technique, U32 lod, Bool skinned, Bool velocity, Bool meshShaders)
+	RenderingKey(RenderingTechnique technique, U32 lod, Bool skinned, Bool velocity, Bool meshletRendering)
 		: m_technique(technique)
 		, m_lod(lod & 0b11)
 		, m_skinned(skinned)
 		, m_velocity(velocity)
-		, m_meshShaders(meshShaders)
+		, m_meshletRendering(meshletRendering)
 	{
 		ANKI_ASSERT(lod < kMaxLodCount);
 	}
@@ -56,7 +56,7 @@ public:
 	}
 
 	RenderingKey(const RenderingKey& b)
-		: RenderingKey(b.m_technique, b.m_lod, b.m_skinned, b.m_velocity, b.m_meshShaders)
+		: RenderingKey(b.m_technique, b.m_lod, b.m_skinned, b.m_velocity, b.m_meshletRendering)
 	{
 	}
 
@@ -69,7 +69,7 @@ public:
 	Bool operator==(const RenderingKey& b) const
 	{
 		return m_technique == b.m_technique && m_lod == b.m_lod && m_skinned == b.m_skinned && m_velocity == b.m_velocity
-			   && m_meshShaders == b.m_meshShaders;
+			   && m_meshletRendering == b.m_meshletRendering;
 	}
 
 	RenderingTechnique getRenderingTechnique() const
@@ -113,14 +113,14 @@ public:
 		m_velocity = v;
 	}
 
-	void setMeshShaders(Bool b)
+	void setMeshletRendering(Bool b)
 	{
-		m_meshShaders = b;
+		m_meshletRendering = b;
 	}
 
-	Bool getMeshShaders() const
+	Bool getMeshletRendering() const
 	{
-		return m_meshShaders;
+		return m_meshletRendering;
 	}
 
 private:
@@ -128,7 +128,7 @@ private:
 	U8 m_lod : 2;
 	Bool m_skinned : 1;
 	Bool m_velocity : 1;
-	Bool m_meshShaders : 1;
+	Bool m_meshletRendering : 1;
 
 	static_assert(kMaxLodCount <= 3, "m_lod only reserves 2 bits so make sure all LODs will fit");
 };

+ 1 - 1
AnKi/Resource/ShaderProgramResource.h

@@ -88,7 +88,7 @@ public:
 
 private:
 	static constexpr U32 kMaxMutators = 32;
-	static constexpr U32 kMaxTechniqueNameLength = 23;
+	static constexpr U32 kMaxTechniqueNameLength = 32;
 
 	ShaderProgramResourcePtr m_ptr;
 

+ 3 - 3
AnKi/Scene/Components/ModelComponent.cpp

@@ -11,6 +11,7 @@
 #include <AnKi/Resource/ModelResource.h>
 #include <AnKi/Resource/ResourceManager.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
+#include <AnKi/Core/App.h>
 
 namespace anki {
 
@@ -273,7 +274,7 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 				key.setRenderingTechnique(t);
 				key.setSkinned(hasSkin);
 				key.setVelocity(moved);
-				key.setMeshShaders(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders);
+				key.setMeshletRendering(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders || g_meshletRenderingCVar.get());
 
 				const MaterialVariant& mvariant = m_model->getModelPatches()[i].getMaterial()->getOrCreateVariant(key);
 
@@ -284,8 +285,7 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 
 				ModelPatchGeometryInfo inf;
 				m_model->getModelPatches()[i].getGeometryInfo(0, inf);
-				const Bool wantsMesletCount = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders
-											  && !(RenderingTechniqueBit(1 << t) & RenderingTechniqueBit::kAllRt);
+				const Bool wantsMesletCount = key.getMeshletRendering() && !(RenderingTechniqueBit(1 << t) & RenderingTechniqueBit::kAllRt);
 				m_patchInfos[i].m_renderStateBucketIndices[t] =
 					RenderStateBucketContainer::getSingleton().addUser(state, t, (wantsMesletCount) ? inf.m_meshletCount : 0);
 			}

+ 7 - 11
AnKi/Scene/RenderStateBucket.cpp

@@ -24,15 +24,6 @@ RenderStateBucketContainer::~RenderStateBucketContainer()
 
 RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo& state, RenderingTechnique technique, U32 lod0MeshletCount)
 {
-	if(!!(state.m_program->getShaderTypes() & ShaderTypeBit::kAllModernGeometry))
-	{
-		ANKI_ASSERT(lod0MeshletCount > 0);
-	}
-	else
-	{
-		ANKI_ASSERT(lod0MeshletCount == 0);
-	}
-
 	// Compute state gash
 	Array<U64, 3> toHash;
 	toHash[0] = state.m_program->getUuid();
@@ -59,11 +50,12 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 		{
 			++buckets[i].m_userCount;
 			buckets[i].m_meshletGroupCount += meshletGroupCount;
+			buckets[i].m_lod0MeshletCount += lod0MeshletCount;
 
 			if(buckets[i].m_userCount == 1)
 			{
 				ANKI_ASSERT(!buckets[i].m_program.isCreated());
-				ANKI_ASSERT(buckets[i].m_meshletGroupCount == meshletGroupCount);
+				ANKI_ASSERT(buckets[i].m_meshletGroupCount == meshletGroupCount && buckets[i].m_meshletGroupCount == lod0MeshletCount);
 				buckets[i].m_program = state.m_program;
 				++m_activeBucketCount[technique];
 			}
@@ -86,6 +78,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	newBucket.m_program = state.m_program;
 	newBucket.m_userCount = 1;
 	newBucket.m_meshletGroupCount = meshletGroupCount;
+	newBucket.m_lod0MeshletCount = lod0MeshletCount;
 
 	++m_activeBucketCount[technique];
 
@@ -104,6 +97,7 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
 	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMeshletGroupSize - 1) / kMeshletGroupSize;
+	const U32 meshletCount = bucketIndex.m_lod0MeshletCount;
 	bucketIndex.invalidate();
 
 	LockGuard lock(m_mtx);
@@ -117,10 +111,12 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 	m_meshletGroupCount[technique] -= meshletGroupCount;
 
 	ExtendedBucket& bucket = m_buckets[technique][idx];
-	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_meshletGroupCount >= meshletGroupCount);
+	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_meshletGroupCount >= meshletGroupCount
+				&& bucket.m_lod0MeshletCount >= meshletCount);
 
 	--bucket.m_userCount;
 	bucket.m_meshletGroupCount -= meshletGroupCount;
+	bucket.m_lod0MeshletCount -= meshletCount;
 
 	if(bucket.m_userCount == 0)
 	{

+ 2 - 1
AnKi/Scene/RenderStateBucket.h

@@ -96,7 +96,7 @@ public:
 	{
 		for(const ExtendedBucket& b : m_buckets[technique])
 		{
-			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_meshletGroupCount);
+			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_meshletGroupCount, b.m_lod0MeshletCount);
 		}
 	}
 
@@ -131,6 +131,7 @@ private:
 		U64 m_hash = 0;
 		U32 m_userCount = 0;
 		U32 m_meshletGroupCount = 0;
+		U32 m_lod0MeshletCount = 0;
 	};
 
 	Array<SceneDynamicArray<ExtendedBucket>, U32(RenderingTechnique::kCount)> m_buckets;

+ 39 - 28
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -46,7 +46,7 @@
 #endif
 
 // General defines
-#define GBUFFER (ANKI_TECHNIQUE_GBuffer || ANKI_TECHNIQUE_GBufferMesh)
+#define GBUFFER (ANKI_TECHNIQUE_GBufferLegacy || ANKI_TECHNIQUE_GBufferMeshShaders || ANKI_TECHNIQUE_GBufferSwMeshletRendering)
 #define REALLY_ALPHA_TEST (ALPHA_TEST && DIFFUSE_TEX)
 #define UVS (GBUFFER || REALLY_ALPHA_TEST)
 #define REALLY_VELOCITY ((ANKI_VELOCITY || ANKI_BONES) && GBUFFER)
@@ -233,7 +233,7 @@ void velocity(Mat3x4 worldTransform, Mat3x4 prevWorldTransform, Vec3 prevLocalPo
 // ===========================================================================
 #if ANKI_VERTEX_SHADER
 
-#	define SW_MESHLETS (ANKI_TECHNIQUE_GBufferMeshlet || ANKI_TECHNIQUE_ShadowsMeshlet)
+#	define SW_MESHLETS (ANKI_TECHNIQUE_GBufferSwMeshletRendering || ANKI_TECHNIQUE_ShadowsSwMeshletRendering)
 
 VertOut main(VertIn input)
 {
@@ -246,11 +246,16 @@ VertOut main(VertIn input)
 	{
 		// Discard the primitive
 		output = (VertOut)0;
+		output.m_svPosition = kNaN;
 		output.m_constantsOffset = instance.m_constantsOffset;
 		return output;
 	}
 
-	UnpackedMeshVertex vert = loadVertex(meshlet, input.m_svVertexId, ANKI_BONES);
+	const U32 primitiveId = input.m_svVertexId / 3u;
+	const UVec3 localIndices = g_unifiedGeom_R8G8B8A8_Uint[meshlet.m_firstPrimitive + primitiveId].xyz;
+	const U32 localIdx = localIndices[input.m_svVertexId % 3u];
+
+	UnpackedMeshVertex vert = loadVertex(meshlet, localIdx, ANKI_BONES);
 #	else
 	const GpuSceneRenderableVertex instance = unpackGpuSceneRenderableVertex(input.m_instanceData);
 	const GpuSceneMeshLod mesh = g_meshLods[instance.m_meshLodIndex];
@@ -532,14 +537,14 @@ void main()
 #	elif !GBUFFER && REALLY_ALPHA_TEST // Shadows WITH alpha
 
 void main(
-#		if ANKI_TECHNIQUE_ShadowsMesh
+#		if ANKI_TECHNIQUE_ShadowsMeshShaders
 	MeshPerVertOut vertInput, MeshPerPrimitiveOut primInput
 #		else
 	VertOut vertInput
 #		endif
 )
 {
-#		if ANKI_TECHNIQUE_ShadowsMesh
+#		if ANKI_TECHNIQUE_ShadowsMeshShaders
 	const U32 constantsOffset = primInput.m_constantsOffset;
 #		else
 	const U32 constantsOffset = vertInput.m_constantsOffset;
@@ -556,14 +561,14 @@ void main(
 #	else // GBUFFER
 
 FragOut main(
-#		if ANKI_TECHNIQUE_GBufferMesh
+#		if ANKI_TECHNIQUE_GBufferMeshShaders
 	MeshPerVertOut vertInput, MeshPerPrimitiveOut primInput
 #		else
 	VertOut vertInput
 #		endif
 )
 {
-#		if ANKI_TECHNIQUE_GBufferMesh
+#		if ANKI_TECHNIQUE_GBufferMeshShaders
 	const U32 constantsOffset = primInput.m_constantsOffset;
 #		else
 	const U32 constantsOffset = vertInput.m_constantsOffset;
@@ -641,7 +646,7 @@ FragOut main(
 	g.m_metallic = metallic;
 	g.m_velocity = velocity;
 
-#		if VISUALIZE_MESHLETS && ANKI_TECHNIQUE_GBufferMesh
+#		if VISUALIZE_MESHLETS && ANKI_TECHNIQUE_GBufferMeshShaders
 	const U32 meshletIdx = primInput.m_meshletIndex % 6u;
 	switch(meshletIdx)
 	{
@@ -735,38 +740,44 @@ FragOut main(
 // ===========================================================================
 // Define the techniques                                                     =
 // ===========================================================================
-#pragma anki technique_start vert GBuffer uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end vert GBuffer
+#pragma anki technique_start vert GBufferLegacy uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert GBufferLegacy
 
-#pragma anki technique_start vert Shadows uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end vert Shadows
+#pragma anki technique_start vert ShadowsLegacy uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert ShadowsLegacy
 
-#pragma anki technique_start vert GBufferMeshlet uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end vert GBufferMeshlet
+#pragma anki technique_start vert GBufferSwMeshletRendering uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert GBufferSwMeshletRendering
 
-#pragma anki technique_start vert ShadowsMeshlet uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end vert ShadowsMeshlet
+#pragma anki technique_start vert ShadowsSwMeshletRendering uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert ShadowsSwMeshletRendering
 
 #pragma anki technique_start task CommonTask uses_mutators
 #pragma anki technique_end task CommonTask
 
-#pragma anki technique_start mesh GBufferMesh uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end mesh GBufferMesh
+#pragma anki technique_start mesh GBufferMeshShaders uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end mesh GBufferMeshShaders
+
+#pragma anki technique_start mesh ShadowsMeshShaders uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end mesh ShadowsMeshShaders
+
+#pragma anki technique_start frag GBufferLegacy
+#pragma anki technique_end frag GBufferLegacy
 
-#pragma anki technique_start mesh ShadowsMesh uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
-#pragma anki technique_end mesh ShadowsMesh
+#pragma anki technique_start frag ShadowsLegacy
+#pragma anki technique_end frag ShadowsLegacy
 
-#pragma anki technique_start frag GBuffer
-#pragma anki technique_end frag GBuffer
+#pragma anki technique_start frag GBufferMeshShaders
+#pragma anki technique_end frag GBufferMeshShaders
 
-#pragma anki technique_start frag Shadows
-#pragma anki technique_end frag Shadows
+#pragma anki technique_start frag ShadowsMeshShaders
+#pragma anki technique_end frag ShadowsMeshShaders
 
-#pragma anki technique_start frag GBufferMesh
-#pragma anki technique_end frag GBufferMesh
+#pragma anki technique_start frag GBufferSwMeshletRendering
+#pragma anki technique_end frag GBufferSwMeshletRendering
 
-#pragma anki technique_start frag ShadowsMesh
-#pragma anki technique_end frag ShadowsMesh
+#pragma anki technique_start frag ShadowsSwMeshletRendering
+#pragma anki technique_end frag ShadowsSwMeshletRendering
 
 #pragma anki technique_start ahit RtShadows uses_mutators ALPHA_TEST DIFFUSE_TEX
 #pragma anki technique_end ahit RtShadows

+ 18 - 14
AnKi/Shaders/GpuVisibilityMeshlet.ankiprog

@@ -10,23 +10,24 @@
 #include <AnKi/Shaders/Common.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
+#include <AnKi/Shaders/PackFunctions.hlsl>
 
-#define MESHLET_BACKFACE_CULLING 0
+#define MESHLET_BACKFACE_CULLING 1
 #define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
 #define MESHLET_NO_SAMPLING_POINT_CULLING 1
 #define MESHLET_HZB_CULLING HZB_TEST
 
 #define THREADGROUP_SIZE ANKI_TASK_SHADER_THREADGROUP_SIZE
 
-[[vk::binding(0, 0)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
-[[vk::binding(0, 1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
-[[vk::binding(0, 2)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
-[[vk::binding(0, 3)]] ByteAddressBuffer g_gpuScene;
-[[vk::binding(0, 4)]] StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes;
-[[vk::binding(0, 5)]] RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArg;
-[[vk::binding(0, 6)]] RWStructuredBuffer<GpuSceneMeshletInstance> g_drawInstances;
-[[vk::binding(0, 7)]] Texture2D<Vec4> g_hzbTexture;
-[[vk::binding(0, 8)]] SamplerState g_nearestClampSampler;
+[[vk::binding(0)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
+[[vk::binding(2)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
+[[vk::binding(3)]] ByteAddressBuffer g_gpuScene;
+[[vk::binding(4)]] StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes;
+[[vk::binding(5)]] RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArg;
+[[vk::binding(6)]] RWStructuredBuffer<GpuSceneMeshletInstance> g_drawInstances;
+[[vk::binding(7)]] Texture2D<Vec4> g_hzbTexture;
+[[vk::binding(8)]] SamplerState g_nearestClampSampler;
 
 struct MaterialGlobalConstants
 {
@@ -53,6 +54,7 @@ struct MaterialGlobalConstants
 	U32 firstMeshletBoundingVolume = meshletGroup * kMeshletGroupSize;
 	const U32 meshletCount = min(kMeshletGroupSize, meshLod.m_meshletCount - firstMeshletBoundingVolume);
 	firstMeshletBoundingVolume += meshLod.m_firstMeshletBoundingVolume;
+	const U32 firstMeshletGeometryDescriptor = meshletGroup * kMeshletGroupSize + meshLod.m_firstMeshletGeometryDescriptor;
 
 	// Meshlet culling
 	if(svGroupIndex < meshletCount)
@@ -63,7 +65,9 @@ struct MaterialGlobalConstants
 		const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
 
 #if MESHLET_BACKFACE_CULLING
-		cull = cullBackfaceMeshlet(meshletBoundingVol, worldTransform, g_consts.m_cameraTransform.getTranslationPart());
+		const Vec4 coneDirAndAng = unpackSnorm4x8(meshletBoundingVol.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
+		cull = cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshletBoundingVol.m_coneApex, worldTransform,
+								   g_consts.m_cameraTransform.getTranslationPart());
 #endif
 
 		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
@@ -80,8 +84,8 @@ struct MaterialGlobalConstants
 
 #if MESHLET_NO_SAMPLING_POINT_CULLING
 		// Sampling points test
-		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_consts.m_viewportSizef.x;
-		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_consts.m_viewportSizef.y;
+		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_consts.m_viewportSizef;
+		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_consts.m_viewportSizef;
 		cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
 #endif
 
@@ -102,7 +106,7 @@ struct MaterialGlobalConstants
 			GpuSceneMeshletInstance instance;
 			instance.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
 			instance.m_constantsOffset = renderable.m_constantsOffset;
-			instance.m_meshletGeometryDescriptorIndex = meshLod.m_firstMeshletGeometryDescriptor + svGroupIndex;
+			instance.m_meshletGeometryDescriptorIndex = firstMeshletGeometryDescriptor + svGroupIndex;
 			instance.m_boneTransformsOrParticleEmitterOffset =
 				(renderable.m_particleEmitterOffset) ? renderable.m_particleEmitterOffset : renderable.m_boneTransformsOffset;
 

+ 1 - 1
AnKi/Shaders/Include/ClusteredShadingTypes.h

@@ -54,7 +54,7 @@ struct PointLight
 	RVec3 m_diffuseColor;
 	U32 m_padding1;
 
-	F32 m_shadow;
+	U32 m_shadow;
 	F32 m_padding2;
 	U32 m_padding3;
 	U32 m_padding4;

+ 1 - 0
AnKi/Shaders/Include/Common.h

@@ -466,6 +466,7 @@ constexpr F16 kMinF16 = (F16)0.00006104;
 #	endif
 
 constexpr F32 kPi = 3.14159265358979323846f;
+constexpr F32 kNaN = 0.0f / 0.0f;
 
 //! == GLSL ============================================================================================================
 #else

+ 2 - 2
AnKi/Shaders/Include/MeshTypes.h

@@ -90,9 +90,9 @@ struct MeshletBoundingVolume
 	F32 m_sphereRadius;
 
 	Vec3 m_aabbMax;
-	U32 m_coneApex_R8G8B8A8_Snorm;
+	U32 m_padding;
 
-	Vec3 m_padding;
+	Vec3 m_coneApex;
 	U32 m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm;
 };
 

+ 1 - 1
AnKi/Shaders/LightShading.ankiprog

@@ -224,7 +224,7 @@ RVec3 main(Vec4 svPosition : SV_POSITION, Vec2 uv : TEXCOORD) : SV_TARGET0
 
 		LIGHTING_COMMON_BRDF();
 
-		[branch] if(light.m_shadowAtlasTileScale >= 0.0)
+		[branch] if(light.m_shadow)
 		{
 			const RF32 shadow = resolvedSm[resolvedSmIdx++];
 			lambert *= shadow;

+ 3 - 16
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -10,8 +10,8 @@
 #include <AnKi/Shaders/Include/MaterialTypes.h>
 #include <AnKi/Shaders/Include/MeshTypes.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
-#include <AnKi/Shaders/PackFunctions.hlsl>
 #include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
+#include <AnKi/Shaders/PackFunctions.hlsl>
 
 ANKI_BINDLESS_SET(MaterialSet::kBindless)
 
@@ -87,19 +87,6 @@ UnpackedMeshVertex loadVertex(MeshletGeometryDescriptor meshlet, U32 vertexIndex
 
 Bool cullBackfaceMeshlet(MeshletBoundingVolume meshlet, Mat3x4 worldTransform, Vec3 cameraWorldPos)
 {
-	const Vec4 coneData = unpackSnorm4x8(meshlet.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
-
-	Vec3 center = (meshlet.m_aabbMin + meshlet.m_aabbMax) / 2.0f;
-
-	center = mul(worldTransform, Vec4(center, 1.0f));
-	const Vec3 coneAxisWspace = normalize(mul(worldTransform, Vec4(coneData.xyz, 0.0f)));
-
-	// Extract uniform scale
-	const Vec3 xAxis = Vec3(worldTransform.m_row0.x, worldTransform.m_row1.x, worldTransform.m_row2.x);
-	const F32 uniformScale = length(xAxis);
-
-	meshlet.m_sphereRadius *= uniformScale;
-
-	const Vec3 dir = center - cameraWorldPos;
-	return dot(dir, coneAxisWspace) >= coneData.w * length(dir) + meshlet.m_sphereRadius;
+	const Vec4 coneDirAndAng = unpackSnorm4x8(meshlet.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
+	return cullBackfaceMeshlet(coneDirAndAng.xyz, coneDirAndAng.w, meshlet.m_coneApex, worldTransform, cameraWorldPos);
 }

+ 1 - 1
AnKi/Shaders/ShadowmapsResolve.ankiprog

@@ -175,7 +175,7 @@ RVec4 main(Vec2 uv : TEXCOORD) : SV_TARGET0
 	{
 		const PointLight light = g_pointLights[idx];
 
-		[branch] if(light.m_shadowAtlasTileScale >= 0.0)
+		[branch] if(light.m_shadow)
 		{
 			const Vec3 frag2Light = light.m_position - worldPos;
 

+ 10 - 0
AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl

@@ -252,3 +252,13 @@ Bool cullHzb(Vec2 aabbMinNdc, Vec2 aabbMaxNdc, F32 aabbMinDepth, Texture2D<Vec4>
 
 	return (aabbMinDepth > maxDepth);
 }
+
+/// All cone values in local space.
+Bool cullBackfaceMeshlet(Vec3 coneDirection, F32 coneCosHalfAngle, Vec3 coneApex, Mat3x4 worldTransform, Vec3 cameraWorldPos)
+{
+	const Vec3 apexWSpace = mul(worldTransform, Vec4(coneApex, 1.0f));
+	const Vec3 coneAxisWSpace = normalize(mul(worldTransform, Vec4(coneDirection, 0.0f)));
+
+	const Vec3 dir = normalize(apexWSpace - cameraWorldPos);
+	return dot(dir, coneAxisWSpace) >= coneCosHalfAngle;
+}

+ 1 - 1
AnKi/Shaders/VolumetricLightingAccumulation.ankiprog

@@ -126,7 +126,7 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 		factor *= phaseFunction(viewDir, normalize(worldPos - light.m_position), kPhaseFunctionAnisotropy);
 
 #if ENABLE_SHADOWS
-		if(light.m_shadowAtlasTileScale >= 0.0)
+		if(light.m_shadow)
 		{
 			factor *= computeShadowFactorPointLight(light, frag2Light, g_shadowAtlasTex, g_linearAnyClampShadowSampler);
 		}

BIN
Samples/SimpleScene/Assets/Mesh_0_d56f58fc33de003f.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_1_266a0dd9d2092f46.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_2_be53007bec464649.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_3_c026fdb5b74773ed.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_4_4d4aae6c030c4fd5.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_5_629309b27fa549a7.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_6_a078cf217893be6f.ankimesh


BIN
Samples/SimpleScene/Assets/Mesh_7_4b76b132380d8a62.ankimesh