浏览代码

Add the S/W mesh shading in the GPU tests and the drawer

Panagiotis Christopoulos Charitos 1 年之前
父节点
当前提交
9dadc44644

+ 0 - 1
AnKi/Gr/Common.h

@@ -385,7 +385,6 @@ enum class VertexStepRate : U8
 {
 	kVertex,
 	kInstance,
-	kDraw,
 	kCount
 };
 

+ 38 - 29
AnKi/Renderer/ForwardShading.cpp

@@ -45,44 +45,50 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
 void ForwardShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgraphCtx)
 {
 	ANKI_TRACE_SCOPED_EVENT(ForwardShading);
+
 	CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
-	// Set state
-	cmdb.setDepthWrite(false);
-	cmdb.setBlendFactors(0, BlendFactor::kSrcAlpha, BlendFactor::kOneMinusSrcAlpha);
+	if(m_runCtx.m_visOut.containsDrawcalls())
+	{
+		// Set state
+		cmdb.setDepthWrite(false);
+		cmdb.setBlendFactors(0, BlendFactor::kSrcAlpha, BlendFactor::kOneMinusSrcAlpha);
 
-	// Bind stuff
-	const U32 set = U32(MaterialSet::kGlobal);
-	cmdb.bindSampler(set, U32(MaterialBinding::kLinearClampSampler), getRenderer().getSamplers().m_trilinearClamp.get());
-	cmdb.bindSampler(set, U32(MaterialBinding::kShadowSampler), getRenderer().getSamplers().m_trilinearClampShadow.get());
+		// Bind stuff
+		const U32 set = U32(MaterialSet::kGlobal);
+		cmdb.bindSampler(set, U32(MaterialBinding::kLinearClampSampler), getRenderer().getSamplers().m_trilinearClamp.get());
+		cmdb.bindSampler(set, U32(MaterialBinding::kShadowSampler), getRenderer().getSamplers().m_trilinearClampShadow.get());
 
-	rgraphCtx.bindTexture(set, U32(MaterialBinding::kDepthRt), getRenderer().getDepthDownscale().getRt(), DepthDownscale::kQuarterInternalResolution);
-	rgraphCtx.bindColorTexture(set, U32(MaterialBinding::kLightVolume), getRenderer().getVolumetricLightingAccumulation().getRt());
+		rgraphCtx.bindTexture(set, U32(MaterialBinding::kDepthRt), getRenderer().getDepthDownscale().getRt(),
+							  DepthDownscale::kQuarterInternalResolution);
+		rgraphCtx.bindColorTexture(set, U32(MaterialBinding::kLightVolume), getRenderer().getVolumetricLightingAccumulation().getRt());
 
-	cmdb.bindConstantBuffer(set, U32(MaterialBinding::kClusterShadingConstants), getRenderer().getClusterBinning().getClusteredShadingConstants());
+		cmdb.bindConstantBuffer(set, U32(MaterialBinding::kClusterShadingConstants),
+								getRenderer().getClusterBinning().getClusteredShadingConstants());
 
-	cmdb.bindUavBuffer(set, U32(MaterialBinding::kClusterShadingLights),
-					   getRenderer().getClusterBinning().getPackedObjectsBuffer(GpuSceneNonRenderableObjectType::kLight));
+		cmdb.bindUavBuffer(set, U32(MaterialBinding::kClusterShadingLights),
+						   getRenderer().getClusterBinning().getPackedObjectsBuffer(GpuSceneNonRenderableObjectType::kLight));
 
-	rgraphCtx.bindColorTexture(set, U32(MaterialBinding::kClusterShadingLights) + 1, getRenderer().getShadowMapping().getShadowmapRt());
+		rgraphCtx.bindColorTexture(set, U32(MaterialBinding::kClusterShadingLights) + 1, getRenderer().getShadowMapping().getShadowmapRt());
 
-	cmdb.bindUavBuffer(set, U32(MaterialBinding::kClusters), getRenderer().getClusterBinning().getClustersBuffer());
+		cmdb.bindUavBuffer(set, U32(MaterialBinding::kClusters), getRenderer().getClusterBinning().getClustersBuffer());
 
-	// Draw
-	RenderableDrawerArguments args;
-	args.m_viewMatrix = ctx.m_matrices.m_view;
-	args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
-	args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
-	args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
-	args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
-	args.m_renderingTechinuqe = RenderingTechnique::kForward;
-	args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
-	args.fillMdi(m_runCtx.m_visOut);
-	getRenderer().getSceneDrawer().drawMdi(args, cmdb);
+		// Draw
+		RenderableDrawerArguments args;
+		args.m_viewMatrix = ctx.m_matrices.m_view;
+		args.m_cameraTransform = ctx.m_matrices.m_cameraTransform;
+		args.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjectionJitter;
+		args.m_previousViewProjectionMatrix = ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection;
+		args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
+		args.m_renderingTechinuqe = RenderingTechnique::kForward;
+		args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
+		args.fillMdi(m_runCtx.m_visOut);
+		getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 
-	// Restore state
-	cmdb.setDepthWrite(true);
-	cmdb.setBlendFactors(0, BlendFactor::kOne, BlendFactor::kZero);
+		// Restore state
+		cmdb.setDepthWrite(true);
+		cmdb.setBlendFactors(0, BlendFactor::kOne, BlendFactor::kZero);
+	}
 
 	// Do lens flares
 	getRenderer().getLensFlare().runDrawFlares(ctx, cmdb);
@@ -99,7 +105,10 @@ void ForwardShading::setDependencies(GraphicsRenderPassDescription& pass)
 		pass.newBufferDependency(getRenderer().getLensFlare().getIndirectDrawBuffer(), BufferUsageBit::kIndirectDraw);
 	}
 
-	pass.newBufferDependency(m_runCtx.m_visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+	if(m_runCtx.m_visOut.containsDrawcalls())
+	{
+		pass.newBufferDependency(m_runCtx.m_visOut.m_someBufferHandle, BufferUsageBit::kIndirectDraw);
+	}
 }
 
 } // end namespace anki

+ 4 - 2
AnKi/Renderer/ShadowMapping.cpp

@@ -548,8 +548,8 @@ BufferOffsetRange ShadowMapping::createVetVisibilityPass(CString passName, const
 	// The shader doesn't actually write to the handle but have it as a write dependency for the drawer to correctly wait for this pass
 	pass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kUavComputeWrite);
 
-	pass.setWork([this, &lightc, hashBuff = visOut.m_visiblesHashBuffer, mdiBuff = visOut.m_mdiDrawCountsBuffer, clearTileIndirectArgs,
-				  taskShadersIndirectArgs = visOut.m_taskShaderIndirectArgsBuffer](RenderPassWorkContext& rpass) {
+	pass.setWork([this, &lightc, hashBuff = visOut.m_visiblesHashBuffer, mdiBuff = visOut.m_legacy.m_mdiDrawCountsBuffer, clearTileIndirectArgs,
+				  taskShadersIndirectArgs = visOut.m_mesh.m_taskShaderIndirectArgsBuffer](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 		cmdb.bindShaderProgram(m_vetVisibilityGrProg.get());
@@ -564,6 +564,8 @@ BufferOffsetRange ShadowMapping::createVetVisibilityPass(CString passName, const
 		cmdb.bindUavBuffer(0, 4, clearTileIndirectArgs);
 		cmdb.bindUavBuffer(0, 5, taskShadersIndirectArgs);
 
+		// TODO add the s/w mesh stuff
+
 		ANKI_ASSERT(RenderStateBucketContainer::getSingleton().getBucketCount(RenderingTechnique::kDepth) <= 64 && "TODO");
 		cmdb.dispatchCompute(1, 1, 1);
 	});

+ 52 - 18
AnKi/Renderer/Utils/Drawer.cpp

@@ -63,7 +63,7 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 					   UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshletGeometryDescriptors),
 					   UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
-	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kTaskShaderPayloads), args.m_taskShaderPayloadsBuffer);
+	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kTaskShaderPayloads), args.m_mesh.m_taskShaderPayloadsBuffer);
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kRenderables),
 					   GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshLods), GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
@@ -101,7 +101,9 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 	setState(args, cmdb);
 
-	cmdb.bindVertexBuffer(0, args.m_instanceRateRenderablesBuffer.m_buffer, args.m_instanceRateRenderablesBuffer.m_offset,
+	const Bool meshShaderHwSupport = GrManager::getSingleton().getDeviceCapabilities().m_meshShaders;
+
+	cmdb.bindVertexBuffer(0, args.m_legacy.m_renderableInstancesBuffer.m_buffer, args.m_legacy.m_renderableInstancesBuffer.m_offset,
 						  sizeof(GpuSceneRenderableVertex), VertexStepRate::kInstance);
 
 	// Gather the drawcalls
@@ -119,7 +121,7 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 			PrimitiveTopology m_primitiveTopology;
 		};
 
-		class ModernDraw
+		class MeshDraw
 		{
 		public:
 			U32 m_firstPayload;
@@ -127,10 +129,20 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 			PtrSize m_taskShaderIndirectArgsBufferOffset;
 		};
 
+		class SwMeshDraw
+		{
+		public:
+			Buffer* m_drawIndirectArgsBuffer;
+			PtrSize m_drawIndirectArgsBufferOffset;
+			Buffer* m_instancesBuffer;
+			PtrSize m_instancesBufferOffset;
+		};
+
 		union
 		{
 			LegacyDraw m_legacyDraw;
-			ModernDraw m_modernDraw;
+			MeshDraw m_meshDraw;
+			SwMeshDraw m_swMeshDraw;
 		};
 
 		ShaderProgram* m_program;
@@ -146,6 +158,7 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 	U32 bucketCount = 0;
 	U32 allMeshletGroupCount = 0;
 	U32 legacyGeometryFlowUserCount = 0;
+	PtrSize meshletInstancesBufferOffset = 0;
 	RenderStateBucketContainer::getSingleton().iterateBuckets(
 		args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount, U32 meshletGroupCount) {
 			if(userCount == 0)
@@ -160,20 +173,32 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 			cmd.m_shaderBinarySize = U64(state.m_program->getShaderBinarySize(ShaderType::kFragment)) << 32u;
 			cmd.m_hasDiscard = state.m_program->hasDiscard();
 
-			const Bool usesMeshShaders = meshletGroupCount > 0;
+			const Bool meshlets = meshletGroupCount > 0;
 
-			if(usesMeshShaders)
+			if(meshlets && meshShaderHwSupport)
 			{
 				cmd.m_drawType = 2;
 				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kMesh);
 
-				cmd.m_modernDraw.m_firstPayload = allMeshletGroupCount;
-				cmd.m_modernDraw.m_taskShaderIndirectArgsBuffer = args.m_taskShaderIndirectArgsBuffer.m_buffer;
-				cmd.m_modernDraw.m_taskShaderIndirectArgsBufferOffset =
-					args.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketCount;
+				cmd.m_meshDraw.m_firstPayload = allMeshletGroupCount;
+				cmd.m_meshDraw.m_taskShaderIndirectArgsBuffer = args.m_mesh.m_taskShaderIndirectArgsBuffer.m_buffer;
+				cmd.m_meshDraw.m_taskShaderIndirectArgsBufferOffset =
+					args.m_mesh.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketCount;
 
 				allMeshletGroupCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
 			}
+			else if(meshlets)
+			{
+				cmd.m_drawType = 3;
+				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kVertex);
+
+				cmd.m_swMeshDraw.m_drawIndirectArgsBuffer = args.m_softwareMesh.m_drawIndirectArgsBuffer.m_buffer;
+				cmd.m_swMeshDraw.m_drawIndirectArgsBufferOffset =
+					args.m_softwareMesh.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketCount;
+
+				cmd.m_swMeshDraw.m_instancesBuffer = args.m_softwareMesh.m_meshletInstancesBuffer.m_buffer;
+				cmd.m_swMeshDraw.m_instancesBufferOffset = args.m_softwareMesh.m_meshletInstancesBuffer.m_offset + meshletInstancesBufferOffset;
+			}
 			else
 			{
 				const U32 maxDrawCount = userCount;
@@ -182,18 +207,19 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 				cmd.m_shaderBinarySize |= state.m_program->getShaderBinarySize(ShaderType::kVertex);
 
 				cmd.m_legacyDraw.m_primitiveTopology = state.m_primitiveTopology;
-				cmd.m_legacyDraw.m_drawIndirectArgsBuffer = args.m_drawIndexedIndirectArgsBuffer.m_buffer;
+				cmd.m_legacyDraw.m_drawIndirectArgsBuffer = args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_buffer;
 				cmd.m_legacyDraw.m_drawIndirectArgsBufferOffset =
-					args.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * legacyGeometryFlowUserCount;
+					args.m_legacy.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * legacyGeometryFlowUserCount;
 				cmd.m_legacyDraw.m_maxDrawCount = maxDrawCount;
-				cmd.m_legacyDraw.m_mdiDrawCountsBuffer = args.m_mdiDrawCountsBuffer.m_buffer;
-				cmd.m_legacyDraw.m_mdiDrawCountsBufferOffset = args.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount;
+				cmd.m_legacyDraw.m_mdiDrawCountsBuffer = args.m_legacy.m_mdiDrawCountsBuffer.m_buffer;
+				cmd.m_legacyDraw.m_mdiDrawCountsBufferOffset = args.m_legacy.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount;
 
 				legacyGeometryFlowUserCount += userCount;
 			}
 
 			++bucketCount;
 			allUserCount += userCount;
+			meshletInstancesBufferOffset += sizeof(UVec4) * min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
 		});
 
 	ANKI_ASSERT(bucketCount == RenderStateBucketContainer::getSingleton().getBucketCount(args.m_renderingTechinuqe));
@@ -230,14 +256,22 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 								   it->m_legacyDraw.m_mdiDrawCountsBuffer, it->m_legacyDraw.m_mdiDrawCountsBufferOffset,
 								   it->m_legacyDraw.m_maxDrawCount);
 		}
+		else if(it->m_drawType == 2)
+		{
+			const UVec4 firstPayload(it->m_meshDraw.m_firstPayload);
+			cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
+
+			cmdb.drawMeshTasksIndirect(it->m_meshDraw.m_taskShaderIndirectArgsBuffer, it->m_meshDraw.m_taskShaderIndirectArgsBufferOffset);
+		}
 		else
 		{
-			ANKI_ASSERT(it->m_drawType == 2);
+			ANKI_ASSERT(it->m_drawType == 3);
 
-			const UVec4 firstPayload(it->m_modernDraw.m_firstPayload);
-			cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
+			cmdb.bindVertexBuffer(0, it->m_swMeshDraw.m_instancesBuffer, it->m_swMeshDraw.m_instancesBufferOffset, sizeof(UVec4),
+								  VertexStepRate::kInstance);
 
-			cmdb.drawMeshTasksIndirect(it->m_modernDraw.m_taskShaderIndirectArgsBuffer, it->m_modernDraw.m_taskShaderIndirectArgsBufferOffset);
+			cmdb.drawIndirect(PrimitiveTopology::kTriangles, 1, it->m_swMeshDraw.m_drawIndirectArgsBufferOffset,
+							  it->m_swMeshDraw.m_drawIndirectArgsBuffer);
 		}
 	}
 

+ 31 - 11
AnKi/Renderer/Utils/Drawer.h

@@ -30,23 +30,43 @@ public:
 
 	Sampler* m_sampler = nullptr;
 
-	// For MDI
 	RenderingTechnique m_renderingTechinuqe = RenderingTechnique::kCount;
 
-	BufferOffsetRange m_mdiDrawCountsBuffer;
-	BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
-	BufferOffsetRange m_instanceRateRenderablesBuffer;
+	class
+	{
+	public:
+		BufferOffsetRange m_mdiDrawCountsBuffer;
+		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
+		BufferOffsetRange m_renderableInstancesBuffer;
+	} m_legacy; ///< Legacy vertex flow
+
+	class
+	{
+	public:
+		BufferOffsetRange m_taskShaderIndirectArgsBuffer;
+		BufferOffsetRange m_taskShaderPayloadsBuffer;
+	} m_mesh;
 
-	BufferOffsetRange m_taskShaderIndirectArgsBuffer;
-	BufferOffsetRange m_taskShaderPayloadsBuffer;
+	class
+	{
+	public:
+		BufferOffsetRange m_meshletInstancesBuffer;
+		BufferOffsetRange m_drawIndirectArgsBuffer;
+	} m_softwareMesh;
 
 	void fillMdi(const GpuVisibilityOutput& visOut)
 	{
-		m_mdiDrawCountsBuffer = visOut.m_mdiDrawCountsBuffer;
-		m_drawIndexedIndirectArgsBuffer = visOut.m_drawIndexedIndirectArgsBuffer;
-		m_instanceRateRenderablesBuffer = visOut.m_instanceRateRenderablesBuffer;
-		m_taskShaderIndirectArgsBuffer = visOut.m_taskShaderIndirectArgsBuffer;
-		m_taskShaderPayloadsBuffer = visOut.m_taskShaderPayloadBuffer;
+		m_legacy.m_mdiDrawCountsBuffer = visOut.m_legacy.m_mdiDrawCountsBuffer;
+		m_legacy.m_drawIndexedIndirectArgsBuffer = visOut.m_legacy.m_drawIndexedIndirectArgsBuffer;
+		m_legacy.m_renderableInstancesBuffer = visOut.m_legacy.m_renderableInstancesBuffer;
+		m_mesh.m_taskShaderIndirectArgsBuffer = visOut.m_mesh.m_taskShaderIndirectArgsBuffer;
+		m_mesh.m_taskShaderPayloadsBuffer = visOut.m_mesh.m_taskShaderPayloadBuffer;
+	}
+
+	void fill(const GpuMeshletVisibilityOutput& visOut)
+	{
+		m_softwareMesh.m_meshletInstancesBuffer = visOut.m_meshletInstancesBuffer;
+		m_softwareMesh.m_drawIndirectArgsBuffer = visOut.m_drawIndirectArgsBuffer;
 	}
 };
 

+ 220 - 92
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -12,45 +12,12 @@
 #include <AnKi/Core/GpuMemory/GpuSceneBuffer.h>
 #include <AnKi/Collision/Functions.h>
 #include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
+#include <AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h>
 #include <AnKi/Core/StatsSet.h>
 
 namespace anki {
 
-Error GpuVisibility::init()
-{
-	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
-	{
-		for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
-		{
-			for(MutatorValue genHash = 0; genHash < 2; ++genHash)
-			{
-				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
-											 {{"HZB_TEST", hzb}, {"DISTANCE_TEST", 0}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}},
-											 m_prog, m_frustumGrProgs[hzb][gatherAabbs][genHash]));
-			}
-		}
-	}
-
-	for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
-	{
-		for(MutatorValue genHash = 0; genHash < 2; ++genHash)
-		{
-			ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
-										 {{"HZB_TEST", 0}, {"DISTANCE_TEST", 1}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}}, m_prog,
-										 m_distGrProgs[gatherAabbs][genHash]));
-		}
-	}
-
-	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
-	{
-		ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityMeshlet.ankiprogbin", {{"HZB_TEST", hzb}}, m_meshletCullingProg,
-									 m_meshletCullingGrProgs[hzb]));
-	}
-
-	return Error::kNone;
-}
-
-GpuVisibility::Counts GpuVisibility::countTechnique(RenderingTechnique t)
+GpuVisibilityCommonBase::Counts GpuVisibilityCommonBase::countTechnique(RenderingTechnique t)
 {
 	Counts out = {};
 
@@ -88,9 +55,38 @@ GpuVisibility::Counts GpuVisibility::countTechnique(RenderingTechnique t)
 	return out;
 }
 
+Error GpuVisibility::init()
+{
+	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
+	{
+		for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
+		{
+			for(MutatorValue genHash = 0; genHash < 2; ++genHash)
+			{
+				ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+											 {{"HZB_TEST", hzb}, {"DISTANCE_TEST", 0}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}},
+											 m_prog, m_frustumGrProgs[hzb][gatherAabbs][genHash]));
+			}
+		}
+	}
+
+	for(MutatorValue gatherAabbs = 0; gatherAabbs < 2; ++gatherAabbs)
+	{
+		for(MutatorValue genHash = 0; genHash < 2; ++genHash)
+		{
+			ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin",
+										 {{"HZB_TEST", 0}, {"DISTANCE_TEST", 1}, {"GATHER_AABBS", gatherAabbs}, {"HASH_VISIBLES", genHash}}, m_prog,
+										 m_distGrProgs[gatherAabbs][genHash]));
+		}
+	}
+
+	return Error::kNone;
+}
+
 void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out)
 {
 	ANKI_ASSERT(in.m_lodReferencePoint.x() != kMaxF32);
+	RenderGraphDescription& rgraph = *in.m_rgraph;
 
 	class DistanceTestData
 	{
@@ -130,24 +126,6 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	if(counts.m_allUserCount == 0) [[unlikely]]
 	{
 		// Early exit
-
-		out.m_instanceRateRenderablesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(GpuSceneRenderable));
-		out.m_drawIndexedIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(DrawIndexedIndirectArgs));
-		out.m_taskShaderIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(DispatchIndirectArgs));
-		out.m_taskShaderPayloadBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(GpuSceneTaskShaderPayload));
-
-		U32* atomics;
-		out.m_mdiDrawCountsBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame<U32>(1, atomics);
-		atomics[0] = 0;
-		out.m_someBufferHandle = in.m_rgraph->importBuffer(BufferUsageBit::kNone, out.m_mdiDrawCountsBuffer);
-
-		if(in.m_gatherAabbIndices)
-		{
-			U32* atomic;
-			out.m_visibleAaabbIndicesBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame<U32>(1, atomic);
-			atomic[0] = 0;
-		}
-
 		return;
 	}
 
@@ -174,8 +152,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 			mem.m_drawIndexedIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(
 				max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
-			mem.m_instanceRateRenderablesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(
-				max(1u, maxCounts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex));
+			mem.m_renderableInstancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_legacyGeometryFlowUserCount)
+																									 * sizeof(GpuSceneRenderableVertex));
 
 			mem.m_taskShaderPayloadBuffer =
 				GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
@@ -185,17 +163,19 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphFrameCallCount % m_runCtx.m_persistentMem.getSize()];
 	++m_runCtx.m_populateRenderGraphFrameCallCount;
 
-	out.m_drawIndexedIndirectArgsBuffer = mem.m_drawIndexedIndirectArgsBuffer;
-	out.m_drawIndexedIndirectArgsBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs);
+	out.m_legacy.m_drawIndexedIndirectArgsBuffer = mem.m_drawIndexedIndirectArgsBuffer;
+	out.m_legacy.m_drawIndexedIndirectArgsBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs);
 
-	out.m_instanceRateRenderablesBuffer = mem.m_instanceRateRenderablesBuffer;
-	out.m_instanceRateRenderablesBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex);
+	out.m_legacy.m_renderableInstancesBuffer = mem.m_renderableInstancesBuffer;
+	out.m_legacy.m_renderableInstancesBuffer.m_range = max(1u, counts.m_legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex);
 
-	out.m_taskShaderPayloadBuffer = mem.m_taskShaderPayloadBuffer;
-	out.m_taskShaderPayloadBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload);
+	out.m_legacy.m_mdiDrawCountsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(U32) * counts.m_bucketCount);
 
-	out.m_taskShaderIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * counts.m_bucketCount);
-	out.m_mdiDrawCountsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(U32) * counts.m_bucketCount);
+	out.m_mesh.m_taskShaderPayloadBuffer = mem.m_taskShaderPayloadBuffer;
+	out.m_mesh.m_taskShaderPayloadBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload);
+
+	out.m_mesh.m_taskShaderIndirectArgsBuffer =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * counts.m_bucketCount);
 
 	if(in.m_hashVisibles)
 	{
@@ -208,38 +188,39 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 
 	// Zero some stuff
-	const BufferHandle zeroStuffDependency = in.m_rgraph->importBuffer(BufferUsageBit::kNone, out.m_mdiDrawCountsBuffer);
+	const BufferHandle zeroStuffDependency = rgraph.importBuffer(BufferUsageBit::kNone, out.m_legacy.m_mdiDrawCountsBuffer);
 	{
 		Array<Char, 128> passName;
 		snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU vis zero: %s", in.m_passesName.cstr());
 
-		ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass(passName.getBegin());
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
 		pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kTransferDestination);
 
-		pass.setWork([mdiDrawCountsBuffer = out.m_mdiDrawCountsBuffer, taskShaderIndirectArgsBuffer = out.m_taskShaderIndirectArgsBuffer,
-					  visiblesHashBuffer = out.m_visiblesHashBuffer,
-					  visibleAaabbIndicesBuffer = out.m_visibleAaabbIndicesBuffer](RenderPassWorkContext& rpass) {
+		pass.setWork([out](RenderPassWorkContext& rpass) {
 			CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 			cmdb.pushDebugMarker("MDI counts", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(mdiDrawCountsBuffer, 0);
+			cmdb.fillBuffer(out.m_legacy.m_mdiDrawCountsBuffer, 0);
 			cmdb.popDebugMarker();
 
-			cmdb.pushDebugMarker("Task shader indirect args", Vec3(1.0f, 1.0f, 1.0f));
-			cmdb.fillBuffer(taskShaderIndirectArgsBuffer, 0);
-			cmdb.popDebugMarker();
+			if(out.m_mesh.m_taskShaderIndirectArgsBuffer.m_buffer)
+			{
+				cmdb.pushDebugMarker("Task shader indirect args", Vec3(1.0f, 1.0f, 1.0f));
+				cmdb.fillBuffer(out.m_mesh.m_taskShaderIndirectArgsBuffer, 0);
+				cmdb.popDebugMarker();
+			}
 
-			if(visiblesHashBuffer.m_buffer)
+			if(out.m_visiblesHashBuffer.m_buffer)
 			{
 				cmdb.pushDebugMarker("Visibles hash", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(visiblesHashBuffer, 0);
+				cmdb.fillBuffer(out.m_visiblesHashBuffer, 0);
 				cmdb.popDebugMarker();
 			}
 
-			if(visibleAaabbIndicesBuffer.m_buffer)
+			if(out.m_visibleAaabbIndicesBuffer.m_buffer)
 			{
 				cmdb.pushDebugMarker("Visible AABB indices", Vec3(1.0f, 1.0f, 1.0f));
-				cmdb.fillBuffer(visibleAaabbIndicesBuffer.m_buffer, visibleAaabbIndicesBuffer.m_offset, sizeof(U32), 0);
+				cmdb.fillBuffer(out.m_visibleAaabbIndicesBuffer.m_buffer, out.m_visibleAaabbIndicesBuffer.m_offset, sizeof(U32), 0);
 				cmdb.popDebugMarker();
 			}
 		});
@@ -248,7 +229,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	// Set the out dependency. Use one of the big buffers.
 	if(!mem.m_bufferDepedency.isValid())
 	{
-		mem.m_bufferDepedency = in.m_rgraph->importBuffer(BufferUsageBit::kNone, mem.m_drawIndexedIndirectArgsBuffer);
+		mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_drawIndexedIndirectArgsBuffer);
 	}
 	out.m_someBufferHandle = mem.m_bufferDepedency;
 
@@ -256,7 +237,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	Array<Char, 128> passName;
 	snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU vis: %s", in.m_passesName.cstr());
 
-	ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass(passName.getBegin());
+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
 
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kUavComputeRead);
 	pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kUavComputeWrite);
@@ -269,15 +250,11 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	}
 
 	pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
-				  technique = in.m_technique, mdiDrawCountsBuffer = out.m_mdiDrawCountsBuffer,
-				  instanceRateRenderables = out.m_instanceRateRenderablesBuffer, indirectArgs = out.m_drawIndexedIndirectArgsBuffer,
-				  aabbCount = counts.m_aabbCount, visibleAaabbIndicesBuffer = out.m_visibleAaabbIndicesBuffer, hashBuffer = out.m_visiblesHashBuffer,
-				  taskShaderIndirectArgsBuff = out.m_taskShaderIndirectArgsBuffer,
-				  taskShaderPayloadBuffer = out.m_taskShaderPayloadBuffer](RenderPassWorkContext& rpass) {
+				  technique = in.m_technique, aabbCount = counts.m_aabbCount, out](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
-		const Bool gatherAabbIndices = visibleAaabbIndicesBuffer.m_buffer != nullptr;
-		const Bool genHash = hashBuffer.m_buffer != nullptr;
+		const Bool gatherAabbIndices = out.m_visibleAaabbIndicesBuffer.m_buffer != nullptr;
+		const Bool genHash = out.m_visiblesHashBuffer.m_buffer != nullptr;
 
 		if(frustumTestData)
 		{
@@ -308,11 +285,11 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 		cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
 		cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
-		cmdb.bindUavBuffer(0, 4, instanceRateRenderables);
-		cmdb.bindUavBuffer(0, 5, indirectArgs);
-		cmdb.bindUavBuffer(0, 6, mdiDrawCountsBuffer);
-		cmdb.bindUavBuffer(0, 7, taskShaderIndirectArgsBuff);
-		cmdb.bindUavBuffer(0, 8, taskShaderPayloadBuffer);
+		cmdb.bindUavBuffer(0, 4, out.m_legacy.m_renderableInstancesBuffer);
+		cmdb.bindUavBuffer(0, 5, out.m_legacy.m_drawIndexedIndirectArgsBuffer);
+		cmdb.bindUavBuffer(0, 6, out.m_legacy.m_mdiDrawCountsBuffer);
+		cmdb.bindUavBuffer(0, 7, out.m_mesh.m_taskShaderIndirectArgsBuffer);
+		cmdb.bindUavBuffer(0, 8, out.m_mesh.m_taskShaderPayloadBuffer);
 
 		U32* drawIndirectArgsIndexOrTaskPayloadIndex =
 			allocateAndBindUav<U32>(cmdb, 0, 9, RenderStateBucketContainer::getSingleton().getBucketCount(technique));
@@ -384,18 +361,169 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 		if(gatherAabbIndices)
 		{
-			cmdb.bindUavBuffer(0, 13, visibleAaabbIndicesBuffer);
+			cmdb.bindUavBuffer(0, 13, out.m_visibleAaabbIndicesBuffer);
 		}
 
 		if(genHash)
 		{
-			cmdb.bindUavBuffer(0, 14, hashBuffer);
+			cmdb.bindUavBuffer(0, 14, out.m_visiblesHashBuffer);
 		}
 
 		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
 	});
 }
 
+Error GpuMeshletVisibility::init()
+{
+	for(MutatorValue hzb = 0; hzb < 2; ++hzb)
+	{
+		ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityMeshlet.ankiprogbin", {{"HZB_TEST", hzb}}, m_meshletCullingProg,
+									 m_meshletCullingGrProgs[hzb]));
+	}
+
+	return Error::kNone;
+}
+
+void GpuMeshletVisibility::populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out)
+{
+	RenderGraphDescription& rgraph = *in.m_rgraph;
+
+	const Counts counts = countTechnique(in.m_technique);
+
+	if(counts.m_allUserCount == 0) [[unlikely]]
+	{
+		// Early exit
+		return;
+	}
+
+	// Allocate memory
+	const Bool firstFrame = m_runCtx.m_frameIdx != getRenderer().getFrameCount();
+	if(firstFrame)
+	{
+		// Allocate the big buffers once at the beginning of the frame
+
+		m_runCtx.m_frameIdx = getRenderer().getFrameCount();
+		m_runCtx.m_populateRenderGraphFrameCallCount = 0;
+
+		// Find the max counts of all techniques
+		Counts maxCounts = {};
+		for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(RenderingTechniqueBit::kAllRaster))
+		{
+			maxCounts = maxCounts.max((in.m_technique == t) ? counts : countTechnique(t));
+		}
+
+		// Allocate memory
+		for(PersistentMemory& mem : m_runCtx.m_persistentMem)
+		{
+			mem = {};
+
+			mem.m_meshletInstancesBuffer =
+				GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, maxCounts.m_meshletGroupCount) * sizeof(UVec4));
+		}
+	}
+
+	PersistentMemory& mem = m_runCtx.m_persistentMem[m_runCtx.m_populateRenderGraphFrameCallCount % m_runCtx.m_persistentMem.getSize()];
+	++m_runCtx.m_populateRenderGraphFrameCallCount;
+
+	out.m_drawIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DrawIndirectArgs) * counts.m_bucketCount);
+
+	out.m_meshletInstancesBuffer = mem.m_meshletInstancesBuffer;
+	out.m_meshletInstancesBuffer.m_range = max(1u, counts.m_meshletGroupCount) * sizeof(UVec4);
+
+	// Zero some stuff
+	const BufferHandle zeroStuffDependency = rgraph.importBuffer(BufferUsageBit::kNone, out.m_drawIndirectArgsBuffer);
+	{
+		Array<Char, 128> passName;
+		snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU meshlet vis zero: %s", in.m_passesName.cstr());
+
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
+		pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kTransferDestination);
+
+		pass.setWork([drawIndirectArgsBuffer = out.m_drawIndirectArgsBuffer](RenderPassWorkContext& rpass) {
+			CommandBuffer& cmdb = *rpass.m_commandBuffer;
+
+			cmdb.pushDebugMarker("Draw indirect args", Vec3(1.0f, 1.0f, 1.0f));
+			cmdb.fillBuffer(drawIndirectArgsBuffer, 0);
+			cmdb.popDebugMarker();
+		});
+	}
+
+	// Set the out dependency. Use one of the big buffers.
+	if(!mem.m_bufferDepedency.isValid())
+	{
+		mem.m_bufferDepedency = rgraph.importBuffer(BufferUsageBit::kNone, mem.m_meshletInstancesBuffer);
+	}
+	out.m_dependency = mem.m_bufferDepedency;
+
+	// Create the renderpass
+	Array<Char, 128> passName;
+	snprintf(passName.getBegin(), passName.getSizeInBytes(), "GPU meshlet vis: %s", in.m_passesName.cstr());
+
+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
+
+	pass.newBufferDependency(out.m_dependency, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(zeroStuffDependency, BufferUsageBit::kUavComputeRead);
+
+	pass.setWork([this, technique = in.m_technique, hzbRt = in.m_hzbRt, taskShaderPayloadsBuff = in.m_taskShaderPayloadBuffer,
+				  viewProjMat = in.m_viewProjectionMatrix, camTrf = in.m_cameraTransform, viewportSize = in.m_viewportSize,
+				  out](RenderPassWorkContext& rpass) {
+		CommandBuffer& cmdb = *rpass.m_commandBuffer;
+
+		U32 bucketIdx = 0;
+		U32 firstPayload = 0;
+		RenderStateBucketContainer::getSingleton().iterateBuckets(technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount) {
+			if(!meshletGroupCount)
+			{
+				++bucketIdx;
+				return;
+			}
+
+			// Create a depedency to a part of the indirect args buffer
+			const BufferOffsetRange drawIndirectArgsBufferChunk = {out.m_drawIndirectArgsBuffer.m_buffer,
+																   out.m_drawIndirectArgsBuffer.m_offset + sizeof(DrawIndirectArgs) * bucketIdx,
+																   sizeof(DrawIndirectArgs)};
+
+			const Bool hasHzb = hzbRt.isValid();
+
+			cmdb.bindShaderProgram(m_meshletCullingGrProgs[hasHzb].get());
+
+			cmdb.bindUavBuffer(0, 0, taskShaderPayloadsBuff);
+			cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 2, GpuSceneArrays::MeshLod::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 3, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 4, UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
+			cmdb.bindUavBuffer(0, 5, out.m_drawIndirectArgsBuffer);
+			cmdb.bindUavBuffer(0, 6, out.m_meshletInstancesBuffer);
+			if(hasHzb)
+			{
+				rpass.bindColorTexture(0, 7, hzbRt);
+				cmdb.bindSampler(0, 8, getRenderer().getSamplers().m_nearestNearestClamp.get());
+			}
+
+			class MaterialGlobalConstants
+			{
+			public:
+				Mat4 m_viewProjectionMatrix;
+				Mat3x4 m_cameraTransform;
+
+				Vec2 m_viewportSizef;
+				U32 m_firstPayload;
+				U32 m_padding;
+			} consts;
+			consts.m_viewProjectionMatrix = viewProjMat;
+			consts.m_cameraTransform = camTrf;
+			consts.m_viewportSizef = Vec2(viewportSize);
+			consts.m_firstPayload = firstPayload;
+			cmdb.setPushConstants(&consts, sizeof(consts));
+
+			cmdb.dispatchCompute(meshletGroupCount, 1, 1);
+
+			firstPayload += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
+			++bucketIdx;
+		});
+	});
+}
+
 Error GpuVisibilityNonRenderables::init()
 {
 	ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/GpuVisibilityNonRenderables.ankiprogbin", m_prog));

+ 119 - 33
AnKi/Renderer/Utils/GpuVisibility.h

@@ -14,6 +14,38 @@ namespace anki {
 /// @addtogroup renderer
 /// @{
 
+class GpuVisibilityCommonBase : public RendererObject
+{
+protected:
+	class Counts
+	{
+	public:
+		U32 m_aabbCount;
+		U32 m_bucketCount;
+		U32 m_legacyGeometryFlowUserCount;
+		U32 m_modernGeometryFlowUserCount;
+		U32 m_meshletGroupCount;
+		U32 m_allUserCount;
+
+		Counts max(const Counts& b) const
+		{
+			Counts out;
+#define ANKI_MAX(member) out.member = anki::max(member, b.member)
+			ANKI_MAX(m_aabbCount);
+			ANKI_MAX(m_bucketCount);
+			ANKI_MAX(m_legacyGeometryFlowUserCount);
+			ANKI_MAX(m_modernGeometryFlowUserCount);
+			ANKI_MAX(m_meshletGroupCount);
+			ANKI_MAX(m_allUserCount);
+#undef ANKI_MAX
+			return out;
+		}
+	};
+
+	static Counts countTechnique(RenderingTechnique t);
+};
+
+/// @memberof GpuVisibility
 class BaseGpuVisibilityInput
 {
 public:
@@ -55,21 +87,34 @@ class GpuVisibilityOutput
 public:
 	BufferHandle m_someBufferHandle; ///< Just expose one handle for depedencies. No need to track all buffers.
 
-	BufferOffsetRange m_instanceRateRenderablesBuffer; ///< An array of GpuSceneRenderableVertex.
-	BufferOffsetRange m_drawIndexedIndirectArgsBuffer; ///< An array of DrawIndexedIndirectArgs.
-	BufferOffsetRange m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket (even those that use task/mesh flow).
+	class
+	{
+	public:
+		BufferOffsetRange m_renderableInstancesBuffer; ///< An array of GpuSceneRenderableVertex.
+		BufferOffsetRange m_drawIndexedIndirectArgsBuffer; ///< An array of DrawIndexedIndirectArgs.
+		BufferOffsetRange m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket (even those that use task/mesh flow).
+	} m_legacy; ///< Legacy vertex shading.
 
-	/// An array of DispatchIndirectArgs, one for each render state bucket (even those that use vertex flow).
-	BufferOffsetRange m_taskShaderIndirectArgsBuffer;
-	BufferOffsetRange m_taskShaderPayloadBuffer; ///< The payloads of task shaders. One for each task shader threadgroup.
+	class
+	{
+	public:
+		/// An array of DispatchIndirectArgs, one for each render state bucket (even those that use legacy flow).
+		BufferOffsetRange m_taskShaderIndirectArgsBuffer;
+		BufferOffsetRange m_taskShaderPayloadBuffer; ///< The payloads of task shaders. One for each task shader threadgroup / meshlet group.
+	} m_mesh; ///< S/W meshlets or H/W mesh shading.
 
 	BufferOffsetRange m_visibleAaabbIndicesBuffer; ///< [Optional] Indices to the AABB buffer. The 1st element is the count.
 
 	BufferOffsetRange m_visiblesHashBuffer; ///< [Optional] A hash of the visible objects. Used to conditionaly not perform shadow randering.
+
+	Bool containsDrawcalls() const
+	{
+		return m_someBufferHandle.isValid();
+	}
 };
 
 /// Performs GPU visibility for some pass.
-class GpuVisibility : public RendererObject
+class GpuVisibility : public GpuVisibilityCommonBase
 {
 public:
 	Error init();
@@ -95,15 +140,15 @@ private:
 	Array3d<ShaderProgramPtr, 2, 2, 2> m_frustumGrProgs;
 	Array2d<ShaderProgramPtr, 2, 2> m_distGrProgs;
 
-	ShaderProgramResourcePtr m_meshletCullingProg;
-	Array<ShaderProgramPtr, 2> m_meshletCullingGrProgs;
-
 	// Contains quite large buffer that we want want to reuse muptiple times in a single frame.
 	class PersistentMemory
 	{
 	public:
+		// Legacy
 		BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
-		BufferOffsetRange m_instanceRateRenderablesBuffer;
+		BufferOffsetRange m_renderableInstancesBuffer; ///< Instance rate vertex buffer.
+
+		// Mesh
 		BufferOffsetRange m_taskShaderPayloadBuffer;
 
 		BufferHandle m_bufferDepedency;
@@ -119,34 +164,75 @@ private:
 		Array<PersistentMemory, 4> m_persistentMem;
 	} m_runCtx;
 
-	class Counts
+	void populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out);
+};
+
+/// @memberof GpuMeshletVisibility
+class GpuMeshletVisibilityInput
+{
+public:
+	CString m_passesName;
+
+	Mat4 m_viewProjectionMatrix;
+	Mat3x4 m_cameraTransform;
+
+	/// The size of the viewport the visibility results will be used on. Used to kill objects that don't touch the sampling positions.
+	UVec2 m_viewportSize;
+
+	BufferOffsetRange m_taskShaderIndirectArgsBuffer; ///< Taken from GpuVisibilityOutput.
+	BufferOffsetRange m_taskShaderPayloadBuffer; ///< Taken from GpuVisibilityOutput.
+
+	BufferHandle m_dependency;
+
+	RenderGraphDescription* m_rgraph = nullptr;
+
+	RenderingTechnique m_technique = RenderingTechnique::kCount;
+
+	RenderTargetHandle m_hzbRt; ///< Optional.
+};
+
+/// @memberof GpuMeshletVisibility
+class GpuMeshletVisibilityOutput
+{
+public:
+	BufferOffsetRange m_meshletInstancesBuffer; ///< Array of UVec4 (encodes GpuSceneMeshletInstance) per instance vertex. One for each meshlet.
+	BufferOffsetRange m_drawIndirectArgsBuffer; ///< Array of DrawIndirectArgs. One for every render state bucket (even those that use that flow).
+
+	BufferHandle m_dependency;
+};
+
+/// Performs meshlet GPU visibility when the GPU doesn't support mesh shaders.
+class GpuMeshletVisibility : public GpuVisibilityCommonBase
+{
+public:
+	Error init();
+
+	/// Perform meshlet GPU visibility.
+	/// @note Not thread-safe.
+	void populateRenderGraph(GpuMeshletVisibilityInput& in, GpuMeshletVisibilityOutput& out);
+
+private:
+	ShaderProgramResourcePtr m_meshletCullingProg;
+	Array<ShaderProgramPtr, 2> m_meshletCullingGrProgs;
+
+	// Contains quite large buffer that we want want to reuse muptiple times in a single frame.
+	class PersistentMemory
 	{
 	public:
-		U32 m_aabbCount;
-		U32 m_bucketCount;
-		U32 m_legacyGeometryFlowUserCount;
-		U32 m_modernGeometryFlowUserCount;
-		U32 m_meshletGroupCount;
-		U32 m_allUserCount;
+		BufferOffsetRange m_meshletInstancesBuffer; ///< Instance rate vertex buffer.
 
-		Counts max(const Counts& b) const
-		{
-			Counts out;
-#define ANKI_MAX(member) out.member = anki::max(member, b.member)
-			ANKI_MAX(m_aabbCount);
-			ANKI_MAX(m_bucketCount);
-			ANKI_MAX(m_legacyGeometryFlowUserCount);
-			ANKI_MAX(m_modernGeometryFlowUserCount);
-			ANKI_MAX(m_meshletGroupCount);
-			ANKI_MAX(m_allUserCount);
-#undef ANKI_MAX
-			return out;
-		}
+		BufferHandle m_bufferDepedency;
 	};
 
-	Counts countTechnique(RenderingTechnique t);
+	class
+	{
+	public:
+		U64 m_frameIdx = kMaxU64;
+		U32 m_populateRenderGraphFrameCallCount = 0;
 
-	void populateRenderGraphInternal(Bool distanceBased, BaseGpuVisibilityInput& in, GpuVisibilityOutput& out);
+		/// The more persistent memory there is the more passes will be able to run in parallel but the more memory is used.
+		Array<PersistentMemory, 4> m_persistentMem;
+	} m_runCtx;
 };
 
 /// @memberof GpuVisibilityNonRenderables

+ 4 - 2
AnKi/Resource/MaterialResource.cpp

@@ -167,11 +167,13 @@ Error MaterialResource::parseShaderProgram(XmlElement shaderProgramEl, Bool asyn
 	// Find present techniques
 	for(const ShaderProgramBinaryTechnique& t : m_prog->getBinary().m_techniques)
 	{
-		if(t.m_name.getBegin() == CString("GBuffer") || t.m_name.getBegin() == CString("GBufferMesh"))
+		if(t.m_name.getBegin() == CString("GBuffer") || t.m_name.getBegin() == CString("GBufferMesh")
+		   || t.m_name.getBegin() == CString("GBufferMeshlet"))
 		{
 			m_techniquesMask |= RenderingTechniqueBit::kGBuffer;
 		}
-		else if(t.m_name.getBegin() == CString("Shadows") || t.m_name.getBegin() == CString("ShadowsMesh"))
+		else if(t.m_name.getBegin() == CString("Shadows") || t.m_name.getBegin() == CString("ShadowsMesh")
+				|| t.m_name.getBegin() == CString("ShadowsMeshlet"))
 		{
 			m_techniquesMask |= RenderingTechniqueBit::kDepth;
 		}

+ 30 - 7
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -88,7 +88,7 @@
 struct VertIn
 {
 	U32 m_svVertexId : SV_VERTEXID;
-	[[vk::location(0)]] UVec4 m_gpuSceneRenderable : RENDERABLE;
+	[[vk::location(0)]] UVec4 m_instanceData : INSTANCE_DATA;
 };
 
 struct VertOut
@@ -232,16 +232,33 @@ void velocity(Mat3x4 worldTransform, Mat3x4 prevWorldTransform, Vec3 prevLocalPo
 // Vert                                                                      =
 // ===========================================================================
 #if ANKI_VERTEX_SHADER
+
+#	define SW_MESHLETS (ANKI_TECHNIQUE_GBufferMeshlet || ANKI_TECHNIQUE_ShadowsMeshlet)
+
 VertOut main(VertIn input)
 {
 	VertOut output;
 
-	const GpuSceneRenderableVertex renderable = unpackGpuSceneRenderableVertex(input.m_gpuSceneRenderable);
-	const GpuSceneMeshLod mesh = g_meshLods[renderable.m_meshLodIndex];
+#	if SW_MESHLETS
+	const GpuSceneMeshletInstance instance = unpackGpuSceneMeshletInstance(input.m_instanceData);
+	const MeshletGeometryDescriptor meshlet = g_meshletGeometryDescriptors[instance.m_meshletGeometryDescriptorIndex];
+	if(input.m_svVertexId >= (meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint >> 16u) * 3u)
+	{
+		// Discard the primitive
+		output = (VertOut)0;
+		output.m_constantsOffset = instance.m_constantsOffset;
+		return output;
+	}
+
+	UnpackedMeshVertex vert = loadVertex(meshlet, input.m_svVertexId, ANKI_BONES);
+#	else
+	const GpuSceneRenderableVertex instance = unpackGpuSceneRenderableVertex(input.m_instanceData);
+	const GpuSceneMeshLod mesh = g_meshLods[instance.m_meshLodIndex];
 	UnpackedMeshVertex vert = loadVertex(mesh, input.m_svVertexId, ANKI_BONES);
+#	endif
 
-	const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
-	const Mat3x4 prevWorldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset + sizeof(Mat3x4));
+	const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(instance.m_worldTransformsOffset);
+	const Mat3x4 prevWorldTransform = g_gpuScene.Load<Mat3x4>(instance.m_worldTransformsOffset + sizeof(Mat3x4));
 	ANKI_MAYBE_UNUSED(prevWorldTransform);
 
 #	if UVS
@@ -249,11 +266,11 @@ VertOut main(VertIn input)
 #	endif
 	Vec3 prevPos = vert.m_position;
 	ANKI_MAYBE_UNUSED(prevPos);
-	output.m_constantsOffset = renderable.m_constantsOffset;
+	output.m_constantsOffset = instance.m_constantsOffset;
 
 	// Do stuff
 #	if ANKI_BONES
-	skinning(vert, renderable.m_boneTransformsOrParticleEmitterOffset, vert.m_position, prevPos, vert.m_normal);
+	skinning(vert, instance.m_boneTransformsOrParticleEmitterOffset, vert.m_position, prevPos, vert.m_normal);
 #	endif
 
 	const Vec3 worldPos = mul(worldTransform, Vec4(vert.m_position, 1.0));
@@ -724,6 +741,12 @@ FragOut main(
 #pragma anki technique_start vert Shadows uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
 #pragma anki technique_end vert Shadows
 
+#pragma anki technique_start vert GBufferMeshlet uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX NORMAL_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert GBufferMeshlet
+
+#pragma anki technique_start vert ShadowsMeshlet uses_mutators ANKI_VELOCITY ANKI_BONES DIFFUSE_TEX PARALLAX ALPHA_TEST
+#pragma anki technique_end vert ShadowsMeshlet
+
 #pragma anki technique_start task CommonTask uses_mutators
 #pragma anki technique_end task CommonTask
 

+ 36 - 134
AnKi/Shaders/GpuVisibilityMeshlet.ankiprog

@@ -9,56 +9,40 @@
 
 #include <AnKi/Shaders/Common.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
-#include <AnKi/Shaders/MaterialShadersCommon.hlsl>
+#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
 
 #define MESHLET_BACKFACE_CULLING 0
 #define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
 #define MESHLET_NO_SAMPLING_POINT_CULLING 1
 #define MESHLET_HZB_CULLING HZB_TEST
-#define PRIMITIVE_BACKFACE_CULLING 1
-#define PRIMITIVE_NO_SAMPLING_POINTS_CULLING 1
-#define PRIMITIVE_ANY_CULLING (PRIMITIVE_BACKFACE_CULLING || PRIMITIVE_NO_SAMPLING_POINTS_CULLING)
 
 #define THREADGROUP_SIZE ANKI_TASK_SHADER_THREADGROUP_SIZE
 
-struct FirstPayload
+[[vk::binding(0, 0)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(0, 1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
+[[vk::binding(0, 2)]] StructuredBuffer<GpuSceneMeshLod> g_meshLods;
+[[vk::binding(0, 3)]] ByteAddressBuffer g_gpuScene;
+[[vk::binding(0, 4)]] StructuredBuffer<MeshletBoundingVolume> g_meshletBoundingVolumes;
+[[vk::binding(0, 5)]] RWStructuredBuffer<DrawIndirectArgs> g_indirectDrawArg;
+[[vk::binding(0, 6)]] RWStructuredBuffer<GpuSceneMeshletInstance> g_drawInstances;
+[[vk::binding(0, 7)]] Texture2D<Vec4> g_hzbTexture;
+[[vk::binding(0, 8)]] SamplerState g_nearestClampSampler;
+
+struct MaterialGlobalConstants
 {
-	UVec4 m_val;
-};
-
-[[vk::push_constant]] ConstantBuffer<FirstPayload> g_firstPayload;
-
-[[vk::binding(0, 2)]] RWStructuredBuffer<U32> g_indexBuffer; // 1st element is the count
-
-groupshared U32 s_visibleMeshletCount;
-groupshared U32 s_visibleMeshletIndices[kMeshletGroupSize];
-
-#if PRIMITIVE_ANY_CULLING
-groupshared Vec2 s_windowCoords[kMaxVerticesPerMeshlet];
-groupshared F32 s_clipW[kMaxVerticesPerMeshlet];
-#endif
+	Mat4 m_viewProjectionMatrix;
+	Mat3x4 m_cameraTransform;
 
-U32 encodeMetaIndex(U32 meshLodIndex, U32 meshletIndex, U32 localIndex, U32 primitiveVertex)
-{
-	U32 metaIdx = meshletIndex * kMaxPrimitivesPerMeshlet * 3 + localIndex * 3 + primitiveVertex;
-	metaIdx &= (1u << 17u) - 1u;
-	metaIdx |= meshLodIndex << 17u;
-	return metaIdx;
-}
-
-void decodeMetaIndex(U32 metaIndex, out U32 meshLodIndex, out U32 meshletIndex, out U32 localIndex, out U32 primitiveVertex)
-{
-	meshLodIndex = metaIndex >> 17u;
-	metaIndex &= (1u << 17u) - 1u;
-	primitiveVertex = metaIndex % 3u;
-	localIndex = (metaIndex / 3u) % kMaxPrimitivesPerMeshlet;
-	meshletIndex = metaIndex / (3u * kMaxPrimitivesPerMeshlet);
-}
+	Vec2 m_viewportSizef;
+	U32 m_firstPayload;
+	U32 m_padding;
+};
+[[vk::push_constant]] ConstantBuffer<MaterialGlobalConstants> g_consts;
 
 [numthreads(THREADGROUP_SIZE, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupId : SV_GROUPID,
 											   U32 svGroupIndex : SV_GROUPINDEX)
 {
-	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_firstPayload.m_val.x + svGroupId];
+	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_consts.m_firstPayload + svGroupId];
 
 	const U32 lod = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
 	const U32 renderableIdx = (inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
@@ -69,14 +53,6 @@ void decodeMetaIndex(U32 metaIndex, out U32 meshLodIndex, out U32 meshletIndex,
 	U32 firstMeshletBoundingVolume = meshletGroup * kMeshletGroupSize;
 	const U32 meshletCount = min(kMeshletGroupSize, meshLod.m_meshletCount - firstMeshletBoundingVolume);
 	firstMeshletBoundingVolume += meshLod.m_firstMeshletBoundingVolume;
-	const U32 firstMeshletGeometryDescriptor = meshletGroup * kMeshletGroupSize + meshLod.m_firstMeshletGeometryDescriptor;
-
-	if(svGroupIndex == 0u)
-	{
-		s_visibleMeshletCount = 0;
-	}
-
-	GroupMemoryBarrierWithGroupSync();
 
 	// Meshlet culling
 	if(svGroupIndex < meshletCount)
@@ -87,11 +63,11 @@ void decodeMetaIndex(U32 metaIndex, out U32 meshLodIndex, out U32 meshletIndex,
 		const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
 
 #if MESHLET_BACKFACE_CULLING
-		cull = cullBackfaceMeshlet(meshletBoundingVol, worldTransform, g_globalConstants.m_cameraTransform.getTranslationPart());
+		cull = cullBackfaceMeshlet(meshletBoundingVol, worldTransform, g_consts.m_cameraTransform.getTranslationPart());
 #endif
 
 		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
-		const Mat4 mvp = mul(g_globalConstants.m_viewProjectionMatrix, wordTransform4);
+		const Mat4 mvp = mul(g_consts.m_viewProjectionMatrix, wordTransform4);
 
 		Vec2 minNdc, maxNdc;
 		F32 aabbMinDepth;
@@ -104,107 +80,33 @@ void decodeMetaIndex(U32 metaIndex, out U32 meshLodIndex, out U32 meshletIndex,
 
 #if MESHLET_NO_SAMPLING_POINT_CULLING
 		// Sampling points test
-		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_globalConstants.m_viewport.zw;
-		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_globalConstants.m_viewport.zw;
+		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_consts.m_viewportSizef.x;
+		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_consts.m_viewportSizef.y;
 		cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
 #endif
 
 #if MESHLET_HZB_CULLING
-		cull = cull
-			   || (renderable.m_boneTransformsOffset == 0u && g_globalConstants.m_enableHzbTesting == 1u
-				   && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
+		cull = cull || (renderable.m_boneTransformsOffset == 0u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler));
 #endif
 
 		if(!cull)
 		{
-			U32 idx;
-			InterlockedAdd(s_visibleMeshletCount, 1u, idx);
-
-			InterlockedOr(s_visibleMeshletIndices[idx], firstMeshletGeometryDescriptor);
-		}
-	}
-
-	GroupMemoryBarrierWithGroupSync();
-
-	// Transform positions for culling
-	for(U32 visMeshletIdx = 0; visMeshletIdx < s_visibleMeshletCount; ++visMeshletIdx)
-	{
-		const U32 meshletIndex = s_visibleMeshletIndices[visMeshletIdx];
-		const MeshletGeometryDescriptor meshlet = g_meshletGeometryDescriptors[meshletIndex];
-
-		const U32 primCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint >> 16u;
-		const U32 vertCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint & 0xFFFFu;
-
-		// Compute positions
-#if PRIMITIVE_ANY_CULLING
-		const U32 vertLoopCount = kMaxVerticesPerMeshlet / THREADGROUP_SIZE;
-		[unroll] for(U32 l = 0; l < vertLoopCount; ++l)
-		{
-			const U32 idx = l * THREADGROUP_SIZE + svGroupIndex;
+			U32 instanceIdx;
+			InterlockedAdd(g_indirectDrawArg[0].m_instanceCount, 1u, instanceIdx);
 
-			if(idx < vertCount)
+			if(instanceIdx == 0u)
 			{
-				const UnpackedMeshVertex vert = loadVertex(meshlet, idx, false);
-
-				const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
-
-				const Vec3 worldPos = mul(worldTransform, Vec4(vert.m_position, 1.0));
-				const Vec4 svPosition = mul(g_globalConstants.m_viewProjectionMatrix, Vec4(worldPos, 1.0f));
-				s_windowCoords[idx] = ndcToUv(svPosition.xy / svPosition.w) * g_globalConstants.m_viewport.zw;
-				s_clipW[idx] = svPosition.w;
+				g_indirectDrawArg[0].m_vertexCount = kMaxPrimitivesPerMeshlet * 3u;
 			}
-		}
-
-		GroupMemoryBarrierWithGroupSync();
-#endif
-
-		// Perform primitive culling
-		const U32 primLoopCount = (kMaxPrimitivesPerMeshlet + THREADGROUP_SIZE - 1u) / THREADGROUP_SIZE;
-		[unroll] for(U32 l = 0; l < primLoopCount; ++l)
-		{
-			const U32 idx = l * THREADGROUP_SIZE + svGroupIndex;
 
-			if(idx < primCount)
-			{
-				const UVec3 prim = g_unifiedGeom_R8G8B8A8_Uint[meshlet.m_firstPrimitive + idx].xyz;
-
-#if PRIMITIVE_ANY_CULLING
-				Bool cull = false;
+			GpuSceneMeshletInstance instance;
+			instance.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
+			instance.m_constantsOffset = renderable.m_constantsOffset;
+			instance.m_meshletGeometryDescriptorIndex = meshLod.m_firstMeshletGeometryDescriptor + svGroupIndex;
+			instance.m_boneTransformsOrParticleEmitterOffset =
+				(renderable.m_particleEmitterOffset) ? renderable.m_particleEmitterOffset : renderable.m_boneTransformsOffset;
 
-				const Vec2 a = s_windowCoords[prim.x];
-				const Vec2 b = s_windowCoords[prim.y];
-				const Vec2 c = s_windowCoords[prim.z];
-#endif
-
-#if PRIMITIVE_BACKFACE_CULLING
-				const Vec2 eb = c - a;
-				const Vec2 ec = b - a;
-
-				cull = cull || (eb.x * ec.y >= eb.y * ec.x);
-#endif
-
-#if PRIMITIVE_NO_SAMPLING_POINTS_CULLING
-				const Vec2 windowCoordsMin = min3(a, b, c);
-				const Vec2 windowCoordsMax = max3(a, b, c);
-
-				cull = cull || any(round(windowCoordsMin) == round(windowCoordsMax));
-#endif
-
-#if PRIMITIVE_ANY_CULLING
-				// The computations above are only valid if all vertices are in front of perspective plane
-				cull = cull && min3(s_clipW[prim.x], s_clipW[prim.y], s_clipW[prim.z]) > 0.0f;
-#endif
-
-				if(!cull)
-				{
-					U32 firstIndex;
-					InterlockedAdd(g_indexBuffer[0], 3u, firstIndex);
-
-					g_indexBuffer[firstIndex + 1u] = encodeMetaIndex(renderable.m_meshLodsIndex + lod, meshletIndex, prim[0], 0);
-					g_indexBuffer[firstIndex + 2u] = encodeMetaIndex(renderable.m_meshLodsIndex + lod, meshletIndex, prim[1], 1);
-					g_indexBuffer[firstIndex + 3u] = encodeMetaIndex(renderable.m_meshLodsIndex + lod, meshletIndex, prim[2], 2);
-				}
-			}
+			g_drawInstances[instanceIdx] = instance;
 		}
 	}
 }

+ 0 - 3
AnKi/Shaders/Include/Common.h

@@ -798,9 +798,6 @@ static_assert(kMaxVerticesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
 constexpr U32 kMaxVisibleMeshletsPerRenderStateBucket = 100000000 / kMaxPrimitivesPerMeshlet;
 constexpr U32 kMaxMeshletGroupCountPerRenderStateBucket = kMaxVisibleMeshletsPerRenderStateBucket / kMeshletGroupSize;
 
-/// We want to have 17 bits to fit the index buffer of
-constexpr U32 kMaxMeshletsPerLod = (1u << 17u) / (3u * kMaxPrimitivesPerMeshlet);
-
 struct DrawIndirectArgs
 {
 	U32 m_vertexCount;

+ 10 - 0
AnKi/Shaders/Include/GpuSceneFunctions.h

@@ -19,6 +19,16 @@ inline GpuSceneRenderableVertex unpackGpuSceneRenderableVertex(UVec4 x)
 	return o;
 }
 
+inline GpuSceneMeshletInstance unpackGpuSceneMeshletInstance(UVec4 x)
+{
+	GpuSceneMeshletInstance o;
+	o.m_worldTransformsOffset = x[0];
+	o.m_constantsOffset = x[1];
+	o.m_meshletGeometryDescriptorIndex = x[2];
+	o.m_boneTransformsOrParticleEmitterOffset = x[3];
+	return o;
+}
+
 inline GpuSceneRenderableBoundingVolume initGpuSceneRenderableBoundingVolume(Vec3 aabbMin, Vec3 aabbMax, U32 renderableIndex, U32 renderStateBucket)
 {
 	GpuSceneRenderableBoundingVolume gpuVolume;

+ 9 - 0
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -45,6 +45,15 @@ struct GpuSceneTaskShaderPayload
 };
 static_assert(kMaxLodCount == 3);
 
+/// Minimal data passed to the vertex shaders in the case of meshlet rendering.
+struct GpuSceneMeshletInstance
+{
+	U32 m_worldTransformsOffset;
+	U32 m_constantsOffset;
+	U32 m_meshletGeometryDescriptorIndex; ///< Index in the UGB.
+	U32 m_boneTransformsOrParticleEmitterOffset;
+};
+
 /// Used in visibility testing.
 struct GpuSceneRenderableBoundingVolume
 {