Browse Source

Add the simple mesh shader drawing. Not working yet

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
d1cb889b26

+ 1 - 1
AnKi/Core/GpuMemory/UnifiedGeometryBuffer.cpp

@@ -25,7 +25,7 @@ void UnifiedGeometryBuffer::init()
 	const Array classes = {1_KB, 8_KB, 32_KB, 128_KB, 512_KB, 4_MB, 8_MB, 16_MB, poolSize};
 
 	BufferUsageBit buffUsage = BufferUsageBit::kVertex | BufferUsageBit::kIndex | BufferUsageBit::kTransferDestination
-							   | (BufferUsageBit::kAllTexture & BufferUsageBit::kAllRead);
+							   | (BufferUsageBit::kAllTexture & BufferUsageBit::kAllRead) | (BufferUsageBit::kAllUav & BufferUsageBit::kAllRead);
 
 	if(GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
 	{

+ 5 - 0
AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h

@@ -130,6 +130,11 @@ public:
 		return m_pool.getGpuBuffer();
 	}
 
+	BufferOffsetRange getBufferOffsetRange() const
+	{
+		return {&m_pool.getGpuBuffer(), 0, kMaxPtrSize};
+	}
+
 private:
 	SegregatedListsGpuMemoryPool m_pool;
 

+ 2 - 0
AnKi/Gr/CommandBuffer.h

@@ -310,6 +310,8 @@ public:
 
 	void drawMeshTasks(U32 groupCountX, U32 groupCountY, U32 groupCountZ);
 
+	void drawMeshTasksIndirect(Buffer* argBuffer, PtrSize argBufferOffset);
+
 	void dispatchCompute(U32 groupCountX, U32 groupCountY, U32 groupCountZ);
 
 	void dispatchComputeIndirect(Buffer* argBuffer, PtrSize argBufferOffset);

+ 7 - 0
AnKi/Gr/ShaderProgram.h

@@ -73,6 +73,11 @@ public:
 	/// Same as getShaderGroupHandles but the data live in a GPU buffer.
 	Buffer& getShaderGroupHandlesGpuBuffer() const;
 
+	ShaderTypeBit getShaderTypes() const
+	{
+		return m_shaderTypes;
+	}
+
 protected:
 	/// Construct.
 	ShaderProgram(CString name)
@@ -85,6 +90,8 @@ protected:
 	{
 	}
 
+	ShaderTypeBit m_shaderTypes = ShaderTypeBit::kNone;
+
 private:
 	/// Allocate and initialize a new instance.
 	[[nodiscard]] static ShaderProgram* newInstance(const ShaderProgramInitInfo& init);

+ 6 - 0
AnKi/Gr/Vulkan/CommandBuffer.cpp

@@ -300,6 +300,12 @@ void CommandBuffer::drawMeshTasks(U32 groupCountX, U32 groupCountY, U32 groupCou
 	self.drawMeshTasksInternal(groupCountX, groupCountY, groupCountZ);
 }
 
+void CommandBuffer::drawMeshTasksIndirect(Buffer* argBuffer, PtrSize argBufferOffset)
+{
+	ANKI_VK_SELF(CommandBufferImpl);
+	self.drawMeshTasksIndirectInternal(argBuffer, argBufferOffset);
+}
+
 void CommandBuffer::dispatchCompute(U32 groupCountX, U32 groupCountY, U32 groupCountZ)
 {
 	ANKI_VK_SELF(CommandBufferImpl);

+ 13 - 0
AnKi/Gr/Vulkan/CommandBufferImpl.h

@@ -324,6 +324,19 @@ public:
 		vkCmdDrawMeshTasksEXT(m_handle, groupCountX, groupCountY, groupCountZ);
 	}
 
+	ANKI_FORCE_INLINE void drawMeshTasksIndirectInternal(Buffer* argBuffer, PtrSize argBufferOffset)
+	{
+		ANKI_ASSERT(!!(getGrManagerImpl().getExtensions() & VulkanExtensions::kEXT_mesh_shader));
+		ANKI_ASSERT((argBufferOffset % 4) == 0);
+		const BufferImpl& impl = static_cast<const BufferImpl&>(*argBuffer);
+		ANKI_ASSERT(impl.usageValid(BufferUsageBit::kIndirectDraw));
+		ANKI_ASSERT((argBufferOffset + sizeof(DispatchIndirectArgs)) <= impl.getSize());
+
+		m_state.setPrimitiveTopology(PrimitiveTopology::kTriangles); // Not sure if that's needed
+		drawcallCommon();
+		vkCmdDrawMeshTasksIndirectEXT(m_handle, impl.getHandle(), argBufferOffset, 1, sizeof(DispatchIndirectArgs));
+	}
+
 	ANKI_FORCE_INLINE void drawIndexedIndirectCountInternal(PrimitiveTopology topology, Buffer* argBuffer, PtrSize argBufferOffset,
 															U32 argBufferStride, Buffer* countBuffer, PtrSize countBufferOffset, U32 maxDrawCount)
 	{

+ 2 - 1
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -257,7 +257,8 @@ Error GrManagerImpl::initInstance()
 				ANKI_VK_LOGV("\t%s", layer.layerName);
 				CString layerName = layer.layerName;
 
-				Bool enableLayer = (g_validationCVar.get() || g_debugMarkersCVar.get()) && layerName == "VK_LAYER_KHRONOS_validation";
+				Bool enableLayer =
+					(g_validationCVar.get() || g_debugMarkersCVar.get() || g_debugPrintfCVar.get()) && layerName == "VK_LAYER_KHRONOS_validation";
 				enableLayer = enableLayer || (!g_vkLayers.get().isEmpty() && g_vkLayers.get().find(layerName) != CString::kNpos);
 
 				if(enableLayer)

+ 4 - 4
AnKi/Gr/Vulkan/ShaderProgramImpl.cpp

@@ -105,7 +105,7 @@ Error ShaderProgramImpl::init(const ShaderProgramInitInfo& inf)
 	{
 		for(ShaderPtr& shader : m_shaders)
 		{
-			m_stages |= ShaderTypeBit(1 << shader->getShaderType());
+			m_shaderTypes |= ShaderTypeBit(1 << shader->getShaderType());
 
 			const ShaderImpl& simpl = static_cast<const ShaderImpl&>(*shader);
 
@@ -175,7 +175,7 @@ Error ShaderProgramImpl::init(const ShaderProgramInitInfo& inf)
 
 	// Get some masks
 	//
-	const Bool graphicsProg = !!(m_stages & ShaderTypeBit::kAllGraphics);
+	const Bool graphicsProg = !!(m_shaderTypes & ShaderTypeBit::kAllGraphics);
 	if(graphicsProg)
 	{
 		if(inf.m_graphicsShaders[ShaderType::kVertex])
@@ -225,7 +225,7 @@ Error ShaderProgramImpl::init(const ShaderProgramInitInfo& inf)
 
 	// Create the pipeline if compute
 	//
-	if(!!(m_stages & ShaderTypeBit::kCompute))
+	if(!!(m_shaderTypes & ShaderTypeBit::kCompute))
 	{
 		const ShaderImpl& shaderImpl = static_cast<const ShaderImpl&>(*m_shaders[0]);
 
@@ -252,7 +252,7 @@ Error ShaderProgramImpl::init(const ShaderProgramInitInfo& inf)
 
 	// Create the RT pipeline
 	//
-	if(!!(m_stages & ShaderTypeBit::kAllRayTracing))
+	if(!!(m_shaderTypes & ShaderTypeBit::kAllRayTracing))
 	{
 		// Create shaders
 		GrDynamicArray<VkPipelineShaderStageCreateInfo> stages;

+ 3 - 4
AnKi/Gr/Vulkan/ShaderProgramImpl.h

@@ -41,7 +41,7 @@ public:
 
 	Bool isGraphics() const
 	{
-		return !!(m_stages & ShaderTypeBit::kAllGraphics);
+		return !!(m_shaderTypes & ShaderTypeBit::kAllGraphics);
 	}
 
 	const VkPipelineShaderStageCreateInfo* getShaderCreateInfos(U32& count) const
@@ -88,8 +88,8 @@ public:
 
 	ShaderTypeBit getStages() const
 	{
-		ANKI_ASSERT(!!m_stages);
-		return m_stages;
+		ANKI_ASSERT(!!m_shaderTypes);
+		return m_shaderTypes;
 	}
 
 	U32 getMissShaderCount() const
@@ -111,7 +111,6 @@ public:
 
 private:
 	GrDynamicArray<ShaderPtr> m_shaders;
-	ShaderTypeBit m_stages = ShaderTypeBit::kNone;
 
 	PipelineLayout m_pplineLayout = {};
 	Array<DescriptorSetLayout, kMaxDescriptorSets> m_descriptorSetLayouts;

+ 4 - 4
AnKi/Importer/GltfImporterMesh.cpp

@@ -113,7 +113,7 @@ public:
 };
 static_assert(sizeof(TempVertex) == 5 * sizeof(Vec4), "Will be hashed");
 
-class Meshlet
+class ImporterMeshlet
 {
 public:
 	U32 m_firstVertex = 0;
@@ -135,7 +135,7 @@ public:
 	ImporterDynamicArray<TempVertex> m_verts;
 	ImporterDynamicArray<U32> m_indices;
 
-	ImporterDynamicArray<Meshlet> m_meshlets;
+	ImporterDynamicArray<ImporterMeshlet> m_meshlets;
 	ImporterDynamicArray<U8> m_localIndices;
 
 	Vec3 m_aabbMin = Vec3(kMaxF32);
@@ -474,7 +474,7 @@ static void generateMeshlets(SubMesh& submesh)
 	for(U32 meshletIdx = 0; meshletIdx < meshletCount; ++meshletIdx)
 	{
 		const meshopt_Meshlet& inMeshlet = meshlets[meshletIdx];
-		Meshlet& outMeshlet = submesh.m_meshlets[meshletIdx];
+		ImporterMeshlet& outMeshlet = submesh.m_meshlets[meshletIdx];
 
 		outMeshlet.m_firstLocalIndex = submesh.m_localIndices.getSize();
 		outMeshlet.m_localIndexCount = inMeshlet.triangle_count * 3;
@@ -983,7 +983,7 @@ Error GltfImporter::writeMesh(const cgltf_mesh& mesh) const
 			for(U32 v = 0; v < submesh.m_meshlets.getSize(); ++v)
 			{
 				MeshBinaryMeshlet& out = meshlets[v];
-				const Meshlet& in = submesh.m_meshlets[v];
+				const ImporterMeshlet& in = submesh.m_meshlets[v];
 
 				out.m_firstPrimitive = primitiveCount;
 				out.m_primitiveCount = in.m_localIndexCount / 3;

+ 42 - 17
AnKi/Renderer/Utils/Drawer.cpp

@@ -70,6 +70,11 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 								   &UnifiedGeometryBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize, Format::k##fmt);
 #include <AnKi/Shaders/Include/UnifiedGeometryTypes.defs.h>
 
+	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kMeshlets), UnifiedGeometryBuffer::getSingleton().getBufferOffsetRange());
+	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kTaskShaderPayloads), args.m_taskShaderPayloadsBuffer);
+	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kRenderables),
+					   GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+
 	// Misc
 	cmdb.setVertexAttribute(0, 0, Format::kR32G32B32A32_Uint, 0);
 	cmdb.bindIndexBuffer(&UnifiedGeometryBuffer::getSingleton().getBuffer(), 0, IndexType::kU16);
@@ -148,11 +153,11 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 		DynamicArray<U32, MemoryPoolPtrWrapper<StackMemoryPool>> offsets(&getRenderer().getFrameMemoryPool());
 		U32 allUserCount = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(args.m_renderingTechinuqe,
-																  [&]([[maybe_unused]] const RenderStateInfo& state, U32 userCount) {
-																	  offsets.emplaceBack(allUserCount);
-																	  allUserCount += userCount;
-																  });
+		RenderStateBucketContainer::getSingleton().iterateBuckets(
+			args.m_renderingTechinuqe, [&]([[maybe_unused]] const RenderStateInfo& state, U32 userCount, [[maybe_unused]] U32 meshletGroupCount) {
+				offsets.emplaceBack(allUserCount);
+				allUserCount += userCount;
+			});
 		U32* firstDrawArgIndices;
 		BufferOffsetRange firstDrawArgIndicesBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame(offsets.getSize(), firstDrawArgIndices);
 		memcpy(firstDrawArgIndices, &offsets[0], offsets.getSizeInBytes());
@@ -176,7 +181,10 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 	U32 allUserCount = 0;
 	U32 bucketCount = 0;
-	RenderStateBucketContainer::getSingleton().iterateBuckets(args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount) {
+	U32 allMeshletGroupCount = 0;
+	U32 legacyGeometryFlowUserCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(args.m_renderingTechinuqe, [&](const RenderStateInfo& state, U32 userCount,
+																							 U32 meshletGroupCount) {
 		if(userCount == 0)
 		{
 			++bucketCount;
@@ -186,26 +194,43 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 		ShaderProgramPtr prog = state.m_program;
 		cmdb.bindShaderProgram(prog.get());
 
-		const U32 maxDrawCount = userCount;
+		const Bool usesMeshShaders = meshletGroupCount > 0;
 
-		if(state.m_indexedDrawcall)
+		if(usesMeshShaders)
 		{
-			cmdb.drawIndexedIndirectCount(state.m_primitiveTopology, args.m_drawIndexedIndirectArgsBuffer.m_buffer,
-										  args.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * allUserCount,
-										  sizeof(DrawIndexedIndirectArgs), args.m_mdiDrawCountsBuffer.m_buffer,
-										  args.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount, maxDrawCount);
+			const UVec4 firstPayload(allMeshletGroupCount);
+			cmdb.setPushConstants(&firstPayload, sizeof(firstPayload));
+
+			cmdb.drawMeshTasksIndirect(args.m_taskShaderIndirectArgsBuffer.m_buffer,
+									   args.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketCount);
 		}
 		else
 		{
-			// Yes, the DrawIndexedIndirectArgs is intentional
-			cmdb.drawIndirectCount(state.m_primitiveTopology, args.m_drawIndexedIndirectArgsBuffer.m_buffer,
-								   args.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * allUserCount,
-								   sizeof(DrawIndexedIndirectArgs), args.m_mdiDrawCountsBuffer.m_buffer,
-								   args.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount, maxDrawCount);
+			const U32 maxDrawCount = userCount;
+
+			if(state.m_indexedDrawcall)
+			{
+				cmdb.drawIndexedIndirectCount(state.m_primitiveTopology, args.m_drawIndexedIndirectArgsBuffer.m_buffer,
+											  args.m_drawIndexedIndirectArgsBuffer.m_offset
+												  + sizeof(DrawIndexedIndirectArgs) * legacyGeometryFlowUserCount,
+											  sizeof(DrawIndexedIndirectArgs), args.m_mdiDrawCountsBuffer.m_buffer,
+											  args.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount, maxDrawCount);
+			}
+			else
+			{
+				// Yes, the DrawIndexedIndirectArgs is intentional
+				cmdb.drawIndirectCount(state.m_primitiveTopology, args.m_drawIndexedIndirectArgsBuffer.m_buffer,
+									   args.m_drawIndexedIndirectArgsBuffer.m_offset + sizeof(DrawIndexedIndirectArgs) * legacyGeometryFlowUserCount,
+									   sizeof(DrawIndexedIndirectArgs), args.m_mdiDrawCountsBuffer.m_buffer,
+									   args.m_mdiDrawCountsBuffer.m_offset + sizeof(U32) * bucketCount, maxDrawCount);
+			}
+
+			legacyGeometryFlowUserCount += userCount;
 		}
 
 		++bucketCount;
 		allUserCount += userCount;
+		allMeshletGroupCount += meshletGroupCount;
 	});
 
 	ANKI_ASSERT(bucketCount == RenderStateBucketContainer::getSingleton().getBucketCount(args.m_renderingTechinuqe));

+ 5 - 0
AnKi/Renderer/Utils/Drawer.h

@@ -35,11 +35,16 @@ public:
 	BufferOffsetRange m_drawIndexedIndirectArgsBuffer;
 	BufferOffsetRange m_instanceRateRenderablesBuffer;
 
+	BufferOffsetRange m_taskShaderIndirectArgsBuffer;
+	BufferOffsetRange m_taskShaderPayloadsBuffer;
+
 	void fillMdi(const GpuVisibilityOutput& visOut)
 	{
 		m_mdiDrawCountsBuffer = visOut.m_mdiDrawCountsBuffer;
 		m_drawIndexedIndirectArgsBuffer = visOut.m_drawIndexedIndirectArgsBuffer;
 		m_instanceRateRenderablesBuffer = visOut.m_instanceRateRenderablesBuffer;
+		m_taskShaderIndirectArgsBuffer = visOut.m_taskShaderIndirectArgsBuffer;
+		m_taskShaderPayloadsBuffer = visOut.m_taskShaderPayloadBuffer;
 	}
 };
 

+ 73 - 28
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -103,6 +103,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 		out.m_instanceRateRenderablesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(GpuSceneRenderable));
 		out.m_drawIndexedIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(DrawIndexedIndirectArgs));
+		out.m_taskShaderIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(DispatchIndirectArgs));
+		out.m_taskShaderPayloadBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(1 * sizeof(GpuSceneTaskShaderPayload));
 
 		U32* atomics;
 		out.m_mdiDrawCountsBuffer = RebarTransientMemoryPool::getSingleton().allocateFrame<U32>(1, atomics);
@@ -121,28 +123,53 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 	const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique);
 
-	// Allocate memory for the indirect commands
-	out.m_drawIndexedIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(DrawIndexedIndirectArgs));
-	out.m_instanceRateRenderablesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(GpuSceneRenderableVertex));
+	U32 legacyGeometryFlowUserCount = 0;
+	U32 modernGeometryFlowUserCount = 0;
+	U32 meshletGroupCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount_) {
+		if(meshletGroupCount_)
+		{
+			modernGeometryFlowUserCount += userCount;
+			meshletGroupCount += meshletGroupCount_;
+		}
+		else
+		{
+			legacyGeometryFlowUserCount += userCount;
+		}
+	});
+	const U32 allUserCount = aabbCount;
+
+	// Allocate memory
+	out.m_drawIndexedIndirectArgsBuffer =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
+	out.m_instanceRateRenderablesBuffer =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex));
+
+	out.m_taskShaderPayloadBuffer =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
 
-	// Allocate memory for AABB indices
 	if(in.m_gatherAabbIndices)
 	{
-		out.m_visibleAaabbIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((aabbCount + 1) * sizeof(U32));
+		out.m_visibleAaabbIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((allUserCount + 1) * sizeof(U32));
 	}
 
-	// Allocate memory for counters
+	// Allocate memory for things that will be zeroed
 	PtrSize counterMemory = 0;
 	if(in.m_hashVisibles)
 	{
-		counterMemory += sizeof(GpuVisibilityHash);
-		alignRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment, counterMemory);
+		counterMemory = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment,
+										  counterMemory + sizeof(GpuVisibilityHash));
 	}
 
 	const PtrSize mdiBufferOffset = counterMemory;
 	const PtrSize mdiBufferSize = sizeof(U32) * bucketCount;
-	counterMemory += mdiBufferSize;
-	alignRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment, counterMemory);
+	counterMemory =
+		getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment, counterMemory + mdiBufferSize);
+
+	const PtrSize taskShaderIndirectArgsOffset = counterMemory;
+	const PtrSize taskShaderIndirectArgsSize = sizeof(DispatchIndirectArgs) * bucketCount;
+	counterMemory = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment,
+									  counterMemory + taskShaderIndirectArgsSize);
 
 	const BufferOffsetRange counterBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(counterMemory);
 	const BufferHandle counterBufferHandle = in.m_rgraph->importBuffer(BufferUsageBit::kNone, counterBuffer);
@@ -153,6 +180,9 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		out.m_visiblesHashBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset, sizeof(GpuVisibilityHash)};
 	}
 
+	out.m_mdiDrawCountsBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset + mdiBufferOffset, mdiBufferSize};
+	out.m_taskShaderIndirectArgsBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset + taskShaderIndirectArgsOffset, taskShaderIndirectArgsSize};
+
 	// Zero some stuff
 	{
 		ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass("GPU visibility: Zero stuff");
@@ -168,9 +198,6 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		});
 	}
 
-	// Set the MDI count buffer
-	out.m_mdiDrawCountsBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset + mdiBufferOffset, mdiBufferSize};
-
 	// Create the renderpass
 	ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass(in.m_passesName);
 
@@ -185,8 +212,10 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 	pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
 				  technique = in.m_technique, mdiDrawCountsBuffer = out.m_mdiDrawCountsBuffer,
-				  instanceRateRenderables = out.m_instanceRateRenderablesBuffer, indirectArgs = out.m_drawIndexedIndirectArgsBuffer, aabbCount,
-				  visibleAabbsBuffer = out.m_visibleAaabbIndicesBuffer, hashBuffer = out.m_visiblesHashBuffer](RenderPassWorkContext& rpass) {
+				  instanceRateRenderables = out.m_instanceRateRenderablesBuffer, indirectArgs = out.m_drawIndexedIndirectArgsBuffer, allUserCount,
+				  visibleAabbsBuffer = out.m_visibleAaabbIndicesBuffer, hashBuffer = out.m_visiblesHashBuffer,
+				  taskShaderIndirectArgsBuff = out.m_taskShaderIndirectArgsBuffer,
+				  taskShaderPayloadBuffer = out.m_taskShaderPayloadBuffer](RenderPassWorkContext& rpass) {
 		CommandBuffer& cmdb = *rpass.m_commandBuffer;
 
 		const Bool gatherAabbIndices = visibleAabbsBuffer.m_buffer != nullptr;
@@ -219,25 +248,41 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 		cmdb.bindUavBuffer(0, 0, aabbsBuffer);
 		cmdb.bindUavBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
-		cmdb.bindUavBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
+		cmdb.bindUavBuffer(0, 2, GpuSceneBuffer::getSingleton().getBufferOffsetRange());
 		cmdb.bindUavBuffer(0, 3, instanceRateRenderables);
 		cmdb.bindUavBuffer(0, 4, indirectArgs);
+		cmdb.bindUavBuffer(0, 5, mdiDrawCountsBuffer);
+		cmdb.bindUavBuffer(0, 6, taskShaderIndirectArgsBuff);
+		cmdb.bindUavBuffer(0, 7, taskShaderPayloadBuffer);
 
-		U32* offsets = allocateAndBindUav<U32>(cmdb, 0, 5, RenderStateBucketContainer::getSingleton().getBucketCount(technique));
+		U32* drawIndirectArgsIndexOrTaskPayloadIndex =
+			allocateAndBindUav<U32>(cmdb, 0, 8, RenderStateBucketContainer::getSingleton().getBucketCount(technique));
 		U32 bucketCount = 0;
-		U32 userCount = 0;
-		RenderStateBucketContainer::getSingleton().iterateBuckets(technique, [&](const RenderStateInfo&, U32 userCount_) {
-			offsets[bucketCount] = userCount;
-			userCount += userCount_;
+		U32 legacyGeometryFlowDrawCount = 0;
+		U32 taskPayloadCount = 0;
+		RenderStateBucketContainer::getSingleton().iterateBuckets(technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount) {
+			if(userCount == 0)
+			{
+				// Empty bucket
+				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = kMaxU32;
+			}
+			else if(meshletGroupCount)
+			{
+				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = taskPayloadCount;
+				taskPayloadCount += meshletGroupCount;
+			}
+			else
+			{
+				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = legacyGeometryFlowDrawCount;
+				legacyGeometryFlowDrawCount += userCount;
+			}
+
 			++bucketCount;
 		});
-		ANKI_ASSERT(userCount == RenderStateBucketContainer::getSingleton().getBucketsItemCount(technique));
-
-		cmdb.bindUavBuffer(0, 6, mdiDrawCountsBuffer);
 
 		if(frustumTestData)
 		{
-			FrustumGpuVisibilityConstants* unis = allocateAndBindConstants<FrustumGpuVisibilityConstants>(cmdb, 0, 7);
+			FrustumGpuVisibilityConstants* unis = allocateAndBindConstants<FrustumGpuVisibilityConstants>(cmdb, 0, 9);
 
 			Array<Plane, 6> planes;
 			extractClipPlanes(frustumTestData->m_viewProjMat, planes);
@@ -257,8 +302,8 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 			if(frustumTestData->m_hzbRt.isValid())
 			{
-				rpass.bindColorTexture(0, 8, frustumTestData->m_hzbRt);
-				cmdb.bindSampler(0, 9, getRenderer().getSamplers().m_nearestNearestClamp.get());
+				rpass.bindColorTexture(0, 10, frustumTestData->m_hzbRt);
+				cmdb.bindSampler(0, 11, getRenderer().getSamplers().m_nearestNearestClamp.get());
 			}
 		}
 		else
@@ -287,7 +332,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			cmdb.bindUavBuffer(0, 13, hashBuffer);
 		}
 
-		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
+		dispatchPPCompute(cmdb, 64, 1, allUserCount, 1);
 	});
 }
 

+ 5 - 1
AnKi/Renderer/Utils/GpuVisibility.h

@@ -53,7 +53,11 @@ public:
 
 	BufferOffsetRange m_instanceRateRenderablesBuffer; ///< An array of GpuSceneRenderableVertex.
 	BufferOffsetRange m_drawIndexedIndirectArgsBuffer; ///< An array of DrawIndexedIndirectArgs.
-	BufferOffsetRange m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket.
+	BufferOffsetRange m_mdiDrawCountsBuffer; ///< An array of U32, one for each render state bucket (even those that use task/mesh flow).
+
+	/// An array of DispatchIndirectArgs, one for each render state bucket (even those that use vertex flow).
+	BufferOffsetRange m_taskShaderIndirectArgsBuffer;
+	BufferOffsetRange m_taskShaderPayloadBuffer; ///< The payloads of task shaders. One for each task shader threadgroup.
 
 	BufferOffsetRange m_visibleAaabbIndicesBuffer; ///< Optional.
 

+ 15 - 3
AnKi/Resource/MaterialResource.cpp

@@ -70,7 +70,7 @@ class MaterialResource::Program
 public:
 	ShaderProgramResourcePtr m_prog;
 
-	mutable Array3d<MaterialVariant, U(RenderingTechnique::kCount), 2, 2> m_variantMatrix;
+	mutable Array4d<MaterialVariant, U(RenderingTechnique::kCount), 2, 2, 2> m_variantMatrix;
 	mutable RWMutex m_variantMatrixMtx;
 
 	ResourceDynamicArray<PartialMutation> m_partialMutation; ///< Only with the non-builtins.
@@ -100,7 +100,10 @@ public:
 			{
 				for(U32 vel = 0; vel < 2; ++vel)
 				{
-					m_variantMatrix[t][skin][vel] = std::move(b.m_variantMatrix[t][skin][vel]);
+					for(U32 mesh = 0; mesh < 2; ++mesh)
+					{
+						m_variantMatrix[t][skin][vel][mesh] = std::move(b.m_variantMatrix[t][skin][vel][mesh]);
+					}
 				}
 			}
 		}
@@ -723,16 +726,22 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 		key.setVelocity(false);
 	}
 
+	if(!GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	{
+		key.setMeshShaders(false);
+	}
+
 	ANKI_ASSERT(!key.getSkinned() || !!(prog.m_presentBuildinMutators & U32(1 << BuiltinMutatorId::kBones)));
 	ANKI_ASSERT(!key.getVelocity() || !!(prog.m_presentBuildinMutators & U32(1 << BuiltinMutatorId::kVelocity)));
 
-	MaterialVariant& variant = prog.m_variantMatrix[key.getRenderingTechnique()][key.getSkinned()][key.getVelocity()];
+	MaterialVariant& variant = prog.m_variantMatrix[key.getRenderingTechnique()][key.getSkinned()][key.getVelocity()][key.getMeshShaders()];
 
 	// Check if it's initialized
 	{
 		RLockGuard<RWMutex> lock(prog.m_variantMatrixMtx);
 		if(variant.m_prog.isCreated()) [[likely]]
 		{
+			ANKI_ASSERT(key.getMeshShaders() == !!(variant.m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
 			return variant;
 		}
 	}
@@ -765,6 +774,8 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 		initInfo.addMutation(kBuiltinMutatorNames[BuiltinMutatorId::kVelocity], MutatorValue(key.getVelocity()));
 	}
 
+	initInfo.requestMeshShaders(key.getMeshShaders());
+
 	const ShaderProgramResourceVariant* progVariant = nullptr;
 	prog.m_prog->getOrCreateVariant(initInfo, progVariant);
 
@@ -774,6 +785,7 @@ const MaterialVariant& MaterialResource::getOrCreateVariant(const RenderingKey&
 	}
 
 	variant.m_prog.reset(&progVariant->getProgram());
+	ANKI_ASSERT(key.getMeshShaders() == !!(variant.m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
 
 	if(!!(RenderingTechniqueBit(1 << key.getRenderingTechnique()) & RenderingTechniqueBit::kAllRt))
 	{

+ 4 - 0
AnKi/Resource/MeshBinaryLoader.cpp

@@ -331,6 +331,8 @@ Error MeshBinaryLoader::storeMeshletIndicesBuffer(U32 lod, void* ptr, PtrSize si
 		seek += getLodBuffersSize(l);
 	}
 
+	seek += getIndexBufferSize(lod);
+
 	for(U32 i = 0; i < m_header.m_vertexBuffers.getSize(); ++i)
 	{
 		seek += getVertexBufferSize(lod, i);
@@ -357,6 +359,8 @@ Error MeshBinaryLoader::storeMeshletBuffer(U32 lod, WeakArray<MeshBinaryMeshlet>
 		seek += getLodBuffersSize(l);
 	}
 
+	seek += getIndexBufferSize(lod);
+
 	for(U32 i = 0; i < m_header.m_vertexBuffers.getSize(); ++i)
 	{
 		seek += getVertexBufferSize(lod, i);

+ 2 - 2
AnKi/Resource/MeshResource.cpp

@@ -322,8 +322,8 @@ Error MeshResource::loadAsync(MeshBinaryLoader& loader) const
 				}
 
 				outMeshlet.m_firstPrimitive =
-					lod.m_meshletIndices.getOffset() + inMeshlet.m_firstPrimitive * getFormatInfo(kMeshletPrimitiveFormat).m_texelSize;
-				outMeshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint = (inMeshlet.m_primitiveCount << 16u) & inMeshlet.m_vertexCount;
+					lod.m_meshletIndices.getOffset() / getFormatInfo(kMeshletPrimitiveFormat).m_texelSize + inMeshlet.m_firstPrimitive;
+				outMeshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint = (inMeshlet.m_primitiveCount << 16u) | inMeshlet.m_vertexCount;
 				outMeshlet.m_sphereCenter = inMeshlet.m_boundingVolume.m_sphereCenter;
 				outMeshlet.m_sphereRadius = inMeshlet.m_boundingVolume.m_sphereRadius;
 				outMeshlet.m_coneApex = inMeshlet.m_coneApex;

+ 17 - 4
AnKi/Resource/RenderingKey.h

@@ -39,22 +39,23 @@ ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS(RenderingTechniqueBit)
 class RenderingKey
 {
 public:
-	RenderingKey(RenderingTechnique technique, U32 lod, Bool skinned, Bool velocity)
+	RenderingKey(RenderingTechnique technique, U32 lod, Bool skinned, Bool velocity, Bool meshShaders)
 		: m_technique(technique)
 		, m_lod(lod & 0b11)
 		, m_skinned(skinned)
 		, m_velocity(velocity)
+		, m_meshShaders(meshShaders)
 	{
 		ANKI_ASSERT(lod < kMaxLodCount);
 	}
 
 	RenderingKey()
-		: RenderingKey(RenderingTechnique::kFirst, 0, false, false)
+		: RenderingKey(RenderingTechnique::kFirst, 0, false, false, false)
 	{
 	}
 
 	RenderingKey(const RenderingKey& b)
-		: RenderingKey(b.m_technique, b.m_lod, b.m_skinned, b.m_velocity)
+		: RenderingKey(b.m_technique, b.m_lod, b.m_skinned, b.m_velocity, b.m_meshShaders)
 	{
 	}
 
@@ -66,7 +67,8 @@ public:
 
 	Bool operator==(const RenderingKey& b) const
 	{
-		return m_technique == b.m_technique && m_lod == b.m_lod && m_skinned == b.m_skinned && m_velocity == b.m_velocity;
+		return m_technique == b.m_technique && m_lod == b.m_lod && m_skinned == b.m_skinned && m_velocity == b.m_velocity
+			   && m_meshShaders == b.m_meshShaders;
 	}
 
 	RenderingTechnique getRenderingTechnique() const
@@ -110,11 +112,22 @@ public:
 		m_velocity = v;
 	}
 
+	void setMeshShaders(Bool b)
+	{
+		m_meshShaders = b;
+	}
+
+	Bool getMeshShaders() const
+	{
+		return m_meshShaders;
+	}
+
 private:
 	RenderingTechnique m_technique;
 	U8 m_lod : 2;
 	Bool m_skinned : 1;
 	Bool m_velocity : 1;
+	Bool m_meshShaders : 1;
 
 	static_assert(kMaxLodCount <= 3, "m_lod only reserves 2 bits so make sure all LODs will fit");
 };

+ 12 - 2
AnKi/Resource/ShaderProgramResource.cpp

@@ -198,12 +198,20 @@ void ShaderProgramResource::getOrCreateVariant(const ShaderProgramResourceVarian
 	// Sanity checks
 	ANKI_ASSERT(info.m_setMutators.getSetBitCount() == m_mutators.getSize());
 	ANKI_ASSERT(info.m_setConstants.getSetBitCount() == m_consts.getSize());
+	if(info.m_meshShaders)
+	{
+		ANKI_ASSERT(!!(m_shaderStages & ShaderTypeBit::kAllModernGeometry));
+	}
+	else if(!!(m_shaderStages & ShaderTypeBit::kAllGraphics))
+	{
+		ANKI_ASSERT(!!(m_shaderStages & ShaderTypeBit::kAllLegacyGeometry));
+	}
 
 	// Compute variant hash
-	U64 hash = info.m_meshShaders * 0xBAD;
+	U64 hash = 0xBAD << U32(info.m_meshShaders);
 	if(m_mutators.getSize())
 	{
-		hash = computeHash(info.m_mutation.getBegin(), m_mutators.getSize() * sizeof(info.m_mutation[0]));
+		hash = appendHash(info.m_mutation.getBegin(), m_mutators.getSize() * sizeof(info.m_mutation[0]), hash);
 	}
 
 	if(m_consts.getSize())
@@ -220,6 +228,7 @@ void ShaderProgramResource::getOrCreateVariant(const ShaderProgramResourceVarian
 		{
 			// Done
 			variant = *it;
+			ANKI_ASSERT(!info.m_meshShaders || !!(variant->m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
 			return;
 		}
 	}
@@ -243,6 +252,7 @@ void ShaderProgramResource::getOrCreateVariant(const ShaderProgramResourceVarian
 		m_variants.emplace(hash, v);
 	}
 	variant = v;
+	ANKI_ASSERT(!info.m_meshShaders || !!(variant->m_prog->getShaderTypes() & ShaderTypeBit::kAllModernGeometry));
 }
 
 ShaderProgramResourceVariant* ShaderProgramResource::createNewVariant(const ShaderProgramResourceVariantInitInfo& info) const

+ 1 - 1
AnKi/Resource/ShaderProgramResource.h

@@ -313,7 +313,7 @@ inline ShaderProgramResourceVariantInitInfo& ShaderProgramResourceVariantInitInf
 
 inline void ShaderProgramResourceVariantInitInfo::requestMeshShaders(Bool request)
 {
-	ANKI_ASSERT(!!(m_ptr->getStages() & ShaderTypeBit::kAllModernGeometry));
+	ANKI_ASSERT(!request || !!(m_ptr->getStages() & ShaderTypeBit::kAllModernGeometry));
 	m_meshShaders = request;
 }
 /// @}

+ 6 - 2
AnKi/Scene/Components/ModelComponent.cpp

@@ -205,7 +205,7 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 			gpuRenderable.m_boneTransformsOffset = (hasSkin) ? m_skinComponent->getBoneTransformsGpuSceneOffset() : 0;
 			if(!!(mtl.getRenderingTechniques() & RenderingTechniqueBit::kRtShadow))
 			{
-				const RenderingKey key(RenderingTechnique::kRtShadow, 0, false, false);
+				const RenderingKey key(RenderingTechnique::kRtShadow, 0, false, false, false);
 				const MaterialVariant& variant = mtl.getOrCreateVariant(key);
 				gpuRenderable.m_rtShadowsShaderHandleIndex = variant.getRtShaderGroupHandleIndex();
 			}
@@ -271,6 +271,7 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 				key.setRenderingTechnique(t);
 				key.setSkinned(hasSkin);
 				key.setVelocity(moved);
+				key.setMeshShaders(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders);
 
 				const MaterialVariant& mvariant = m_model->getModelPatches()[i].getMaterial()->getOrCreateVariant(key);
 
@@ -279,7 +280,10 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 				state.m_indexedDrawcall = true;
 				state.m_program = mvariant.getShaderProgram();
 
-				m_patchInfos[i].m_renderStateBucketIndices[t] = RenderStateBucketContainer::getSingleton().addUser(state, t);
+				ModelPatchGeometryInfo inf;
+				m_model->getModelPatches()[i].getGeometryInfo(0, inf);
+				m_patchInfos[i].m_renderStateBucketIndices[t] = RenderStateBucketContainer::getSingleton().addUser(
+					state, t, (GrManager::getSingleton().getDeviceCapabilities().m_meshShaders) ? inf.m_meshletCount : 0);
 			}
 		}
 	}

+ 1 - 1
AnKi/Scene/Components/ParticleEmitterComponent.cpp

@@ -321,7 +321,7 @@ void ParticleEmitterComponent::loadParticleEmitterResource(CString filename)
 		state.m_program = prog;
 		state.m_primitiveTopology = PrimitiveTopology::kTriangles;
 		state.m_indexedDrawcall = false;
-		m_renderStateBuckets[t] = RenderStateBucketContainer::getSingleton().addUser(state, t);
+		m_renderStateBuckets[t] = RenderStateBucketContainer::getSingleton().addUser(state, t, 0);
 	}
 }
 

+ 30 - 6
AnKi/Scene/RenderStateBucket.cpp

@@ -13,16 +13,26 @@ RenderStateBucketContainer::~RenderStateBucketContainer()
 	{
 		for([[maybe_unused]] ExtendedBucket& b : m_buckets[t])
 		{
-			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0);
+			ANKI_ASSERT(!b.m_program.isCreated() && b.m_userCount == 0 && b.m_meshletGroupCount == 0);
 		}
 
-		ANKI_ASSERT(m_bucketItemCount[t] == 0);
+		ANKI_ASSERT(m_bucketUserCount[t] == 0);
 		ANKI_ASSERT(m_activeBucketCount[t] == 0);
+		ANKI_ASSERT(m_meshletGroupCount[t] == 0);
 	}
 }
 
-RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo& state, RenderingTechnique technique)
+RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo& state, RenderingTechnique technique, U32 lod0MeshletCount)
 {
+	if(!!(state.m_program->getShaderTypes() & ShaderTypeBit::kAllModernGeometry))
+	{
+		ANKI_ASSERT(lod0MeshletCount > 0);
+	}
+	else
+	{
+		ANKI_ASSERT(lod0MeshletCount == 0);
+	}
+
 	// Compute state gash
 	Array<U64, 3> toHash;
 	toHash[0] = state.m_program->getUuid();
@@ -30,6 +40,8 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	toHash[2] = state.m_indexedDrawcall;
 	const U64 hash = computeHash(toHash.getBegin(), toHash.getSizeInBytes());
 
+	const U32 meshletGroupCount = lod0MeshletCount + (kMaxMeshletsPerTaskShaderPayload - 1) / kMaxMeshletsPerTaskShaderPayload;
+
 	SceneDynamicArray<ExtendedBucket>& buckets = m_buckets[technique];
 
 	RenderStateBucketIndex out;
@@ -37,7 +49,8 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 
 	LockGuard lock(m_mtx);
 
-	++m_bucketItemCount[technique];
+	++m_bucketUserCount[technique];
+	m_meshletGroupCount[technique] += meshletGroupCount;
 
 	// Search bucket
 	for(U32 i = 0; i < buckets.getSize(); ++i)
@@ -45,10 +58,12 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 		if(buckets[i].m_hash == hash)
 		{
 			++buckets[i].m_userCount;
+			buckets[i].m_meshletGroupCount += meshletGroupCount;
 
 			if(buckets[i].m_userCount == 1)
 			{
 				ANKI_ASSERT(!buckets[i].m_program.isCreated());
+				ANKI_ASSERT(buckets[i].m_meshletGroupCount == meshletGroupCount);
 				buckets[i].m_program = state.m_program;
 				++m_activeBucketCount[technique];
 			}
@@ -58,6 +73,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 			}
 
 			out.m_index = i;
+			out.m_lod0MeshletCount = lod0MeshletCount;
 			return out;
 		}
 	}
@@ -69,10 +85,12 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	newBucket.m_primitiveTopology = state.m_primitiveTopology;
 	newBucket.m_program = state.m_program;
 	newBucket.m_userCount = 1;
+	newBucket.m_meshletGroupCount = meshletGroupCount;
 
 	++m_activeBucketCount[technique];
 
 	out.m_index = buckets.getSize() - 1;
+	out.m_lod0MeshletCount = lod0MeshletCount;
 	return out;
 }
 
@@ -85,18 +103,24 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
+	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMaxMeshletsPerTaskShaderPayload - 1) / kMaxMeshletsPerTaskShaderPayload;
 	bucketIndex.invalidate();
 
 	LockGuard lock(m_mtx);
 
 	ANKI_ASSERT(idx < m_buckets[technique].getSize());
 
-	--m_bucketItemCount[technique];
+	ANKI_ASSERT(m_bucketUserCount[technique] > 0);
+	--m_bucketUserCount[technique];
+
+	ANKI_ASSERT(m_meshletGroupCount[technique] >= meshletGroupCount);
+	m_meshletGroupCount[technique] -= meshletGroupCount;
 
 	ExtendedBucket& bucket = m_buckets[technique][idx];
-	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated());
+	ANKI_ASSERT(bucket.m_userCount > 0 && bucket.m_program.isCreated() && bucket.m_meshletGroupCount >= meshletGroupCount);
 
 	--bucket.m_userCount;
+	bucket.m_meshletGroupCount -= meshletGroupCount;
 
 	if(bucket.m_userCount == 0)
 	{

+ 16 - 6
AnKi/Scene/RenderStateBucket.h

@@ -21,7 +21,6 @@ public:
 	ShaderProgramPtr m_program;
 	PrimitiveTopology m_primitiveTopology = PrimitiveTopology::kTriangles;
 	Bool m_indexedDrawcall = true;
-	Bool m_meshShaders = false;
 };
 
 class RenderStateBucketIndex
@@ -47,6 +46,7 @@ public:
 		ANKI_ASSERT(!isValid() && "Forgot to delete");
 		m_index = b.m_index;
 		m_technique = b.m_technique;
+		m_lod0MeshletCount = b.m_lod0MeshletCount;
 		b.invalidate();
 		return *this;
 	}
@@ -64,11 +64,13 @@ public:
 
 private:
 	U32 m_index = kMaxU32;
+	U32 m_lod0MeshletCount = kMaxU32;
 	RenderingTechnique m_technique = RenderingTechnique::kCount;
 
 	void invalidate()
 	{
 		m_index = kMaxU32;
+		m_lod0MeshletCount = kMaxU32;
 		m_technique = RenderingTechnique::kCount;
 	}
 };
@@ -82,7 +84,7 @@ class RenderStateBucketContainer : public MakeSingleton<RenderStateBucketContain
 public:
 	/// Add a new user for a specific render state and rendering technique.
 	/// @note It's thread-safe against addUser and removeUser
-	RenderStateBucketIndex addUser(const RenderStateInfo& state, RenderingTechnique technique);
+	RenderStateBucketIndex addUser(const RenderStateInfo& state, RenderingTechnique technique, U32 lod0MeshletCount);
 
 	/// Remove the user.
 	/// @note It's thread-safe against addUser and removeUser
@@ -94,14 +96,20 @@ public:
 	{
 		for(const ExtendedBucket& b : m_buckets[technique])
 		{
-			func(static_cast<const RenderStateInfo&>(b), b.m_userCount);
+			func(static_cast<const RenderStateInfo&>(b), b.m_userCount, b.m_meshletGroupCount);
 		}
 	}
 
 	/// Get the number of renderables of all the buckets of a specific rendering technique.
-	U32 getBucketsItemCount(RenderingTechnique technique) const
+	U32 getBucketsUserCount(RenderingTechnique technique) const
 	{
-		return m_bucketItemCount[technique];
+		return m_bucketUserCount[technique];
+	}
+
+	/// Get the number of meshlet groups of a technique.
+	U32 getBucketsMeshletGroupCount(RenderingTechnique technique) const
+	{
+		return m_meshletGroupCount[technique];
 	}
 
 	/// Get number of empty and non-empty buckets.
@@ -122,10 +130,12 @@ private:
 	public:
 		U64 m_hash = 0;
 		U32 m_userCount = 0;
+		U32 m_meshletGroupCount = 0;
 	};
 
 	Array<SceneDynamicArray<ExtendedBucket>, U32(RenderingTechnique::kCount)> m_buckets;
-	Array<U32, U32(RenderingTechnique::kCount)> m_bucketItemCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_bucketUserCount = {};
+	Array<U32, U32(RenderingTechnique::kCount)> m_meshletGroupCount = {};
 	Array<U32, U32(RenderingTechnique::kCount)> m_activeBucketCount = {};
 
 	Mutex m_mtx;

+ 20 - 6
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -222,21 +222,35 @@ VertOut main(VertIn input)
 
 groupshared MeshShaderPayload s_payload;
 
+struct FirstPayload
+{
+	UVec4 m_val;
+};
+
+[[vk::push_constant]] ConstantBuffer<FirstPayload> g_firstPayload;
+
 [numthreads(ANKI_TASK_SHADER_THREADGROUP_SIZE, 1, 1)] void main(U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX)
 {
-	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[svGroupId];
+	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_firstPayload.m_val.x + svGroupId];
 
 	const U32 meshletCount = (inPayload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit & 63u) + 1u;
 	const U32 firstMeshlet = inPayload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit >> 6u;
 
 	if(svGroupIndex < meshletCount)
 	{
+		const GpuSceneRenderable renderable = g_renderables[inPayload.m_renderableIndex];
+		const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(renderable.m_meshLodsOffset);
+
 		s_payload.m_meshletIndices[svGroupIndex] = firstMeshlet + svGroupIndex;
-		s_payload.m_worldTransformsOffset = inPayload.m_worldTransformsOffset;
-		s_payload.m_constantsOffset = inPayload.m_constantsOffset;
-		s_payload.m_boneTransformsOrParticleEmitterOffset = inPayload.m_boneTransformsOrParticleEmitterOffset;
-		s_payload.m_positionScale = inPayload.m_positionScale;
-		s_payload.m_positionTranslation = inPayload.m_positionTranslation;
+
+		if(svGroupIndex == 0u)
+		{
+			s_payload.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
+			s_payload.m_constantsOffset = renderable.m_constantsOffset;
+			s_payload.m_boneTransformsOrParticleEmitterOffset = renderable.m_boneTransformsOffset;
+			s_payload.m_positionScale = meshLod.m_positionScale;
+			s_payload.m_positionTranslation = meshLod.m_positionTranslation;
+		}
 	}
 
 	GroupMemoryBarrierWithGroupSync();

+ 77 - 43
AnKi/Shaders/GpuVisibility.ankiprog

@@ -31,25 +31,30 @@ struct DrawIndirectArgsWithPadding
 [[vk::binding(1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
 [[vk::binding(2)]] ByteAddressBuffer g_gpuScene;
 
-// These 2 have the same size
+// These 3 have the same size
 [[vk::binding(3)]] RWStructuredBuffer<UVec4> g_instanceRateRenderables;
 [[vk::binding(4)]] RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs;
 [[vk::binding(4)]] RWStructuredBuffer<DrawIndirectArgsWithPadding> g_drawIndirectArgs;
 
-// Index pointing to the above arrays. One for each render state bucket
-[[vk::binding(5)]] StructuredBuffer<U32> g_drawIndirectArgsOffsets;
 // The MDI counts. One for each render state bucket
-[[vk::binding(6)]] RWStructuredBuffer<U32> g_mdiDrawCounts;
+[[vk::binding(5)]] RWStructuredBuffer<U32> g_mdiDrawCounts;
+
+// For mesh shading
+[[vk::binding(6)]] RWStructuredBuffer<DispatchIndirectArgs> g_taskShaderIndirectArgs;
+[[vk::binding(7)]] RWStructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+
+// One for each render state bucket. It's either the index of the next indirect args or the index to the next task payload
+[[vk::binding(8)]] StructuredBuffer<U32> g_drawIndirectArgsIndexOrTaskPayloadIndex;
 
 #if DISTANCE_TEST == 0
-[[vk::binding(7)]] ConstantBuffer<FrustumGpuVisibilityConstants> g_consts;
+[[vk::binding(9)]] ConstantBuffer<FrustumGpuVisibilityConstants> g_consts;
 #else
 [[vk::push_constant]] ConstantBuffer<DistanceGpuVisibilityConstants> g_consts;
 #endif
 
 #if HZB_TEST
-[[vk::binding(8)]] Texture2D<Vec4> g_hzbTex;
-[[vk::binding(9)]] SamplerState g_nearestAnyClampSampler;
+[[vk::binding(10)]] Texture2D<Vec4> g_hzbTex;
+[[vk::binding(11)]] SamplerState g_nearestAnyClampSampler;
 #endif
 
 #if GATHER_AABBS
@@ -177,52 +182,81 @@ struct DrawIndirectArgsWithPadding
 	const U32 renderStateBucket = bvolume.m_renderableIndexAndRenderStateBucket & ((1u << 12u) - 1u);
 	const U32 renderableIdx = bvolume.m_renderableIndexAndRenderStateBucket >> 12u;
 
-	U32 indirectIdx;
-	InterlockedAdd(g_mdiDrawCounts[renderStateBucket], 1, indirectIdx);
-	indirectIdx += g_drawIndirectArgsOffsets[renderStateBucket];
-
 	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
 	const U32 meshLodOffset = renderable.m_meshLodsOffset + sizeof(GpuSceneMeshLod) * lod;
 	const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(meshLodOffset);
 
 	const Bool isParticleEmitter = renderable.m_particleEmitterOffset != 0;
+	const Bool usesMeshShaders = meshLod.m_meshletCount != 0u;
 
-	if(!isParticleEmitter)
+	if(usesMeshShaders)
 	{
-		// Regular renderables are always indexed
-
-		DrawIndexedIndirectArgs indirect;
-		indirect.m_indexCount = meshLod.m_indexCount;
-		indirect.m_instanceCount = 1;
-		indirect.m_firstIndex = meshLod.m_firstIndex;
-		indirect.m_vertexOffset = 0;
-		indirect.m_firstInstance = indirectIdx;
-		g_drawIndexedIndirectArgs[indirectIdx] = indirect;
-
-		UVec4 instanceVertex;
-		instanceVertex.x = renderable.m_worldTransformsOffset;
-		instanceVertex.y = renderable.m_constantsOffset;
-		instanceVertex.z = meshLodOffset;
-		instanceVertex.w = renderable.m_boneTransformsOffset;
-		g_instanceRateRenderables[indirectIdx] = instanceVertex;
+		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMaxMeshletsPerTaskShaderPayload - 1)) / kMaxMeshletsPerTaskShaderPayload;
+
+		U32 payloadIdx;
+		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, payloadIdx);
+		payloadIdx += g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
+
+		g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
+		g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountZ = 1u;
+
+		// Divide the mesh into meshlet groups and add them as task payloads
+		GpuSceneTaskShaderPayload payload;
+		payload.m_renderableIndex = renderableIdx;
+
+		for(U32 i = 0; i < meshletGroupCount; ++i)
+		{
+			const U32 firstMeshlet = meshLod.m_firstMeshlet + kMaxMeshletsPerTaskShaderPayload * i;
+			const U32 meshletCount = min(kMaxMeshletsPerTaskShaderPayload, meshLod.m_meshletCount - i * kMaxMeshletsPerTaskShaderPayload);
+
+			payload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit = (firstMeshlet << 6u) | (meshletCount - 1u);
+
+			g_taskShaderPayloads[payloadIdx + i] = payload;
+		}
 	}
 	else
 	{
-		const GpuSceneParticleEmitter emitter = g_gpuScene.Load<GpuSceneParticleEmitter>(renderable.m_particleEmitterOffset);
-
-		DrawIndirectArgsWithPadding indirect;
-		indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
-		indirect.m_instanceCount = 1;
-		indirect.m_firstVertex = 0;
-		indirect.m_firstInstance = indirectIdx;
-		g_drawIndirectArgs[indirectIdx] = indirect;
-
-		UVec4 instanceVertex;
-		instanceVertex.x = renderable.m_worldTransformsOffset;
-		instanceVertex.y = renderable.m_constantsOffset;
-		instanceVertex.z = meshLodOffset;
-		instanceVertex.w = renderable.m_particleEmitterOffset;
-		g_instanceRateRenderables[indirectIdx] = instanceVertex;
+		U32 indirectIdx;
+		InterlockedAdd(g_mdiDrawCounts[renderStateBucket], 1, indirectIdx);
+		indirectIdx += g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
+
+		if(!isParticleEmitter)
+		{
+			// Regular renderables are always indexed
+
+			DrawIndexedIndirectArgs indirect;
+			indirect.m_indexCount = meshLod.m_indexCount;
+			indirect.m_instanceCount = 1;
+			indirect.m_firstIndex = meshLod.m_firstIndex;
+			indirect.m_vertexOffset = 0;
+			indirect.m_firstInstance = indirectIdx;
+			g_drawIndexedIndirectArgs[indirectIdx] = indirect;
+
+			UVec4 instanceVertex;
+			instanceVertex.x = renderable.m_worldTransformsOffset;
+			instanceVertex.y = renderable.m_constantsOffset;
+			instanceVertex.z = meshLodOffset;
+			instanceVertex.w = renderable.m_boneTransformsOffset;
+			g_instanceRateRenderables[indirectIdx] = instanceVertex;
+		}
+		else
+		{
+			const GpuSceneParticleEmitter emitter = g_gpuScene.Load<GpuSceneParticleEmitter>(renderable.m_particleEmitterOffset);
+
+			DrawIndirectArgsWithPadding indirect;
+			indirect.m_vertexCount = emitter.m_aliveParticleCount * meshLod.m_indexCount;
+			indirect.m_instanceCount = 1;
+			indirect.m_firstVertex = 0;
+			indirect.m_firstInstance = indirectIdx;
+			g_drawIndirectArgs[indirectIdx] = indirect;
+
+			UVec4 instanceVertex;
+			instanceVertex.x = renderable.m_worldTransformsOffset;
+			instanceVertex.y = renderable.m_constantsOffset;
+			instanceVertex.z = meshLodOffset;
+			instanceVertex.w = renderable.m_particleEmitterOffset;
+			g_instanceRateRenderables[indirectIdx] = instanceVertex;
+		}
 	}
 
 #if HASH_VISIBLES

+ 2 - 0
AnKi/Shaders/Include/Common.h

@@ -768,6 +768,8 @@ constexpr U32 kMaxMipsSinglePassDownsamplerCanProduce = 12u;
 constexpr U32 kMaxPrimitivesPerMeshlet = 64;
 constexpr U32 kMaxVerticesPerMeshlet = 64;
 #define ANKI_TASK_SHADER_THREADGROUP_SIZE 64u
+constexpr U32 kMaxMeshletsPerTaskShaderPayload = ANKI_TASK_SHADER_THREADGROUP_SIZE;
+
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 64u
 static_assert(ANKI_MESH_SHADER_THREADGROUP_SIZE == max(kMaxPrimitivesPerMeshlet, kMaxVerticesPerMeshlet));
 

+ 1 - 6
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -42,12 +42,7 @@ static_assert(sizeof(GpuSceneRenderableVertex) == sizeof(UVec4));
 struct GpuSceneTaskShaderPayload
 {
 	U32 m_firstMeshlet_26bit_meshletCountMinusOne_6bit;
-	U32 m_worldTransformsOffset;
-	U32 m_constantsOffset;
-	U32 m_boneTransformsOrParticleEmitterOffset;
-
-	Vec3 m_positionTranslation;
-	F32 m_positionScale;
+	U32 m_renderableIndex;
 };
 static_assert(ANKI_TASK_SHADER_THREADGROUP_SIZE == 2u << (6u - 1u)); // Need to fit to 6bit
 

+ 1 - 0
AnKi/Shaders/Include/MaterialTypes.h

@@ -39,6 +39,7 @@ enum class MaterialBinding : U32
 
 	kMeshlets, // Pointing to the unified geom buffer
 	kTaskShaderPayloads,
+	kRenderables,
 
 	// For FW shading:
 	kLinearClampSampler,

+ 1 - 0
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -24,6 +24,7 @@ ANKI_BINDLESS_SET(MaterialSet::kBindless)
 
 [[vk::binding(MaterialBinding::kMeshlets, MaterialSet::kGlobal)]] StructuredBuffer<Meshlet> g_meshlets;
 [[vk::binding(MaterialBinding::kTaskShaderPayloads, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
+[[vk::binding(MaterialBinding::kRenderables, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
 
 // FW shading specific
 #if defined(FORWARD_SHADING)