Browse Source

Decrease task payload size even further

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
f8fc2dfce2

+ 2 - 2
AnKi/Scene/RenderStateBucket.cpp

@@ -40,7 +40,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	toHash[2] = state.m_indexedDrawcall;
 	const U64 hash = computeHash(toHash.getBegin(), toHash.getSizeInBytes());
 
-	const U32 meshletGroupCount = lod0MeshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1) / kMaxMeshletsPerTaskShaderThreadgroup;
+	const U32 meshletGroupCount = lod0MeshletCount + (kMeshletGroupSize - 1) / kMeshletGroupSize;
 
 	SceneDynamicArray<ExtendedBucket>& buckets = m_buckets[technique];
 
@@ -103,7 +103,7 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
-	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1) / kMaxMeshletsPerTaskShaderThreadgroup;
+	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMeshletGroupSize - 1) / kMeshletGroupSize;
 	bucketIndex.invalidate();
 
 	LockGuard lock(m_mtx);

+ 16 - 15
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -233,23 +233,24 @@ struct FirstPayload
 {
 	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_firstPayload.m_val.x + svGroupId];
 
-	const U32 meshletCount = (inPayload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit & 127u) + 1u;
-	const U32 firstMeshlet = inPayload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit >> 7u;
+	const U32 lod = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 30u;
+	const U32 renderableIdx = (inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit >> 9u) & ((1u << 21u) - 1u);
+	const U32 meshletGroup = inPayload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit & ((1u << 9u) - 1u);
 
-	if(svGroupIndex < meshletCount)
-	{
-		const GpuSceneRenderable renderable = g_renderables[inPayload.m_renderableIndex];
-		const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(renderable.m_meshLodsOffset);
+	const GpuSceneRenderable renderable = g_renderables[renderableIdx];
+	const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(renderable.m_meshLodsOffset + sizeof(GpuSceneMeshLod) * lod);
+	U32 firstMeshlet = meshletGroup * kMeshletGroupSize;
+	const U32 meshletCount = min(kMeshletGroupSize, meshLod.m_meshletCount - firstMeshlet * kMeshletGroupSize);
+	firstMeshlet += meshLod.m_firstMeshlet;
 
-		if(svGroupIndex == 0u)
-		{
-			s_payload.m_firstMeshletIndex = firstMeshlet;
-			s_payload.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
-			s_payload.m_constantsOffset = renderable.m_constantsOffset;
-			s_payload.m_boneTransformsOrParticleEmitterOffset = renderable.m_boneTransformsOffset;
-			s_payload.m_positionScale = meshLod.m_positionScale;
-			s_payload.m_positionTranslation = meshLod.m_positionTranslation;
-		}
+	if(svGroupIndex == 0u)
+	{
+		s_payload.m_firstMeshletIndex = firstMeshlet;
+		s_payload.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
+		s_payload.m_constantsOffset = renderable.m_constantsOffset;
+		s_payload.m_boneTransformsOrParticleEmitterOffset = renderable.m_boneTransformsOffset;
+		s_payload.m_positionScale = meshLod.m_positionScale;
+		s_payload.m_positionTranslation = meshLod.m_positionTranslation;
 	}
 
 	DispatchMesh(meshletCount, 1, 1, s_payload);

+ 12 - 14
AnKi/Shaders/GpuVisibility.ankiprog

@@ -191,37 +191,35 @@ struct DrawIndirectArgsWithPadding
 
 	if(usesMeshShaders)
 	{
-		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1)) / kMaxMeshletsPerTaskShaderThreadgroup;
+		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMeshletGroupSize - 1u)) / kMeshletGroupSize;
 
 		U32 payloadIdx;
 		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, payloadIdx);
 
-		if(payloadIdx >= kMaxMeshletGroupCountPerRenderStateBucket)
+		if(payloadIdx == 0u)
 		{
-			// Reached a memory limit, start over to force rendering from the beginning
+			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
+			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountZ = 1u;
+		}
+		else if(payloadIdx >= kMaxMeshletGroupCountPerRenderStateBucket)
+		{
+			// Reached a memory limit, cancel the job
 			ANKI_ASSERT(0);
-			U32 dummy;
-			InterlockedExchange(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, 1, dummy);
 			payloadIdx = 0;
+			g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 0u;
 		}
 
 		payloadIdx += g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
 
-		g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
-		g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountZ = 1u;
-
 		// Divide the mesh into meshlet groups and add them as task payloads
 		GpuSceneTaskShaderPayload payload;
-		payload.m_renderableIndex = renderableIdx;
+		payload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit = (lod << 30u) | (renderableIdx << 9u);
 
 		for(U32 i = 0; i < meshletGroupCount; ++i)
 		{
-			const U32 firstMeshlet = meshLod.m_firstMeshlet + kMaxMeshletsPerTaskShaderThreadgroup * i;
-			const U32 meshletCount = min(kMaxMeshletsPerTaskShaderThreadgroup, meshLod.m_meshletCount - i * kMaxMeshletsPerTaskShaderThreadgroup);
-
-			payload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit = (firstMeshlet << 7u) | ((meshletCount - 1u) & 127u);
-
 			g_taskShaderPayloads[payloadIdx + i] = payload;
+
+			++payload.m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
 		}
 	}
 	else

+ 2 - 2
AnKi/Shaders/Include/Common.h

@@ -773,7 +773,7 @@ constexpr U32 kMaxMipsSinglePassDownsamplerCanProduce = 12u;
 constexpr U32 kMaxPrimitivesPerMeshlet = 128;
 constexpr U32 kMaxVerticesPerMeshlet = 128;
 #define ANKI_TASK_SHADER_THREADGROUP_SIZE 128u
-constexpr U32 kMaxMeshletsPerTaskShaderThreadgroup = ANKI_TASK_SHADER_THREADGROUP_SIZE;
+constexpr U32 kMeshletGroupSize = ANKI_TASK_SHADER_THREADGROUP_SIZE;
 
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 32u
 static_assert(kMaxPrimitivesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
@@ -781,7 +781,7 @@ static_assert(kMaxVerticesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
 
 /// Assume that a render state bucket can't go beyond 100M triangles. This helps ground some memory allocations.
 constexpr U32 kMaxVisibleMeshletsPerRenderStateBucket = 100000000 / kMaxPrimitivesPerMeshlet;
-constexpr U32 kMaxMeshletGroupCountPerRenderStateBucket = kMaxVisibleMeshletsPerRenderStateBucket / kMaxMeshletsPerTaskShaderThreadgroup;
+constexpr U32 kMaxMeshletGroupCountPerRenderStateBucket = kMaxVisibleMeshletsPerRenderStateBucket / kMeshletGroupSize;
 
 struct DrawIndirectArgs
 {

+ 2 - 3
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -41,10 +41,9 @@ static_assert(sizeof(GpuSceneRenderableVertex) == sizeof(UVec4));
 /// Input to a single task shader threadgroup. Something similar to GpuSceneRenderableVertex but for mesh shading.
 struct GpuSceneTaskShaderPayload
 {
-	U32 m_firstMeshlet_25bit_meshletCountMinusOne_7bit;
-	U32 m_renderableIndex;
+	U32 m_lod_2bit_renderableIdx_21bit_meshletGroup_9bit;
 };
-static_assert(ANKI_TASK_SHADER_THREADGROUP_SIZE == 2u << (7u - 1u)); // Need to fit to 6bit
+static_assert(kMaxLodCount == 3);
 
 /// Used in visibility testing.
 struct GpuSceneRenderableBoundingVolume