Browse Source

Minor improvements

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
eb242209ea

+ 16 - 29
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -144,7 +144,9 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
 	out.m_instanceRateRenderablesBuffer =
 		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, legacyGeometryFlowUserCount) * sizeof(GpuSceneRenderableVertex));
+	out.m_mdiDrawCountsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(U32) * bucketCount);
 
+	out.m_taskShaderIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * bucketCount);
 	out.m_taskShaderPayloadBuffer =
 		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, meshletGroupCount) * sizeof(GpuSceneTaskShaderPayload));
 
@@ -153,43 +155,28 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		out.m_visibleAaabbIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((allUserCount + 1) * sizeof(U32));
 	}
 
-	// Allocate memory for things that will be zeroed
-	PtrSize counterMemory = 0;
 	if(in.m_hashVisibles)
 	{
-		counterMemory = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment,
-										  counterMemory + sizeof(GpuVisibilityHash));
+		out.m_visiblesHashBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(GpuVisibilityHash));
 	}
 
-	const PtrSize mdiBufferOffset = counterMemory;
-	const PtrSize mdiBufferSize = sizeof(U32) * bucketCount;
-	counterMemory =
-		getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment, counterMemory + mdiBufferSize);
-
-	const PtrSize taskShaderIndirectArgsOffset = counterMemory;
-	const PtrSize taskShaderIndirectArgsSize = sizeof(DispatchIndirectArgs) * bucketCount;
-	counterMemory = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_uavBufferBindOffsetAlignment,
-									  counterMemory + taskShaderIndirectArgsSize);
-
-	const BufferOffsetRange counterBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(counterMemory);
-	const BufferHandle counterBufferHandle = in.m_rgraph->importBuffer(BufferUsageBit::kNone, counterBuffer);
-	out.m_someBufferHandle = counterBufferHandle;
-
-	if(in.m_hashVisibles)
-	{
-		out.m_visiblesHashBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset, sizeof(GpuVisibilityHash)};
-	}
-
-	out.m_mdiDrawCountsBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset + mdiBufferOffset, mdiBufferSize};
-	out.m_taskShaderIndirectArgsBuffer = {counterBuffer.m_buffer, counterBuffer.m_offset + taskShaderIndirectArgsOffset, taskShaderIndirectArgsSize};
+	out.m_someBufferHandle = in.m_rgraph->importBuffer(BufferUsageBit::kNone, out.m_mdiDrawCountsBuffer);
 
 	// Zero some stuff
 	{
 		ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass("GPU visibility: Zero stuff");
-		pass.newBufferDependency(counterBufferHandle, BufferUsageBit::kTransferDestination);
+		pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kTransferDestination);
+
+		pass.setWork([mdiDrawCountsBuffer = out.m_mdiDrawCountsBuffer, taskShaderIndirectArgsBuffer = out.m_taskShaderIndirectArgsBuffer,
+					  visibleAaabbIndicesBuffer = out.m_visibleAaabbIndicesBuffer,
+					  visiblesHashBuffer = out.m_visibleAaabbIndicesBuffer](RenderPassWorkContext& rpass) {
+			rpass.m_commandBuffer->fillBuffer(mdiDrawCountsBuffer, 0);
+			rpass.m_commandBuffer->fillBuffer(taskShaderIndirectArgsBuffer, 0);
 
-		pass.setWork([counterBuffer, visibleAaabbIndicesBuffer = out.m_visibleAaabbIndicesBuffer](RenderPassWorkContext& rpass) {
-			rpass.m_commandBuffer->fillBuffer(counterBuffer.m_buffer, counterBuffer.m_offset, counterBuffer.m_range, 0);
+			if(visiblesHashBuffer.m_buffer)
+			{
+				rpass.m_commandBuffer->fillBuffer(visiblesHashBuffer, 0);
+			}
 
 			if(visibleAaabbIndicesBuffer.m_buffer)
 			{
@@ -202,7 +189,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 	ComputeRenderPassDescription& pass = in.m_rgraph->newComputeRenderPass(in.m_passesName);
 
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kUavComputeRead);
-	pass.newBufferDependency(counterBufferHandle, BufferUsageBit::kUavComputeWrite);
+	pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavComputeWrite);
 
 	if(!distanceBased && static_cast<FrustumGpuVisibilityInput&>(in).m_hzbRt)
 	{

+ 2 - 2
AnKi/Scene/RenderStateBucket.cpp

@@ -40,7 +40,7 @@ RenderStateBucketIndex RenderStateBucketContainer::addUser(const RenderStateInfo
 	toHash[2] = state.m_indexedDrawcall;
 	const U64 hash = computeHash(toHash.getBegin(), toHash.getSizeInBytes());
 
-	const U32 meshletGroupCount = lod0MeshletCount + (kMaxMeshletsPerTaskShaderPayload - 1) / kMaxMeshletsPerTaskShaderPayload;
+	const U32 meshletGroupCount = lod0MeshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1) / kMaxMeshletsPerTaskShaderThreadgroup;
 
 	SceneDynamicArray<ExtendedBucket>& buckets = m_buckets[technique];
 
@@ -103,7 +103,7 @@ void RenderStateBucketContainer::removeUser(RenderStateBucketIndex& bucketIndex)
 
 	const RenderingTechnique technique = bucketIndex.m_technique;
 	const U32 idx = bucketIndex.m_index;
-	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMaxMeshletsPerTaskShaderPayload - 1) / kMaxMeshletsPerTaskShaderPayload;
+	const U32 meshletGroupCount = bucketIndex.m_lod0MeshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1) / kMaxMeshletsPerTaskShaderThreadgroup;
 	bucketIndex.invalidate();
 
 	LockGuard lock(m_mtx);

+ 3 - 5
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -101,7 +101,7 @@ struct FragOut
 
 struct MeshShaderPayload
 {
-	U32 m_meshletIndices[ANKI_TASK_SHADER_THREADGROUP_SIZE];
+	U32 m_firstMeshletIndex;
 	U32 m_worldTransformsOffset;
 	U32 m_constantsOffset;
 	U32 m_boneTransformsOrParticleEmitterOffset;
@@ -241,10 +241,9 @@ struct FirstPayload
 		const GpuSceneRenderable renderable = g_renderables[inPayload.m_renderableIndex];
 		const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(renderable.m_meshLodsOffset);
 
-		s_payload.m_meshletIndices[svGroupIndex] = firstMeshlet + svGroupIndex;
-
 		if(svGroupIndex == 0u)
 		{
+			s_payload.m_firstMeshletIndex = firstMeshlet;
 			s_payload.m_worldTransformsOffset = renderable.m_worldTransformsOffset;
 			s_payload.m_constantsOffset = renderable.m_constantsOffset;
 			s_payload.m_boneTransformsOrParticleEmitterOffset = renderable.m_boneTransformsOffset;
@@ -253,7 +252,6 @@ struct FirstPayload
 		}
 	}
 
-	GroupMemoryBarrierWithGroupSync();
 	DispatchMesh(meshletCount, 1, 1, s_payload);
 }
 
@@ -267,7 +265,7 @@ constexpr U32 g_dummy = 0; // The formater is getting confused so add this
 main(in payload MeshShaderPayload payload, out vertices VertOut verts[kMaxVerticesPerMeshlet], out indices UVec3 indices[kMaxPrimitivesPerMeshlet],
 	 U32 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX)
 {
-	const Meshlet meshlet = g_meshlets[payload.m_meshletIndices[svGroupId]];
+	const Meshlet meshlet = g_meshlets[payload.m_firstMeshletIndex + svGroupId];
 	const U32 primCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint >> 16u;
 	const U32 vertCount = meshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint & 0xFFFFu;
 

+ 3 - 3
AnKi/Shaders/GpuVisibility.ankiprog

@@ -191,7 +191,7 @@ struct DrawIndirectArgsWithPadding
 
 	if(usesMeshShaders)
 	{
-		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMaxMeshletsPerTaskShaderPayload - 1)) / kMaxMeshletsPerTaskShaderPayload;
+		const U32 meshletGroupCount = (meshLod.m_meshletCount + (kMaxMeshletsPerTaskShaderThreadgroup - 1)) / kMaxMeshletsPerTaskShaderThreadgroup;
 
 		U32 payloadIdx;
 		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, payloadIdx);
@@ -206,8 +206,8 @@ struct DrawIndirectArgsWithPadding
 
 		for(U32 i = 0; i < meshletGroupCount; ++i)
 		{
-			const U32 firstMeshlet = meshLod.m_firstMeshlet + kMaxMeshletsPerTaskShaderPayload * i;
-			const U32 meshletCount = min(kMaxMeshletsPerTaskShaderPayload, meshLod.m_meshletCount - i * kMaxMeshletsPerTaskShaderPayload);
+			const U32 firstMeshlet = meshLod.m_firstMeshlet + kMaxMeshletsPerTaskShaderThreadgroup * i;
+			const U32 meshletCount = min(kMaxMeshletsPerTaskShaderThreadgroup, meshLod.m_meshletCount - i * kMaxMeshletsPerTaskShaderThreadgroup);
 
 			payload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit = (firstMeshlet << 6u) | (meshletCount - 1u);
 

+ 1 - 1
AnKi/Shaders/Include/Common.h

@@ -768,7 +768,7 @@ constexpr U32 kMaxMipsSinglePassDownsamplerCanProduce = 12u;
 constexpr U32 kMaxPrimitivesPerMeshlet = 64;
 constexpr U32 kMaxVerticesPerMeshlet = 64;
 #define ANKI_TASK_SHADER_THREADGROUP_SIZE 64u
-constexpr U32 kMaxMeshletsPerTaskShaderPayload = ANKI_TASK_SHADER_THREADGROUP_SIZE;
+constexpr U32 kMaxMeshletsPerTaskShaderThreadgroup = ANKI_TASK_SHADER_THREADGROUP_SIZE;
 
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 32u
 static_assert(max(kMaxPrimitivesPerMeshlet, kMaxVerticesPerMeshlet) % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);