Browse Source

Increase meshlet size

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
516d4068d7

+ 2 - 1
AnKi/Core/CMakeLists.txt

@@ -7,7 +7,8 @@ set(sources
 	GpuMemory/UnifiedGeometryBuffer.cpp
 	GpuMemory/GpuSceneBuffer.cpp
 	GpuMemory/RebarTransientMemoryPool.cpp
-	GpuMemory/GpuReadbackMemoryPool.cpp)
+	GpuMemory/GpuReadbackMemoryPool.cpp
+	GpuMemory/GpuVisibleTransientMemoryPool.cpp)
 
 set(headers
 	App.h

+ 26 - 0
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.cpp

@@ -0,0 +1,26 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
+#include <AnKi/Core/StatsSet.h>
+
+namespace anki {
+
+static StatCounter g_gpuVisibleTransientMemoryStatVar(StatCategory::kGpuMem, "GPU visible transient mem",
+													  StatFlag::kBytes | StatFlag::kMainThreadUpdates);
+
+void GpuVisibleTransientMemoryPool::endFrame()
+{
+	g_gpuVisibleTransientMemoryStatVar.set(m_pool.getAllocatedMemory());
+
+	if(m_frame == 0)
+	{
+		m_pool.reset();
+	}
+
+	m_frame = (m_frame + 1) % kMaxFramesInFlight;
+}
+
+} // end namespace anki

+ 1 - 9
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -66,15 +66,7 @@ public:
 		return out;
 	}
 
-	void endFrame()
-	{
-		if(m_frame == 0)
-		{
-			m_pool.reset();
-		}
-
-		m_frame = (m_frame + 1) % kMaxFramesInFlight;
-	}
+	void endFrame();
 
 private:
 	StackGpuMemoryPool m_pool;

+ 11 - 0
AnKi/Gr/Utils/StackGpuMemoryPool.cpp

@@ -28,6 +28,7 @@ public:
 	PtrSize m_initialSize = 0;
 	F64 m_scale = 0.0;
 	PtrSize m_bias = 0;
+	PtrSize m_allocatedMemory = 0;
 	GrString m_bufferName;
 	U32 m_alignment = 0;
 	BufferUsageBit m_bufferUsage = BufferUsageBit::kNone;
@@ -77,6 +78,8 @@ public:
 		buffInit.m_mapAccess = m_bufferMap;
 		chunk->m_buffer = GrManager::getSingleton().newBuffer(buffInit);
 
+		m_allocatedMemory += size;
+
 		if(!!m_bufferMap)
 		{
 			chunk->m_mappedMemory = static_cast<U8*>(chunk->m_buffer->map(0, size, m_bufferMap));
@@ -95,6 +98,9 @@ public:
 			chunk->m_buffer->unmap();
 		}
 
+		ANKI_ASSERT(m_allocatedMemory >= chunk->m_buffer->getSize());
+		m_allocatedMemory -= chunk->m_buffer->getSize();
+
 		deleteInstance(GrMemoryPool::getSingleton(), chunk);
 	}
 
@@ -159,4 +165,9 @@ void StackGpuMemoryPool::allocate(PtrSize size, PtrSize& outOffset, Buffer*& buf
 	}
 }
 
+PtrSize StackGpuMemoryPool::getAllocatedMemory() const
+{
+	return m_builder->getInterface().m_allocatedMemory;
+}
+
 } // end namespace anki

+ 2 - 0
AnKi/Gr/Utils/StackGpuMemoryPool.h

@@ -40,6 +40,8 @@ public:
 
 	void reset();
 
+	PtrSize getAllocatedMemory() const;
+
 private:
 	class Chunk;
 	class BuilderInterface;

+ 2 - 2
AnKi/Importer/GltfImporterMesh.cpp

@@ -543,8 +543,8 @@ static void generateMeshlets(SubMesh& submesh)
 		outMeshlet.m_aabb = computeBoundingAabb(&newVertexBuffer[outMeshlet.m_firstVertex].m_position, outMeshlet.m_vertexCount, sizeof(TempVertex));
 	}
 
-	ANKI_IMPORTER_LOGV("After meshletization the mesh has %f more vertices",
-					   (F32(newVertexBuffer.getSize()) - F32(submesh.m_verts.getSize())) / F32(submesh.m_verts.getSize()) * 100.0f);
+	ANKI_IMPORTER_LOGV("Meshletization stats: %f%% more vertices, %u meshlets",
+					   (F32(newVertexBuffer.getSize()) - F32(submesh.m_verts.getSize())) / F32(submesh.m_verts.getSize()) * 100.0f, meshletCount);
 
 	submesh.m_indices = std::move(newIndexBuffer);
 	submesh.m_verts = std::move(newVertexBuffer);

+ 2 - 2
AnKi/Renderer/IndirectDiffuseProbes.cpp

@@ -235,7 +235,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 			frustum.setWorldTransform(Transform(cellCenter.xyz0(), Frustum::getOmnidirectionalFrustumRotations()[i], 1.0f));
 			frustum.update();
 
-			Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
+			Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 			FrustumGpuVisibilityInput visIn;
 			visIn.m_passesName = "GI GBuffer visibility";
@@ -315,7 +315,7 @@ void IndirectDiffuseProbes::populateRenderGraph(RenderingContext& rctx)
 
 				cascadeViewProjMats[i] = cascadeProjMats[i] * Mat4(cascadeViewMats[i], Vec4(0.0f, 0.0f, 0.0f, 1.0f));
 
-				Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
+				Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 				FrustumGpuVisibilityInput visIn;
 				visIn.m_passesName = "GI shadows visibility";

+ 2 - 2
AnKi/Renderer/ProbeReflections.cpp

@@ -358,7 +358,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 		frustum.setWorldTransform(Transform(probeToRefresh->getWorldPosition().xyz0(), Frustum::getOmnidirectionalFrustumRotations()[i], 1.0f));
 		frustum.update();
 
-		Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
+		Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 		FrustumGpuVisibilityInput visIn;
 		visIn.m_passesName = "Cube refl GBuffer visibility";
@@ -430,7 +430,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 
 			cascadeViewProjMats[i] = cascadeProjMats[i] * Mat4(cascadeViewMats[i], Vec4(0.0f, 0.0f, 0.0f, 1.0f));
 
-			Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
+			Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
 
 			FrustumGpuVisibilityInput visIn;
 			visIn.m_passesName = "Cube refl shadows visibility";

+ 2 - 1
AnKi/Renderer/Utils/Drawer.cpp

@@ -203,6 +203,8 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 			cmdb.drawMeshTasksIndirect(args.m_taskShaderIndirectArgsBuffer.m_buffer,
 									   args.m_taskShaderIndirectArgsBuffer.m_offset + sizeof(DispatchIndirectArgs) * bucketCount);
+
+			allMeshletGroupCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
 		}
 		else
 		{
@@ -230,7 +232,6 @@ void RenderableDrawer::drawMdi(const RenderableDrawerArguments& args, CommandBuf
 
 		++bucketCount;
 		allUserCount += userCount;
-		allMeshletGroupCount += meshletGroupCount;
 	});
 
 	ANKI_ASSERT(bucketCount == RenderStateBucketContainer::getSingleton().getBucketCount(args.m_renderingTechinuqe));

+ 22 - 22
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -97,7 +97,25 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		ANKI_ASSERT(0);
 	}
 
-	if(aabbCount == 0) [[unlikely]]
+	const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique);
+
+	U32 legacyGeometryFlowUserCount = 0;
+	U32 modernGeometryFlowUserCount = 0;
+	U32 meshletGroupCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(in.m_technique, [&](const RenderStateInfo& s, U32 userCount, U32 meshletGroupCount_) {
+		if(meshletGroupCount_)
+		{
+			modernGeometryFlowUserCount += userCount;
+			meshletGroupCount += min(meshletGroupCount_, kMaxMeshletGroupCountPerRenderStateBucket);
+		}
+		else
+		{
+			legacyGeometryFlowUserCount += userCount;
+		}
+	});
+	const U32 allUserCount = legacyGeometryFlowUserCount + modernGeometryFlowUserCount;
+
+	if(allUserCount == 0) [[unlikely]]
 	{
 		// Early exit
 
@@ -121,24 +139,6 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 		return;
 	}
 
-	const U32 bucketCount = RenderStateBucketContainer::getSingleton().getBucketCount(in.m_technique);
-
-	U32 legacyGeometryFlowUserCount = 0;
-	U32 modernGeometryFlowUserCount = 0;
-	U32 meshletGroupCount = 0;
-	RenderStateBucketContainer::getSingleton().iterateBuckets(in.m_technique, [&](const RenderStateInfo&, U32 userCount, U32 meshletGroupCount_) {
-		if(meshletGroupCount_)
-		{
-			modernGeometryFlowUserCount += userCount;
-			meshletGroupCount += meshletGroupCount_;
-		}
-		else
-		{
-			legacyGeometryFlowUserCount += userCount;
-		}
-	});
-	const U32 allUserCount = aabbCount;
-
 	// Allocate memory
 	out.m_drawIndexedIndirectArgsBuffer =
 		GpuVisibleTransientMemoryPool::getSingleton().allocate(max(1u, legacyGeometryFlowUserCount) * sizeof(DrawIndexedIndirectArgs));
@@ -199,7 +199,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 
 	pass.setWork([this, frustumTestData, distTestData, lodReferencePoint = in.m_lodReferencePoint, lodDistances = in.m_lodDistances,
 				  technique = in.m_technique, mdiDrawCountsBuffer = out.m_mdiDrawCountsBuffer,
-				  instanceRateRenderables = out.m_instanceRateRenderablesBuffer, indirectArgs = out.m_drawIndexedIndirectArgsBuffer, allUserCount,
+				  instanceRateRenderables = out.m_instanceRateRenderablesBuffer, indirectArgs = out.m_drawIndexedIndirectArgsBuffer, aabbCount,
 				  visibleAabbsBuffer = out.m_visibleAaabbIndicesBuffer, hashBuffer = out.m_visiblesHashBuffer,
 				  taskShaderIndirectArgsBuff = out.m_taskShaderIndirectArgsBuffer,
 				  taskShaderPayloadBuffer = out.m_taskShaderPayloadBuffer](RenderPassWorkContext& rpass) {
@@ -256,7 +256,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			else if(meshletGroupCount)
 			{
 				drawIndirectArgsIndexOrTaskPayloadIndex[bucketCount] = taskPayloadCount;
-				taskPayloadCount += meshletGroupCount;
+				taskPayloadCount += min(meshletGroupCount, kMaxMeshletGroupCountPerRenderStateBucket);
 			}
 			else
 			{
@@ -319,7 +319,7 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			cmdb.bindUavBuffer(0, 13, hashBuffer);
 		}
 
-		dispatchPPCompute(cmdb, 64, 1, allUserCount, 1);
+		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
 	});
 }
 

+ 1 - 0
AnKi/Scene/GpuSceneArray.h

@@ -108,6 +108,7 @@ public:
 		return getElementCount() * getElementSize();
 	}
 
+	/// This count contains elements that may be innactive after a free. Frees in the middle of the array will not re-arrange other elements.
 	/// @note Thread-safe
 	U32 getElementCount() const
 	{

+ 11 - 6
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -233,8 +233,8 @@ struct FirstPayload
 {
 	const GpuSceneTaskShaderPayload inPayload = g_taskShaderPayloads[g_firstPayload.m_val.x + svGroupId];
 
-	const U32 meshletCount = (inPayload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit & 63u) + 1u;
-	const U32 firstMeshlet = inPayload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit >> 6u;
+	const U32 meshletCount = (inPayload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit & 127u) + 1u;
+	const U32 firstMeshlet = inPayload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit >> 7u;
 
 	if(svGroupIndex < meshletCount)
 	{
@@ -271,10 +271,9 @@ main(in payload MeshShaderPayload payload, out vertices VertOut verts[kMaxVertic
 
 	SetMeshOutputCounts(vertCount, primCount);
 
-	const U32 loopCount = max(kMaxPrimitivesPerMeshlet, kMaxVerticesPerMeshlet) / ANKI_MESH_SHADER_THREADGROUP_SIZE;
-
 	// Write the verts
-	[unroll] for(U32 l = 0; l < loopCount; ++l)
+	const U32 vertLoopCount = kMaxVerticesPerMeshlet / ANKI_MESH_SHADER_THREADGROUP_SIZE;
+	[unroll] for(U32 l = 0; l < vertLoopCount; ++l)
 	{
 		const U32 idx = l * ANKI_MESH_SHADER_THREADGROUP_SIZE + svGroupIndex;
 
@@ -315,8 +314,14 @@ main(in payload MeshShaderPayload payload, out vertices VertOut verts[kMaxVertic
 
 			verts[idx] = output;
 		}
+	}
+
+	// Write the indices
+	const U32 primLoopCount = kMaxPrimitivesPerMeshlet / ANKI_MESH_SHADER_THREADGROUP_SIZE;
+	[unroll] for(U32 l = 0; l < primLoopCount; ++l)
+	{
+		const U32 idx = l * ANKI_MESH_SHADER_THREADGROUP_SIZE + svGroupIndex;
 
-		// Write the indices
 		if(idx < primCount)
 		{
 			indices[idx] = g_unifiedGeom_R8G8B8A8_Uint[meshlet.m_firstPrimitive + idx].xyz;

+ 11 - 1
AnKi/Shaders/GpuVisibility.ankiprog

@@ -195,6 +195,16 @@ struct DrawIndirectArgsWithPadding
 
 		U32 payloadIdx;
 		InterlockedAdd(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, meshletGroupCount, payloadIdx);
+
+		if(payloadIdx >= kMaxMeshletGroupCountPerRenderStateBucket)
+		{
+			// Reached a memory limit, start over to force rendering from the beginning
+			ANKI_ASSERT(0);
+			U32 dummy;
+			InterlockedExchange(g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountX, 1, dummy);
+			payloadIdx = 0;
+		}
+
 		payloadIdx += g_drawIndirectArgsIndexOrTaskPayloadIndex[renderStateBucket];
 
 		g_taskShaderIndirectArgs[renderStateBucket].m_threadGroupCountY = 1u;
@@ -209,7 +219,7 @@ struct DrawIndirectArgsWithPadding
 			const U32 firstMeshlet = meshLod.m_firstMeshlet + kMaxMeshletsPerTaskShaderThreadgroup * i;
 			const U32 meshletCount = min(kMaxMeshletsPerTaskShaderThreadgroup, meshLod.m_meshletCount - i * kMaxMeshletsPerTaskShaderThreadgroup);
 
-			payload.m_firstMeshlet_26bit_meshletCountMinusOne_6bit = (firstMeshlet << 6u) | (meshletCount - 1u);
+			payload.m_firstMeshlet_25bit_meshletCountMinusOne_7bit = (firstMeshlet << 7u) | ((meshletCount - 1u) & 127u);
 
 			g_taskShaderPayloads[payloadIdx + i] = payload;
 		}

+ 16 - 6
AnKi/Shaders/Include/Common.h

@@ -48,7 +48,6 @@ ANKI_END_NAMESPACE
 #	define ANKI_BEGIN_NAMESPACE
 #	define ANKI_END_NAMESPACE
 #	define inline
-#	define ANKI_ASSERT(x)
 
 #	define ANKI_ARRAY(type, size, name) type name[(U32)size]
 
@@ -58,7 +57,13 @@ ANKI_END_NAMESPACE
 
 #	define constexpr static const
 
-#	define ANKI_ASSERT(x)
+#	if defined(ANKI_ASSERTIONS_ENABLED) && ANKI_ASSERTIONS_ENABLED == 1
+#		define ANKI_ASSERT(x) \
+			if(!(x)) \
+			printf("Assertion failed. Line %i", __LINE__)
+#	else
+#		define ANKI_ASSERT(x)
+#	endif
 
 template<typename T>
 void maybeUnused(T a)
@@ -765,13 +770,18 @@ constexpr F32 kShadowsPolygonOffsetUnits = 2.75f;
 
 constexpr U32 kMaxMipsSinglePassDownsamplerCanProduce = 12u;
 
-constexpr U32 kMaxPrimitivesPerMeshlet = 64;
-constexpr U32 kMaxVerticesPerMeshlet = 64;
-#define ANKI_TASK_SHADER_THREADGROUP_SIZE 64u
+constexpr U32 kMaxPrimitivesPerMeshlet = 128;
+constexpr U32 kMaxVerticesPerMeshlet = 128;
+#define ANKI_TASK_SHADER_THREADGROUP_SIZE 128u
 constexpr U32 kMaxMeshletsPerTaskShaderThreadgroup = ANKI_TASK_SHADER_THREADGROUP_SIZE;
 
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 32u
-static_assert(max(kMaxPrimitivesPerMeshlet, kMaxVerticesPerMeshlet) % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
+static_assert(kMaxPrimitivesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
+static_assert(kMaxVerticesPerMeshlet % ANKI_MESH_SHADER_THREADGROUP_SIZE == 0);
+
+/// Assume that a render state bucket can't go beyond 100M triangles. This helps ground some memory allocations.
+constexpr U32 kMaxVisibleMeshletsPerRenderStateBucket = 100000000 / kMaxPrimitivesPerMeshlet;
+constexpr U32 kMaxMeshletGroupCountPerRenderStateBucket = kMaxVisibleMeshletsPerRenderStateBucket / kMaxMeshletsPerTaskShaderThreadgroup;
 
 struct DrawIndirectArgs
 {

+ 2 - 2
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -41,10 +41,10 @@ static_assert(sizeof(GpuSceneRenderableVertex) == sizeof(UVec4));
 /// Input to a single task shader threadgroup. Something similar to GpuSceneRenderableVertex but for mesh shading.
 struct GpuSceneTaskShaderPayload
 {
-	U32 m_firstMeshlet_26bit_meshletCountMinusOne_6bit;
+	U32 m_firstMeshlet_25bit_meshletCountMinusOne_7bit;
 	U32 m_renderableIndex;
 };
-static_assert(ANKI_TASK_SHADER_THREADGROUP_SIZE == 2u << (6u - 1u)); // Need to fit to 6bit
+static_assert(ANKI_TASK_SHADER_THREADGROUP_SIZE == 2u << (7u - 1u)); // Need to fit to 6bit
 
 /// Used in visibility testing.
 struct GpuSceneRenderableBoundingVolume

+ 1 - 0
AnKi/Shaders/ShadowMappingVetVisibility.ankiprog

@@ -59,6 +59,7 @@ groupshared U32 s_renderLight;
 
 		U32 renderStateBucketCount, unused;
 		g_mdiDrawCounts.GetDimensions(renderStateBucketCount, unused);
+		ANKI_ASSERT(renderStateBucketCount <= 64);
 
 		if(svGroupIndex < renderStateBucketCount)
 		{