Browse Source

Refactor the GPU vis of non-renderables

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
67e17c5c47

+ 1 - 1
AnKi/Gr/Common.h

@@ -61,7 +61,7 @@ constexpr U32 kMaxColorRenderTargets = 4;
 constexpr U32 kMaxDescriptorSets = 3; ///< Groups that can be bound at the same time.
 constexpr U32 kMaxBindingsPerDescriptorSet = 32;
 constexpr U32 kMaxFramesInFlight = 3; ///< Triple buffering.
-constexpr U32 kMaxGrObjectNameLength = 31;
+constexpr U32 kMaxGrObjectNameLength = 61;
 constexpr U32 kMaxBindlessTextures = 512;
 constexpr U32 kMaxBindlessReadonlyTextureBuffers = 512;
 

+ 1 - 1
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -202,7 +202,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 
 		if(!m_capabilities.m_unalignedBbpTextureFormats)
 		{
-			ANKI_VK_LOGI("R8G8B8, R16G16B16 and R32G32B32 image formats are not supported");
+			ANKI_VK_LOGV("R8G8B8, R16G16B16 and R32G32B32 image formats are not supported");
 		}
 	}
 

+ 21 - 27
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -247,10 +247,13 @@ Error GpuVisibilityNonRenderables::init()
 		cmdbInit.m_flags |= CommandBufferFlag::kSmallBatch;
 		CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(cmdbInit);
 
-		for(U32 i = 0; i < kMaxFeedbackRequestsPerFrame; ++i)
+		for(U32 i = 0; i < kMaxRenderGraphAccelerationStructures; ++i)
 		{
-			BufferInitInfo buffInit("GpuVisibilityNonRenderablesFeedbackCounters");
-			buffInit.m_size = 2 * sizeof(U32);
+			RendererString name;
+			name.sprintf("GpuVisibilityNonRenderablesCounters#%u", i);
+
+			BufferInitInfo buffInit(name);
+			buffInit.m_size = 3 * sizeof(U32);
 			buffInit.m_usage = BufferUsageBit::kStorageComputeWrite | BufferUsageBit::kTransferDestination;
 
 			m_counterBuffers[i] = GrManager::getSingleton().newBuffer(buffInit);
@@ -280,28 +283,22 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 		ANKI_ASSERT(in.m_cpuFeedbackBuffer.m_range == sizeof(U32) * (objCount + 1));
 	}
 
-	// Find the counter buffer required for feedback
+	// Find the counter buffer
 	U32 counterBufferIdx = kMaxU32;
-	if(in.m_cpuFeedbackBuffer.m_buffer)
+	if(m_lastFrameIdx != getRenderer().getFrameCount())
 	{
-		if(m_lastFrameIdx != getRenderer().getFrameCount())
-		{
-			m_lastFrameIdx = getRenderer().getFrameCount();
-			m_feedbackRequestCountThisFrame = 0;
-		}
-
-		counterBufferIdx = m_feedbackRequestCountThisFrame++;
-		m_counterIdx[counterBufferIdx] = (m_counterIdx[counterBufferIdx] + 1) & 1;
+		m_lastFrameIdx = getRenderer().getFrameCount();
+		m_runIdx = 0;
 	}
 
+	counterBufferIdx = m_runIdx++;
+
 	// Allocate memory for the result
-	RebarAllocation visibleIndicesAlloc;
-	U32* indices = RebarTransientMemoryPool::getSingleton().allocateFrame<U32>(objCount + 1, visibleIndicesAlloc);
-	indices[0] = 0;
+	GpuVisibleTransientMemoryAllocation visibleIndicesAlloc = GpuVisibleTransientMemoryPool::getSingleton().allocate((objCount + 1) * sizeof(U32));
 
-	out.m_visiblesBuffer.m_buffer = &RebarTransientMemoryPool::getSingleton().getBuffer();
+	out.m_visiblesBuffer.m_buffer = visibleIndicesAlloc.m_buffer;
 	out.m_visiblesBuffer.m_offset = visibleIndicesAlloc.m_offset;
-	out.m_visiblesBuffer.m_range = visibleIndicesAlloc.m_range;
+	out.m_visiblesBuffer.m_range = visibleIndicesAlloc.m_size;
 
 	// Import buffers
 	RenderGraphDescription& rgraph = *in.m_rgraph;
@@ -320,8 +317,7 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 	}
 
 	pass.setWork([this, objType = in.m_objectType, feedbackBuffer = in.m_cpuFeedbackBuffer, viewProjectionMat = in.m_viewProjectionMat,
-				  visibleIndicesBuffHandle = out.m_bufferHandle, counterBufferIdx,
-				  counterIdx = m_counterIdx[counterBufferIdx]](RenderPassWorkContext& rgraph) {
+				  visibleIndicesBuffHandle = out.m_bufferHandle, counterBufferIdx](RenderPassWorkContext& rgraph) {
 		CommandBuffer& cmdb = *rgraph.m_commandBuffer;
 		const GpuSceneContiguousArrayType arrayType = gpuSceneNonRenderableObjectTypeToGpuSceneContiguousArrayType(objType);
 		const U32 objCount = GpuSceneContiguousArrays::getSingleton().getElementCount(arrayType);
@@ -333,23 +329,21 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 		cmdb.bindStorageBuffer(0, 0, &GpuSceneBuffer::getSingleton().getBuffer(), cArrays.getArrayBaseOffset(arrayType),
 							   cArrays.getElementSize(arrayType) * cArrays.getElementCount(arrayType), 0);
 
-		GpuVisibilityNonRenderableUniforms* unis =
-			allocateAndBindUniforms<GpuVisibilityNonRenderableUniforms*>(sizeof(GpuVisibilityNonRenderableUniforms), cmdb, 0, 1);
+		GpuVisibilityNonRenderableUniforms unis;
 		Array<Plane, 6> planes;
 		extractClipPlanes(viewProjectionMat, planes);
 		for(U32 i = 0; i < 6; ++i)
 		{
-			unis->m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+			unis.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
 		}
+		cmdb.setPushConstants(&unis, sizeof(unis));
 
-		unis->m_feedbackCounterIdx = counterIdx;
-
-		rgraph.bindStorageBuffer(0, 2, visibleIndicesBuffHandle);
+		rgraph.bindStorageBuffer(0, 1, visibleIndicesBuffHandle);
+		cmdb.bindStorageBuffer(0, 2, m_counterBuffers[counterBufferIdx].get(), 0, kMaxPtrSize);
 
 		if(needsFeedback)
 		{
 			cmdb.bindStorageBuffer(0, 3, feedbackBuffer.m_buffer, feedbackBuffer.m_offset, feedbackBuffer.m_range);
-			cmdb.bindStorageBuffer(0, 4, m_counterBuffers[counterBufferIdx].get(), 0, kMaxPtrSize);
 		}
 
 		dispatchPPCompute(cmdb, 64, 1, objCount, 1);

+ 3 - 4
AnKi/Renderer/Utils/GpuVisibility.h

@@ -86,12 +86,11 @@ private:
 	ShaderProgramResourcePtr m_prog;
 	Array3d<ShaderProgramPtr, 2, U32(GpuSceneNonRenderableObjectType::kCount), 2> m_grProgs;
 
-	static constexpr U32 kMaxFeedbackRequestsPerFrame = 6;
+	static constexpr U32 kMaxPopulateRenderGraphPerFrame = 32; ///< Max times the populateRenderGraph() will be called per frame.
 
-	Array<BufferPtr, kMaxFeedbackRequestsPerFrame> m_counterBuffers; ///< A buffer containing multiple counters for atomic operations.
-	Array<U8, kMaxFeedbackRequestsPerFrame> m_counterIdx = {};
+	Array<BufferPtr, kMaxPopulateRenderGraphPerFrame> m_counterBuffers; ///< A buffer containing multiple counters for atomic operations.
 	U64 m_lastFrameIdx = kMaxU64;
-	U32 m_feedbackRequestCountThisFrame = 0;
+	U32 m_runIdx = 0;
 };
 /// @}
 

+ 2 - 5
AnKi/Script/ScriptManager.cpp

@@ -11,26 +11,23 @@ namespace anki {
 
 ScriptManager::PoolInit::PoolInit(AllocAlignedCallback allocCb, void* allocCbData)
 {
-	ANKI_SCRIPT_LOGI("Initializing scripting engine...");
-
 	ScriptMemoryPool::allocateSingleton(allocCb, allocCbData);
 }
 
 ScriptManager::PoolInit ::~PoolInit()
 {
-	ANKI_SCRIPT_LOGI("Destroying scripting engine...");
-
 	ScriptMemoryPool::freeSingleton();
 }
 
 ScriptManager::ScriptManager(AllocAlignedCallback allocCb, void* allocCbData)
 	: m_poolInit(allocCb, allocCbData)
 {
-	ANKI_SCRIPT_LOGI("Initializing scripting engine...");
+	ANKI_SCRIPT_LOGI("Initializing scripting");
 }
 
 ScriptManager::~ScriptManager()
 {
+	ANKI_SCRIPT_LOGI("Destroying scripting");
 }
 
 } // end namespace anki

+ 1 - 0
AnKi/ShaderCompiler/Dxc.cpp

@@ -90,6 +90,7 @@ Error compileHlslToSpirv(CString src, ShaderType shaderType, Bool compileWith16b
 	dxcArgs.emplaceBack(profile(shaderType));
 	dxcArgs.emplaceBack("-spirv");
 	dxcArgs.emplaceBack("-fspv-target-env=vulkan1.1spirv1.4");
+	// dxcArgs.emplaceBack("-Zi"); // Debug info
 	dxcArgs.emplaceBack(hlslFilename);
 
 	if(compileWith16bitTypes)

+ 50 - 31
AnKi/Shaders/GpuVisibilityNonRenderables.ankiprog

@@ -30,15 +30,17 @@ typedef GpuSceneGlobalIlluminationProbe ObjectType;
 #endif
 
 [[vk::binding(0)]] StructuredBuffer<ObjectType> g_objects;
-[[vk::binding(1)]] ConstantBuffer<GpuVisibilityNonRenderableUniforms> g_unis;
-[[vk::binding(2)]] RWStructuredBuffer<U32> g_visibleIndices; // 1st element is the count. What follows is indices
+[[vk::push_constant]] ConstantBuffer<GpuVisibilityNonRenderableUniforms> g_unis;
+[[vk::binding(1)]] RWStructuredBuffer<U32> g_visibleIndices; // 1st element is the count. What follows is indices
+
+constexpr U32 kVisibleObjCounterIdx = 1;
+constexpr U32 kThreadgroupCounterIdx = 0;
+constexpr U32 kFeedbackCounterIdx = 2;
+[[vk::binding(2)]] RWStructuredBuffer<U32> g_counterBuffer; // 2 counters per dispatch with an optional 3rd for feedback
 
 #if CPU_FEEDBACK
 // 1st element is a count. What follows is an array of UUIDs.
 [[vk::binding(3)]] RWStructuredBuffer<U32> g_cpuFeedbackBuffer;
-
-// Contains 2 U32s. One that it gets cleared and another that will be incremented.
-[[vk::binding(4)]] RWStructuredBuffer<U32> g_counterBuffer;
 #endif
 
 Vec4 getSphere(GpuSceneLight l)
@@ -79,52 +81,69 @@ Vec4 getSphere(GpuSceneGlobalIlluminationProbe l)
 	return Vec4(center, radius);
 }
 
-[numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
+#define NUMTHREADS 64
+[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
 {
+	Bool skip = false;
+
 	U32 objectCount, unused;
 	g_objects.GetDimensions(objectCount, unused);
-
-#if CPU_FEEDBACK
-	// Zero feedback counters of the next frame
-	if(svDispatchThreadId.x == 0)
-	{
-		g_counterBuffer[!g_unis.m_feedbackCounterIdx] = 0u;
-	}
-#endif
-
-	if(svDispatchThreadId.x >= objectCount)
-	{
-		return;
-	}
+	skip = (svDispatchThreadId >= objectCount);
 
 	// Frustum test
 	//
-	const ObjectType obj = g_objects[svDispatchThreadId.x];
-	const Vec4 sphere = getSphere(obj);
-	if(!frustumTest(g_unis.m_clipPlanes, sphere.xyz, sphere.w))
+	if(!skip)
 	{
-		return;
+		const Vec4 sphere = getSphere(g_objects[svDispatchThreadId]);
+		skip = !frustumTest(g_unis.m_clipPlanes, sphere.xyz, sphere.w);
 	}
 
 	// Add the object
 	//
-	U32 idx;
-	InterlockedAdd(g_visibleIndices[0], 1, idx);
-	g_visibleIndices[idx + 1] = svDispatchThreadId.x;
+	if(!skip)
+	{
+		U32 idx;
+		InterlockedAdd(g_counterBuffer[kVisibleObjCounterIdx], 1, idx);
+
+		g_visibleIndices[idx + 1] = svDispatchThreadId;
+	}
 
 	// Give feedback to the CPU
 	//
 #if CPU_FEEDBACK
-	if(obj.m_uuid != 0)
+	if(!skip && g_objects[svDispatchThreadId].m_uuid != 0)
 	{
 		U32 idx;
-		InterlockedAdd(g_counterBuffer[g_unis.m_feedbackCounterIdx], 1, idx);
+		InterlockedAdd(g_counterBuffer[kFeedbackCounterIdx], 1, idx);
+
+		g_cpuFeedbackBuffer[idx + 1] = g_objects[svDispatchThreadId].m_uuid;
+	}
+#endif
 
-		U32 dummy;
-		InterlockedExchange(g_cpuFeedbackBuffer[0], idx + 1, dummy);
-		g_cpuFeedbackBuffer[idx + 1] = obj.m_uuid;
+	// Store the counters to the actual buffers
+	//
+	Bool lastThreadgroupExecuting = false;
+	if(svGroupIndex == 0)
+	{
+		U32 threadgroupIdx;
+		InterlockedAdd(g_counterBuffer[kThreadgroupCounterIdx], 1, threadgroupIdx);
+		const U32 threadgroupCount = (objectCount + NUMTHREADS - 1) / NUMTHREADS;
+		lastThreadgroupExecuting = (threadgroupIdx == threadgroupCount);
 	}
+
+	// Sync to make sure all the atomic ops have finished before the following code reads them
+	GroupMemoryBarrier();
+
+	if(lastThreadgroupExecuting)
+	{
+		g_visibleIndices[0] = g_counterBuffer[kVisibleObjCounterIdx];
+		g_counterBuffer[kVisibleObjCounterIdx] = 0;
+#if CPU_FEEDBACK
+		g_cpuFeedbackBuffer[0] = g_counterBuffer[kFeedbackCounterIdx];
+		g_counterBuffer[kFeedbackCounterIdx] = 0;
 #endif
+		g_counterBuffer[kThreadgroupCounterIdx] = 0;
+	}
 }
 
 #pragma anki end

+ 0 - 5
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -29,11 +29,6 @@ struct GpuVisibilityUniforms
 struct GpuVisibilityNonRenderableUniforms
 {
 	Vec4 m_clipPlanes[6u];
-
-	U32 m_feedbackCounterIdx;
-	U32 m_padding0;
-	U32 m_padding1;
-	U32 m_padding2;
 };
 
 struct PointLightRendererCacheEntry