Browse Source

Fix bugs in the light binning and the indirect diffuse

Panagiotis Christopoulos Charitos 4 months ago
parent
commit
9535403840

+ 15 - 0
AnKi/Renderer/AccelerationStructureBuilder.cpp

@@ -61,6 +61,21 @@ void AccelerationStructureBuilder::populateRenderGraph(RenderingContext& ctx)
 			rgraphCtx.m_commandBuffer->buildAccelerationStructure(m_runCtx.m_tlas.get(), scratchBuff);
 		});
 	}
+
+	// Light visibility
+	{
+		GpuVisibilityLocalLightsInput in;
+		in.m_cellCounts = UVec3(g_lightGridSizeXYCVar, g_lightGridSizeXYCVar, g_lightGridSizeZCVar);
+		in.m_cellSize = Vec3(g_lightGridCellSizeXYCVar, g_lightGridCellSizeXYCVar, g_lightGridCellSizeZCVar);
+		in.m_cameraPosition = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz();
+		in.m_lookDirection = -ctx.m_matrices.m_cameraTransform.getRotationPart().getZAxis();
+		in.m_lightIndexListSize = g_lightIndexListSizeCVar;
+		in.m_rgraph = &ctx.m_renderGraphDescr;
+
+		GpuVisibilityLocalLightsOutput out;
+
+		getGpuVisibilityLocalLights().populateRenderGraph(in, out);
+	}
 }
 
 } // end namespace anki

+ 6 - 0
AnKi/Renderer/AccelerationStructureBuilder.h

@@ -16,6 +16,12 @@ inline NumericCVar<F32>
 	g_rayTracingExtendedFrustumDistanceCVar("R", "RayTracingExtendedFrustumDistance", 200.0f, 10.0f, 10000.0f,
 											"Every object that its distance from the camera is bellow that value will take part in ray tracing");
 
+inline NumericCVar<U32> g_lightGridSizeXYCVar("R", "LightGridSizeXY", 128, 1, 1024, "The number of cells in the X and Y axis");
+inline NumericCVar<U32> g_lightGridSizeZCVar("R", "LightGridSizeZ", 4, 1, 1024, "The number of cells in the Z axis");
+inline NumericCVar<F32> g_lightGridCellSizeXYCVar("R", "LightGridCellSizeXY", 2.0f, 0.5f, 1000.0f, "The cell size in the X and Y dimensions");
+inline NumericCVar<F32> g_lightGridCellSizeZCVar("R", "LightGridCellSizeZ", 25.0f, 0.5f, 1000.0f, "The cell size in the Z dimension");
+inline NumericCVar<U32> g_lightIndexListSizeCVar("R", "LightIndexListSize", 64 * 1024, 128, 256 * 1024, "The light index list size");
+
 /// Build acceleration structures.
 class AccelerationStructureBuilder : public RendererObject
 {

+ 93 - 2
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -1223,20 +1223,34 @@ void GpuVisibilityLocalLights::populateRenderGraph(GpuVisibilityLocalLightsInput
 	const BufferView lightIndexCountsPerCellBuff = allocateStructuredBuffer<U32>(cellCount);
 	const BufferView lightIndexOffsetsPerCellBuff = allocateStructuredBuffer<U32>(cellCount);
 	const BufferView lightIndexCountBuff = allocateStructuredBuffer<U32>(1);
+	const BufferView lightIndexListBuff = allocateStructuredBuffer<U32>(in.m_lightIndexListSize);
 
 	const BufferHandle dep = rgraph.importBuffer(lightIndexCountBuff, BufferUsageBit::kNone);
 
+	out.m_dependency = dep;
+	out.m_lightIndexListBuffer = lightIndexListBuff;
+	out.m_lightIndexCountsPerCellBuffer = lightIndexCountsPerCellBuff;
+	out.m_lightIndexOffsetsPerCellBuffer = lightIndexOffsetsPerCellBuff;
+
+	GpuVisibilityLocalLightsConsts consts;
+	consts.m_cellSize = in.m_cellSize;
+	consts.m_maxLightIndices = in.m_lightIndexListSize;
+	consts.m_gridVolumeMin = out.m_lightGridMin;
+	consts.m_gridVolumeMax = out.m_lightGridMax;
+	consts.m_cellCounts = Vec3(in.m_cellCounts);
+
 	// Setup
 	{
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Setup: %s", in.m_passesName.cstr()));
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU local light vis setup: %s", in.m_passesName.cstr()));
 
 		pass.newBufferDependency(dep, BufferUsageBit::kUavCompute);
-		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
 
 		pass.setWork([this, lightIndexCountsPerCellBuff, lightIndexCountBuff, cellCount](RenderPassWorkContext& rgraph) {
 			ANKI_TRACE_SCOPED_EVENT(GpuVisibilityLocalLightsSetup);
 			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
 
+			cmdb.bindShaderProgram(m_setupGrProg.get());
+
 			cmdb.bindUav(0, 0, lightIndexCountsPerCellBuff);
 			cmdb.bindUav(1, 0, lightIndexCountBuff);
 
@@ -1245,7 +1259,84 @@ void GpuVisibilityLocalLights::populateRenderGraph(GpuVisibilityLocalLightsInput
 	}
 
 	// Count
+	const GpuSceneArrays::Light& lights = GpuSceneArrays::Light::getSingleton();
+	if(lights.getElementCount())
+	{
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU local light vis count: %s", in.m_passesName.cstr()));
+
+		pass.newBufferDependency(dep, BufferUsageBit::kUavCompute);
+		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
+
+		pass.setWork([this, lightIndexCountsPerCellBuff, lightIndexCountBuff, consts](RenderPassWorkContext& rgraph) {
+			ANKI_TRACE_SCOPED_EVENT(GpuVisibilityLocalLightsCount);
+
+			const GpuSceneArrays::Light& lights = GpuSceneArrays::Light::getSingleton();
+
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_countGrProg.get());
+
+			cmdb.bindSrv(0, 0, lights.getBufferView());
+
+			cmdb.bindUav(0, 0, lightIndexCountsPerCellBuff);
+			cmdb.bindUav(1, 0, lightIndexCountBuff);
+
+			cmdb.setFastConstants(&consts, sizeof(consts));
+
+			dispatchPPCompute(cmdb, 64, 1, lights.getElementCount(), 1);
+		});
+	}
+
+	// PrefixSum
+	{
+		NonGraphicsRenderPass& pass =
+			rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU local light vis prefix sum: %s", in.m_passesName.cstr()));
+
+		pass.newBufferDependency(dep, BufferUsageBit::kUavCompute);
+
+		pass.setWork([this, lightIndexCountsPerCellBuff, lightIndexOffsetsPerCellBuff, lightIndexCountBuff, consts](RenderPassWorkContext& rgraph) {
+			ANKI_TRACE_SCOPED_EVENT(GpuVisibilityLocalLightsPrefixSum);
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_prefixSumGrProg.get());
+
+			cmdb.bindUav(0, 0, lightIndexCountsPerCellBuff);
+			cmdb.bindUav(1, 0, lightIndexOffsetsPerCellBuff);
+			cmdb.bindUav(2, 0, lightIndexCountBuff);
+
+			cmdb.setFastConstants(&consts, sizeof(consts));
+
+			cmdb.dispatchCompute(1, 1, 1);
+		});
+	}
+
+	// Fill
+	if(lights.getElementCount())
 	{
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("GPU local light vis fill: %s", in.m_passesName.cstr()));
+
+		pass.newBufferDependency(dep, BufferUsageBit::kUavCompute);
+
+		pass.setWork([this, lightIndexCountsPerCellBuff, lightIndexOffsetsPerCellBuff, lightIndexCountBuff, consts,
+					  lightIndexListBuff](RenderPassWorkContext& rgraph) {
+			ANKI_TRACE_SCOPED_EVENT(GpuVisibilityLocalLightsPrefixSum);
+			const GpuSceneArrays::Light& lights = GpuSceneArrays::Light::getSingleton();
+
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_fillGrProg.get());
+
+			cmdb.bindSrv(0, 0, lights.getBufferView());
+			cmdb.bindSrv(1, 0, lightIndexOffsetsPerCellBuff);
+
+			cmdb.bindUav(0, 0, lightIndexCountBuff);
+			cmdb.bindUav(1, 0, lightIndexCountsPerCellBuff);
+			cmdb.bindUav(2, 0, lightIndexListBuff);
+
+			cmdb.setFastConstants(&consts, sizeof(consts));
+
+			dispatchPPCompute(cmdb, 64, 1, lights.getElementCount(), 1);
+		});
 	}
 }
 

+ 2 - 2
AnKi/Renderer/Utils/GpuVisibility.h

@@ -340,8 +340,8 @@ public:
 	BufferHandle m_dependency; ///< Some handle to track dependencies. No need to track every buffer.
 
 	BufferView m_lightIndexOffsetsPerCellBuffer; ///< One offset to the m_lightIndexBuffer. One offset per cell.
-	BufferView m_lightIndexCountPerCellBuffer; ///< Number of lights per cell.
-	BufferView m_lightIndexBuffer; ///< Contains indexes to the GPU scene lights array.
+	BufferView m_lightIndexCountsPerCellBuffer; ///< Number of lights per cell.
+	BufferView m_lightIndexListBuffer; ///< Contains indexes to the GPU scene lights array.
 
 	/// @{
 	/// The volume of the grid.

+ 14 - 9
AnKi/Shaders/GpuVisibilityLocalLights.ankiprog

@@ -156,7 +156,7 @@ struct Func
 constexpr U32 kThreadCount = 1024; // Common for most GPUs
 constexpr U32 kMaxElementCountPerIteration = kThreadCount * 2;
 
-RWStructuredBuffer<U32> g_inputElements : register(u0); // It's the g_lightIndexCountsPerCell
+RWStructuredBuffer<U32> g_inputElements : register(u0); // It's the g_lightIndexCountsPerCell. RW because we want to zero it at the end
 
 RWStructuredBuffer<U32> g_outputElements : register(u1);
 
@@ -184,7 +184,7 @@ groupshared U32 g_valueSum;
 		const U32 firstElement = it * kMaxElementCountPerIteration;
 		const U32 endElement = min((it + 1) * kMaxElementCountPerIteration, elementCount);
 
-		// load input into shared memory
+		// Load input into shared memory
 		const U32 inIdx1 = 2 * tid + firstElement;
 		const U32 value1 = (inIdx1 < endElement) ? SBUFF(g_inputElements, inIdx1) : 0;
 		g_tmp[2 * tid] = value1;
@@ -209,9 +209,6 @@ groupshared U32 g_valueSum;
 			offset *= 2;
 		}
 
-		// Update the g_valueSum now that enough barriers have happened
-		InterlockedAdd(g_valueSum, value1 + value2);
-
 		// Clear the last element
 		if(tid == 0)
 		{
@@ -235,17 +232,26 @@ groupshared U32 g_valueSum;
 			}
 		}
 
+		// Good time to read it
+		const U32 valueSum = g_valueSum;
+
 		GroupMemoryBarrierWithGroupSync();
 
 		// Write to output buffer
 		if(inIdx1 < endElement)
 		{
-			SBUFF(g_outputElements, inIdx1) = g_tmp[2 * tid] + g_valueSum;
+			SBUFF(g_outputElements, inIdx1) = g_tmp[2 * tid] + valueSum;
 		}
 
 		if(inIdx2 < endElement)
 		{
-			SBUFF(g_outputElements, inIdx2) = g_tmp[2 * tid + 1] + g_valueSum;
+			SBUFF(g_outputElements, inIdx2) = g_tmp[2 * tid + 1] + valueSum;
+		}
+
+		// Good time to update it
+		if(value1 + value2 > 0)
+		{
+			InterlockedAdd(g_valueSum, value1 + value2);
 		}
 	}
 
@@ -281,7 +287,6 @@ groupshared U32 g_valueSum;
 #if NOT_ZERO(ANKI_TECHNIQUE_Fill)
 
 StructuredBuffer<GpuSceneLight> g_lights : register(t0);
-
 StructuredBuffer<U32> g_lightIndexListOffsets : register(t1); // Basically the prefix sum. One per cell
 
 RWStructuredBuffer<U32> g_lightIndexCount : register(u0);
@@ -295,7 +300,7 @@ struct Func
 	void operator()(U32 clusterIdx, U32 lightIdx)
 	{
 		U32 offset;
-		InterlockedAdd(SBUFF(g_lightIndexCountsPerCell, clusterIdx), offset);
+		InterlockedAdd(SBUFF(g_lightIndexCountsPerCell, clusterIdx), 1, offset);
 
 		offset += SBUFF(g_lightIndexListOffsets, clusterIdx);
 

+ 1 - 1
AnKi/Shaders/Include/Common.h

@@ -66,7 +66,7 @@ ANKI_END_NAMESPACE
 #	if defined(ANKI_ASSERTIONS_ENABLED) && ANKI_ASSERTIONS_ENABLED == 1 && ANKI_GR_BACKEND_VULKAN
 #		define ANKI_ASSERT(x) \
 			if(!(x)) \
-			printf("Assertion failed. Line %i", __LINE__)
+			printf("Assertion failed. (" __FILE__ ":%i)", __LINE__)
 #	else
 #		define ANKI_ASSERT(x)
 #	endif

+ 2 - 5
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -20,8 +20,6 @@
 #pragma anki technique BilateralDenoise comp mutators
 #pragma anki technique VisualizeProbes vert pixel mutators
 
-#define ANKI_ASSERTIONS_ENABLED 1
-
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/Functions.hlsl>
 #include <AnKi/Shaders/Include/MiscRendererTypes.h>
@@ -643,8 +641,7 @@ void oneIn4Reconstruct(IVec2 svDispatchThreadId)
 				maxLumaPixel = IVec2(x, y);
 			}
 
-			coord = quarterCoord + IVec2(x, y);
-			coord *= 2;
+			coord = quarterCoord * 2 + IVec2(x, y);
 			sampleDepths[x][y] = TEX(g_depthTex, coord);
 		}
 	}
@@ -849,7 +846,7 @@ RWTexture2D<Vec4> g_outTex : register(u0);
 			}
 
 			IVec2 newCoord = coord + IVec2(x, y);
-			newCoord.x = clamp(newCoord.x, 0, viewport.x - 1);
+			newCoord = clamp(newCoord, 0, viewport - 1);
 
 			const Vec3 sampleColor = TEX(g_inTex, newCoord);
 			const F32 sampleDepth = TEX(g_depthTex, newCoord);