Browse Source

Local light visibility (WIP)

Panagiotis Christopoulos Charitos 4 months ago
parent
commit
bbbbb09d7d

+ 6 - 0
AnKi/Gr/Vulkan/VkGrManager.cpp

@@ -598,6 +598,12 @@ Error GrManagerImpl::initInstance()
 	m_capabilities.m_minWaveSize = props13.minSubgroupSize;
 	m_capabilities.m_maxWaveSize = props13.maxSubgroupSize;
 
+	if(props2.properties.limits.maxComputeWorkGroupInvocations < 1024)
+	{
+		ANKI_VK_LOGE("GPU doesn't support at least 1024 workgroup invocations");
+		return Error::kFunctionFailed;
+	}
+
 	// Find vendor
 	switch(props2.properties.vendorID)
 	{

+ 3 - 1
AnKi/Renderer/RendererObject.def.h

@@ -50,7 +50,9 @@ ANKI_RENDERER_OBJECT_DEF(IndirectDiffuse, indirectDiffuse,
 ANKI_RENDERER_OBJECT_DEF(RenderableDrawer, drawer, 1)
 ANKI_RENDERER_OBJECT_DEF(GpuVisibility, gpuVisibility, 1)
 ANKI_RENDERER_OBJECT_DEF(GpuVisibilityNonRenderables, gpuVisibilityNonRenderables, 1)
-ANKI_RENDERER_OBJECT_DEF(GpuVisibilityAccelerationStructures, gpuVisibilityAccelerationStructures, 1)
+ANKI_RENDERER_OBJECT_DEF(GpuVisibilityAccelerationStructures, gpuVisibilityAccelerationStructures,
+						 GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
+ANKI_RENDERER_OBJECT_DEF(GpuVisibilityLocalLights, gpuVisibilityLocalLights, GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled)
 ANKI_RENDERER_OBJECT_DEF(HzbGenerator, hzbGenerator, 1)
 ANKI_RENDERER_OBJECT_DEF(ReadbackManager, readbackManager, 1)
 ANKI_RENDERER_OBJECT_DEF(MipmapGenerator, mipmapGenerator, 1)

+ 54 - 0
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -1195,4 +1195,58 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 	}
 }
 
+Error GpuVisibilityLocalLights::init()
+{
+	const CString fname = "ShaderBinaries/GpuVisibilityLocalLights.ankiprogbin";
+	ANKI_CHECK(loadShaderProgram(fname, {}, m_visibilityProg, m_setupGrProg, "Setup"));
+	ANKI_CHECK(loadShaderProgram(fname, {}, m_visibilityProg, m_countGrProg, "Count"));
+	ANKI_CHECK(loadShaderProgram(fname, {}, m_visibilityProg, m_prefixSumGrProg, "PrefixSum"));
+	ANKI_CHECK(loadShaderProgram(fname, {}, m_visibilityProg, m_fillGrProg, "Fill"));
+	return Error::kNone;
+}
+
+void GpuVisibilityLocalLights::populateRenderGraph(GpuVisibilityLocalLightsInput& in, GpuVisibilityLocalLightsOutput& out)
+{
+	RenderGraphBuilder& rgraph = *in.m_rgraph;
+
+	// Compute the bounds
+	{
+		const Vec3 newCamPos = in.m_cameraPosition + in.m_lookDirection * kForwardBias;
+		const Vec3 gridSize = Vec3(in.m_cellCounts) * in.m_cellSize;
+
+		out.m_lightGridMin = newCamPos - gridSize / 2.0f;
+		out.m_lightGridMax = out.m_lightGridMin + gridSize;
+	}
+
+	const U32 cellCount = in.m_cellCounts.x() * in.m_cellCounts.y() * in.m_cellCounts.z();
+
+	const BufferView lightIndexCountsPerCellBuff = allocateStructuredBuffer<U32>(cellCount);
+	const BufferView lightIndexOffsetsPerCellBuff = allocateStructuredBuffer<U32>(cellCount);
+	const BufferView lightIndexCountBuff = allocateStructuredBuffer<U32>(1);
+
+	const BufferHandle dep = rgraph.importBuffer(lightIndexCountBuff, BufferUsageBit::kNone);
+
+	// Setup
+	{
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass(generateTempPassName("Setup: %s", in.m_passesName.cstr()));
+
+		pass.newBufferDependency(dep, BufferUsageBit::kUavCompute);
+		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
+
+		pass.setWork([this, lightIndexCountsPerCellBuff, lightIndexCountBuff, cellCount](RenderPassWorkContext& rgraph) {
+			ANKI_TRACE_SCOPED_EVENT(GpuVisibilityLocalLightsSetup);
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+
+			cmdb.bindUav(0, 0, lightIndexCountsPerCellBuff);
+			cmdb.bindUav(1, 0, lightIndexCountBuff);
+
+			dispatchPPCompute(cmdb, 64, 1, cellCount, 1);
+		});
+	}
+
+	// Count
+	{
+	}
+}
+
 } // end namespace anki

+ 53 - 0
AnKi/Renderer/Utils/GpuVisibility.h

@@ -315,6 +315,59 @@ private:
 	U64 m_lastFrameIdx = kMaxU64;
 #endif
 };
+
+/// @memberof GpuVisibilityLocalLights
+class GpuVisibilityLocalLightsInput
+{
+public:
+	UVec3 m_cellCounts;
+	Vec3 m_cellSize;
+
+	Vec3 m_cameraPosition;
+	Vec3 m_lookDirection;
+
+	U32 m_lightIndexListSize = 0; ///< The number of light indices to store.
+
+	CString m_passesName = "GpuVisibilityLocalLights";
+
+	RenderGraphBuilder* m_rgraph = nullptr;
+};
+
+/// @memberof GpuVisibilityLocalLights
+class GpuVisibilityLocalLightsOutput
+{
+public:
+	BufferHandle m_dependency; ///< Some handle to track dependencies. No need to track every buffer.
+
+	BufferView m_lightIndexOffsetsPerCellBuffer; ///< One offset to the m_lightIndexBuffer. One offset per cell.
+	BufferView m_lightIndexCountPerCellBuffer; ///< Number of lights per cell.
+	BufferView m_lightIndexBuffer; ///< Contains indexes to the GPU scene lights array.
+
+	/// @{
+	/// The volume of the grid.
+	Vec3 m_lightGridMin;
+	Vec3 m_lightGridMax;
+	/// @}
+};
+
+/// Gathers the local lights around the camera to a grid.
+class GpuVisibilityLocalLights : public RendererObject
+{
+public:
+	Error init();
+
+	void populateRenderGraph(GpuVisibilityLocalLightsInput& in, GpuVisibilityLocalLightsOutput& out);
+
+private:
+	static constexpr F32 kForwardBias = 4.0f;
+
+	ShaderProgramResourcePtr m_visibilityProg;
+
+	ShaderProgramPtr m_setupGrProg;
+	ShaderProgramPtr m_countGrProg;
+	ShaderProgramPtr m_prefixSumGrProg;
+	ShaderProgramPtr m_fillGrProg;
+};
 /// @}
 
 } // end namespace anki

+ 312 - 0
AnKi/Shaders/GpuVisibilityLocalLights.ankiprog

@@ -0,0 +1,312 @@
+// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+// Terminology:
+// - Grid: The volume we are looking to gather lights for
+// - Cell: The grid is dividied in cells
+// - Light index list: An array of indices that point the GPU scene lights. Each cell points to a part of this list
+
+#pragma anki technique Setup comp
+#pragma anki technique Count comp
+#pragma anki technique PrefixSum comp
+#pragma anki technique Fill comp
+
+#include <AnKi/Shaders/Common.hlsl>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+#include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
+#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
+
+template<typename TFunc>
+void lightVsCellVisibility(StructuredBuffer<GpuSceneLight> lights, U32 lightIdx, GpuVisibilityLocalLightsConsts consts,
+						   RWStructuredBuffer<U32> lightIndexCount, TFunc binLightToCellFunc)
+{
+	const U32 lightCount = getStructuredBufferElementCount(lights);
+	if(lightIdx >= lightCount)
+	{
+		return;
+	}
+
+	const GpuSceneLight light = SBUFF(lights, lightIdx);
+
+	// Get the light bounds
+	Vec3 worldLightAabbMin;
+	Vec3 worldLightAabbMax;
+	if((U32)light.m_flags & (U32)GpuSceneLightFlag::kPointLight)
+	{
+		worldLightAabbMin = light.m_position - light.m_radius;
+		worldLightAabbMax = light.m_position + light.m_radius;
+	}
+	else
+	{
+		worldLightAabbMin = light.m_position;
+		worldLightAabbMax = light.m_position;
+
+		[unroll] for(U32 i = 0; i < 4; ++i)
+		{
+			worldLightAabbMin = min(worldLightAabbMin, light.m_edgePoints[i]);
+			worldLightAabbMax = max(worldLightAabbMax, light.m_edgePoints[i]);
+		}
+	}
+
+	Vec3 localLightAabbMin = worldLightAabbMin - consts.m_gridVolumeMin;
+	localLightAabbMin = clamp(localLightAabbMin, 0.0, consts.m_gridVolumeMax - kEpsilonF32);
+
+	Vec3 localLightAabbMax = worldLightAabbMax - consts.m_gridVolumeMin;
+	localLightAabbMax = clamp(localLightAabbMax, 0.0, consts.m_gridVolumeMax - kEpsilonF32);
+
+	if(any(localLightAabbMin == localLightAabbMax))
+	{
+		// Outside the volume
+		return;
+	}
+
+	const Vec3 localLightFirstCell = floor(localLightAabbMin / consts.m_cellSize);
+	const Vec3 localLightEndCell = ceil(localLightAabbMax / consts.m_cellSize);
+
+	for(F32 x = localLightFirstCell.x; x < localLightEndCell.x; x += 1.0)
+	{
+		for(F32 y = localLightFirstCell.y; y < localLightEndCell.y; y += 1.0)
+		{
+			for(F32 z = localLightFirstCell.z; z < localLightEndCell.z; z += 1.0)
+			{
+				U32 count;
+				InterlockedAdd(SBUFF(lightIndexCount, 0), 1, count);
+				++count;
+
+				if(count > consts.m_maxLightIndices)
+				{
+					// Light index list is too small
+					break;
+				}
+
+				const F32 cellIdx = z * consts.m_cellCounts.y * consts.m_cellCounts.x + y * consts.m_cellCounts.x + x;
+
+				binLightToCellFunc(cellIdx, lightIdx);
+			}
+		}
+	}
+}
+
+// ===========================================================================
+// Setup                                                                     =
+// ===========================================================================
+#if NOT_ZERO(ANKI_TECHNIQUE_Setup)
+
+RWStructuredBuffer<U32> g_lightIndexCountsPerCell : register(u0);
+RWStructuredBuffer<U32> g_lightIndexCount : register(u1);
+
+ANKI_FAST_CONSTANTS(GpuVisibilityLocalLightsConsts, g_consts)
+
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	if(svDispatchThreadId.x == 0)
+	{
+		SBUFF(g_lightIndexCount, 0) = 0;
+	}
+
+	const U32 elementCount = getStructuredBufferElementCount(g_lightIndexCountsPerCell);
+	if(svDispatchThreadId.x < elementCount)
+	{
+		SBUFF(g_lightIndexCountsPerCell, svDispatchThreadId.x) = 0;
+	}
+}
+#endif
+
+// ===========================================================================
+// Count                                                                     =
+// ===========================================================================
+
+// Counts the light indices per cell
+
+#if NOT_ZERO(ANKI_TECHNIQUE_Count)
+
+StructuredBuffer<GpuSceneLight> g_lights : register(t0);
+
+RWStructuredBuffer<U32> g_lightIndexCountsPerCell : register(u0);
+RWStructuredBuffer<U32> g_lightIndexCount : register(u1);
+
+ANKI_FAST_CONSTANTS(GpuVisibilityLocalLightsConsts, g_consts)
+
+struct Func
+{
+	void operator()(U32 cellIdx, U32 lightIdx)
+	{
+		InterlockedAdd(SBUFF(g_lightIndexCountsPerCell, cellIdx), 1);
+	}
+};
+
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	Func func;
+	lightVsCellVisibility(g_lights, svDispatchThreadId.x, g_consts, g_lightIndexCount, func);
+}
+#endif
+
+// ===========================================================================
+// PrefixSum                                                                 =
+// ===========================================================================
+
+// Parallel prefix based on: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
+// But it runs multiple iterations to support bigger arrays
+
+#if NOT_ZERO(ANKI_TECHNIQUE_PrefixSum)
+
+constexpr U32 kThreadCount = 1024; // Common for most GPUs
+constexpr U32 kMaxElementCountPerIteration = kThreadCount * 2;
+
+RWStructuredBuffer<U32> g_inputElements : register(u0); // It's the g_lightIndexCountsPerCell
+
+RWStructuredBuffer<U32> g_outputElements : register(u1);
+
+// Some stuff to zero
+RWStructuredBuffer<U32> g_lightIndexCount : register(u2);
+
+ANKI_FAST_CONSTANTS(GpuVisibilityLocalLightsConsts, g_consts)
+
+groupshared U32 g_tmp[kMaxElementCountPerIteration];
+groupshared U32 g_valueSum;
+
+[numthreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const U32 elementCount = g_consts.m_cellCounts.x * g_consts.m_cellCounts.y * g_consts.m_cellCounts.z;
+	const U32 iterationCount = (elementCount + kMaxElementCountPerIteration - 1) / kMaxElementCountPerIteration;
+
+	const U32 tid = svGroupIndex;
+
+	g_valueSum = 0; // No need for barrier, there are plenty bellow
+
+	for(U32 it = 0; it < iterationCount; ++it)
+	{
+		GroupMemoryBarrierWithGroupSync(); // Barrier because of the loop
+
+		const U32 firstElement = it * kMaxElementCountPerIteration;
+		const U32 endElement = min((it + 1) * kMaxElementCountPerIteration, elementCount);
+
+		// load input into shared memory
+		const U32 inIdx1 = 2 * tid + firstElement;
+		const U32 value1 = (inIdx1 < endElement) ? SBUFF(g_inputElements, inIdx1) : 0;
+		g_tmp[2 * tid] = value1;
+
+		const U32 inIdx2 = 2 * tid + 1 + firstElement;
+		const U32 value2 = (inIdx2 < endElement) ? SBUFF(g_inputElements, inIdx2) : 0;
+		g_tmp[2 * tid + 1] = value2;
+
+		// Perform reduction
+		U32 offset = 1;
+		for(U32 d = kMaxElementCountPerIteration >> 1; d > 0; d >>= 1)
+		{
+			GroupMemoryBarrierWithGroupSync();
+
+			if(tid < d)
+			{
+				const U32 ai = offset * (2 * tid + 1) - 1;
+				const U32 bi = offset * (2 * tid + 2) - 1;
+				g_tmp[bi] += g_tmp[ai];
+			}
+
+			offset *= 2;
+		}
+
+		// Update the g_valueSum now that enough barriers have happened
+		InterlockedAdd(g_valueSum, value1 + value2);
+
+		// Clear the last element
+		if(tid == 0)
+		{
+			g_tmp[kMaxElementCountPerIteration - 1] = 0;
+		}
+
+		// Perform downsweep and build scan
+		for(U32 d = 1; d < kMaxElementCountPerIteration; d *= 2)
+		{
+			offset >>= 1;
+
+			GroupMemoryBarrierWithGroupSync();
+
+			if(tid < d)
+			{
+				const U32 ai = offset * (2 * tid + 1) - 1;
+				const U32 bi = offset * (2 * tid + 2) - 1;
+				const U32 t = g_tmp[ai];
+				g_tmp[ai] = g_tmp[bi];
+				g_tmp[bi] += t;
+			}
+		}
+
+		GroupMemoryBarrierWithGroupSync();
+
+		// Write to output buffer
+		if(inIdx1 < endElement)
+		{
+			SBUFF(g_outputElements, inIdx1) = g_tmp[2 * tid] + g_valueSum;
+		}
+
+		if(inIdx2 < endElement)
+		{
+			SBUFF(g_outputElements, inIdx2) = g_tmp[2 * tid + 1] + g_valueSum;
+		}
+	}
+
+	// Abuse this compute job to also reset some buffers
+	if(tid == 0)
+	{
+		SBUFF(g_lightIndexCount, 0) = 0;
+	}
+
+	{
+		const U32 elementsPerThread = (elementCount + kThreadCount - 1) / kThreadCount;
+
+		for(U32 i = 0; i < elementsPerThread; ++i)
+		{
+			const U32 idx = tid * elementsPerThread + i;
+			if(idx >= elementCount)
+			{
+				break;
+			}
+
+			SBUFF(g_inputElements, idx) = 0;
+		}
+	}
+}
+#endif
+
+// ===========================================================================
+// Fill                                                                      =
+// ===========================================================================
+
+// After the prefix sum is complete this job can store the results
+
+#if NOT_ZERO(ANKI_TECHNIQUE_Fill)
+
+StructuredBuffer<GpuSceneLight> g_lights : register(t0);
+
+StructuredBuffer<U32> g_lightIndexListOffsets : register(t1); // Basically the prefix sum. One per cell
+
+RWStructuredBuffer<U32> g_lightIndexCount : register(u0);
+RWStructuredBuffer<U32> g_lightIndexCountsPerCell : register(u1);
+RWStructuredBuffer<U32> g_lightIndexList : register(u2);
+
+ANKI_FAST_CONSTANTS(GpuVisibilityLocalLightsConsts, g_consts)
+
+struct Func
+{
+	void operator()(U32 clusterIdx, U32 lightIdx)
+	{
+		U32 offset;
+		InterlockedAdd(SBUFF(g_lightIndexCountsPerCell, clusterIdx), offset);
+
+		offset += SBUFF(g_lightIndexListOffsets, clusterIdx);
+
+		SBUFF(g_lightIndexList, offset) = lightIdx;
+	}
+};
+
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	Func func;
+	lightVsCellVisibility(g_lights, svDispatchThreadId.x, g_consts, g_lightIndexCount, func);
+}
+
+#endif

+ 15 - 0
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -104,4 +104,19 @@ struct GpuVisibilityNonRenderablesCounters
 	U32 m_feedbackObjectCount; ///< Counts the visbile objects that need feedback
 };
 
+struct GpuVisibilityLocalLightsConsts
+{
+	Vec3 m_cellSize;
+	U32 m_maxLightIndices;
+
+	Vec3 m_gridVolumeMin;
+	F32 m_padding2;
+
+	Vec3 m_gridVolumeMax;
+	F32 m_padding3;
+
+	Vec3 m_cellCounts;
+	F32 m_padding4;
+};
+
 ANKI_END_NAMESPACE