Răsfoiți Sursa

Improve fog

Panagiotis Christopoulos Charitos 3 săptămâni în urmă
părinte
comite
ee83ac1a72

+ 0 - 4
AnKi/Config.h.cmake

@@ -7,9 +7,6 @@
 
 #pragma once
 
-/// @addtogroup config
-/// @{
-
 #define _ANKI_STR_HELPER(x) #x
 #define _ANKI_STR(x) _ANKI_STR_HELPER(x)
 
@@ -350,4 +347,3 @@ void cleanupGetAndroidCommandLineArguments(void* ptr);
 		return exitCode; \
 	}
 #endif
-/// @}

+ 27 - 8
AnKi/Renderer/ClusterBinning.cpp

@@ -38,6 +38,8 @@ Error ClusterBinning::init()
 									 m_packingGrProgs[type], "PackVisibles"));
 	}
 
+	m_tileCounts = (getRenderer().getInternalResolution() + kClusteredShadingTileSize - 1) / kClusteredShadingTileSize;
+
 	return Error::kNone;
 }
 
@@ -49,8 +51,10 @@ void ClusterBinning::populateRenderGraph()
 
 	// Allocate the clusters buffer
 	{
-		const U32 clusterCount = getRenderer().getTileCounts().x * getRenderer().getTileCounts().y + getRenderer().getZSplitCount();
-		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<Cluster>(clusterCount);
+		const U32 clusterCount = m_tileCounts.x * m_tileCounts.y + g_cvarRenderClustererZSplitCount;
+
+		// Allocate +1 which is used as a zero cluster when a point is outside the clusterer
+		m_runCtx.m_clustersBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<Cluster>(clusterCount + 1);
 		m_runCtx.m_dep = rgraph.importBuffer(m_runCtx.m_clustersBuffer, BufferUsageBit::kNone);
 	}
 
@@ -78,7 +82,7 @@ void ClusterBinning::populateRenderGraph()
 
 			cmdb.bindShaderProgram(m_jobSetupGrProg.get());
 
-			const UVec4 consts(getRenderer().getTileCounts().x * getRenderer().getTileCounts().y);
+			const UVec4 consts(m_tileCounts.x * m_tileCounts.y);
 			cmdb.setFastConstants(&consts, sizeof(consts));
 
 			for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
@@ -162,7 +166,7 @@ void ClusterBinning::populateRenderGraph()
 				struct ClusterBinningConstants
 				{
 					Vec3 m_cameraOrigin;
-					F32 m_zSplitCountOverFrustumLength;
+					F32 m_zSplitCountOverClustererLength;
 
 					Vec2 m_renderingSize;
 					U32 m_tileCountX;
@@ -180,16 +184,16 @@ void ClusterBinning::populateRenderGraph()
 
 				RenderingContext& ctx = getRenderingContext();
 				consts.m_cameraOrigin = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz;
-				consts.m_zSplitCountOverFrustumLength = F32(getRenderer().getZSplitCount()) / (ctx.m_matrices.m_far - ctx.m_matrices.m_near);
+				consts.m_zSplitCountOverClustererLength = F32(g_cvarRenderClustererZSplitCount) / (computeClustererFar() - ctx.m_matrices.m_near);
 				consts.m_renderingSize = Vec2(getRenderer().getInternalResolution());
-				consts.m_tileCountX = getRenderer().getTileCounts().x;
-				consts.m_tileCount = getRenderer().getTileCounts().x * getRenderer().getTileCounts().y;
+				consts.m_tileCountX = m_tileCounts.x;
+				consts.m_tileCount = m_tileCounts.x * m_tileCounts.y;
 
 				Plane nearPlane;
 				extractClipPlane(ctx.m_matrices.m_viewProjection, FrustumPlaneType::kNear, nearPlane);
 				consts.m_nearPlaneWorld = Vec4(nearPlane.getNormal().xyz, nearPlane.getOffset());
 
-				consts.m_zSplitCountMinusOne = getRenderer().getZSplitCount() - 1;
+				consts.m_zSplitCountMinusOne = g_cvarRenderClustererZSplitCount - 1;
 
 				consts.m_invertedViewProjMat = ctx.m_matrices.m_invertedViewProjectionJitter;
 
@@ -280,4 +284,19 @@ void ClusterBinning::populateRenderGraph()
 	}
 }
 
+void ClusterBinning::fillClustererConstants(ClustererConstants& consts) const
+{
+	const F32 clustererFar = computeClustererFar();
+	const F32 zSplitCount = F32(g_cvarRenderClustererZSplitCount);
+	const F32 n = getRenderingContext().m_matrices.m_near;
+	const F32 f = getRenderingContext().m_matrices.m_far;
+
+	consts.m_zSplitMagic.x = (clustererFar - n) / (-n * zSplitCount);
+	consts.m_zSplitMagic.y = f * (clustererFar - n) / (n * (f - n) * zSplitCount);
+	consts.m_tileCounts = m_tileCounts;
+	consts.m_clustererFar = clustererFar;
+	consts.m_clusterCount = consts.m_tileCounts.x * consts.m_tileCounts.y * U32(zSplitCount);
+	consts.m_zSplitCount = U32(zSplitCount);
+}
+
 } // end namespace anki

+ 18 - 0
AnKi/Renderer/ClusterBinning.h

@@ -9,6 +9,9 @@
 
 namespace anki {
 
+ANKI_CVAR2(NumericCVar<U32>, Render, Clusterer, ZSplitCount, 64, 8, kMaxZsplitCount, "Clusterer number of Z splits")
+ANKI_CVAR2(NumericCVar<F32>, Render, Clusterer, Far, 512.0f, 32.0f, 10.0f * 1000.0f, "The extend of the clusterer in meters")
+
 // Bins clusterer objects to the clusterer.
 class ClusterBinning : public RendererObject
 {
@@ -36,12 +39,27 @@ public:
 		return m_runCtx.m_dep;
 	}
 
+	// Returns the length of the cluster frustum. It's less or equal to camera far.
+	F32 computeClustererFar() const
+	{
+		return min<F32>(getRenderingContext().m_matrices.m_far, g_cvarRenderClustererFar);
+	}
+
+	const UVec2& getTileCounts() const
+	{
+		return m_tileCounts;
+	}
+
+	void fillClustererConstants(ClustererConstants& consts) const;
+
 private:
 	ShaderProgramResourcePtr m_prog;
 	ShaderProgramPtr m_jobSetupGrProg;
 	Array<ShaderProgramPtr, U32(GpuSceneNonRenderableObjectType::kCount)> m_binningGrProgs;
 	Array<ShaderProgramPtr, U32(GpuSceneNonRenderableObjectType::kCount)> m_packingGrProgs;
 
+	UVec2 m_tileCounts = UVec2(0u);
+
 	class
 	{
 	public:

+ 2 - 16
AnKi/Renderer/LightShading.cpp

@@ -176,26 +176,12 @@ void LightShading::run(RenderPassWorkContext& rgraphCtx)
 		cmdb.bindShaderProgram(m_applyFog.m_grProg.get());
 
 		// Bind all
-		cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
-		cmdb.bindSampler(1, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+		cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
 
 		rgraphCtx.bindSrv(0, 0, getGBuffer().getDepthRt());
 		rgraphCtx.bindSrv(1, 0, getRenderer().getVolumetricFog().getRt());
 
-		class Consts
-		{
-		public:
-			F32 m_zSplitCount;
-			F32 m_finalZSplit;
-			F32 m_near;
-			F32 m_far;
-		} consts;
-		consts.m_zSplitCount = F32(getRenderer().getZSplitCount());
-		consts.m_finalZSplit = F32(getRenderer().getVolumetricFog().getFinalClusterInZ());
-		consts.m_near = getRenderingContext().m_matrices.m_near;
-		consts.m_far = getRenderingContext().m_matrices.m_far;
-
-		cmdb.setFastConstants(&consts, sizeof(consts));
+		cmdb.bindConstantBuffer(0, 0, getRenderingContext().m_globalRenderingConstantsBuffer);
 
 		// finalPixelColor = pixelWithoutFog * transmitance + inScattering (see the shader)
 		cmdb.setBlendFactors(0, BlendFactor::kOne, BlendFactor::kSrcAlpha);

+ 4 - 12
AnKi/Renderer/Renderer.cpp

@@ -163,10 +163,6 @@ Error Renderer::initInternal(const RendererInitInfo& inf)
 	ANKI_R_LOGI("Initializing offscreen renderer. Resolution %ux%u. Internal resolution %ux%u", m_postProcessResolution.x, m_postProcessResolution.y,
 				m_internalResolution.x, m_internalResolution.y);
 
-	m_tileCounts.x = (m_internalResolution.x + kClusteredShadingTileSize - 1) / kClusteredShadingTileSize;
-	m_tileCounts.y = (m_internalResolution.y + kClusteredShadingTileSize - 1) / kClusteredShadingTileSize;
-	m_zSplitCount = g_cvarRenderZSplitCount;
-
 	if(g_cvarCoreMeshletRendering && !GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
 	{
 		m_meshletRenderingType = MeshletRenderingType::kSoftware;
@@ -342,7 +338,7 @@ Error Renderer::populateRenderGraph()
 	m_historyLength->populateRenderGraph();
 	m_depthDownscale->populateRenderGraph();
 	m_shadowMapping->populateRenderGraph();
-	m_clusterBinning2->populateRenderGraph();
+	m_clusterBinning->populateRenderGraph();
 	m_generatedSky->populateRenderGraph();
 	if(m_indirectDiffuseProbes)
 	{
@@ -392,7 +388,7 @@ void Renderer::writeGlobalRendererConstants(GlobalRendererConstants& outConsts)
 
 	RenderingContext& ctx = getRenderingContext();
 	GlobalRendererConstants consts;
-	memset(&consts, 0, sizeof(consts));
+	zeroMemory(consts);
 
 	consts.m_renderingSize = Vec2(F32(m_internalResolution.x), F32(m_internalResolution.y));
 
@@ -404,12 +400,8 @@ void Renderer::writeGlobalRendererConstants(GlobalRendererConstants& outConsts)
 	consts.m_nearPlaneWSpace = Vec4(nearPlane.getNormal().xyz, nearPlane.getOffset());
 	consts.m_cameraPosition = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz;
 
-	consts.m_tileCounts = m_tileCounts;
-	consts.m_zSplitCount = m_zSplitCount;
-	consts.m_zSplitCountOverFrustumLength = F32(m_zSplitCount) / (ctx.m_matrices.m_far - ctx.m_matrices.m_near);
-	consts.m_zSplitMagic.x = (ctx.m_matrices.m_near - ctx.m_matrices.m_far) / (ctx.m_matrices.m_near * F32(m_zSplitCount));
-	consts.m_zSplitMagic.y = ctx.m_matrices.m_far / (ctx.m_matrices.m_near * F32(m_zSplitCount));
-	consts.m_lightVolumeLastZSplit = min(g_cvarRenderVolumetricLightingAccumulationFinalZSplit - 1, m_zSplitCount);
+	m_clusterBinning->fillClustererConstants(consts.m_clusterer);
+	m_volumetricLightingAccumulation->fillClustererConstants(consts.m_clusterer);
 
 	consts.m_reflectionProbesMipCount = F32(m_probeReflections->getReflectionTextureMipmapCount());
 

+ 0 - 14
AnKi/Renderer/Renderer.h

@@ -25,7 +25,6 @@ ANKI_CVAR(
 		return (value > 0.1f && value <= 8.0f) || value == 540.0f || value == 720.0f || value == 1080.0f || value == 1440.0f || value == 2160.0f;
 	},
 	"A factor over the requested swapchain resolution. Applies to post-processing and UI")
-ANKI_CVAR(NumericCVar<U32>, Render, ZSplitCount, 64, 8, kMaxZsplitCount, "Clusterer number of Z splits")
 ANKI_CVAR(NumericCVar<U8>, Render, TextureAnisotropy, (ANKI_PLATFORM_MOBILE) ? 1 : 16, 1, 16, "Texture anisotropy for the main passes")
 ANKI_CVAR(BoolCVar, Render, PreferCompute, !ANKI_PLATFORM_MOBILE, "Prefer compute shaders")
 ANKI_CVAR(BoolCVar, Render, HighQualityHdr, !ANKI_PLATFORM_MOBILE, "If true use R16G16B16 for HDR images. Alternatively use B10G11R11")
@@ -153,16 +152,6 @@ public:
 		return m_samplers;
 	}
 
-	const UVec2& getTileCounts() const
-	{
-		return m_tileCounts;
-	}
-
-	U32 getZSplitCount() const
-	{
-		return m_zSplitCount;
-	}
-
 	Format getHdrFormat() const;
 	Format getDepthNoStencilFormat() const;
 
@@ -275,9 +264,6 @@ private:
 
 	RenderGraphPtr m_rgraph;
 
-	UVec2 m_tileCounts = UVec2(0u);
-	U32 m_zSplitCount = 0;
-
 	class
 	{
 	public:

+ 1 - 1
AnKi/Renderer/RendererObject.def.h

@@ -15,6 +15,7 @@ ANKI_RENDERER_OBJECT_DEF(Tonemapping, tonemapping, 1)
 ANKI_RENDERER_OBJECT_DEF(FinalComposite, finalComposite, 1)
 ANKI_RENDERER_OBJECT_DEF(Dbg, dbg, 1)
 ANKI_RENDERER_OBJECT_DEF(ProbeReflections, probeReflections, 1)
+ANKI_RENDERER_OBJECT_DEF(ClusterBinning, clusterBinning, 1)
 ANKI_RENDERER_OBJECT_DEF(VolumetricFog, volumetricFog, 1)
 ANKI_RENDERER_OBJECT_DEF(DepthDownscale, depthDownscale, 1)
 ANKI_RENDERER_OBJECT_DEF(TemporalAA, temporalAA, 1)
@@ -34,7 +35,6 @@ ANKI_RENDERER_OBJECT_DEF(MotionVectors, motionVectors, 1)
 ANKI_RENDERER_OBJECT_DEF(TemporalUpscaler, temporalUpscaler, 1)
 ANKI_RENDERER_OBJECT_DEF(VrsSriGeneration, vrsSriGeneration, 1)
 ANKI_RENDERER_OBJECT_DEF(PrimaryNonRenderableVisibility, primaryNonRenderableVisibility, 1)
-ANKI_RENDERER_OBJECT_DEF(ClusterBinning, clusterBinning2, 1)
 ANKI_RENDERER_OBJECT_DEF(Ssao, ssao, 1)
 ANKI_RENDERER_OBJECT_DEF(GeneratedSky, generatedSky, 1)
 ANKI_RENDERER_OBJECT_DEF(MotionBlur, motionBlur, 1)

+ 10 - 14
AnKi/Renderer/VolumetricFog.cpp

@@ -8,6 +8,7 @@
 #include <AnKi/Renderer/DepthDownscale.h>
 #include <AnKi/Renderer/ShadowMapping.h>
 #include <AnKi/Renderer/LightShading.h>
+#include <AnKi/Renderer/ClusterBinning.h>
 #include <AnKi/Renderer/VolumetricLightingAccumulation.h>
 #include <AnKi/Util/CVarSet.h>
 #include <AnKi/Scene/Components/SkyboxComponent.h>
@@ -18,20 +19,17 @@ namespace anki {
 Error VolumetricFog::init()
 {
 	// Misc
-	const F32 qualityXY = g_cvarRenderVolumetricLightingAccumulationQualityXY;
-	const F32 qualityZ = g_cvarRenderVolumetricLightingAccumulationQualityZ;
-	m_finalZSplit = min<U32>(getRenderer().getZSplitCount() - 1, g_cvarRenderVolumetricLightingAccumulationFinalZSplit);
+	const U32 zSplitCount = min<U32>(g_cvarRenderClustererZSplitCount, g_cvarRenderVolumetricLightingAccumulationFinalZSplit + 1);
 
-	m_volumeSize[0] = U32(F32(getRenderer().getTileCounts().x) * qualityXY);
-	m_volumeSize[1] = U32(F32(getRenderer().getTileCounts().y) * qualityXY);
-	m_volumeSize[2] = U32(F32(m_finalZSplit + 1) * qualityZ);
+	m_volumeSize.xy = getClusterBinning().getTileCounts() << g_cvarRenderVolumetricLightingAccumulationSubdivisionXY;
+	m_volumeSize.z = zSplitCount << g_cvarRenderVolumetricLightingAccumulationSubdivisionZ;
 
 	// Shaders
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/VolumetricFogAccumulation.ankiprogbin", m_prog, m_grProg));
 
 	// RT descr
-	m_rtDescr = getRenderer().create2DRenderTargetDescription(m_volumeSize[0], m_volumeSize[1], Format::kR16G16B16A16_Sfloat, "Fog");
-	m_rtDescr.m_depth = m_volumeSize[2];
+	m_rtDescr = getRenderer().create2DRenderTargetDescription(m_volumeSize.x, m_volumeSize.y, Format::kR16G16B16A16_Sfloat, "Fog");
+	m_rtDescr.m_depth = m_volumeSize.z;
 	m_rtDescr.m_type = TextureType::k3D;
 	m_rtDescr.bake();
 
@@ -67,15 +65,13 @@ void VolumetricFog::populateRenderGraph()
 		consts.m_fogDiffuse = (sky) ? sky->getFogDiffuseColor() : Vec3(0.0f);
 		consts.m_fogScatteringCoeff = (sky) ? sky->getFogScatteringCoefficient() : 0.0f;
 		consts.m_fogAbsorptionCoeff = (sky) ? sky->getFogAbsorptionCoefficient() : 0.0f;
-		consts.m_near = getRenderingContext().m_matrices.m_near;
-		consts.m_far = getRenderingContext().m_matrices.m_far;
-		consts.m_zSplitCountf = F32(getRenderer().getZSplitCount());
-		consts.m_volumeSize = UVec3(m_volumeSize);
-		consts.m_maxZSplitsToProcessf = F32(m_finalZSplit + 1);
+		consts.m_zSplitThickness = (getClusterBinning().computeClustererFar() - getRenderingContext().m_matrices.m_near)
+								   / F32(g_cvarRenderClustererZSplitCount << g_cvarRenderVolumetricLightingAccumulationSubdivisionZ);
+		consts.m_volumeSize = m_volumeSize;
 
 		cmdb.setFastConstants(&consts, sizeof(consts));
 
-		dispatchPPCompute(cmdb, 8, 8, m_volumeSize[0], m_volumeSize[1]);
+		dispatchPPCompute(cmdb, 8, 8, m_volumeSize.x, m_volumeSize.y);
 	});
 }
 

+ 1 - 14
AnKi/Renderer/VolumetricFog.h

@@ -22,26 +22,13 @@ public:
 		return m_runCtx.m_rt;
 	}
 
-	const Array<U32, 3>& getVolumeSize() const
-	{
-		return m_volumeSize;
-	}
-
-	// Get the last cluster split in Z axis that will be affected by lighting.
-	U32 getFinalClusterInZ() const
-	{
-		return m_finalZSplit;
-	}
-
 private:
 	ShaderProgramResourcePtr m_prog;
 	ShaderProgramPtr m_grProg;
 
 	RenderTargetDesc m_rtDescr;
 
-	U32 m_finalZSplit = 0;
-
-	Array<U32, 3> m_volumeSize;
+	UVec3 m_volumeSize;
 
 	class
 	{

+ 86 - 17
AnKi/Renderer/VolumetricLightingAccumulation.cpp

@@ -7,6 +7,7 @@
 #include <AnKi/Renderer/ShadowMapping.h>
 #include <AnKi/Renderer/IndirectDiffuseProbes.h>
 #include <AnKi/Renderer/IndirectDiffuseClipmaps.h>
+#include <AnKi/Renderer/GBuffer.h>
 #include <AnKi/Renderer/Renderer.h>
 #include <AnKi/Renderer/ClusterBinning.h>
 #include <AnKi/Resource/ImageResource.h>
@@ -19,16 +20,13 @@ namespace anki {
 Error VolumetricLightingAccumulation::init()
 {
 	// Misc
-	const F32 qualityXY = g_cvarRenderVolumetricLightingAccumulationQualityXY;
-	const F32 qualityZ = g_cvarRenderVolumetricLightingAccumulationQualityZ;
-	const U32 finalZSplit = min<U32>(getRenderer().getZSplitCount() - 1, g_cvarRenderVolumetricLightingAccumulationFinalZSplit);
+	const U32 finalZSplit = min<U32>(g_cvarRenderClustererZSplitCount - 1, g_cvarRenderVolumetricLightingAccumulationFinalZSplit);
 
-	m_volumeSize[0] = U32(F32(getRenderer().getTileCounts().x) * qualityXY);
-	m_volumeSize[1] = U32(F32(getRenderer().getTileCounts().y) * qualityXY);
-	m_volumeSize[2] = U32(F32(finalZSplit + 1) * qualityZ);
+	m_volumeSize.xy = getClusterBinning().getTileCounts() << g_cvarRenderVolumetricLightingAccumulationSubdivisionXY;
+	m_volumeSize.z = (finalZSplit + 1) << g_cvarRenderVolumetricLightingAccumulationSubdivisionZ;
 
-	if(!isAligned(getRenderer().getTileCounts().x, m_volumeSize[0]) || !isAligned(getRenderer().getTileCounts().y, m_volumeSize[1])
-	   || m_volumeSize[0] == 0 || m_volumeSize[1] == 0 || m_volumeSize[2] == 0)
+	if(!isAligned(getClusterBinning().getTileCounts().x, m_volumeSize.x) || !isAligned(getClusterBinning().getTileCounts().y, m_volumeSize.y)
+	   || m_volumeSize.x == 0 || m_volumeSize.y == 0 || m_volumeSize.z == 0)
 	{
 		ANKI_R_LOGE("Wrong input");
 		return Error::kUserData;
@@ -37,18 +35,25 @@ Error VolumetricLightingAccumulation::init()
 	ANKI_CHECK(ResourceManager::getSingleton().loadResource("EngineAssets/BlueNoise_Rgba8_64x64.png", m_noiseImage));
 
 	// Shaders
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/VolumetricLightingAccumulation.ankiprogbin",
-								 {{"ENABLE_SHADOWS", 1}, {"CLIPMAP_DIFFUSE_INDIRECT", isIndirectDiffuseClipmapsEnabled()}}, m_prog, m_grProg));
+	const Array<SubMutation, 2> mutation = {{{"ENABLE_SHADOWS", 1}, {"CLIPMAP_DIFFUSE_INDIRECT", isIndirectDiffuseClipmapsEnabled()}}};
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/VolumetricLightingAccumulation.ankiprogbin", mutation, m_prog, m_grProg, "Accumulate"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/VolumetricLightingAccumulation.ankiprogbin", mutation, m_prog, m_debugGrProg, "Debug"));
 
 	// Create RTs
 	TextureInitInfo texinit = getRenderer().create2DRenderTargetInitInfo(
-		m_volumeSize[0], m_volumeSize[1], Format::kR16G16B16A16_Sfloat,
+		m_volumeSize.x, m_volumeSize.y, Format::kR16G16B16A16_Sfloat,
 		TextureUsageBit::kUavCompute | TextureUsageBit::kSrvPixel | TextureUsageBit::kSrvCompute, "VolLight");
-	texinit.m_depth = m_volumeSize[2];
+	texinit.m_depth = m_volumeSize.z;
 	texinit.m_type = TextureType::k3D;
 	m_rtTextures[0] = getRenderer().createAndClearRenderTarget(texinit, TextureUsageBit::kSrvPixel);
 	m_rtTextures[1] = getRenderer().createAndClearRenderTarget(texinit, TextureUsageBit::kSrvPixel);
 
+	m_debugRtDesc = getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x, getRenderer().getInternalResolution().y,
+																  Format::kR16G16B16A16_Sfloat);
+	m_debugRtDesc.bake();
+
+	m_debugResult = g_cvarRenderVolumetricLightingAccumulationDebug;
+
 	return Error::kNone;
 }
 
@@ -80,6 +85,8 @@ void VolumetricLightingAccumulation::populateRenderGraph()
 		getIndirectDiffuseClipmaps().setDependencies(pass, TextureUsageBit::kSrvCompute);
 	}
 
+	pass.newTextureDependency(getGBuffer().getDepthRt(), TextureUsageBit::kSrvCompute);
+
 	pass.setWork([this](RenderPassWorkContext& rgraphCtx) {
 		ANKI_TRACE_SCOPED_EVENT(VolumetricLightingAccumulation);
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -103,6 +110,7 @@ void VolumetricLightingAccumulation::populateRenderGraph()
 		rgraphCtx.bindSrv(srv++, 0, getShadowMapping().getShadowmapRt());
 		cmdb.bindSrv(srv++, 0, getClusterBinning().getPackedObjectsBuffer(GpuSceneNonRenderableObjectType::kFogDensityVolume));
 		cmdb.bindSrv(srv++, 0, getClusterBinning().getClustersBuffer());
+		rgraphCtx.bindSrv(srv++, 0, getGBuffer().getDepthRt());
 
 		if(isIndirectDiffuseProbesEnabled())
 		{
@@ -133,15 +141,76 @@ void VolumetricLightingAccumulation::populateRenderGraph()
 			consts.m_densityAtMinHeight = sky->getMaxFogDensity();
 			consts.m_densityAtMaxHeight = sky->getMinFogDensity();
 		}
-		consts.m_volumeSize = UVec3(m_volumeSize);
-
-		const U32 finalZSplit = min<U32>(getRenderer().getZSplitCount() - 1, g_cvarRenderVolumetricLightingAccumulationFinalZSplit);
-		consts.m_maxZSplitsToProcessf = F32(finalZSplit + 1);
+		consts.m_volumeSize = m_volumeSize;
+		consts.m_subZSplitThickness = (getClusterBinning().computeClustererFar() - getRenderingContext().m_matrices.m_near)
+									  / F32(g_cvarRenderClustererZSplitCount << g_cvarRenderVolumetricLightingAccumulationSubdivisionZ);
+		consts.m_clusterSubdivision =
+			UVec3(g_cvarRenderVolumetricLightingAccumulationSubdivisionXY, g_cvarRenderVolumetricLightingAccumulationSubdivisionXY,
+				  g_cvarRenderVolumetricLightingAccumulationSubdivisionZ);
 
 		cmdb.setFastConstants(&consts, sizeof(consts));
 
-		dispatchPPCompute(cmdb, 8, 8, 8, m_volumeSize[0], m_volumeSize[1], m_volumeSize[2]);
+		dispatchPPCompute(cmdb, 8, 8, 8, m_volumeSize.x, m_volumeSize.y, m_volumeSize.z);
 	});
+
+	if(m_debugResult)
+	{
+		m_runCtx.m_debugRt = rgraph.newRenderTarget(m_debugRtDesc);
+
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("Vol debug");
+
+		pass.newTextureDependency(getRt(), TextureUsageBit::kSrvCompute);
+		pass.newTextureDependency(getGBuffer().getDepthRt(), TextureUsageBit::kSrvCompute);
+		pass.newTextureDependency(m_runCtx.m_debugRt, TextureUsageBit::kUavCompute);
+
+		pass.setWork([this](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(VolumetricLightingAccumulationDebug);
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_debugGrProg.get());
+
+			rgraphCtx.bindSrv(0, 0, getRt());
+			rgraphCtx.bindSrv(1, 0, getGBuffer().getDepthRt());
+
+			rgraphCtx.bindUav(0, 0, m_runCtx.m_debugRt);
+
+			cmdb.bindConstantBuffer(0, 0, getRenderingContext().m_globalRenderingConstantsBuffer);
+
+			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_nearestNearestClamp.get());
+
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x, getRenderer().getInternalResolution().y);
+		});
+	}
+	else
+	{
+		m_runCtx.m_debugRt = {};
+	}
+}
+
+void VolumetricLightingAccumulation::fillClustererConstants(ClustererConstants& consts)
+{
+	const U32 lightZSplitCount = min<U32>(g_cvarRenderClustererZSplitCount, g_cvarRenderVolumetricLightingAccumulationFinalZSplit + 1);
+	const F32 clustererFar = getClusterBinning().computeClustererFar() / F32(g_cvarRenderClustererZSplitCount) * F32(lightZSplitCount);
+	const F32 n = getRenderingContext().m_matrices.m_near;
+	const F32 f = getRenderingContext().m_matrices.m_far;
+
+	consts.m_lightVolumeWMagic.x = (clustererFar - n) / (-n);
+	consts.m_lightVolumeWMagic.y = f * (clustererFar - n) / (n * (f - n));
+}
+
+void VolumetricLightingAccumulation::getDebugRenderTarget([[maybe_unused]] CString rtName,
+														  Array<RenderTargetHandle, U32(DebugRenderTargetRegister::kCount)>& handles,
+														  DebugRenderTargetDrawStyle& drawStyle) const
+{
+	if(m_runCtx.m_debugRt.isValid())
+	{
+		handles[0] = m_runCtx.m_debugRt;
+		drawStyle = DebugRenderTargetDrawStyle::kTonemap;
+	}
+	else
+	{
+		ANKI_R_LOGW("Need to enable debug drawing of volumetrics else nothing will happen");
+	}
 }
 
 } // end namespace anki

+ 30 - 3
AnKi/Renderer/VolumetricLightingAccumulation.h

@@ -9,15 +9,22 @@
 
 namespace anki {
 
-ANKI_CVAR2(NumericCVar<F32>, Render, VolumetricLightingAccumulation, QualityXY, 4.0f, 1.0f, 16.0f, "Quality of XY dimensions of volumetric lights")
-ANKI_CVAR2(NumericCVar<F32>, Render, VolumetricLightingAccumulation, QualityZ, 4.0f, 1.0f, 16.0f, "Quality of Z dimension of volumetric lights")
+ANKI_CVAR2(NumericCVar<U32>, Render, VolumetricLightingAccumulation, SubdivisionXY, 2u, 1u, 16u,
+		   "The original clusters will be split using this CVar")
+ANKI_CVAR2(NumericCVar<U32>, Render, VolumetricLightingAccumulation, SubdivisionZ, 2u, 1u, 16u, "The original clusters will be split using this CVar")
 ANKI_CVAR2(NumericCVar<U32>, Render, VolumetricLightingAccumulation, FinalZSplit, 26, 1, 256,
 		   "Final cluster split that will recieve volumetric lights")
+ANKI_CVAR2(BoolCVar, Render, VolumetricLightingAccumulation, Debug, false, "Enable debugging of volumetrics")
 
 // Volumetric lighting. It accumulates lighting in a volume texture.
 class VolumetricLightingAccumulation : public RendererObject
 {
 public:
+	VolumetricLightingAccumulation()
+	{
+		registerDebugRenderTarget("Volumetric Lighting");
+	}
+
 	Error init();
 
 	void populateRenderGraph();
@@ -27,19 +34,39 @@ public:
 		return m_runCtx.m_rts[1];
 	}
 
+	void fillClustererConstants(ClustererConstants& consts);
+
+	void setEnableDebuggingView(Bool enable)
+	{
+		m_debugResult = enable;
+	}
+
+	Bool getDebuggingView() const
+	{
+		return m_debugResult;
+	}
+
+	void getDebugRenderTarget(CString rtName, Array<RenderTargetHandle, U32(DebugRenderTargetRegister::kCount)>& handles,
+							  DebugRenderTargetDrawStyle& drawStyle) const override;
+
 private:
 	ShaderProgramResourcePtr m_prog;
 	ShaderProgramPtr m_grProg;
+	ShaderProgramPtr m_debugGrProg;
 
 	Array<TexturePtr, 2> m_rtTextures;
 	ImageResourcePtr m_noiseImage;
 
-	Array<U32, 3> m_volumeSize;
+	RenderTargetDesc m_debugRtDesc;
+
+	UVec3 m_volumeSize;
+	Bool m_debugResult = false;
 
 	class
 	{
 	public:
 		Array<RenderTargetHandle, 2> m_rts;
+		RenderTargetHandle m_debugRt;
 	} m_runCtx; // Runtime context.
 };
 

+ 16 - 16
AnKi/Shaders/ClusterBinning.ankiprog

@@ -102,7 +102,7 @@ constexpr U32 kPackVisiblesThreadgroupSize = 64;
 struct ClusterBinningConstants
 {
 	Vec3 m_cameraOrigin;
-	F32 m_zSplitCountOverFrustumLength;
+	F32 m_zSplitCountOverClustererLength; // = clustererFar - near
 
 	Vec2 m_renderingSize;
 	U32 m_tileCountX;
@@ -233,20 +233,20 @@ constexpr UVec2 kSampleLocations[kSampleCount] = {LOCATION(1, -3), LOCATION(-1,
 #	if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
 		if(obj.m_isPointLight)
 		{
-			InterlockedOr(g_clusters[tileIdx].m_pointLightsMask[maskArrayIdx], mask);
+			InterlockedOr(SBUFF(g_clusters, tileIdx).m_pointLightsMask[maskArrayIdx], mask);
 		}
 		else
 		{
-			InterlockedOr(g_clusters[tileIdx].m_spotLightsMask[maskArrayIdx], mask);
+			InterlockedOr(SBUFF(g_clusters, tileIdx).m_spotLightsMask[maskArrayIdx], mask);
 		}
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
-		InterlockedOr(g_clusters[tileIdx].m_decalsMask[maskArrayIdx], mask);
+		InterlockedOr(SBUFF(g_clusters, tileIdx).m_decalsMask[maskArrayIdx], mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
-		InterlockedOr(g_clusters[tileIdx].m_fogDensityVolumesMask, mask);
+		InterlockedOr(SBUFF(g_clusters, tileIdx).m_fogDensityVolumesMask, mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
-		InterlockedOr(g_clusters[tileIdx].m_reflectionProbesMask, mask);
+		InterlockedOr(SBUFF(g_clusters, tileIdx).m_reflectionProbesMask, mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
-		InterlockedOr(g_clusters[tileIdx].m_giProbesMask, mask);
+		InterlockedOr(SBUFF(g_clusters, tileIdx).m_giProbesMask, mask);
 #	else
 #		error See file
 #	endif
@@ -270,27 +270,27 @@ constexpr UVec2 kSampleLocations[kSampleCount] = {LOCATION(1, -3), LOCATION(-1,
 			maxDistFromNearPlane = distFromNearPlaneA;
 		}
 
-		const I32 startZSplit = max(I32(minDistFromNearPlane * g_consts.m_zSplitCountOverFrustumLength), 0);
-		const I32 endZSplit = clamp(I32(maxDistFromNearPlane * g_consts.m_zSplitCountOverFrustumLength), 0, g_consts.m_zSplitCountMinusOne);
+		const I32 startZSplit = max(I32(floor(minDistFromNearPlane * g_consts.m_zSplitCountOverClustererLength)), 0);
+		const I32 endZSplit = clamp(I32(ceil(maxDistFromNearPlane * g_consts.m_zSplitCountOverClustererLength)), 0, g_consts.m_zSplitCountMinusOne);
 		for(I32 i = startZSplit; i <= endZSplit; ++i)
 		{
 #	if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
 			if(obj.m_isPointLight)
 			{
-				InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_pointLightsMask[maskArrayIdx], mask);
+				InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_pointLightsMask[maskArrayIdx], mask);
 			}
 			else
 			{
-				InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_spotLightsMask[maskArrayIdx], mask);
+				InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_spotLightsMask[maskArrayIdx], mask);
 			}
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
-			InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_decalsMask[maskArrayIdx], mask);
+			InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_decalsMask[maskArrayIdx], mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
-			InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_fogDensityVolumesMask, mask);
+			InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_fogDensityVolumesMask, mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
-			InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_reflectionProbesMask, mask);
+			InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_reflectionProbesMask, mask);
 #	elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
-			InterlockedOr(g_clusters[g_consts.m_tileCount + i].m_giProbesMask, mask);
+			InterlockedOr(SBUFF(g_clusters, g_consts.m_tileCount + i).m_giProbesMask, mask);
 #	else
 #		error See file
 #	endif
@@ -319,6 +319,6 @@ StructuredBuffer<U32> g_visibles : register(t1);
 		return;
 	}
 
-	g_outBuffer[idxOut] = g_inBuffer[g_visibles[idxOut + 1]];
+	SBUFF(g_outBuffer, idxOut) = SBUFF(g_inBuffer, SBUFF(g_visibles, idxOut + 1));
 }
 #endif

+ 33 - 18
AnKi/Shaders/ClusteredShadingFunctions.hlsl

@@ -53,22 +53,32 @@ Vec3 clusterHeatmap(Cluster cluster, U32 objectTypeMask, U32 maxObjectOverride =
 	return heatmap(factor);
 }
 
-/// Returns the index of the zSplit or linearizeDepth(n, f, depth)*zSplitCount
-/// Simplifying this equation is 1/(a+b/depth) where a=(n-f)/(n*zSplitCount) and b=f/(n*zSplitCount)
-U32 computeZSplitClusterIndex(F32 depth, U32 zSplitCount, F32 a, F32 b)
+// Returns the index of the zSplit. Calculated as linearizeDepth(depth, n, f)*(f-n)/(clustererFar-n)*zSplitCount
+// Simplifying this equation is 1/(a+b/depth) where a=(clustererFar-n)/(-n*zSplitCount) and b=f*(clustererFar-n)/(n*(f-n)*zSplitCount)
+// If the depth is outside the clusterer's range then the return value will be creater or equal to zSplitCount
+U32 computeZSplitClusterIndex(F32 depth, F32 a, F32 b)
 {
-	const F32 fSplitIdx = 1.0 / (a + b / depth);
-	return min(zSplitCount - 1u, (U32)fSplitIdx);
+	const F32 splitIdxf = 1.0 / (a + b / depth); // It's fine if depth is zero. The splitIdxf will become 0.0
+	return (U32)splitIdxf;
 }
 
-/// Return the tile index.
+// It's similar to computeZSplitClusterIndex but instead of an index it returns a tex coordinate for the w coord of a 3D texture that covers the
+// clusterer. Calculated as linearizeDepth(depth, n, f)*(f-n)/(clustererFar-n).
+// Simplifying this equation is 1/(a+b/depth) where a=(clustererFar-n)/(-n) and b=f*(clustererFar-n)/(n*(f-n))
+// If the depth is outside the clusterer's range then the return value will be creater or equal than 1.0
+F32 computeVolumeWTexCoord(F32 depth, F32 a, F32 b)
+{
+	return 1.0 / (a + b / depth); // It's fine if depth is zero. The expression will become 0.0
+}
+
+// Return the tile index.
 U32 computeTileClusterIndexFragCoord(Vec2 fragCoord, U32 tileCountX)
 {
 	const UVec2 tileXY = UVec2(fragCoord / F32(kClusteredShadingTileSize));
 	return tileXY.y * tileCountX + tileXY.x;
 }
 
-/// Merge the tiles with z splits into a single cluster.
+// Merge the tiles with z splits into a single cluster.
 template<Bool kDynamicallyUniform = false>
 Cluster mergeClusters(Cluster tileCluster, Cluster zCluster)
 {
@@ -78,18 +88,18 @@ Cluster mergeClusters(Cluster tileCluster, Cluster zCluster)
 	{
 		[unroll] for(U32 i = 0; i < kMaxVisibleLights / 32; ++i)
 		{
-			outCluster.m_pointLightsMask[i] = WaveActiveBitOr(tileCluster.m_pointLightsMask[i] & zCluster.m_pointLightsMask[i]);
-			outCluster.m_spotLightsMask[i] = WaveActiveBitOr(tileCluster.m_spotLightsMask[i] & zCluster.m_spotLightsMask[i]);
+			outCluster.m_pointLightsMask[i] = WaveActiveBitAnd(tileCluster.m_pointLightsMask[i] & zCluster.m_pointLightsMask[i]);
+			outCluster.m_spotLightsMask[i] = WaveActiveBitAnd(tileCluster.m_spotLightsMask[i] & zCluster.m_spotLightsMask[i]);
 		}
 
 		[unroll] for(U32 i = 0; i < kMaxVisibleDecals / 32; ++i)
 		{
-			outCluster.m_decalsMask[i] = WaveActiveBitOr(tileCluster.m_decalsMask[i] & zCluster.m_decalsMask[i]);
+			outCluster.m_decalsMask[i] = WaveActiveBitAnd(tileCluster.m_decalsMask[i] & zCluster.m_decalsMask[i]);
 		}
 
-		outCluster.m_fogDensityVolumesMask = WaveActiveBitOr(tileCluster.m_fogDensityVolumesMask & zCluster.m_fogDensityVolumesMask);
-		outCluster.m_reflectionProbesMask = WaveActiveBitOr(tileCluster.m_reflectionProbesMask & zCluster.m_reflectionProbesMask);
-		outCluster.m_giProbesMask = WaveActiveBitOr(tileCluster.m_giProbesMask & zCluster.m_giProbesMask);
+		outCluster.m_fogDensityVolumesMask = WaveActiveBitAnd(tileCluster.m_fogDensityVolumesMask & zCluster.m_fogDensityVolumesMask);
+		outCluster.m_reflectionProbesMask = WaveActiveBitAnd(tileCluster.m_reflectionProbesMask & zCluster.m_reflectionProbesMask);
+		outCluster.m_giProbesMask = WaveActiveBitAnd(tileCluster.m_giProbesMask & zCluster.m_giProbesMask);
 	}
 	else
 	{
@@ -112,13 +122,18 @@ Cluster mergeClusters(Cluster tileCluster, Cluster zCluster)
 	return outCluster;
 }
 
-/// Get the final cluster after ORing and ANDing the masks.
+// Get the final cluster after ORing and ANDing the masks.
 template<Bool kDynamicallyUniform = false>
-Cluster getClusterFragCoord(StructuredBuffer<Cluster> clusters, GlobalRendererConstants consts, Vec3 fragCoord)
+Cluster getClusterFragCoord(StructuredBuffer<Cluster> clusters, ClustererConstants consts, Vec3 fragCoord)
 {
-	const Cluster tileCluster = clusters[computeTileClusterIndexFragCoord(fragCoord.xy, consts.m_tileCounts.x)];
-	const Cluster zCluster = clusters[computeZSplitClusterIndex(fragCoord.z, consts.m_zSplitCount, consts.m_zSplitMagic.x, consts.m_zSplitMagic.y)
-									  + consts.m_tileCounts.x * consts.m_tileCounts.y];
+	U32 idx = computeTileClusterIndexFragCoord(fragCoord.xy, consts.m_tileCounts.x);
+	const Cluster tileCluster = SBUFF(clusters, idx);
+
+	idx = computeZSplitClusterIndex(fragCoord.z, consts.m_zSplitMagic.x, consts.m_zSplitMagic.y);
+	idx += consts.m_tileCounts.x * consts.m_tileCounts.y;
+	idx = min(idx, consts.m_clusterCount); // The "consts.m_clusterCount" is intentional. There is a hiden cluster at the end that is all zeroes
+	const Cluster zCluster = SBUFF(clusters, idx);
+
 	return mergeClusters<kDynamicallyUniform>(tileCluster, zCluster);
 }
 

+ 5 - 7
AnKi/Shaders/ForwardShadingCommon.hlsl

@@ -46,7 +46,7 @@ Vec3 computeLightColorHigh(Vec3 diffCol, Vec3 worldPos, Vec4 svPosition)
 	Vec3 outColor = Vec3(0.0, 0.0, 0.0);
 
 	// Find the cluster and then the light counts
-	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants, svPosition.xyz);
+	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants.m_clusterer, svPosition.xyz);
 
 	// Point lights
 	U32 idx = 0;
@@ -97,12 +97,10 @@ Vec3 computeLightColorHigh(Vec3 diffCol, Vec3 worldPos, Vec4 svPosition)
 // Just read the light color from the vol texture
 Vec3 computeLightColorLow(Vec3 diffCol, Vec3 worldPos, Vec4 svPosition)
 {
-	ANKI_MAYBE_UNUSED(worldPos);
-
-	const Vec2 uv = svPosition.xy / g_globalRendererConstants.m_renderingSize;
-	const F32 linearDepth = linearizeDepth(svPosition.z, g_globalRendererConstants.m_matrices.m_near, g_globalRendererConstants.m_matrices.m_far);
-	const F32 w = linearDepth * (F32(g_globalRendererConstants.m_zSplitCount) / F32(g_globalRendererConstants.m_lightVolumeLastZSplit + 1u));
-	const Vec3 uvw = Vec3(uv, w);
+	Vec3 uvw;
+	uvw.xy = svPosition.xy / g_globalRendererConstants.m_renderingSize;
+	uvw.z = computeVolumeWTexCoord(svPosition.z, g_globalRendererConstants.m_clusterer.m_lightVolumeWMagic.x,
+								   g_globalRendererConstants.m_clusterer.m_lightVolumeWMagic.y);
 
 	const Vec3 light = g_lightVol.SampleLevel(g_trilinearClampSampler, uvw, 0.0).rgb;
 	return diffuseLobe(diffCol) * light;

+ 1 - 1
AnKi/Shaders/GBufferPost.ankiprog

@@ -38,7 +38,7 @@ SamplerState g_linearAnyClampSampler : register(s0);
 	const Vec3 worldPos = worldPos4.xyz / worldPos4.w;
 
 	// Get the cluster. Make sure it's dynamically uniform because we are accessing bindless textures later on
-	Cluster cluster = getClusterFragCoord<true>(g_clusters, g_globalConstants, Vec3(svDispatchThreadId, depth));
+	Cluster cluster = getClusterFragCoord<true>(g_clusters, g_globalConstants.m_clusterer, Vec3(svDispatchThreadId, depth));
 
 	// Make the decalsMask uniform across the wave because we are accessing bindless textures later on
 	U32 decalsMask = cluster.m_decalsMask[0];

+ 22 - 13
AnKi/Shaders/Include/MiscRendererTypes.h

@@ -124,6 +124,18 @@ struct LocalLightsGridConstants
 	F32 m_padding4;
 };
 
+struct ClustererConstants
+{
+	Vec2 m_zSplitMagic; // It's the "a" and "b" of computeZSplitClusterIndex(). See there for details.
+	UVec2 m_tileCounts;
+
+	Vec2 m_lightVolumeWMagic; // the "a" and "b" of computeVolumeWTexCoord(). See there for details.
+	F32 m_clustererFar;
+	U32 m_clusterCount : 16;
+	U32 m_zSplitCount : 16;
+};
+static_assert(sizeof(ClustererConstants) % sizeof(Vec4) == 0);
+
 // Common constants for all passes.
 struct GlobalRendererConstants
 {
@@ -136,13 +148,7 @@ struct GlobalRendererConstants
 	Vec3 m_cameraPosition;
 	F32 m_reflectionProbesMipCount;
 
-	UVec2 m_tileCounts;
-	U32 m_zSplitCount;
-	F32 m_zSplitCountOverFrustumLength; ///< m_zSplitCount/(far-near)
-
-	Vec2 m_zSplitMagic; ///< It's the "a" and "b" of computeZSplitClusterIndex(). See there for details.
-	U32 m_lightVolumeLastZSplit;
-	U32 m_padding1;
+	ClustererConstants m_clusterer;
 
 	DirectionalLight m_directionalLight;
 
@@ -200,13 +206,13 @@ struct VolumetricFogConstants
 	Vec3 m_fogDiffuse;
 	F32 m_fogScatteringCoeff;
 
+	UVec3 m_volumeSize;
 	F32 m_fogAbsorptionCoeff;
-	F32 m_near;
-	F32 m_far;
-	F32 m_zSplitCountf;
 
-	UVec3 m_volumeSize;
-	F32 m_maxZSplitsToProcessf;
+	F32 m_zSplitThickness;
+	U32 m_padding1;
+	U32 m_padding2;
+	U32 m_padding3;
 };
 
 // Vol lighting
@@ -218,7 +224,10 @@ struct VolumetricLightingConstants
 	F32 m_oneOverMaxMinusMinHeight; // 1 / (maxHeight / minHeight)
 
 	UVec3 m_volumeSize;
-	F32 m_maxZSplitsToProcessf;
+	F32 m_subZSplitThickness;
+
+	UVec3 m_clusterSubdivision;
+	F32 m_padding;
 };
 
 // SSAO

+ 1 - 1
AnKi/Shaders/LightShading.ankiprog

@@ -63,7 +63,7 @@ Vec4 main(VertOut input) : SV_TARGET0
 	const HVec3 viewDir = normalize(g_globalConstants.m_cameraPosition - worldPos);
 
 	// Get the cluster
-	Cluster cluster = getClusterFragCoord(g_clusters, g_globalConstants, Vec3(input.m_svPosition.xy, depth));
+	Cluster cluster = getClusterFragCoord(g_clusters, g_globalConstants.m_clusterer, Vec3(input.m_svPosition.xy, depth));
 
 	// return clusterHeatmap(cluster, 1u << (U32)GpuSceneNonRenderableObjectType::kLight, 3);
 

+ 19 - 17
AnKi/Shaders/LightShadingApplyFog.ankiprog

@@ -9,33 +9,35 @@
 
 #if ANKI_PIXEL_SHADER
 #	include <AnKi/Shaders/Functions.hlsl>
+#	include <AnKi/Shaders/ClusteredShadingFunctions.hlsl>
+#	include <AnKi/Shaders/Include/MiscRendererTypes.h>
+#	include <AnKi/Shaders/ImportanceSampling.hlsl>
+
+SamplerState g_linearAnyClampSampler : register(s0);
 
-SamplerState g_nearestAnyClampSampler : register(s0);
-SamplerState g_linearAnyClampSampler : register(s1);
 Texture2D g_depthRt : register(t0);
 Texture3D<Vec4> g_fogVolume : register(t1);
 
-struct Constants
-{
-	F32 m_zSplitCount;
-	F32 m_finalZSplit;
-	F32 m_near;
-	F32 m_far;
-};
-ANKI_FAST_CONSTANTS(Constants, g_consts)
+ConstantBuffer<GlobalRendererConstants> g_consts : register(b0);
 
 Vec4 main(VertOut input) : SV_TARGET0
 {
-	const Vec2 uv = input.m_uv;
-	Vec3 uvw;
+	Vec3 texSize;
+	g_fogVolume.GetDimensions(texSize.x, texSize.y, texSize.z);
+	const Vec3 texelSize = 1.0 / texSize;
 
-	// Compute W coordinate
-	const F32 depth = g_depthRt.SampleLevel(g_nearestAnyClampSampler, uv, 0.0).r;
-	const F32 linearDepth = linearizeDepth(depth, g_consts.m_near, g_consts.m_far);
-	uvw.z = linearDepth * (g_consts.m_zSplitCount / (g_consts.m_finalZSplit + 1.0f));
+	// Random
+	const UVec3 seed = rand3DPCG16(UVec3(input.m_svPosition.xy, g_consts.m_frame % 8u));
+	const Vec2 random = hammersleyRandom16(g_consts.m_frame % 16, 16, seed);
+	const Vec2 urandom = random * 2.0 - 1.0;
 
 	// Compute UV coordinates
-	uvw.xy = uv;
+	const F32 depth = TEX(g_depthRt, input.m_svPosition.xy).r;
+	Vec3 uvw;
+	uvw.xy = input.m_uv;
+	uvw.xy += texelSize * urandom;
+	uvw.z = computeVolumeWTexCoord(depth, g_consts.m_clusterer.m_lightVolumeWMagic.x, g_consts.m_clusterer.m_lightVolumeWMagic.y);
+	uvw.z += texelSize / 2.0 * urandom.x;
 
 	// Read the volume
 	const Vec4 fogVals = g_fogVolume.SampleLevel(g_linearAnyClampSampler, uvw, 0.0);

+ 4 - 4
AnKi/Shaders/Reflections.ankiprog

@@ -220,7 +220,7 @@ Vec3 doLightShading(Vec3 worldPos, Vec3 viewPos, UVec2 coord, F32 depth)
 
 	Vec3 outColor = gbuffer.m_emission;
 
-	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants, Vec3(coord.xy + 0.5, depth));
+	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants.m_clusterer, Vec3(coord.xy + 0.5, depth));
 
 	// GI
 #	if INDIRECT_DIFFUSE_CLIPMAPS
@@ -416,7 +416,7 @@ void bestCandidateToHallucinate(IVec2 svGroupThreadId, IVec2 offset, F32 depth,
 		const Vec3 col = sampleClipmapRadiance(worldPos, reflDir, g_globalRendererConstants.m_cameraPosition,
 											   g_globalRendererConstants.m_indirectDiffuseClipmaps, g_linearAnyRepeatSampler, kSampleClipmapFlags);
 #	else
-		Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants, Vec3(logicalCoord.xy + 0.5, depth));
+		Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants.m_clusterer, Vec3(logicalCoord.xy + 0.5, depth));
 		const Vec3 col = sampleGiProbes<F32>(cluster, g_giProbes, reflDir, worldPos.xyz, g_trilinearClampSampler);
 #	endif
 
@@ -457,7 +457,7 @@ void bestCandidateToHallucinate(IVec2 svGroupThreadId, IVec2 offset, F32 depth,
 	{
 		viewReflDir = reflect(-viewDir, viewNormal);
 
-		Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants, Vec3(logicalCoord.xy + 0.5, depth));
+		Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants.m_clusterer, Vec3(logicalCoord.xy + 0.5, depth));
 
 		const Vec3 woldReflDir = mul(g_globalRendererConstants.m_matrices.m_cameraTransform, Vec4(viewReflDir, 0.0));
 
@@ -628,7 +628,7 @@ RWTexture2D<Vec4> g_hitPosAndDepthTex : register(u1);
 	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(ndc, depth, 1.0));
 	const Vec3 worldPos = v4.xyz / v4.w;
 
-	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants, Vec3(logicalCoord.xy + 0.5, depth));
+	Cluster cluster = getClusterFragCoord(g_clusters, g_globalRendererConstants.m_clusterer, Vec3(logicalCoord.xy + 0.5, depth));
 
 	const F32 reflLod = (g_globalRendererConstants.m_reflectionProbesMipCount - 1.0f) * roughness;
 	Vec3 probeColor = sampleReflectionProbes<F32>(cluster, g_reflectionProbes, reflDir, worldPos, reflLod, g_trilinearClampSampler);

+ 1 - 1
AnKi/Shaders/ShadowmapsResolve.ankiprog

@@ -102,7 +102,7 @@ Vec4 main(VertOut input) : SV_TARGET0
 
 	// Cluster
 	const Vec2 fragCoord = uv * g_globalConstants.m_renderingSize;
-	Cluster cluster = getClusterFragCoord(g_clusters, g_globalConstants, Vec3(fragCoord, depth));
+	Cluster cluster = getClusterFragCoord(g_clusters, g_globalConstants.m_clusterer, Vec3(fragCoord, depth));
 
 	// Layers
 	U32 shadowCasterCountPerFragment = 0u;

+ 4 - 20
AnKi/Shaders/VolumetricFogAccumulation.ankiprog

@@ -26,29 +26,13 @@ ANKI_FAST_CONSTANTS(VolumetricFogConstants, g_consts)
 	Vec4 colorAndDensityFront = 0.0;
 	[loop] for(U32 i = 0u; i < g_consts.m_volumeSize.z; ++i)
 	{
-		const F32 fi = F32(i);
-
-		// Compute the linear depth
-		const F32 maxLinearDepth = g_consts.m_maxZSplitsToProcessf / g_consts.m_zSplitCountf;
-		const F32 linearDepthFraction = maxLinearDepth / F32(g_consts.m_volumeSize.z);
-		const F32 linearDepthNear = fi * linearDepthFraction;
-		const F32 linearDepthFar = (fi + 1.0) * linearDepthFraction;
-
-		// Compute the min and max Z in view space if this cluster fragment
-		const F32 zVSpaceNear = -linearDepthNear * (g_consts.m_far - g_consts.m_near) + g_consts.m_near;
-		const F32 zVSpaceFar = -linearDepthFar * (g_consts.m_far - g_consts.m_near) + g_consts.m_near;
-
-		// Compute the thikness of this fragment
-		const F32 layerThinkness = abs(zVSpaceNear - zVSpaceFar);
-
 		// Read the light value and the fog density from the fog volumes
-		const F32 w = (fi + 0.5) / F32(g_consts.m_volumeSize.z);
-		Vec4 lightAndFogDensity = g_lightVolume.SampleLevel(g_linearAnyClampSampler, Vec3(uv, w), 0.0);
+		Vec4 lightAndFogDensity = TEX(g_lightVolume, UVec3(svDispatchThreadId.xy, i));
 		lightAndFogDensity.xyz *= g_consts.m_fogDiffuse / kPi;
 
 		// Scattering & absorption
-		const F32 scattering = lightAndFogDensity.w * g_consts.m_fogScatteringCoeff * layerThinkness;
-		const F32 absorption = lightAndFogDensity.w * g_consts.m_fogAbsorptionCoeff * layerThinkness;
+		const F32 scattering = lightAndFogDensity.w * g_consts.m_fogScatteringCoeff * g_consts.m_zSplitThickness;
+		const F32 absorption = lightAndFogDensity.w * g_consts.m_fogAbsorptionCoeff * g_consts.m_zSplitThickness;
 
 		// Integrate
 		const Vec4 colorAndDensityBack = Vec4(lightAndFogDensity.xyz * scattering, scattering + absorption);
@@ -58,6 +42,6 @@ ANKI_FAST_CONSTANTS(VolumetricFogConstants, g_consts)
 
 		// Write the value
 		const Vec4 valToWrite = Vec4(colorAndDensityFront.rgb, saturate(exp(-colorAndDensityFront.a)));
-		g_fogVolume[UVec3(svDispatchThreadId.xy, i)] = valToWrite;
+		TEX(g_fogVolume, UVec3(svDispatchThreadId.xy, i)) = valToWrite;
 	}
 }

+ 120 - 67
AnKi/Shaders/VolumetricLightingAccumulation.ankiprog

@@ -8,13 +8,19 @@
 #pragma anki mutator ENABLE_SHADOWS 0 1
 #pragma anki mutator CLIPMAP_DIFFUSE_INDIRECT 0 1
 
-#pragma anki technique comp
+#pragma anki technique Accumulate comp
+#pragma anki technique Debug comp
 
 #include <AnKi/Shaders/Include/MiscRendererTypes.h>
 #include <AnKi/Shaders/ClusteredShadingFunctions.hlsl>
 #include <AnKi/Shaders/IndirectDiffuseClipmaps.hlsl>
 #include <AnKi/Shaders/ImportanceSampling.hlsl>
 
+// ===========================================================================
+// Accumulate                                                                =
+// ===========================================================================
+#if ANKI_TECHNIQUE_Accumulate
+
 constexpr F32 kPhaseFunctionAnisotropy = 0.3;
 
 SamplerState g_linearAnyRepeatSampler : register(s0);
@@ -29,46 +35,15 @@ StructuredBuffer<GpuSceneLight> g_lights : register(t2);
 Texture2D<Vec4> g_shadowAtlasTex : register(t3);
 StructuredBuffer<GpuSceneFogDensityVolume> g_fogDensityVolumes : register(t4);
 StructuredBuffer<Cluster> g_clusters : register(t5);
-#if !CLIPMAP_DIFFUSE_INDIRECT
-StructuredBuffer<GpuSceneGlobalIlluminationProbe> g_giProbes : register(t6);
-#endif
+Texture2D g_depthMap : register(t6);
+#	if !CLIPMAP_DIFFUSE_INDIRECT
+StructuredBuffer<GpuSceneGlobalIlluminationProbe> g_giProbes : register(t7);
+#	endif
 
 ConstantBuffer<GlobalRendererConstants> g_globalConstants : register(b0);
 
 ANKI_FAST_CONSTANTS(VolumetricLightingConstants, g_consts)
 
-Vec3 worldPosInsideClusterAndZViewSpace(Vec3 relativePos, Vec3 clusterIdf, out F32 negativeZViewSpace, out Vec3 uvw)
-{
-	// XY UV
-	uvw.xy = lerp(clusterIdf.xy, clusterIdf.xy + 1.0, relativePos.xy) / Vec2(g_consts.m_volumeSize.xy);
-
-	// Compute the linear depth
-	const F32 maxLinearDepth = g_consts.m_maxZSplitsToProcessf / F32(g_globalConstants.m_zSplitCount);
-	const F32 linearDepthFraction = maxLinearDepth / F32(g_consts.m_volumeSize.z);
-	const F32 linearDepthNear = clusterIdf.z * linearDepthFraction;
-	const F32 linearDepthFar = (clusterIdf.z + 1.0) * linearDepthFraction;
-	const F32 linearDepth = lerp(linearDepthNear, linearDepthFar, relativePos.z);
-	uvw.z = linearDepth;
-
-	// View space
-	negativeZViewSpace =
-		linearDepth * (g_globalConstants.m_matrices.m_far - g_globalConstants.m_matrices.m_near) + g_globalConstants.m_matrices.m_near;
-	const F32 zViewSpace = -negativeZViewSpace;
-	const Vec2 xyViewSpace = uvToNdc(uvw.xy) * g_globalConstants.m_matrices.m_unprojectionParameters.xy * zViewSpace;
-
-	// Get the final world pos
-	const Vec3 worldPos = mul(g_globalConstants.m_matrices.m_cameraTransform, Vec4(xyViewSpace, zViewSpace, 1.0));
-
-	return worldPos;
-}
-
-Vec3 worldPosInsideCluster(Vec3 relativePos, Vec3 clusterIdf)
-{
-	F32 unused;
-	Vec3 unused1;
-	return worldPosInsideClusterAndZViewSpace(relativePos, clusterIdf, unused, unused1);
-}
-
 // https://developer.nvidia.com/gpugems/GPUGems2/gpugems2_chapter16.html
 F32 phaseFunction2(Vec3 pont2CameraDir, Vec3 point2LightDir, F32 g)
 {
@@ -101,7 +76,7 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 	{
 		F32 factor = phaseFunction(viewDir, -dirLight.m_direction, kPhaseFunctionAnisotropy);
 
-#if ENABLE_SHADOWS
+#	if ENABLE_SHADOWS
 		const U32 shadowCascadeCount = dirLight.m_shadowCascadeCount;
 
 		if(shadowCascadeCount > 0u && negativeZViewSpace < dirLight.m_shadowCascadeDistances[shadowCascadeCount - 1u])
@@ -110,7 +85,7 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 
 			factor *= computeShadowFactorDirLight<F32>(dirLight, cascadeIdx, worldPos, g_shadowAtlasTex, g_linearAnyClampShadowSampler);
 		}
-#endif
+#	endif
 
 		color += dirLight.m_diffuseColor * factor;
 	}
@@ -126,12 +101,12 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 
 		factor *= phaseFunction(viewDir, normalize(frag2Light), kPhaseFunctionAnisotropy);
 
-#if ENABLE_SHADOWS
+#	if ENABLE_SHADOWS
 		if(light.m_shadow)
 		{
 			factor *= computeShadowFactorPointLight<F32>(light, frag2Light, g_shadowAtlasTex, g_linearAnyClampShadowSampler);
 		}
-#endif
+#	endif
 
 		color += light.m_diffuseColor * factor;
 	}
@@ -150,18 +125,18 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 
 		factor *= phaseFunction(viewDir, -light.m_direction, kPhaseFunctionAnisotropy);
 
-#if ENABLE_SHADOWS
+#	if ENABLE_SHADOWS
 		if(light.m_shadow)
 		{
 			factor *= computeShadowFactorSpotLight<F32>(light, worldPos, g_shadowAtlasTex, g_linearAnyClampShadowSampler);
 		}
-#endif
+#	endif
 
 		color += light.m_diffuseColor * factor;
 	}
 
 	// Indirect diffuse GI
-#if CLIPMAP_DIFFUSE_INDIRECT
+#	if CLIPMAP_DIFFUSE_INDIRECT
 	{
 		const SampleClipmapFlag flags = kSampleClipmapFlagNone;
 		const Vec3 irradiance = sampleClipmapAvgIrradiance(worldPos, 0.0, g_globalConstants.m_cameraPosition,
@@ -169,14 +144,14 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 
 		color += irradiance * kPi; // Not sure why the multiplication with Pi but it looks more correct
 	}
-#else
+#	else
 	{
 		Vec3 diffIndirect = sampleGiProbes<F32>(cluster, g_giProbes, viewDir, worldPos, g_linearAnyClampSampler);
 		diffIndirect *= kPi; // Irradiance is pre-divided with PI so fix it
 
 		color += diffIndirect;
 	}
-#endif
+#	endif
 
 	// Fog density
 	F32 fogDensity = 0.0;
@@ -214,52 +189,130 @@ Vec4 accumulateLightsAndFog(Cluster cluster, Vec3 worldPos, F32 negativeZViewSpa
 
 [numthreads(8, 8, 8)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
-	const UVec3 clusterId = svDispatchThreadId;
-	if(any(clusterId >= g_consts.m_volumeSize))
+	const UVec3 subClusterId = svDispatchThreadId;
+	if(any(subClusterId >= g_consts.m_volumeSize))
 	{
 		return;
 	}
 
+	const ClustererConstants clustererConsts = g_globalConstants.m_clusterer;
+
 	// Find a random pos inside the cluster
-	Vec3 random = TEX(g_noiseTex, (clusterId.xy + clusterId.z) % 64).rgb;
-	random = animateBlueNoise(random, g_globalConstants.m_frame);
+#	if 1
+	Vec3 randomFactor = TEX(g_noiseTex, subClusterId.xy % 64).rgb;
+	randomFactor = animateBlueNoise(randomFactor, g_globalConstants.m_frame + subClusterId.z);
+#	else
+	const UVec3 seed = rand3DPCG16(UVec3(subClusterId.xy, (g_globalConstants.m_frame + subClusterId.z) % 8u));
+	const Vec2 random2 = hammersleyRandom16(g_globalConstants.m_frame % 16, 16, seed);
+	Vec3 randomFactor = random2.xyx;
+#	endif
+
+	const Vec3 subClusterIdf = subClusterId;
+	const Vec3 uvw = (subClusterIdf + randomFactor) / Vec3(g_consts.m_volumeSize);
 
-	F32 negativeZViewSpace;
-	Vec3 uvw;
-	const Vec3 worldPos = worldPosInsideClusterAndZViewSpace(random, clusterId, negativeZViewSpace, uvw);
+	// View space
+	const F32 negativeZViewSpace = g_consts.m_subZSplitThickness * (subClusterIdf.z + randomFactor.z) + g_globalConstants.m_matrices.m_near;
+	const Vec3 viewPos = Vec3(uvToNdc(uvw.xy) * g_globalConstants.m_matrices.m_unprojectionParameters.xy * -negativeZViewSpace, -negativeZViewSpace);
+
+	// Compute depth of sample point
+	const Vec4 v4 = mul(g_globalConstants.m_matrices.m_projection, Vec4(viewPos, 1.0));
+	const F32 depthSample = v4.z / v4.w;
+	const F32 depth = g_depthMap.SampleLevel(g_linearAnyClampSampler, uvw.xy, 0.0).x;
+
+	Bool validSample;
+	Vec4 lightAndFog;
+	if(depthSample < depth)
+	{
+		// Valid sample, do lighting
+
+		validSample = true;
+
+		// Get the final world pos
+		const Vec3 worldPos = mul(g_globalConstants.m_matrices.m_cameraTransform, Vec4(viewPos, 1.0));
 
-	// Get the cluster
-	const UVec2 tileIdxXY = UVec2(uvw.xy * Vec2(g_globalConstants.m_tileCounts));
-	const U32 tileIdx = tileIdxXY.y * g_globalConstants.m_tileCounts.x + tileIdxXY.x;
-	Cluster cluster = g_clusters[tileIdx];
+		// Get the cluster
+		const UVec3 clusterId = subClusterId >> g_consts.m_clusterSubdivision;
+		const U32 tileIdx = clusterId.y * clustererConsts.m_tileCounts.x + clusterId.x;
+		Cluster cluster = SBUFF(g_clusters, tileIdx);
 
-	const U32 zSplitIdx = U32(uvw.z * F32(g_globalConstants.m_zSplitCount));
-	const Cluster split = g_clusters[g_globalConstants.m_tileCounts.x * g_globalConstants.m_tileCounts.y + zSplitIdx];
+		const Cluster split = SBUFF(g_clusters, clustererConsts.m_tileCounts.x * clustererConsts.m_tileCounts.y + clusterId.z);
 
-	cluster = mergeClusters(cluster, split);
+		cluster = mergeClusters(cluster, split);
 
-	// Get lighting
-	Vec4 lightAndFog = accumulateLightsAndFog(cluster, worldPos, negativeZViewSpace, random.x);
+		// Do lighting
+		lightAndFog = accumulateLightsAndFog(cluster, worldPos, negativeZViewSpace, randomFactor.x);
+	}
+	else
+	{
+		// Invalid sample, regect it
+
+		validSample = false;
+		lightAndFog = 0.0;
+	}
 
 	// Read the prev result
 	{
 		// Better get a new world pos in the center of the cluster. Using worldPos creates noisy results
-		const Vec3 midWPos = worldPosInsideCluster(Vec3(0.5, 0.5, 0.5), clusterId);
+		const Vec2 uv = (subClusterIdf.xy + 0.5) / g_consts.m_volumeSize.xy;
+		const F32 negativeZViewSpace = g_consts.m_subZSplitThickness * (subClusterIdf.z + 0.5) + g_globalConstants.m_matrices.m_near;
+		const Vec3 viewPos = Vec3(uvToNdc(uv) * g_globalConstants.m_matrices.m_unprojectionParameters.xy * -negativeZViewSpace, -negativeZViewSpace);
+		const Vec3 midWPos = mul(g_globalConstants.m_matrices.m_cameraTransform, Vec4(viewPos, 1.0));
 
 		// Project
 		const Vec4 prevClipPos4 = mul(g_globalConstants.m_previousMatrices.m_viewProjection, Vec4(midWPos, 1.0));
 		const Vec3 prevClipPos = prevClipPos4.xyz / prevClipPos4.w;
 
 		// Read prev
-		if(all(prevClipPos.xy > -1.0) && all(prevClipPos.xy < 1.0))
+		if(all(abs(prevClipPos.xy) <= 1.0) && prevClipPos.z > 0.0)
 		{
-			const F32 linearDepth = linearizeDepth(prevClipPos.z, g_globalConstants.m_matrices.m_near, g_globalConstants.m_matrices.m_far);
-			const Vec3 uvw = Vec3(ndcToUv(prevClipPos.xy), linearDepth * (F32(g_globalConstants.m_zSplitCount) / g_consts.m_maxZSplitsToProcessf));
+			Vec3 uvw;
+			uvw.xy = ndcToUv(prevClipPos.xy);
+			uvw.z = computeVolumeWTexCoord(prevClipPos.z, clustererConsts.m_lightVolumeWMagic.x, clustererConsts.m_lightVolumeWMagic.y);
+
 			const Vec4 history = g_prevVolume.SampleLevel(g_linearAnyClampSampler, uvw, 0.0);
-			lightAndFog = lerp(history, lightAndFog, 1.0 / 16.0);
+			lightAndFog = lerp(history, lightAndFog, (validSample) ? 1.0 / 16.0 : 0.0);
 		}
 	}
 
 	// Write result
-	TEX(g_volume, clusterId) = lightAndFog;
+	TEX(g_volume, subClusterId) = lightAndFog;
 }
+
+#endif
+
+// ===========================================================================
+// Debug                                                                     =
+// ===========================================================================
+#if ANKI_TECHNIQUE_Debug
+
+Texture3D g_lightVolume : register(t0);
+Texture2D g_depthBuffer : register(t1);
+
+RWTexture2D<Vec4> g_outTex : register(u0);
+
+ConstantBuffer<GlobalRendererConstants> g_consts : register(b0);
+
+SamplerState g_linearAnyClampSampler : register(s0);
+
+[numthreads(8, 8, 1)] void main(COMPUTE_ARGS)
+{
+	UVec2 outTexSize;
+	g_outTex.GetDimensions(outTexSize.x, outTexSize.y);
+
+	const UVec2 coord = svDispatchThreadId.xy;
+	if(any(coord >= outTexSize))
+	{
+		return;
+	}
+
+	const F32 depth = TEX(g_depthBuffer, coord).x;
+
+	Vec3 uvw;
+	uvw.xy = (coord + 0.5) / outTexSize;
+	uvw.z = computeVolumeWTexCoord(depth, g_consts.m_clusterer.m_lightVolumeWMagic.x, g_consts.m_clusterer.m_lightVolumeWMagic.y);
+
+	const Vec3 light = g_lightVolume.SampleLevel(g_linearAnyClampSampler, uvw, 0.0).rgb;
+
+	TEX(g_outTex, coord) = Vec4(light, 0.0);
+}
+#endif