Browse Source

Finalize HZB for sun light. Not working correctly

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
c0fe5cdfdb

+ 1 - 1
AnKi/Gr/CommandBuffer.h

@@ -77,7 +77,7 @@ public:
 	Array<TextureUsageBit, kMaxColorRenderTargets> m_colorAttachmentUsages = {};
 	TextureUsageBit m_depthStencilAttachmentUsage = TextureUsageBit::kNone;
 
-	CommandBufferFlag m_flags = CommandBufferFlag::kNone;
+	CommandBufferFlag m_flags = CommandBufferFlag::kGeneralWork;
 
 	CommandBufferInitInfo(CString name = {})
 		: GrBaseInitInfo(name)

+ 2 - 7
AnKi/Renderer/GBuffer.cpp

@@ -22,11 +22,6 @@ Error GBuffer::init()
 {
 	Error err = initInternal();
 
-	if(!err)
-	{
-		err = m_hzb.init();
-	}
-
 	if(err)
 	{
 		ANKI_R_LOGE("Failed to initialize g-buffer pass");
@@ -232,8 +227,8 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 	pass.newBufferDependency(visOut.m_mdiDrawCountsHandle, BufferUsageBit::kIndirectDraw);
 
 	// HZB generation for the next frame
-	m_hzb.populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,
-							  UVec2(m_hzbRt->getWidth(), m_hzbRt->getHeight()), ctx);
+	getRenderer().getHzbHelper().populateRenderGraph(m_runCtx.m_crntFrameDepthRt, getRenderer().getInternalResolution(), m_runCtx.m_hzbRt,
+													 UVec2(m_hzbRt->getWidth(), m_hzbRt->getHeight()), rgraph);
 }
 
 } // end namespace anki

+ 0 - 3
AnKi/Renderer/GBuffer.h

@@ -6,7 +6,6 @@
 #pragma once
 
 #include <AnKi/Renderer/RendererObject.h>
-#include <AnKi/Renderer/HzbHelper.h>
 #include <AnKi/Gr.h>
 
 namespace anki {
@@ -79,8 +78,6 @@ private:
 	TexturePtr m_hzbRt;
 	FramebufferDescription m_fbDescr;
 
-	HzbHelper m_hzb;
-
 	class
 	{
 	public:

+ 159 - 27
AnKi/Renderer/HzbHelper.cpp

@@ -25,6 +25,14 @@
 
 namespace anki {
 
+//   7 +----+ 6
+//    /|   /|
+// 3 +----+2|
+//   | *--| + 5
+//   |/4  |/
+// 0 +----+  1
+static constexpr U16 kBoxIndices[] = {1, 5, 2, 2, 5, 6, 0, 3, 4, 4, 3, 7, 3, 2, 7, 7, 2, 6, 0, 4, 1, 1, 4, 5, 0, 1, 3, 3, 1, 2, 4, 7, 5, 5, 7, 6};
+
 Error HzbHelper::init()
 {
 	if(GrManager::getSingleton().getDeviceCapabilities().m_samplingFilterMinMax)
@@ -36,55 +44,81 @@ Error HzbHelper::init()
 		m_maxSampler = GrManager::getSingleton().newSampler(sinit);
 	}
 
-	ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/HzbGenPyramid.ankiprogbin", m_prog));
+	{
+		ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/HzbGenPyramid.ankiprogbin", m_genPyramidProg));
+
+		ShaderProgramResourceVariantInitInfo variantInit(m_genPyramidProg);
+		variantInit.addMutation("REDUCTION_TYPE", 1);
+		variantInit.addMutation("REVERSE_1_TO_0", 0);
+		variantInit.addMutation("MIN_MAX_SAMPLER", m_maxSampler.isCreated());
+		const ShaderProgramResourceVariant* variant;
+		m_genPyramidProg->getOrCreateVariant(variantInit, variant);
+		m_genPyramidMainCameraGrProg.reset(&variant->getProgram());
 
-	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
-	variantInit.addMutation("REDUCTION_TYPE", 1);
-	variantInit.addMutation("MIN_MAX_SAMPLER", m_maxSampler.isCreated());
-	const ShaderProgramResourceVariant* variant;
-	m_prog->getOrCreateVariant(variantInit, variant);
-	m_grProg.reset(&variant->getProgram());
+		variantInit.addMutation("REVERSE_1_TO_0", 1);
+		variantInit.addMutation("MIN_MAX_SAMPLER", 0);
+		m_genPyramidProg->getOrCreateVariant(variantInit, variant);
+		m_genPyramidShadowGrProg.reset(&variant->getProgram());
+	}
+
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMinMaxDepth.ankiprogbin", m_minMaxDepthProg, m_minMaxDepthGrProg));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/HzbMinMaxBox.ankiprogbin", m_minMaxBoxProg, m_minMaxBoxGrProg));
 
 	BufferInitInfo buffInit("HzbCounterBuffer");
 	buffInit.m_size = sizeof(U32);
 	buffInit.m_usage = BufferUsageBit::kStorageComputeWrite | BufferUsageBit::kTransferDestination;
 	m_counterBuffer = GrManager::getSingleton().newBuffer(buffInit);
 
+	// Zero counter buffer
+	{
+		CommandBufferInitInfo cmdbInit;
+		cmdbInit.m_flags |= CommandBufferFlag::kSmallBatch;
+		CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(cmdbInit);
+
+		cmdb->fillBuffer(m_counterBuffer.get(), 0, kMaxPtrSize, 0);
+
+		FencePtr fence;
+		cmdb->flush({}, &fence);
+
+		fence->clientWait(6.0_sec);
+	}
+
+	buffInit = BufferInitInfo("HzbBoxIndices");
+	buffInit.m_size = sizeof(kBoxIndices);
+	buffInit.m_usage = BufferUsageBit::kIndex;
+	buffInit.m_mapAccess = BufferMapAccessBit::kWrite;
+	m_boxIndexBuffer = GrManager::getSingleton().newBuffer(buffInit);
+
+	void* mappedMem = m_boxIndexBuffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite);
+	memcpy(mappedMem, kBoxIndices, sizeof(kBoxIndices));
+	m_boxIndexBuffer->unmap();
+
+	m_fbDescr.m_depthStencilAttachment.m_aspect = DepthStencilAspectBit::kDepth;
+	m_fbDescr.m_depthStencilAttachment.m_clearValue.m_depthStencil.m_depth = 1.0f;
+	m_fbDescr.m_depthStencilAttachment.m_loadOperation = AttachmentLoadOperation::kClear;
+	m_fbDescr.bake();
+
 	return Error::kNone;
 }
 
-void HzbHelper::populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
-									RenderingContext& ctx)
+void HzbHelper::populateRenderGraphInternal(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
+											RenderGraphDescription& rgraph, CString customName, ShaderProgram* prog, Sampler* sampler) const
 {
-	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
 	TextureSubresourceInfo firstMipSubresource;
 
-	constexpr U32 kMaxSpdMips = 12;
 	const U32 hzbMipCount = min(kMaxSpdMips, computeMaxMipmapCount2d(dstHzbRtSize.x(), dstHzbRtSize.y()));
 
-	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("HZB generation");
+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass((customName.isEmpty()) ? "HZB generation" : customName);
 
 	pass.newTextureDependency(srcDepthRt, TextureUsageBit::kSampledCompute, firstMipSubresource);
 	pass.newTextureDependency(dstHzbRt, TextureUsageBit::kImageComputeWrite);
 
-	pass.setWork([this, hzbMipCount, srcDepthRt, srcDepthRtSize, dstHzbRt, dstHzbRtSize](RenderPassWorkContext& rgraphCtx) {
+	pass.setWork([this, hzbMipCount, srcDepthRt, srcDepthRtSize, dstHzbRt, dstHzbRtSize, prog, sampler](RenderPassWorkContext& rgraphCtx) {
 		CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 		const U32 mipsToCompute = hzbMipCount;
 
-		// Zero the counter buffer once before everything else
-		if(!m_counterBufferZeroed) [[unlikely]]
-		{
-			m_counterBufferZeroed = true;
-
-			cmdb.fillBuffer(m_counterBuffer.get(), 0, kMaxPtrSize, 0);
-
-			const BufferBarrierInfo barrier = {m_counterBuffer.get(), BufferUsageBit::kTransferDestination, BufferUsageBit::kStorageComputeWrite, 0,
-											   kMaxPtrSize};
-			cmdb.setPipelineBarrier({}, {&barrier, 1}, {});
-		}
-
-		cmdb.bindShaderProgram(m_grProg.get());
+		cmdb.bindShaderProgram(prog);
 
 		varAU2(dispatchThreadGroupCountXY);
 		varAU2(workGroupOffset); // needed if Left and Top are not 0,0
@@ -122,10 +156,108 @@ void HzbHelper::populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDept
 
 		cmdb.bindStorageBuffer(0, 1, m_counterBuffer.get(), 0, kMaxPtrSize);
 		rgraphCtx.bindTexture(0, 2, srcDepthRt, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
-		cmdb.bindSampler(0, 3, m_maxSampler.isCreated() ? m_maxSampler.get() : getRenderer().getSamplers().m_trilinearClamp.get());
+		cmdb.bindSampler(0, 3, sampler);
 
 		cmdb.dispatchCompute(dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1], 1);
 	});
 }
 
+void HzbHelper::populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
+									RenderGraphDescription& rgraph, CString customName) const
+{
+	populateRenderGraphInternal(srcDepthRt, srcDepthRtSize, dstHzbRt, dstHzbRtSize, rgraph, customName, m_genPyramidMainCameraGrProg.get(),
+								m_maxSampler.isCreated() ? m_maxSampler.get() : getRenderer().getSamplers().m_trilinearClamp.get());
+}
+
+void HzbHelper::populateRenderGraphDirectionalLight(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, ConstWeakArray<RenderTargetHandle> dstHzbRts,
+													ConstWeakArray<Mat4> dstViewProjectionMats, ConstWeakArray<UVec2> dstHzbSizes,
+													const Mat4& invViewProjMat, RenderGraphDescription& rgraph) const
+{
+	RenderTargetHandle minMaxRt;
+	constexpr U32 kTileSize = 64;
+	const UVec2 minMaxRtSize((srcDepthRtSize.x() + kTileSize - 1) / kTileSize, (srcDepthRtSize.y() + kTileSize - 1) / kTileSize);
+	const U32 cascadeCount = dstHzbRts.getSize();
+	ANKI_ASSERT(cascadeCount > 0);
+
+	// Generate a temp RT with the min&max depth of each 64x64 tile of the depth buffer
+	{
+		RenderTargetDescription minMaxRtDescr("HZB min/max depth");
+		minMaxRtDescr.m_width = minMaxRtSize.x();
+		minMaxRtDescr.m_height = minMaxRtSize.y();
+		minMaxRtDescr.m_format = Format::kR32G32_Sfloat;
+		minMaxRtDescr.bake();
+		minMaxRt = rgraph.newRenderTarget(minMaxRtDescr);
+
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("HZB min/max depth");
+
+		pass.newTextureDependency(srcDepthRt, TextureUsageBit::kSampledCompute, DepthStencilAspectBit::kDepth);
+		pass.newTextureDependency(minMaxRt, TextureUsageBit::kImageComputeWrite);
+
+		pass.setWork([this, srcDepthRt, minMaxRt, minMaxRtSize, srcDepthRtSize](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			rgraphCtx.bindTexture(0, 0, srcDepthRt, TextureSubresourceInfo(DepthStencilAspectBit::kDepth));
+			cmdb.bindSampler(0, 1, getRenderer().getSamplers().m_nearestNearestClamp.get());
+			rgraphCtx.bindImage(0, 2, minMaxRt);
+
+			cmdb.bindShaderProgram(m_minMaxDepthGrProg.get());
+
+			cmdb.dispatchCompute(minMaxRtSize.x(), minMaxRtSize.y(), 1);
+		});
+	}
+
+	// Project a box for each tile on each cascade's HZB
+	Array<RenderTargetHandle, kMaxShadowCascades> depthRts;
+	for(U32 i = 0; i < cascadeCount; ++i)
+	{
+		RenderTargetDescription depthRtDescr("HZB min/max boxes depth");
+		depthRtDescr.m_width = dstHzbSizes[i].x() * 2;
+		depthRtDescr.m_height = dstHzbSizes[i].y() * 2;
+		depthRtDescr.m_format = Format::kD16_Unorm;
+		depthRtDescr.bake();
+		depthRts[i] = rgraph.newRenderTarget(depthRtDescr);
+
+		GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass("HZB min/max boxes");
+
+		pass.setFramebufferInfo(m_fbDescr, {}, depthRts[i]);
+
+		pass.newTextureDependency(minMaxRt, TextureUsageBit::kSampledFragment);
+		pass.newTextureDependency(depthRts[i], TextureUsageBit::kFramebufferWrite, DepthStencilAspectBit::kDepth);
+
+		pass.setWork([this, minMaxRt, invViewProjMat, lightViewProjMat = dstViewProjectionMats[i], viewport = dstHzbSizes[i] * 2, minMaxRtSize,
+					  srcDepthRtSize](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.setViewport(0, 0, viewport.x(), viewport.y());
+
+			cmdb.bindShaderProgram(m_minMaxBoxGrProg.get());
+
+			rgraphCtx.bindColorTexture(0, 0, minMaxRt);
+
+			struct Uniforms
+			{
+				Mat4 m_reprojectionMat;
+				UVec2 m_mainCameraDepthBufferSize;
+				UVec2 m_padding;
+			} unis;
+
+			unis.m_reprojectionMat = lightViewProjMat * invViewProjMat;
+			unis.m_mainCameraDepthBufferSize = srcDepthRtSize;
+
+			cmdb.setPushConstants(&unis, sizeof(unis));
+
+			cmdb.bindIndexBuffer(m_boxIndexBuffer.get(), 0, IndexType::kU16);
+
+			cmdb.drawIndexed(PrimitiveTopology::kTriangles, 6 * 2 * 3, minMaxRtSize.x() * minMaxRtSize.y());
+		});
+	}
+
+	// Generate the HZBs
+	for(U32 i = 0; i < cascadeCount; ++i)
+	{
+		populateRenderGraphInternal(depthRts[i], dstHzbSizes[i] * 2, dstHzbRts[i], dstHzbSizes[i], rgraph, "HZB generation cascade",
+									m_genPyramidShadowGrProg.get(), getRenderer().getSamplers().m_trilinearClamp.get());
+	}
+}
+
 } // end namespace anki

+ 22 - 6
AnKi/Renderer/HzbHelper.h

@@ -19,19 +19,35 @@ public:
 	Error init();
 
 	void populateRenderGraph(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
-							 RenderingContext& ctx);
+							 RenderGraphDescription& rgraph, CString customName = {}) const;
 
-	void populateRenderGraphDirectionalLight(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, WeakArray<RenderTargetHandle> dstHzbRts,
-											 WeakArray<Mat4> viewProjectionMatrices, RenderingContext& ctx);
+	void populateRenderGraphDirectionalLight(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, ConstWeakArray<RenderTargetHandle> dstHzbRts,
+											 ConstWeakArray<Mat4> dstViewProjectionMats, ConstWeakArray<UVec2> dstHzbSizes,
+											 const Mat4& invViewProjMat, RenderGraphDescription& rgraph) const;
 
 private:
-	ShaderProgramResourcePtr m_prog;
-	ShaderProgramPtr m_grProg;
+	static constexpr U32 kMaxSpdMips = 12;
+
+	ShaderProgramResourcePtr m_genPyramidProg;
+	ShaderProgramPtr m_genPyramidMainCameraGrProg;
+	ShaderProgramPtr m_genPyramidShadowGrProg;
+
+	ShaderProgramResourcePtr m_minMaxDepthProg;
+	ShaderProgramPtr m_minMaxDepthGrProg;
+
+	ShaderProgramResourcePtr m_minMaxBoxProg;
+	ShaderProgramPtr m_minMaxBoxGrProg;
 
 	SamplerPtr m_maxSampler;
 
 	BufferPtr m_counterBuffer;
-	Bool m_counterBufferZeroed = false;
+
+	BufferPtr m_boxIndexBuffer;
+
+	FramebufferDescription m_fbDescr;
+
+	void populateRenderGraphInternal(RenderTargetHandle srcDepthRt, UVec2 srcDepthRtSize, RenderTargetHandle dstHzbRt, UVec2 dstHzbRtSize,
+									 RenderGraphDescription& rgraph, CString customName, ShaderProgram* prog, Sampler* sampler) const;
 };
 /// @}
 

+ 1 - 0
AnKi/Renderer/Renderer.cpp

@@ -291,6 +291,7 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	}
 
 	ANKI_CHECK(m_visibility.init());
+	ANKI_CHECK(m_hzbHelper.init());
 
 	return Error::kNone;
 }

+ 7 - 0
AnKi/Renderer/Renderer.h

@@ -8,6 +8,7 @@
 #include <AnKi/Renderer/Common.h>
 #include <AnKi/Renderer/Drawer.h>
 #include <AnKi/Renderer/GpuVisibility.h>
+#include <AnKi/Renderer/HzbHelper.h>
 #include <AnKi/Math.h>
 #include <AnKi/Gr.h>
 #include <AnKi/Resource/Forward.h>
@@ -94,6 +95,11 @@ public:
 		return m_visibility;
 	}
 
+	const HzbHelper& getHzbHelper() const
+	{
+		return m_hzbHelper;
+	}
+
 	/// Create the init info for a 2D texture that will be used as a render target.
 	[[nodiscard]] TextureInitInfo create2DRenderTargetInitInfo(U32 w, U32 h, Format format, TextureUsageBit usage, CString name = {});
 
@@ -188,6 +194,7 @@ private:
 
 	RenderableDrawer m_sceneDrawer;
 	GpuVisibility m_visibility;
+	HzbHelper m_hzbHelper;
 
 	U64 m_frameCount; ///< Frame number
 

+ 34 - 3
AnKi/Renderer/ShadowMapping.cpp

@@ -5,6 +5,7 @@
 
 #include <AnKi/Renderer/ShadowMapping.h>
 #include <AnKi/Renderer/Renderer.h>
+#include <AnKi/Renderer/GBuffer.h>
 #include <AnKi/Renderer/RenderQueue.h>
 #include <AnKi/Core/ConfigSet.h>
 #include <AnKi/Util/ThreadHive.h>
@@ -64,6 +65,20 @@ Error ShadowMapping::initInternal()
 	m_clearDepthProg->getOrCreateVariant(variant);
 	m_clearDepthGrProg.reset(&variant->getProgram());
 
+	for(U32 i = 0; i < kMaxShadowCascades; ++i)
+	{
+		RendererString name;
+		name.sprintf("DirLight HZB #%d", i);
+
+		UVec2 size(m_tileResolution >> chooseDirectionalLightShadowCascadeDetail(i),
+				   m_tileResolution >> chooseDirectionalLightShadowCascadeDetail(i));
+		size /= 2;
+
+		m_cascadeHzbRtDescrs[i] = getRenderer().create2DRenderTargetDescription(size.x(), size.y(), Format::kR32_Sfloat, name);
+		m_cascadeHzbRtDescrs[i].m_mipmapCount = U8(computeMaxMipmapCount2d(m_cascadeHzbRtDescrs[i].m_width, m_cascadeHzbRtDescrs[i].m_height));
+		m_cascadeHzbRtDescrs[i].bake();
+	}
+
 	return Error::kNone;
 }
 
@@ -229,14 +244,14 @@ Bool ShadowMapping::allocateAtlasTiles(U64 lightUuid, U32 faceCount, const U64*
 
 template<typename TMemoryPool>
 void ShadowMapping::newWorkItem(const UVec4& atlasViewport, const RenderQueue& queue, RenderGraphDescription& rgraph,
-								DynamicArray<ViewportWorkItem, TMemoryPool>& workItems)
+								DynamicArray<ViewportWorkItem, TMemoryPool>& workItems, RenderTargetHandle* hzbRt)
 {
 	ViewportWorkItem& work = *workItems.emplaceBack();
 
 	const Array<F32, kMaxLodCount - 1> lodDistances = {ConfigSet::getSingleton().getLod0MaxDistance(),
 													   ConfigSet::getSingleton().getLod1MaxDistance()};
 	getRenderer().getGpuVisibility().populateRenderGraph("Shadowmapping visibility", RenderingTechnique::kDepth, queue.m_viewProjectionMatrix,
-														 queue.m_cameraTransform.getTranslationPart().xyz(), lodDistances, nullptr, rgraph,
+														 queue.m_cameraTransform.getTranslationPart().xyz(), lodDistances, hzbRt, rgraph,
 														 work.m_visOut);
 
 	work.m_viewport = atlasViewport;
@@ -284,13 +299,29 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 
 		if(!allocationFailed)
 		{
+			// HZB generation
+			Array<RenderTargetHandle, kMaxShadowCascades> hzbRts;
+			Array<UVec2, kMaxShadowCascades> hzbSizes;
+			Array<Mat4, kMaxShadowCascades> dstViewProjectionMats;
+
+			for(U cascade = 0; cascade < light.m_shadowCascadeCount; ++cascade)
+			{
+				hzbRts[cascade] = rgraph.newRenderTarget(m_cascadeHzbRtDescrs[cascade]);
+				hzbSizes[cascade] = UVec2(m_cascadeHzbRtDescrs[cascade].m_width, m_cascadeHzbRtDescrs[cascade].m_height);
+				dstViewProjectionMats[cascade] = ctx.m_renderQueue->m_directionalLight.m_shadowRenderQueues[cascade]->m_viewProjectionMatrix;
+			}
+
+			getRenderer().getHzbHelper().populateRenderGraphDirectionalLight(getRenderer().getGBuffer().getDepthRt(),
+																			 getRenderer().getInternalResolution(), hzbRts, dstViewProjectionMats,
+																			 hzbSizes, ctx.m_matrices.m_invertedViewProjection, rgraph);
+
 			for(U cascade = 0; cascade < light.m_shadowCascadeCount; ++cascade)
 			{
 				// Update the texture matrix to point to the correct region in the atlas
 				light.m_textureMatrices[cascade] = createSpotLightTextureMatrix(atlasViewports[cascade]) * light.m_textureMatrices[cascade];
 
 				// Push work
-				newWorkItem(atlasViewports[cascade], *light.m_shadowRenderQueues[cascade], rgraph, workItems);
+				newWorkItem(atlasViewports[cascade], *light.m_shadowRenderQueues[cascade], rgraph, workItems, &hzbRts[cascade]);
 			}
 		}
 		else

+ 3 - 1
AnKi/Renderer/ShadowMapping.h

@@ -52,6 +52,8 @@ private:
 	ShaderProgramResourcePtr m_clearDepthProg;
 	ShaderProgramPtr m_clearDepthGrProg;
 
+	Array<RenderTargetDescription, kMaxShadowCascades> m_cascadeHzbRtDescrs;
+
 	class
 	{
 	public:
@@ -76,7 +78,7 @@ private:
 
 	template<typename TMemoryPool>
 	void newWorkItem(const UVec4& atlasViewport, const RenderQueue& queue, RenderGraphDescription& rgraph,
-					 DynamicArray<ViewportWorkItem, TMemoryPool>& workItems);
+					 DynamicArray<ViewportWorkItem, TMemoryPool>& workItems, RenderTargetHandle* hzbRt = nullptr);
 
 	void runShadowMapping(RenderPassWorkContext& rgraphCtx);
 };

+ 5 - 1
AnKi/Shaders/Common.hlsl

@@ -7,7 +7,11 @@
 
 #pragma once
 
-#include <AnKi/Shaders/Include/Common.h>
+#if defined(__INTELLISENSE__)
+#	include <AnKi/Shaders/Intellisense.hlsl>
+#else
+#	include <AnKi/Shaders/Include/Common.h>
+#endif
 
 template<typename T>
 T uvToNdc(T x)

+ 2 - 2
AnKi/Shaders/Functions.hlsl

@@ -684,8 +684,8 @@ F32 fastCos(F32 x)
 }
 
 #if defined(ANKI_COMPUTE_SHADER)
-/// HLSL doesn't have SubgroupID so compute it. It's a macro because we can't have functions that InterlockedAdd on
-/// local variables (the compiler can't see it's groupshared).
+/// HLSL doesn't have SubgroupID so compute it. It's a macro because we can't have functions that InterlockedAdd on local variables (the compiler
+/// can't see it's groupshared).
 /// @param svGroupIndex Self explanatory.
 /// @param tmpGroupsharedU32Var A U32 groupshared variable that will help with the calculation.
 /// @param waveIndexInsideThreadgroup The SubgroupID.

+ 1 - 0
AnKi/Shaders/GpuVisibility.ankiprog

@@ -7,6 +7,7 @@
 
 #pragma anki start comp
 
+#include <AnKi/Shaders/Common.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/Include/MiscRendererTypes.h>
 #include <AnKi/Shaders/CollisionFunctions.hlsl>

+ 10 - 1
AnKi/Shaders/HzbGenPyramid.ankiprog

@@ -5,6 +5,7 @@
 
 #pragma anki mutator REDUCTION_TYPE 0 1 // 0: min 1: max
 #pragma anki mutator MIN_MAX_SAMPLER 0 1
+#pragma anki mutator REVERSE_1_TO_0 0 1 // Used in shadows where we want to move the far (1.0) close to the camera to cull
 
 #pragma anki start comp
 #include <AnKi/Shaders/Common.hlsl>
@@ -44,7 +45,15 @@ AF4 SpdLoadSourceImage(AU2 p, AU1 slice)
 #if MIN_MAX_SAMPLER
 	const F32 f = g_srcTex.SampleLevel(g_minMaxAnyClampSampler, uv, 0.0).r;
 #else
-	const Vec4 samples = g_srcTex.GatherRed(g_linearAnyClampSampler, uv);
+	Vec4 samples = g_srcTex.GatherRed(g_linearAnyClampSampler, uv);
+
+#	if REVERSE_1_TO_0
+	[unroll] for(U32 i = 0; i < 4; ++i)
+	{
+		samples[i] = (samples[i] == 1.0f) ? 0.0f : samples[i];
+	}
+#	endif
+
 #	if REDUCTION_TYPE == 0
 	const F32 f = min4(samples);
 #	else

+ 73 - 0
AnKi/Shaders/HzbMinMaxBox.ankiprog

@@ -0,0 +1,73 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+// This shader draws tile aligned boxes in order to fill the HZB buffer for cascaded shadows
+
+#include <AnKi/Shaders/Functions.hlsl>
+
+#pragma anki start vert
+
+constexpr U32 kTileSize = 64;
+
+[[vk::binding(0)]] Texture2D<Vec4> g_minMaxRt;
+
+struct Uniforms
+{
+	Mat4 m_reprojectionMat;
+	UVec2 m_mainCameraDepthBufferSize;
+	UVec2 m_padding;
+};
+
+[[vk::push_constant]] ConstantBuffer<Uniforms> g_unis;
+
+Vec4 main(U32 svVertexId : SV_VERTEXID, U32 svInstanceId : SV_INSTANCEID) : SV_POSITION
+{
+	UVec2 minMaxRtSize;
+	g_minMaxRt.GetDimensions(minMaxRtSize.x, minMaxRtSize.y);
+
+	const U32 tileX = svInstanceId % minMaxRtSize.x;
+	const U32 tileY = svInstanceId / minMaxRtSize.x;
+
+	const Vec2 depths = g_minMaxRt[UVec2(tileX, tileY)].xy;
+	const F32 minDepth = depths.x;
+	const F32 maxDepth = depths.y;
+
+	// Z
+	Vec3 ndc;
+	ndc.z = (svVertexId <= 3) ? minDepth : maxDepth;
+
+	// X
+	ndc.x = F32(tileX * kTileSize);
+	if(svVertexId == 1 || svVertexId == 2 || svVertexId == 5 || svVertexId == 6)
+	{
+		// Right side, move the point
+		ndc.x += kTileSize;
+	}
+	ndc.x /= F32(g_unis.m_mainCameraDepthBufferSize.x);
+
+	// Y
+	ndc.y = F32(tileY * kTileSize);
+	if(svVertexId == 3 || svVertexId == 2 || svVertexId == 7 || svVertexId == 6)
+	{
+		// Top side, move the point
+		ndc.y += kTileSize;
+	}
+	ndc.y /= F32(g_unis.m_mainCameraDepthBufferSize.y);
+
+	ndc.xy = uvToNdc(saturate(ndc.xy));
+
+	// Unproject and project
+	return mul(g_unis.m_reprojectionMat, Vec4(ndc, 1.0));
+}
+
+#pragma anki end
+
+#pragma anki start frag
+
+void main()
+{
+}
+
+#pragma anki end

+ 74 - 0
AnKi/Shaders/HzbMinMaxDepth.ankiprog

@@ -0,0 +1,74 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#include <AnKi/Shaders/Functions.hlsl>
+
+#pragma anki start comp
+
+[[vk::binding(0)]] Texture2D<Vec4> g_depthRt;
+[[vk::binding(1)]] SamplerState g_nearestAnyClampSampler;
+[[vk::binding(2)]] RWTexture2D<Vec4> g_minMaxDepthUav;
+
+#define TILE_SIZE 64
+#define THREADGROUP_SIZE_XY 16
+#define MIN_POSSIBLE_WAVE_SIZE 8
+constexpr U32 kSharedMemoryEntries = THREADGROUP_SIZE_XY * THREADGROUP_SIZE_XY / MIN_POSSIBLE_WAVE_SIZE;
+
+groupshared F32 s_minDepths[kSharedMemoryEntries];
+groupshared F32 s_maxDepths[kSharedMemoryEntries];
+groupshared U32 s_waveIndexInsideThreadGroup;
+
+[numthreads(THREADGROUP_SIZE_XY, THREADGROUP_SIZE_XY, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX,
+																	UVec2 svGroupId : SV_GROUPID)
+{
+	Vec2 depthRtSize;
+	g_depthRt.GetDimensions(depthRtSize.x, depthRtSize.y);
+
+	const U32 pixelsPerAxisPerThread = TILE_SIZE / THREADGROUP_SIZE_XY;
+
+	const Vec2 uv = (Vec2(svDispatchThreadId * pixelsPerAxisPerThread) + 0.5) / depthRtSize;
+
+	F32 minDepth = 2.0f;
+	F32 maxDepth = -1.0f;
+	[unroll] for(U32 x = 0; x < pixelsPerAxisPerThread; x++)
+	{
+		[unroll] for(U32 y = 0; y < pixelsPerAxisPerThread; y++)
+		{
+			const F32 depth = g_depthRt.SampleLevel(g_nearestAnyClampSampler, uv, 0.0f, IVec2(x, y));
+			minDepth = min(minDepth, depth);
+			maxDepth = max(maxDepth, depth);
+		}
+	}
+
+	U32 wavesPerThreadGroup;
+	U32 waveIndexInsideThreadGroup;
+	ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, s_waveIndexInsideThreadGroup, waveIndexInsideThreadGroup, wavesPerThreadGroup);
+
+	const F32 waveMinDepth = WaveActiveMin(minDepth);
+	const F32 waveMaxDepth = WaveActiveMax(maxDepth);
+
+	if(WaveIsFirstLane())
+	{
+		s_minDepths[waveIndexInsideThreadGroup] = waveMinDepth;
+		s_maxDepths[waveIndexInsideThreadGroup] = waveMaxDepth;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	if(svGroupIndex == 0)
+	{
+		F32 threadGroupMin = s_minDepths[0];
+		F32 threadGroupMax = s_maxDepths[0];
+		for(U32 i = 1; i < wavesPerThreadGroup; ++i)
+		{
+			threadGroupMin = min(threadGroupMin, s_minDepths[i]);
+			threadGroupMax = min(threadGroupMax, s_maxDepths[i]);
+		}
+
+		g_minMaxDepthUav[svGroupId] = Vec4(threadGroupMin, threadGroupMax, 0.0, 0.0);
+	}
+}
+
+#pragma anki end

+ 158 - 0
AnKi/Shaders/Intellisense.hlsl

@@ -0,0 +1,158 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma once
+
+#define groupshared
+#define globallycoherent
+#define SV_DISPATCHTHREADID
+#define SV_GROUPINDEX
+#define SV_GROUPID
+#define SV_VERTEXID
+#define SV_POSITION
+#define SV_INSTANCEID
+#define numthreads(x, y, z) [nodiscard]
+#define unroll [nodiscard]
+#define loop [nodiscard]
+
+#define ANKI_BEGIN_NAMESPACE
+#define ANKI_END_NAMESPACE
+#define ANKI_HLSL 1
+
+using I8 = int;
+using I16 = int;
+using I32 = int;
+using U8 = unsigned int;
+using U16 = unsigned int;
+using U32 = unsigned int;
+using F32 = float;
+
+struct UVec2
+{
+	U32 x;
+	U32 y;
+};
+
+struct UVec3
+{
+	U32 x;
+	U32 y;
+	U32 z;
+};
+
+struct UVec4
+{
+	U32 x;
+	U32 y;
+	U32 z;
+	U32 w;
+};
+
+struct IVec2
+{
+	I32 x;
+	I32 y;
+};
+
+struct IVec3
+{
+	I32 x;
+	I32 y;
+	I32 z;
+};
+
+struct IVec4
+{
+	I32 x;
+	I32 y;
+	I32 z;
+	I32 w;
+};
+
+struct Vec2
+{
+	F32 x;
+	F32 y;
+};
+
+struct Vec3
+{
+	F32 x;
+	F32 y;
+	F32 z;
+};
+
+struct Vec4
+{
+	F32 x;
+	F32 y;
+	F32 z;
+	F32 w;
+};
+
+struct Mat4
+{
+	F32 arr[16];
+};
+
+struct SamplerState
+{
+};
+
+template<typename T>
+struct Texture2D
+{
+	void GetDimensions(U32& width, U32& height);
+
+	void GetDimensions(F32& width, F32& height);
+
+	T SampleLevel(SamplerState sampler, Vec2 uvs, F32 lod, IVec2 offset = {});
+
+	T& operator[](UVec2 coords);
+};
+
+template<typename T>
+using RWTexture2D = Texture2D<T>;
+
+template<typename T>
+struct StructuredBuffer
+{
+	T& operator[](U32 index);
+};
+
+template<typename T>
+using RWStructuredBuffer = StructuredBuffer<T>;
+
+template<typename T>
+struct ConstantBuffer : public T
+{
+};
+
+struct ByteAddressBuffer
+{
+	template<typename T>
+	T& Load(U32 offset);
+};
+
+// Basic functions
+
+template<typename T>
+T min(T a, T b);
+
+template<typename T>
+T max(T a, T b);
+
+template<typename T>
+T saturate(T a);
+
+// Wave ops
+
+template<typename T>
+T WaveActiveMin(T value);
+
+template<typename T>
+T WaveActiveMax(T value);
+
+bool WaveIsFirstLane();