Browse Source

Add VRS in IndirectDiffuse

Panagiotis Christopoulos Charitos 3 years ago
parent
commit
7a24211ce8

+ 121 - 4
AnKi/Renderer/IndirectDiffuse.cpp

@@ -49,8 +49,41 @@ Error IndirectDiffuse::initInternal()
 	texInit.setName("IndirectDiffuse #2");
 	texInit.setName("IndirectDiffuse #2");
 	m_rts[1] = m_r->createAndClearRenderTarget(texInit);
 	m_rts[1] = m_r->createAndClearRenderTarget(texInit);
 
 
-	m_fbDescr.m_colorAttachmentCount = 1;
-	m_fbDescr.bake();
+	// Init VRS SRI generation
+	{
+		m_main.m_fbDescr.m_colorAttachmentCount = 1;
+		m_main.m_fbDescr.bake();
+
+		const UVec2 rez = (size + m_vrs.m_sriTexelDimension - 1) / m_vrs.m_sriTexelDimension;
+		m_vrs.m_rtHandle =
+			m_r->create2DRenderTargetDescription(rez.x(), rez.y(), Format::R8_UINT, "IndirectDiffuse VRS SRI");
+		m_vrs.m_rtHandle.bake();
+
+		ANKI_CHECK(getResourceManager().loadResource("Shaders/IndirectDiffuseVrsSriGeneration.ankiprog", m_vrs.m_prog));
+
+		ShaderProgramResourceVariantInitInfo variantInit(m_vrs.m_prog);
+		variantInit.addMutation("SRI_TEXEL_DIMENSION", m_vrs.m_sriTexelDimension);
+
+		if(m_vrs.m_sriTexelDimension == 16 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 32)
+		{
+			// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No
+			// need for shared mem
+			variantInit.addMutation("SHARED_MEMORY", 0);
+		}
+		else
+		{
+			variantInit.addMutation("SHARED_MEMORY", 1);
+		}
+
+		const ShaderProgramResourceVariant* variant;
+		m_vrs.m_prog->getOrCreateVariant(variantInit, variant);
+		m_vrs.m_grProg = variant->getProgram();
+
+		ANKI_CHECK(getResourceManager().loadResource("AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog",
+													 m_vrs.m_visualizeProg));
+		m_vrs.m_visualizeProg->getOrCreateVariant(variant);
+		m_vrs.m_visualizeGrProg = variant->getProgram();
+	}
 
 
 	// Init SSGI+probes pass
 	// Init SSGI+probes pass
 	{
 	{
@@ -65,6 +98,9 @@ Error IndirectDiffuse::initInternal()
 
 
 	// Init denoise
 	// Init denoise
 	{
 	{
+		m_denoise.m_fbDescr.m_colorAttachmentCount = 1;
+		m_denoise.m_fbDescr.bake();
+
 		ANKI_CHECK(getResourceManager().loadResource((preferCompute) ? "Shaders/IndirectDiffuseDenoiseCompute.ankiprog"
 		ANKI_CHECK(getResourceManager().loadResource((preferCompute) ? "Shaders/IndirectDiffuseDenoiseCompute.ankiprog"
 																	 : "Shaders/IndirectDiffuseDenoiseRaster.ankiprog",
 																	 : "Shaders/IndirectDiffuseDenoiseRaster.ankiprog",
 													 m_denoise.m_prog));
 													 m_denoise.m_prog));
@@ -87,6 +123,64 @@ void IndirectDiffuse::populateRenderGraph(RenderingContext& ctx)
 {
 {
 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
 	const Bool preferCompute = getConfig().getRPreferCompute();
 	const Bool preferCompute = getConfig().getRPreferCompute();
+	const Bool enableVrs = getGrManager().getDeviceCapabilities().m_vrs && getConfig().getRVrs() && !preferCompute;
+	const Bool fbDescrHasVrs = m_main.m_fbDescr.m_shadingRateAttachmentTexelWidth > 0;
+
+	if(!preferCompute && enableVrs != fbDescrHasVrs)
+	{
+		// Re-bake the FB descriptor if the VRS state has changed
+
+		if(enableVrs)
+		{
+			m_main.m_fbDescr.m_shadingRateAttachmentTexelWidth = m_vrs.m_sriTexelDimension;
+			m_main.m_fbDescr.m_shadingRateAttachmentTexelHeight = m_vrs.m_sriTexelDimension;
+		}
+		else
+		{
+			m_main.m_fbDescr.m_shadingRateAttachmentTexelWidth = 0;
+			m_main.m_fbDescr.m_shadingRateAttachmentTexelHeight = 0;
+		}
+
+		m_main.m_fbDescr.bake();
+	}
+
+	// VRS SRI
+	if(enableVrs)
+	{
+		m_runCtx.m_sriRt = rgraph.newRenderTarget(m_vrs.m_rtHandle);
+
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("VRS SRI generation");
+
+		pass.newDependency(RenderPassDependency(m_runCtx.m_sriRt, TextureUsageBit::IMAGE_COMPUTE_WRITE));
+		pass.newDependency(RenderPassDependency(m_r->getDepthDownscale().getHiZRt(), TextureUsageBit::SAMPLED_COMPUTE,
+												HIZ_HALF_DEPTH));
+
+		pass.setWork([this, &ctx](RenderPassWorkContext& rgraphCtx) {
+			const UVec2 viewport = m_r->getInternalResolution() / 2u;
+
+			CommandBufferPtr& cmdb = rgraphCtx.m_commandBuffer;
+
+			cmdb->bindShaderProgram(m_vrs.m_grProg);
+
+			rgraphCtx.bindTexture(0, 0, m_r->getDepthDownscale().getHiZRt(), HIZ_HALF_DEPTH);
+			cmdb->bindSampler(0, 1, m_r->getSamplers().m_nearestNearestClamp);
+			rgraphCtx.bindImage(0, 2, m_runCtx.m_sriRt);
+
+			class
+			{
+			public:
+				Vec4 m_v4;
+				Mat4 m_invertedViewProjectionJitter;
+			} pc;
+
+			pc.m_v4 = Vec4(1.0f / Vec2(viewport), 0.01f, 0.0f);
+			pc.m_invertedViewProjectionJitter = ctx.m_matrices.m_invertedViewProjectionJitter;
+
+			cmdb->setPushConstants(&pc, sizeof(pc));
+
+			dispatchPPCompute(cmdb, m_vrs.m_sriTexelDimension, m_vrs.m_sriTexelDimension, viewport.x(), viewport.y());
+		});
+	}
 
 
 	// SSGI+probes
 	// SSGI+probes
 	{
 	{
@@ -119,10 +213,17 @@ void IndirectDiffuse::populateRenderGraph(RenderingContext& ctx)
 		else
 		else
 		{
 		{
 			GraphicsRenderPassDescription& rpass = rgraph.newGraphicsRenderPass("IndirectDiffuse");
 			GraphicsRenderPassDescription& rpass = rgraph.newGraphicsRenderPass("IndirectDiffuse");
-			rpass.setFramebufferInfo(m_fbDescr, {m_runCtx.m_mainRtHandles[WRITE]});
+			rpass.setFramebufferInfo(m_main.m_fbDescr, {m_runCtx.m_mainRtHandles[WRITE]}, {},
+									 (enableVrs) ? m_runCtx.m_sriRt : RenderTargetHandle());
 			readUsage = TextureUsageBit::SAMPLED_FRAGMENT;
 			readUsage = TextureUsageBit::SAMPLED_FRAGMENT;
 			writeUsage = TextureUsageBit::FRAMEBUFFER_ATTACHMENT_WRITE;
 			writeUsage = TextureUsageBit::FRAMEBUFFER_ATTACHMENT_WRITE;
 			prpass = &rpass;
 			prpass = &rpass;
+
+			if(enableVrs)
+			{
+				prpass->newDependency(
+					RenderPassDependency(m_runCtx.m_sriRt, TextureUsageBit::FRAMEBUFFER_SHADING_RATE));
+			}
 		}
 		}
 
 
 		prpass->newDependency(RenderPassDependency(m_runCtx.m_mainRtHandles[WRITE], writeUsage));
 		prpass->newDependency(RenderPassDependency(m_runCtx.m_mainRtHandles[WRITE], writeUsage));
@@ -183,6 +284,7 @@ void IndirectDiffuse::populateRenderGraph(RenderingContext& ctx)
 			else
 			else
 			{
 			{
 				cmdb->setViewport(0, 0, unis.m_viewportSize.x(), unis.m_viewportSize.y());
 				cmdb->setViewport(0, 0, unis.m_viewportSize.x(), unis.m_viewportSize.y());
+				cmdb->setVrsRate(VrsRate::_1x1);
 
 
 				cmdb->drawArrays(PrimitiveTopology::TRIANGLES, 3);
 				cmdb->drawArrays(PrimitiveTopology::TRIANGLES, 3);
 			}
 			}
@@ -209,7 +311,7 @@ void IndirectDiffuse::populateRenderGraph(RenderingContext& ctx)
 		{
 		{
 			GraphicsRenderPassDescription& rpass =
 			GraphicsRenderPassDescription& rpass =
 				rgraph.newGraphicsRenderPass((dir == 0) ? "IndirectDiffuseDenoiseH" : "IndirectDiffuseDenoiseV");
 				rgraph.newGraphicsRenderPass((dir == 0) ? "IndirectDiffuseDenoiseH" : "IndirectDiffuseDenoiseV");
-			rpass.setFramebufferInfo(m_fbDescr, {m_runCtx.m_mainRtHandles[!readIdx]});
+			rpass.setFramebufferInfo(m_denoise.m_fbDescr, {m_runCtx.m_mainRtHandles[!readIdx]});
 			readUsage = TextureUsageBit::SAMPLED_FRAGMENT;
 			readUsage = TextureUsageBit::SAMPLED_FRAGMENT;
 			writeUsage = TextureUsageBit::FRAMEBUFFER_ATTACHMENT_WRITE;
 			writeUsage = TextureUsageBit::FRAMEBUFFER_ATTACHMENT_WRITE;
 			prpass = &rpass;
 			prpass = &rpass;
@@ -260,4 +362,19 @@ void IndirectDiffuse::populateRenderGraph(RenderingContext& ctx)
 	}
 	}
 }
 }
 
 
+void IndirectDiffuse::getDebugRenderTarget(CString rtName, RenderTargetHandle& handle,
+										   ShaderProgramPtr& optionalShaderProgram) const
+{
+	if(rtName == "IndirectDiffuse")
+	{
+		handle = m_runCtx.m_mainRtHandles[WRITE];
+	}
+	else
+	{
+		ANKI_ASSERT(rtName == "IndirectDiffuseVrsSri");
+		handle = m_runCtx.m_sriRt;
+		optionalShaderProgram = m_vrs.m_visualizeGrProg;
+	}
+}
+
 } // end namespace anki
 } // end namespace anki

+ 18 - 6
AnKi/Renderer/IndirectDiffuse.h

@@ -22,6 +22,7 @@ public:
 		: RendererObject(r)
 		: RendererObject(r)
 	{
 	{
 		registerDebugRenderTarget("IndirectDiffuse");
 		registerDebugRenderTarget("IndirectDiffuse");
+		registerDebugRenderTarget("IndirectDiffuseVrsSri");
 	}
 	}
 
 
 	~IndirectDiffuse();
 	~IndirectDiffuse();
@@ -31,11 +32,7 @@ public:
 	void populateRenderGraph(RenderingContext& ctx);
 	void populateRenderGraph(RenderingContext& ctx);
 
 
 	void getDebugRenderTarget(CString rtName, RenderTargetHandle& handle,
 	void getDebugRenderTarget(CString rtName, RenderTargetHandle& handle,
-							  ShaderProgramPtr& optionalShaderProgram) const override
-	{
-		ANKI_ASSERT(rtName == "IndirectDiffuse");
-		handle = m_runCtx.m_mainRtHandles[WRITE];
-	}
+							  ShaderProgramPtr& optionalShaderProgram) const override;
 
 
 	RenderTargetHandle getRt() const
 	RenderTargetHandle getRt() const
 	{
 	{
@@ -44,7 +41,6 @@ public:
 
 
 private:
 private:
 	Array<TexturePtr, 2> m_rts;
 	Array<TexturePtr, 2> m_rts;
-	FramebufferDescription m_fbDescr;
 	Bool m_rtsImportedOnce = false;
 	Bool m_rtsImportedOnce = false;
 
 
 	static constexpr U32 READ = 0;
 	static constexpr U32 READ = 0;
@@ -55,6 +51,20 @@ private:
 	public:
 	public:
 		ShaderProgramResourcePtr m_prog;
 		ShaderProgramResourcePtr m_prog;
 		ShaderProgramPtr m_grProg;
 		ShaderProgramPtr m_grProg;
+		RenderTargetDescription m_rtHandle;
+
+		ShaderProgramResourcePtr m_visualizeProg;
+		ShaderProgramPtr m_visualizeGrProg;
+
+		U32 m_sriTexelDimension = 16;
+	} m_vrs;
+
+	class
+	{
+	public:
+		ShaderProgramResourcePtr m_prog;
+		ShaderProgramPtr m_grProg;
+		FramebufferDescription m_fbDescr;
 	} m_main;
 	} m_main;
 
 
 	class
 	class
@@ -62,11 +72,13 @@ private:
 	public:
 	public:
 		ShaderProgramResourcePtr m_prog;
 		ShaderProgramResourcePtr m_prog;
 		Array<ShaderProgramPtr, 2> m_grProgs;
 		Array<ShaderProgramPtr, 2> m_grProgs;
+		FramebufferDescription m_fbDescr;
 	} m_denoise;
 	} m_denoise;
 
 
 	class
 	class
 	{
 	{
 	public:
 	public:
+		RenderTargetHandle m_sriRt;
 		Array<RenderTargetHandle, 2> m_mainRtHandles;
 		Array<RenderTargetHandle, 2> m_mainRtHandles;
 	} m_runCtx;
 	} m_runCtx;
 
 

+ 1 - 0
AnKi/Renderer/LightShading.cpp

@@ -310,6 +310,7 @@ void LightShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgrap
 
 
 	if(enableVrs)
 	if(enableVrs)
 	{
 	{
+		// Restore
 		cmdb->setVrsRate(VrsRate::_1x1);
 		cmdb->setVrsRate(VrsRate::_1x1);
 	}
 	}
 }
 }

+ 6 - 0
AnKi/ShaderCompiler/ShaderProgramParser.cpp

@@ -923,6 +923,12 @@ Error ShaderProgramParser::parse()
 
 
 	// Checks
 	// Checks
 	{
 	{
+		if(!m_shaderTypes)
+		{
+			ANKI_SHADER_COMPILER_LOGE("Haven't found any shader types");
+			return Error::USER_DATA;
+		}
+
 		if(!!(m_shaderTypes & ShaderTypeBit::COMPUTE))
 		if(!!(m_shaderTypes & ShaderTypeBit::COMPUTE))
 		{
 		{
 			if(m_shaderTypes != ShaderTypeBit::COMPUTE)
 			if(m_shaderTypes != ShaderTypeBit::COMPUTE)

+ 15 - 0
AnKi/Shaders/Functions.glsl

@@ -618,6 +618,21 @@ U32 encodeVrsRate(UVec2 rateXY)
 	return (rateXY.y >> 1u) | ((rateXY.x << 1u) & 12u);
 	return (rateXY.y >> 1u) | ((rateXY.x << 1u) & 12u);
 }
 }
 
 
+U32 encodeAndSanitizeVrsRate(UVec2 rate)
+{
+	// 1x4 and 4x1 shading rates don't exist.
+	if(rate == UVec2(1u, 4u))
+	{
+		rate = UVec2(1u, 2u);
+	}
+	else if(rate == UVec2(4u, 1u))
+	{
+		rate = UVec2(2u, 1u);
+	}
+
+	return encodeVrsRate(rate);
+}
+
 /// Decodes a number produced by encodeVrsRate(). Returns the shading rates.
 /// Decodes a number produced by encodeVrsRate(). Returns the shading rates.
 UVec2 decodeVrsRate(U32 texel)
 UVec2 decodeVrsRate(U32 texel)
 {
 {

+ 123 - 0
AnKi/Shaders/IndirectDiffuseVrsSriGeneration.ankiprog

@@ -0,0 +1,123 @@
+// Copyright (C) 2009-2022, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki mutator SRI_TEXEL_DIMENSION 8 16
+#pragma anki mutator SHARED_MEMORY 0 1
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Functions.glsl>
+
+layout(set = 0, binding = 0) uniform texture2D u_inputTex;
+layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
+
+const UVec2 REGION_SIZE = UVec2(2u, 4u);
+
+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
+layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
+
+layout(set = 0, binding = 2) uniform writeonly uimage2D u_sriImg;
+
+layout(push_constant, std430) uniform b_pc
+{
+	Vec2 u_oneOverViewportSize;
+	F32 u_thresholdMeters;
+	F32 u_padding0;
+	Mat4 u_invertedViewProjectionJitter;
+};
+
+#if SHARED_MEMORY
+// Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
+// subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
+// constant, so estimate it assuming a subgroupSize of at least 8.
+const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
+shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
+#endif
+
+F32 toViewSpace(Vec2 ndc, F32 depth)
+{
+	const Vec4 v4 = u_invertedViewProjectionJitter * Vec4(ndc, depth, 1.0);
+	return v4.z / v4.w;
+}
+
+#define sampleWorldPositionZ(offsetX, offsetY) \
+	toViewSpace(ndc, textureLodOffset(sampler2D(u_inputTex, u_nearestClampSampler), uv, 0.0, IVec2(offsetX, offsetY)).x)
+
+void main()
+{
+	const Vec2 uv = Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) * u_oneOverViewportSize;
+	const Vec2 ndc = UV_TO_NDC(uv);
+
+	// Get positions
+	// l1.z  l1.w
+	// l1.x  l1.y
+	// l0.z  l0.w
+	// l0.x  l0.y
+	Vec4 l0;
+	l0.x = sampleWorldPositionZ(0, 0);
+	l0.y = sampleWorldPositionZ(1, 0);
+	l0.z = sampleWorldPositionZ(0, 1);
+	l0.w = sampleWorldPositionZ(1, 1);
+
+	Vec4 l1;
+	l1.x = sampleWorldPositionZ(0, 2);
+	l1.y = sampleWorldPositionZ(1, 2);
+	l1.z = sampleWorldPositionZ(0, 3);
+	l1.w = sampleWorldPositionZ(1, 3);
+
+	// Calculate derivatives.
+	Vec4 a = Vec4(l0.y, l0.z, l1.y, l1.z);
+	Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.w);
+	const Vec4 dx = abs(a - b);
+
+	a = Vec4(l0.z, l0.w, l1.z, l1.y);
+	b = Vec4(l0.x, l0.y, l1.x, l1.w);
+	const Vec4 dy = abs(a - b);
+
+	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
+	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
+	maxDerivativeX = subgroupMax(maxDerivativeX);
+	maxDerivativeY = subgroupMax(maxDerivativeY);
+
+#if SHARED_MEMORY
+	// Store results in shared memory.
+	ANKI_BRANCH if(subgroupElect())
+	{
+		s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
+	}
+
+	memoryBarrierShared();
+	barrier();
+#endif
+
+	// Write the result
+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
+	{
+		// Get max across all subgroups.
+#if SHARED_MEMORY
+		Vec2 maxDerivative = s_maxDerivative[0];
+
+		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
+		{
+			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
+		}
+#else
+		const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
+#endif
+
+		// Determine shading rate.
+		const F32 threshold1 = u_thresholdMeters;
+		const F32 threshold2 = threshold1 * 0.4;
+
+		UVec2 rate;
+		rate.x = (maxDerivative.x > threshold1) ? 1u : ((maxDerivative.x > threshold2) ? 2u : 4u);
+		rate.y = (maxDerivative.y > threshold1) ? 1u : ((maxDerivative.y > threshold2) ? 2u : 4u);
+
+		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
+		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeAndSanitizeVrsRate(rate)));
+	}
+}
+
+#pragma anki end

+ 1 - 11
AnKi/Shaders/VrsSriGeneration.glsl

@@ -127,17 +127,7 @@ void main()
 		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
 		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
 		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
 		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
 
 
-		// 1x4 and 4x1 shading rates don't exist.
-		if(rate == UVec2(1u, 4u))
-		{
-			rate = UVec2(1u, 2u);
-		}
-		else if(rate == UVec2(4u, 1u))
-		{
-			rate = UVec2(2u, 1u);
-		}
-
 		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
 		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
-		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeVrsRate(rate)));
+		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeAndSanitizeVrsRate(rate)));
 	}
 	}
 }
 }

+ 15 - 1
Samples/Common/SampleApp.cpp

@@ -86,7 +86,21 @@ Error SampleApp::userMainLoop(Bool& quit, Second elapsedTime)
 
 
 	if(in.getKey(KeyCode::P) == 1)
 	if(in.getKey(KeyCode::P) == 1)
 	{
 	{
-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "VRS") ? "" : "VRS");
+		static U32 idx = 0;
+		++idx;
+		idx %= 3;
+		if(idx == 0)
+		{
+			renderer.setCurrentDebugRenderTarget("IndirectDiffuseVrsSri");
+		}
+		else if(idx == 1)
+		{
+			renderer.setCurrentDebugRenderTarget("VRS");
+		}
+		else
+		{
+			renderer.setCurrentDebugRenderTarget("");
+		}
 	}
 	}
 
 
 	if(in.getKey(KeyCode::L) == 1)
 	if(in.getKey(KeyCode::L) == 1)