Browse Source

Checkerboard rendering optimizations

Panagiotis Christopoulos Charitos 6 months ago
parent
commit
579947c846

+ 49 - 18
AnKi/Renderer/IndirectDiffuseClipmaps.cpp

@@ -33,10 +33,14 @@ Error IndirectDiffuseClipmaps::init()
 {
 	ANKI_CHECK(RtMaterialFetchRendererObject::init());
 
-	m_appliedGiRtDesc =
-		getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(),
-													  getRenderer().getHdrFormat(), "IndirectDiffuseClipmap: Final");
-	m_appliedGiRtDesc.bake();
+	m_halfRtDesc =
+		getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x() / 2, getRenderer().getInternalResolution().y(),
+													  getRenderer().getHdrFormat(), "IndirectDiffuseClipmap: Half");
+	m_halfRtDesc.bake();
+
+	m_fullRtDesc = getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(),
+																 getRenderer().getHdrFormat(), "IndirectDiffuseClipmap: Full");
+	m_fullRtDesc.bake();
 
 	m_consts.m_probeCounts = UVec3(g_indirectDiffuseClipmapProbesXZCVar, g_indirectDiffuseClipmapProbesYCVar, g_indirectDiffuseClipmapProbesXZCVar);
 	m_consts.m_totalProbeCount = m_consts.m_probeCounts.x() * m_consts.m_probeCounts.y() * m_consts.m_probeCounts.z();
@@ -120,6 +124,9 @@ Error IndirectDiffuseClipmaps::init()
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_populateCachesGrProg, "PopulateCaches"));
 	ANKI_CHECK(
 		loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_computeIrradianceGrProg, "ComputeIrradiance"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_temporalDenoiseGrProg, "TemporalDenoise"));
+	ANKI_CHECK(
+		loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_spatialReconstructGrProg, "SpatialReconstruct"));
 
 	for(MutatorValue rtMaterialFetchClipmap = 0; rtMaterialFetchClipmap < 2; ++rtMaterialFetchClipmap)
 	{
@@ -177,6 +184,8 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 {
 	ANKI_TRACE_SCOPED_EVENT(IndirectDiffuse);
 
+	const Bool firstBounceRt = g_indirectDiffuseClipmapFirstBounceRayDistance > 0.0f;
+
 	for(U32 i = 0; i < kIndirectDiffuseClipmapCount; ++i)
 	{
 		m_consts.m_previousFrameAabbMins[i] = m_consts.m_aabbMins[i];
@@ -188,9 +197,9 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 	RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
 
 	const RenderTargetHandle rtResultHandle = rgraph.newRenderTarget(m_rtResultRtDesc);
-	m_runCtx.m_handles.m_appliedIrradiance = rgraph.newRenderTarget(m_appliedGiRtDesc);
+	const RenderTargetHandle halfHandle = rgraph.newRenderTarget(m_halfRtDesc);
+	const RenderTargetHandle fullHandle = rgraph.newRenderTarget(m_fullRtDesc);
 
-	RenderTargetHandle& appliedGiRt = m_runCtx.m_handles.m_appliedIrradiance;
 	Array<RenderTargetHandle, kIndirectDiffuseClipmapCount>& radianceVolumes = m_runCtx.m_handles.m_radianceVolumes;
 	Array<RenderTargetHandle, kIndirectDiffuseClipmapCount>& irradianceVolumes = m_runCtx.m_handles.m_irradianceVolumes;
 	Array<RenderTargetHandle, kIndirectDiffuseClipmapCount>& distanceMomentsVolumes = m_runCtx.m_handles.m_distanceMomentsVolumes;
@@ -388,7 +397,7 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 	}
 
 	// Apply GI
-	if(1)
+	if(firstBounceRt)
 	{
 		patchShaderBindingTablePass("IndirectDiffuseClipmaps: Patch SBT", m_rtLibraryGrProg.get(), m_rayGenShaderGroupIndices[0],
 									m_missShaderGroupIdx, m_sbtRecordSize, rgraph, sbtHandle, sbtBuffer);
@@ -413,9 +422,9 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 			pass.newTextureDependency(distanceMomentsVolumes[clipmap], TextureUsageBit::kSrvTraceRays);
 		}
 
-		pass.newTextureDependency(appliedGiRt, TextureUsageBit::kUavTraceRays);
+		pass.newTextureDependency(halfHandle, TextureUsageBit::kUavTraceRays);
 
-		pass.setWork([this, &ctx, sbtBuffer](RenderPassWorkContext& rgraphCtx) {
+		pass.setWork([this, &ctx, sbtBuffer, halfHandle](RenderPassWorkContext& rgraphCtx) {
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 			cmdb.bindShaderProgram(m_rtLibraryGrProg.get());
@@ -469,20 +478,19 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 			cmdb.bindSampler(1, 2, getRenderer().getSamplers().m_trilinearClampShadow.get());
 			cmdb.bindSampler(2, 2, getRenderer().getSamplers().m_trilinearRepeat.get());
 
-			rgraphCtx.bindUav(0, 2, m_runCtx.m_handles.m_appliedIrradiance);
+			rgraphCtx.bindUav(0, 2, halfHandle);
 			cmdb.bindUav(1, 2, TextureView(getDummyGpuResources().m_texture2DUav.get(), TextureSubresourceDesc::firstSurface()));
 
-			const F32 rayTMax = 10.0f; // TODO
-			const Vec4 consts(rayTMax);
+			const Vec4 consts(g_indirectDiffuseClipmapFirstBounceRayDistance);
 			cmdb.setFastConstants(&consts, sizeof(consts));
 
 			cmdb.traceRays(sbtBuffer, m_sbtRecordSize, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount(), 1,
-						   getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), 1);
+						   getRenderer().getInternalResolution().x() / 2, getRenderer().getInternalResolution().y(), 1);
 		});
 	}
 	else
 	{
-		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("IndirectDiffuseClipmaps composite");
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("IndirectDiffuseClipmaps: Apply irradiance");
 
 		pass.newTextureDependency(getGBuffer().getDepthRt(), TextureUsageBit::kSrvCompute);
 		pass.newTextureDependency(getGBuffer().getColorRt(2), TextureUsageBit::kSrvCompute);
@@ -493,9 +501,9 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 			pass.newTextureDependency(distanceMomentsVolumes[i], TextureUsageBit::kSrvCompute);
 			pass.newTextureDependency(avgIrradianceVolumes[i], TextureUsageBit::kSrvCompute);
 		}
-		pass.newTextureDependency(appliedGiRt, TextureUsageBit::kUavCompute);
+		pass.newTextureDependency(halfHandle, TextureUsageBit::kUavCompute);
 
-		pass.setWork([this, &ctx](RenderPassWorkContext& rgraphCtx) {
+		pass.setWork([this, &ctx, halfHandle](RenderPassWorkContext& rgraphCtx) {
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 			cmdb.bindShaderProgram(m_applyGiGrProg.get());
@@ -504,15 +512,38 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 			rgraphCtx.bindSrv(1, 0, getGBuffer().getColorRt(2));
 			cmdb.bindSrv(2, 0, TextureView(&m_blueNoiseImg->getTexture(), TextureSubresourceDesc::firstSurface()));
 
-			rgraphCtx.bindUav(0, 0, m_runCtx.m_handles.m_appliedIrradiance);
+			rgraphCtx.bindUav(0, 0, halfHandle);
 
 			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
 
 			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
 
-			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x() / 2, getRenderer().getInternalResolution().y());
 		});
 	}
+
+	// Spatial reconstruct
+	{
+		NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("IndirectDiffuseClipmaps: Spatial reconstruct");
+
+		pass.newTextureDependency(getGBuffer().getDepthRt(), TextureUsageBit::kSrvCompute);
+		pass.newTextureDependency(halfHandle, TextureUsageBit::kSrvCompute);
+		pass.newTextureDependency(fullHandle, TextureUsageBit::kUavCompute);
+
+		pass.setWork([this, &ctx, halfHandle, fullHandle](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_spatialReconstructGrProg.get());
+
+			rgraphCtx.bindSrv(0, 0, halfHandle);
+			rgraphCtx.bindSrv(1, 0, getGBuffer().getDepthRt());
+			rgraphCtx.bindUav(0, 0, fullHandle);
+
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x() / 2, getRenderer().getInternalResolution().y());
+		});
+	}
+
+	m_runCtx.m_handles.m_appliedIrradiance = fullHandle;
 }
 
 void IndirectDiffuseClipmaps::drawDebugProbes(const RenderingContext& ctx, RenderPassWorkContext& rgraphCtx) const

+ 7 - 1
AnKi/Renderer/IndirectDiffuseClipmaps.h

@@ -57,6 +57,9 @@ inline NumericCVar<U32> g_indirectDiffuseClipmapRadianceOctMapSize(
 inline NumericCVar<U32> g_indirectDiffuseClipmapIrradianceOctMapSize("R", "IndirectDiffuseClipmapIrradianceOctMapSize", 5, 4, 20,
 																	 "Size of the octahedral for the irradiance");
 
+inline NumericCVar<F32> g_indirectDiffuseClipmapFirstBounceRayDistance("R", "IndirectDiffuseClipmapFirstBounceRayDistance", 0.0f, 0.0f, 10000.0f,
+																	   "For the 1st bounce shoot rays instead of sampling the clipmaps");
+
 /// @memberof IndirectDiffuseClipmaps
 class IndirectDiffuseClipmapsRenderTargetHandles
 {
@@ -108,7 +111,8 @@ private:
 	Array<TexturePtr, kIndirectDiffuseClipmapCount> m_avgIrradianceVolumes;
 
 	RenderTargetDesc m_rtResultRtDesc;
-	RenderTargetDesc m_appliedGiRtDesc;
+	RenderTargetDesc m_halfRtDesc;
+	RenderTargetDesc m_fullRtDesc;
 
 	IndirectDiffuseClipmapConstants m_consts;
 
@@ -119,6 +123,8 @@ private:
 	ShaderProgramPtr m_computeIrradianceGrProg;
 	ShaderProgramPtr m_applyGiGrProg;
 	ShaderProgramPtr m_visProbesGrProg;
+	ShaderProgramPtr m_temporalDenoiseGrProg;
+	ShaderProgramPtr m_spatialReconstructGrProg;
 
 	ImageResourcePtr m_blueNoiseImg;
 

+ 2 - 0
AnKi/Shaders/Common.hlsl

@@ -92,6 +92,8 @@ constexpr F32 k2Pi = 2.0 * kPi;
 constexpr F32 kHalfPi = kPi / 2.0;
 constexpr F32 kNaN = 0.0 / 0.0;
 
+constexpr F32 kMaxHistoryLength = 16.0;
+
 struct Barycentrics
 {
 	Vec2 m_value;

+ 40 - 0
AnKi/Shaders/Functions.hlsl

@@ -855,6 +855,46 @@ U32 octahedronBorder(IVec2 texSize, IVec2 texCoord, out IVec2 borderTexOffsets[3
 	return borderCount;
 }
 
+/// See octahedronBorder.
+template<typename TStoreFunc>
+void storeOctahedronBorder(IVec2 octSize, IVec2 octCoord, TStoreFunc func)
+{
+	if(all(octCoord == 0))
+	{
+		func(octCoord + octSize);
+	}
+	else if(octCoord.x == 0 && octCoord.y == octSize.y - 1)
+	{
+		func(octCoord + IVec2(octSize.x, -octSize.y));
+	}
+	else if(all(octCoord == octSize - 1))
+	{
+		func(octCoord - octSize);
+	}
+	else if(octCoord.x == octSize.x - 1 && octCoord.y == 0)
+	{
+		func(octCoord + IVec2(-octSize.x, octSize.y));
+	}
+
+	if(octCoord.y == 0)
+	{
+		func(octCoord + IVec2((octSize.x - 1) - 2 * octCoord.x, -1));
+	}
+	else if(octCoord.y == octSize.y - 1)
+	{
+		func(octCoord + IVec2((octSize.x - 1) - 2 * octCoord.x, 1));
+	}
+
+	if(octCoord.x == 0)
+	{
+		func(octCoord + IVec2(-1, (octSize.y - 1) - 2 * octCoord.y));
+	}
+	else if(octCoord.x == octSize.x - 1)
+	{
+		func(octCoord + IVec2(1, (octSize.y - 1) - 2 * octCoord.y));
+	}
+}
+
 /// Manual texture sampling of a 3D texture.
 template<typename T, U32 kComp>
 vector<T, kComp> linearTextureSampling(Texture3D<Vec4> sam, Vec3 uv)

+ 2 - 3
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -38,7 +38,7 @@
 #pragma anki technique RtShadows ahit mutators ALPHA_TEST DIFFUSE_TEX
 #pragma anki technique RtShadows chit mutators
 
-#pragma anki technique RtMaterialFetch chit mutators DIFFUSE_TEX EMISSIVE_TEX ROUGHNESS_METALNESS_TEX
+#pragma anki technique RtMaterialFetch chit mutators DIFFUSE_TEX EMISSIVE_TEX ROUGHNESS_METALNESS_TEX ALPHA_TEST
 
 #include <AnKi/Shaders/Include/MaterialTypes.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
@@ -64,7 +64,6 @@
 #endif
 
 #if ANKI_TECHNIQUE_RtMaterialFetch
-#	define ALPHA_TEST 0
 #	define ANKI_VELOCITY 0
 #	define ANKI_BONES 0
 #	define PARALLAX 0
@@ -770,7 +769,7 @@ GBufferPixelOut main(
 	payload.m_worldNormal = normal;
 
 	// Ray T
-	const Bool backfacing = HitKind() == HIT_KIND_TRIANGLE_FRONT_FACE;
+	const Bool backfacing = !ALPHA_TEST && HitKind() == HIT_KIND_TRIANGLE_FRONT_FACE;
 	payload.m_rayT = RayTCurrent() * (backfacing ? -1.0 : 1.0);
 }
 #	endif

+ 0 - 1
AnKi/Shaders/HistoryLength.ankiprog

@@ -14,7 +14,6 @@
 #include <AnKi/Shaders/QuadVert.hlsl>
 
 constexpr F32 kZDistanceLimit = 0.05; // In meters
-constexpr F32 kMaxHistoryLength = 16.0;
 
 Texture2D<F32> g_depthTex : register(t0);
 Texture2D<F32> g_historyDepthTex : register(t1);

+ 155 - 36
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -14,6 +14,8 @@
 #pragma anki technique PopulateCaches comp mutators RADIANCE_OCTAHEDRON_MAP_SIZE
 #pragma anki technique ComputeIrradiance comp mutators GPU_WAVE_SIZE RADIANCE_OCTAHEDRON_MAP_SIZE IRRADIANCE_OCTAHEDRON_MAP_SIZE
 #pragma anki technique Apply comp mutators
+#pragma anki technique SpatialReconstruct comp mutators
+#pragma anki technique TemporalDenoise comp mutators
 #pragma anki technique VisualizeProbes vert pixel mutators
 
 #define ANKI_ASSERTIONS_ENABLED 1
@@ -103,10 +105,11 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 	const F32 kMaxDist = 1000.0; // Chose something small and make sure its square doesn't overflow F16
 	TEX(g_lightResultTex, UVec2(probeIdx, outPixelIdx + raysPerProbePerFrame * g_consts.m_clipmapIdx)) = HVec4(radiance, min(rayT, kMaxDist));
 }
-#	else // RT_MATERIAL_FETCH_CLIPMAP
-
-// RT based apply of indirect
 
+// ===========================================================================
+// RtMaterialFetch (Apply)                                                   =
+// ===========================================================================
+#	else // RT_MATERIAL_FETCH_CLIPMAP
 struct Consts
 {
 	F32 m_rayMax;
@@ -118,11 +121,11 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 
 [Shader("raygeneration")] void main()
 {
-	const UVec2 coord = DispatchRaysIndex().xy;
-	const Vec2 uv = Vec2(coord) / DispatchRaysDimensions().xy;
+	const UVec2 fullCoord = UVec2(DispatchRaysIndex().x * 2u + (DispatchRaysIndex().y & 1u), DispatchRaysIndex().y);
+	const Vec2 uv = (fullCoord + 0.5) / (DispatchRaysDimensions().xy * UVec2(2, 1));
 
-	const F32 depth = g_depthTex[coord].x;
-	const Vec4 rt2 = g_gbufferRt2[coord];
+	const F32 depth = TEX(g_depthTex, fullCoord).x;
+	const Vec4 rt2 = TEX(g_gbufferRt2, fullCoord);
 	const Vec3 worldNormal = unpackNormalFromGBuffer(rt2);
 
 	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(uvToNdc(uv), depth, 1.0));
@@ -132,7 +135,7 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 	const Vec3 biasedWorldPos = worldPos + biasDir * 0.1;
 
 	// Rand
-	const UVec3 seed = rand3DPCG16(UVec3(coord, g_globalRendererConstants.m_frame % 8u));
+	const UVec3 seed = rand3DPCG16(UVec3(fullCoord, g_globalRendererConstants.m_frame % 8u));
 	const Vec2 randFactors = hammersleyRandom16(g_globalRendererConstants.m_frame % 64u, 64u, seed);
 
 	const Mat3 tbn = rotationFromDirection(worldNormal);
@@ -182,7 +185,7 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 		final = irradiance;
 	}
 
-	TEX(g_colorAndPdfTex, coord).xyz = final;
+	TEX(g_colorAndPdfTex, DispatchRaysIndex().xy) = Vec4(final, 0.0);
 }
 #	endif // RT_MATERIAL_FETCH_CLIPMAP
 #endif
@@ -337,15 +340,15 @@ groupshared Vec3 g_avgIrradiance[kThreadCount];
 
 void InterlockedAddColor(U32 x, U32 y, Vec3 color)
 {
-	[unroll] for(U32 i = 0; i < 3; ++i)
-	{
-		const F32 fracPart = frac(color[i]);
-		const F32 intPart = color[i] - fracPart;
+	const Vec3 fracPart = frac(color);
+	const Vec3 intPart = color - fracPart;
 
-		U64 val = U64(intPart) << U64(32);
-		val |= U64(fracPart * 10000.0);
-		InterlockedAdd(g_irradianceResults[y][x][i], val);
-	}
+	U64Vec3 val = U64Vec3(intPart) << U64(32);
+	val |= U64Vec3(fracPart * 10000.0);
+
+	InterlockedAdd(g_irradianceResults[y][x][0], val[0]);
+	InterlockedAdd(g_irradianceResults[y][x][1], val[1]);
+	InterlockedAdd(g_irradianceResults[y][x][2], val[2]);
 }
 
 Vec3 decodeAtomicColor(U32 x, U32 y)
@@ -361,6 +364,19 @@ Vec3 decodeAtomicColor(U32 x, U32 y)
 	return output;
 }
 
+struct StoreBorderFunc
+{
+	IVec3 m_startOfOctCoord;
+	Vec3 m_value;
+	U32 m_clipmapIdx;
+
+	void operator()(IVec2 offset)
+	{
+		const IVec3 coord = m_startOfOctCoord + IVec3(offset, 0);
+		TEX(g_irradianceVolumes[m_clipmapIdx], coord) = Vec4(m_value, 0.0);
+	}
+};
+
 // The group services a single probe. Every thread reads a radiance value and bins it to the appropreate irradiance pixel
 [NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
 {
@@ -447,19 +463,15 @@ Vec3 decodeAtomicColor(U32 x, U32 y)
 
 		threadAvgIrradiance += irradiance / irradianceTexelCount;
 
-		TEX(g_irradianceVolumes[clipmapIdx], irradianceTexelCoord).xyz = irradiance;
+		TEX(g_irradianceVolumes[clipmapIdx], irradianceTexelCoord) = Vec4(irradiance, 0.0);
 
 		// Write the borders
-		IVec2 borders[3];
+		StoreBorderFunc func;
+		func.m_clipmapIdx = clipmapIdx;
+		func.m_startOfOctCoord = irradianceTexelCoordStart;
+		func.m_value = irradiance;
 		const IVec2 octCoord = IVec2(x, y);
-		const U32 borderCount = octahedronBorder(IRRADIANCE_OCTAHEDRON_MAP_SIZE, octCoord, borders);
-		for(U32 i = 0; i < borderCount; ++i)
-		{
-			IVec3 actualVolumeTexCoord = irradianceTexelCoordStart;
-			actualVolumeTexCoord.xy += octCoord + borders[i];
-
-			TEX(g_irradianceVolumes[clipmapIdx], actualVolumeTexCoord).xyz = irradiance;
-		}
+		storeOctahedronBorder(IRRADIANCE_OCTAHEDRON_MAP_SIZE, octCoord, func);
 	}
 
 	g_avgIrradiance[svGroupIndex] = threadAvgIrradiance;
@@ -503,28 +515,32 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 
 SamplerState g_linearAnyRepeatSampler : register(s0);
 
-[NumThreads(8, 8, 1)] void main(COMPUTE_ARGS)
+[NumThreads(64, 1, 1)] void main(COMPUTE_ARGS)
 {
-	UVec2 viewportSize;
-	g_outTex.GetDimensions(viewportSize.x, viewportSize.y);
+	Vec2 halfViewportSize;
+	g_outTex.GetDimensions(halfViewportSize.x, halfViewportSize.y);
+	const Vec2 fullViewportSize = halfViewportSize * Vec2(2.0, 1.0);
+
+	const UVec2 realSvDispatchThreadId = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+	const Vec2 coord = Vec2(realSvDispatchThreadId.x * 2u + (realSvDispatchThreadId.y & 1u), realSvDispatchThreadId.y);
 
-	if(any(svDispatchThreadId >= viewportSize))
+	if(any(coord >= fullViewportSize))
 	{
 		return;
 	}
 
-	const F32 depth = g_depthTex[svDispatchThreadId.xy].r;
-	const Vec2 uv = Vec2(svDispatchThreadId.xy) / Vec2(viewportSize);
+	const F32 depth = TEX(g_depthTex, coord).r;
+	const Vec2 uv = (coord + 0.5) / fullViewportSize;
 	const Vec2 ndc = uvToNdc(uv);
 	const Vec4 worldPos4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(ndc, depth, 1.0));
 	const Vec3 worldPos = worldPos4.xyz / worldPos4.w;
 
-	const Vec3 normal = unpackNormalFromGBuffer(g_gbufferRt2[svDispatchThreadId.xy]);
+	const Vec3 normal = unpackNormalFromGBuffer(TEX(g_gbufferRt2, coord));
 
 	// Rand
 	UVec2 noiseTexSize;
 	g_blueNoiseTex.GetDimensions(noiseTexSize.x, noiseTexSize.y);
-	Vec3 noise3 = g_blueNoiseTex[svDispatchThreadId % noiseTexSize];
+	Vec3 noise3 = TEX(g_blueNoiseTex, realSvDispatchThreadId % noiseTexSize);
 	noise3 = animateBlueNoise(noise3, g_globalRendererConstants.m_frame);
 	const F32 noise = noise3.x;
 
@@ -543,7 +559,110 @@ SamplerState g_linearAnyRepeatSampler : register(s0);
 												g_globalRendererConstants.m_indirectDiffuseClipmaps, g_linearAnyRepeatSampler, flags, noise);
 	}
 
-	TEX(g_outTex, svDispatchThreadId.xy).xyz = irradiance;
+	TEX(g_outTex, realSvDispatchThreadId) = Vec4(irradiance, 0.0);
+}
+#endif
+
+// ===========================================================================
+// SpatialReconstruct                                                        =
+// ===========================================================================
+#if NOT_ZERO(ANKI_TECHNIQUE_SpatialReconstruct)
+#	include <AnKi/Shaders/BilateralFilter.hlsl>
+
+Texture2D<Vec3> g_inTex : register(t0);
+Texture2D<F32> g_depthTex : register(t1);
+
+RWTexture2D<Vec4> g_outTex : register(u0);
+
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	IVec2 viewportSize;
+	g_outTex.GetDimensions(viewportSize.x, viewportSize.y);
+
+	const IVec2 realSvDispatchThreadId = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+
+	const IVec2 filledCoord = IVec2(realSvDispatchThreadId.x * 2 + (realSvDispatchThreadId.y & 1), realSvDispatchThreadId.y);
+	const IVec2 toBeFilledCoord = IVec2(realSvDispatchThreadId.x * 2 + ((realSvDispatchThreadId.y + 1) & 1), realSvDispatchThreadId.y);
+
+	const F32 refDepth = TEX(g_depthTex, toBeFilledCoord);
+
+	Vec3 toBeFilledColor = 0.0;
+	F32 weightSum = 0.0;
+	const IVec2 offsets[4] = {IVec2(-1, 0), IVec2(1, 0), IVec2(0, -1), IVec2(0, 1)};
+	[unroll] for(U32 i = 0; i < 4; ++i)
+	{
+		const IVec2 sampleCoord = toBeFilledCoord + offsets[i];
+		if(all(sampleCoord >= 0) && all(sampleCoord < viewportSize))
+		{
+			const F32 sampleDepth = TEX(g_depthTex, sampleCoord);
+
+			const Vec3 sample = TEX(g_inTex, IVec2(sampleCoord.x / 2, sampleCoord.y));
+
+			const F32 weight = calculateBilateralWeightDepth<F32>(refDepth, sampleDepth, 1.0);
+			weightSum += weight;
+
+			toBeFilledColor += weight * sample;
+
+			if(all(sampleCoord == filledCoord))
+			{
+				TEX(g_outTex, filledCoord) = Vec4(sample, 0.0);
+			}
+		}
+	}
+
+	if(weightSum > kEpsilonF32 * 10.0)
+	{
+		toBeFilledColor /= weightSum;
+	}
+	else
+	{
+		toBeFilledColor = 0.0;
+	}
+
+	TEX(g_outTex, toBeFilledCoord) = Vec4(toBeFilledColor, 0.0);
+}
+#endif
+
+// ===========================================================================
+// TemporalDenoise                                                           =
+// ===========================================================================
+#if NOT_ZERO(ANKI_TECHNIQUE_TemporalDenoise)
+Texture2D<F32> g_historyLengthTex : register(t0);
+Texture2D<Vec2> g_motionVectorsTex : register(t1);
+Texture2D<Vec3> g_historyTex : register(t2);
+Texture2D<Vec3> g_currentTex : register(t3);
+
+RWTexture2D<Vec3> g_outTex : register(u0);
+
+SamplerState g_linearAnyClampSampler : register(s0);
+
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const Vec2 coord = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+	Vec2 viewport;
+	g_historyLengthTex.GetDimensions(viewport.x, viewport.y);
+
+	const F32 historyLen = TEX(g_historyLengthTex, coord) / kMaxHistoryLength;
+
+	F32 blendFactor = historyLen / 1.0;
+	blendFactor = lerp(1.0, 0.05, blendFactor);
+
+	Vec3 outColor = TEX(g_currentTex, coord);
+	if(blendFactor > 0.9)
+	{
+		// Don't accumulate
+	}
+	else
+	{
+		const Vec2 uv = (coord + 0.5) / viewport;
+		const Vec2 historyUv = uv + TEX(g_motionVectorsTex, coord);
+
+		const Vec3 history = g_historyTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0);
+
+		outColor = lerp(history, outColor, blendFactor);
+	}
+
+	TEX(g_outTex, coord) = outColor;
 }
 #endif