Browse Source

RT reflections are working but without optimizations

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
5e77945ce4

+ 22 - 2
AnKi/Renderer/LightShading.cpp

@@ -18,6 +18,7 @@
 #include <AnKi/Renderer/ClusterBinning.h>
 #include <AnKi/Renderer/Ssao.h>
 #include <AnKi/Renderer/Ssr.h>
+#include <AnKi/Renderer/RtReflections.h>
 #include <AnKi/Util/CVarSet.h>
 #include <AnKi/Util/Tracer.h>
 #include <AnKi/Scene/Components/SkyboxComponent.h>
@@ -94,7 +95,16 @@ void LightShading::run(const RenderingContext& ctx, RenderPassWorkContext& rgrap
 		rgraphCtx.bindSrv(8, 0, getRenderer().getGBuffer().getDepthRt());
 		rgraphCtx.bindSrv(9, 0, getRenderer().getShadowmapsResolve().getRt());
 		rgraphCtx.bindSrv(10, 0, getRenderer().getSsao().getRt());
-		rgraphCtx.bindSrv(11, 0, getRenderer().getSsr().getRt());
+
+		const Bool rtReflections = g_rtReflectionsCVar && GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled;
+		if(rtReflections)
+		{
+			rgraphCtx.bindSrv(11, 0, getRenderer().getRtReflections().getRt());
+		}
+		else
+		{
+			rgraphCtx.bindSrv(11, 0, getRenderer().getSsr().getRt());
+		}
 		cmdb.bindSrv(12, 0, TextureView(&getRenderer().getProbeReflections().getIntegrationLut(), TextureSubresourceDesc::all()));
 
 		// Draw
@@ -254,6 +264,8 @@ void LightShading::populateRenderGraph(RenderingContext& ctx)
 	}
 
 	// Light shading
+	const Bool rtReflections = g_rtReflectionsCVar && GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled;
+
 	pass.newTextureDependency(m_runCtx.m_rt, TextureUsageBit::kRtvDsvWrite);
 	pass.newTextureDependency(getRenderer().getGBuffer().getColorRt(0), readUsage);
 	pass.newTextureDependency(getRenderer().getGBuffer().getColorRt(1), readUsage);
@@ -264,7 +276,15 @@ void LightShading::populateRenderGraph(RenderingContext& ctx)
 	pass.newBufferDependency(getRenderer().getClusterBinning().getPackedObjectsBufferHandle(GpuSceneNonRenderableObjectType::kLight),
 							 BufferUsageBit::kSrvPixel);
 	pass.newTextureDependency(getRenderer().getSsao().getRt(), readUsage);
-	pass.newTextureDependency(getRenderer().getSsr().getRt(), readUsage);
+
+	if(rtReflections)
+	{
+		pass.newTextureDependency(getRenderer().getRtReflections().getRt(), readUsage);
+	}
+	else
+	{
+		pass.newTextureDependency(getRenderer().getSsr().getRt(), readUsage);
+	}
 
 	if(getRenderer().getProbeReflections().getHasCurrentlyRefreshedReflectionRt())
 	{

+ 14 - 2
AnKi/Renderer/RtMaterialFetchDbg.cpp

@@ -55,11 +55,12 @@ void RtMaterialFetchDbg::populateRenderGraph(RenderingContext& ctx)
 	BufferView sbtBuildIndirectArgsBuffer;
 	{
 		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(1);
-		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kUavCompute);
+		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kNone);
 
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtMaterialFetch setup build SBT");
 
-		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kAccelerationStructureBuild);
+		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kUavCompute);
+		rpass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kSrvCompute);
 
 		rpass.setWork([this, sbtBuildIndirectArgsBuffer](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtMaterialFetch);
@@ -103,6 +104,7 @@ void RtMaterialFetchDbg::populateRenderGraph(RenderingContext& ctx)
 
 		rpass.newBufferDependency(visibilityHandle, BufferUsageBit::kSrvCompute);
 		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kIndirectCompute);
+		rpass.newBufferDependency(sbtHandle, BufferUsageBit::kUavCompute);
 
 		rpass.setWork([this, sbtBuildIndirectArgsBuffer, sbtBuffer, visibleRenderableIndicesBuff](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtShadows);
@@ -148,6 +150,7 @@ void RtMaterialFetchDbg::populateRenderGraph(RenderingContext& ctx)
 			cmdb.bindSampler(ANKI_MATERIAL_REGISTER_TILINEAR_REPEAT_SAMPLER, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
 			cmdb.bindSrv(ANKI_MATERIAL_REGISTER_GPU_SCENE, 0, GpuSceneBuffer::getSingleton().getBufferView());
 			cmdb.bindSrv(ANKI_MATERIAL_REGISTER_MESH_LODS, 0, GpuSceneArrays::MeshLod::getSingleton().getBufferView());
+			cmdb.bindSrv(ANKI_MATERIAL_REGISTER_TRANSFORMS, 0, GpuSceneArrays::Transform::getSingleton().getBufferView());
 
 #define ANKI_UNIFIED_GEOM_FORMAT(fmt, shaderType, reg) \
 	cmdb.bindSrv( \
@@ -158,9 +161,18 @@ void RtMaterialFetchDbg::populateRenderGraph(RenderingContext& ctx)
 #include <AnKi/Shaders/Include/UnifiedGeometryTypes.def.h>
 
 			cmdb.bindConstantBuffer(0, 2, ctx.m_globalRenderingConstantsBuffer);
+
 			rgraphCtx.bindSrv(0, 2, getRenderer().getAccelerationStructureBuilder().getAccelerationStructureHandle());
+
 			rgraphCtx.bindUav(0, 2, m_runCtx.m_rt);
 
+			// Fill the rest of the interface resources
+			cmdb.bindSrv(1, 2, TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
+			cmdb.bindSrv(2, 2, TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
+			cmdb.bindSrv(3, 2, TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
+			cmdb.bindSrv(4, 2, TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::all()));
+			cmdb.bindUav(1, 2, TextureView(&getRenderer().getDummyTexture2d(), TextureSubresourceDesc::firstSurface()));
+
 			cmdb.traceRays(sbtBuffer, m_sbtRecordSize, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount(), 1,
 						   getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), 1);
 		});

+ 178 - 20
AnKi/Renderer/RtReflections.cpp

@@ -7,6 +7,8 @@
 #include <AnKi/Renderer/Renderer.h>
 #include <AnKi/Renderer/AccelerationStructureBuilder.h>
 #include <AnKi/Renderer/GBuffer.h>
+#include <AnKi/Renderer/MotionVectors.h>
+#include <AnKi/Renderer/Sky.h>
 #include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
 #include <AnKi/Core/GpuMemory/UnifiedGeometryBuffer.h>
 #include <AnKi/Util/Tracer.h>
@@ -37,14 +39,36 @@ Error RtReflections::init()
 		m_missShaderGroupIdx = variant->getShaderGroupHandleIndex();
 	}
 
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_spatialDenoisingGrProg, "SpatialDenoise"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_temporalDenoisingGrProg, "TemporalDenoise"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_bilateralDenoisingGrProg, "BilateralDenoise"));
+
 	m_sbtRecordSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment,
 										GrManager::getSingleton().getDeviceCapabilities().m_shaderGroupHandleSize + U32(sizeof(UVec4)));
 
-	m_rtDesc = getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(),
-															 getRenderer().getHdrFormat(), "RtReflections");
-	m_rtDesc.bake();
+	m_transientRtDesc1 = getRenderer().create2DRenderTargetDescription(
+		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), Format::kR16G16B16A16_Sfloat, "RtReflections #1");
+	m_transientRtDesc1.bake();
+
+	m_transientRtDesc2 = getRenderer().create2DRenderTargetDescription(
+		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), Format::kR16G16B16A16_Sfloat, "RtReflections #2");
+	m_transientRtDesc2.bake();
+
+	m_hitPosAndDepthRtDesc = getRenderer().create2DRenderTargetDescription(
+		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), Format::kR32G32B32A32_Sfloat, "HitPosAndDepth");
+	m_hitPosAndDepthRtDesc.bake();
+
+	TextureInitInfo texInit = getRenderer().create2DRenderTargetDescription(
+		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), getRenderer().getHdrFormat(), "RtReflectionsMain");
+	texInit.m_usage = TextureUsageBit::kAllShaderResource | TextureUsageBit::kAllUav;
+	m_tex = getRenderer().createAndClearRenderTarget(texInit, TextureUsageBit::kSrvCompute);
 
-	ANKI_CHECK(ResourceManager::getSingleton().loadResource("EngineAssets/BlueNoise_Rgba8_64x64.png", m_blueNoiseImg));
+	texInit = getRenderer().create2DRenderTargetDescription(getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(),
+															Format::kR32G32_Sfloat, "RtReflectionsMoments #1");
+	texInit.m_usage = TextureUsageBit::kAllShaderResource | TextureUsageBit::kAllUav;
+	m_momentsTextures[0] = getRenderer().createAndClearRenderTarget(texInit, TextureUsageBit::kSrvCompute);
+	texInit.setName("RtReflectionsMoments #2");
+	m_momentsTextures[1] = getRenderer().createAndClearRenderTarget(texInit, TextureUsageBit::kSrvCompute);
 
 	return Error::kNone;
 }
@@ -53,24 +77,51 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 {
 	RenderGraphBuilder& rgraph = ctx.m_renderGraphDescr;
 
+	// Create or import render targets
+	RenderTargetHandle mainRt;
+	RenderTargetHandle readMomentsRt;
+	RenderTargetHandle writeMomentsRt;
+	if(m_texImportedOnce)
+	{
+		mainRt = rgraph.importRenderTarget(m_tex.get());
+		readMomentsRt = rgraph.importRenderTarget(m_momentsTextures[getRenderer().getFrameCount() & 1].get());
+		writeMomentsRt = rgraph.importRenderTarget(m_momentsTextures[(getRenderer().getFrameCount() + 1) & 1].get());
+	}
+	else
+	{
+		mainRt = rgraph.importRenderTarget(m_tex.get(), TextureUsageBit::kAllCompute);
+		readMomentsRt = rgraph.importRenderTarget(m_momentsTextures[getRenderer().getFrameCount() & 1].get(), TextureUsageBit::kAllCompute);
+		writeMomentsRt = rgraph.importRenderTarget(m_momentsTextures[(getRenderer().getFrameCount() + 1) & 1].get(), TextureUsageBit::kAllCompute);
+		m_texImportedOnce = true;
+	}
+
+	const RenderTargetHandle transientRt1 = rgraph.newRenderTarget(m_transientRtDesc1);
+	const RenderTargetHandle transientRt2 = rgraph.newRenderTarget(m_transientRtDesc2);
+	const RenderTargetHandle hitPosAndDepthRt = rgraph.newRenderTarget(m_hitPosAndDepthRtDesc);
+
+	BufferHandle visibilityHandle;
+	BufferView visibleRenderableIndicesBuff;
+	getRenderer().getAccelerationStructureBuilder().getVisibilityInfo(visibilityHandle, visibleRenderableIndicesBuff);
+
 	// SBT build setup
 	BufferHandle sbtBuildIndirectArgsHandle;
 	BufferView sbtBuildIndirectArgsBuffer;
 	{
 		sbtBuildIndirectArgsBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocateStructuredBuffer<DispatchIndirectArgs>(1);
-		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kUavCompute);
+		sbtBuildIndirectArgsHandle = rgraph.importBuffer(sbtBuildIndirectArgsBuffer, BufferUsageBit::kNone);
 
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflections setup build SBT");
 
-		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kAccelerationStructureBuild);
+		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kUavCompute);
+		rpass.newBufferDependency(visibilityHandle, BufferUsageBit::kSrvCompute);
 
-		rpass.setWork([this, sbtBuildIndirectArgsBuffer](RenderPassWorkContext& rgraphCtx) {
+		rpass.setWork([this, sbtBuildIndirectArgsBuffer, visibleRenderableIndicesBuff](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtReflections);
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 			cmdb.bindShaderProgram(m_sbtBuildSetupGrProg.get());
 
-			cmdb.bindSrv(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferView());
+			cmdb.bindSrv(0, 0, visibleRenderableIndicesBuff);
 			cmdb.bindUav(0, 0, sbtBuildIndirectArgsBuffer);
 
 			cmdb.dispatchCompute(1, 1, 1);
@@ -89,7 +140,7 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 		U8* sbtMem;
 		sbtBuffer = RebarTransientMemoryPool::getSingleton().allocate(
 			(GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount() + 2) * m_sbtRecordSize, sbtAlignment, sbtMem);
-		sbtHandle = rgraph.importBuffer(sbtBuffer, BufferUsageBit::kUavCompute);
+		sbtHandle = rgraph.importBuffer(sbtBuffer, BufferUsageBit::kNone);
 
 		// Write the first 2 entries of the SBT
 		ConstWeakArray<U8> shaderGroupHandles = m_libraryGrProg->getShaderGroupHandles();
@@ -100,12 +151,8 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 		// Create the pass
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflections build SBT");
 
-		BufferHandle visibilityHandle;
-		BufferView visibleRenderableIndicesBuff;
-		getRenderer().getAccelerationStructureBuilder().getVisibilityInfo(visibilityHandle, visibleRenderableIndicesBuff);
-
-		rpass.newBufferDependency(visibilityHandle, BufferUsageBit::kSrvCompute);
 		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kIndirectCompute);
+		rpass.newBufferDependency(sbtHandle, BufferUsageBit::kUavCompute);
 
 		rpass.setWork([this, sbtBuildIndirectArgsBuffer, sbtBuffer, visibleRenderableIndicesBuff](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtShadows);
@@ -132,19 +179,19 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 
 	// Ray gen
 	{
-		m_runCtx.m_rt = rgraph.newRenderTarget(m_rtDesc);
-
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflections");
 
 		rpass.newBufferDependency(sbtHandle, BufferUsageBit::kShaderBindingTable);
-		rpass.newTextureDependency(m_runCtx.m_rt, TextureUsageBit::kUavTraceRays);
+		rpass.newTextureDependency(transientRt1, TextureUsageBit::kUavTraceRays);
+		rpass.newTextureDependency(hitPosAndDepthRt, TextureUsageBit::kUavTraceRays);
 		rpass.newTextureDependency(getRenderer().getGBuffer().getDepthRt(), TextureUsageBit::kSrvTraceRays);
 		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(1), TextureUsageBit::kSrvTraceRays);
 		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(2), TextureUsageBit::kSrvTraceRays);
+		rpass.newTextureDependency(getRenderer().getSky().getEnvironmentMapRt(), TextureUsageBit::kSrvTraceRays);
 		rpass.newAccelerationStructureDependency(getRenderer().getAccelerationStructureBuilder().getAccelerationStructureHandle(),
 												 AccelerationStructureUsageBit::kTraceRaysSrv);
 
-		rpass.setWork([this, sbtBuffer, &ctx](RenderPassWorkContext& rgraphCtx) {
+		rpass.setWork([this, sbtBuffer, &ctx, transientRt1, hitPosAndDepthRt](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtShadows);
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
@@ -170,14 +217,125 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 			rgraphCtx.bindSrv(1, 2, getRenderer().getGBuffer().getDepthRt());
 			rgraphCtx.bindSrv(2, 2, getRenderer().getGBuffer().getColorRt(1));
 			rgraphCtx.bindSrv(3, 2, getRenderer().getGBuffer().getColorRt(2));
-			cmdb.bindSrv(4, 2, TextureView(&m_blueNoiseImg->getTexture(), TextureSubresourceDesc::all()));
+			rgraphCtx.bindSrv(4, 2, getRenderer().getSky().getEnvironmentMapRt());
+
+			rgraphCtx.bindUav(0, 2, transientRt1);
+			rgraphCtx.bindUav(1, 2, hitPosAndDepthRt);
 
-			rgraphCtx.bindUav(0, 2, m_runCtx.m_rt);
+			cmdb.bindSampler(0, 2, getRenderer().getSamplers().m_trilinearClamp.get());
+
+			const Vec4 consts(g_rtReflectionsMaxRayDistanceCVar);
+			cmdb.setFastConstants(&consts, sizeof(consts));
 
 			cmdb.traceRays(sbtBuffer, m_sbtRecordSize, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount(), 1,
 						   getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), 1);
 		});
 	}
+
+	// Spatial denoising
+	{
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsSpatialDenoise");
+
+		rpass.newTextureDependency(transientRt1, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(hitPosAndDepthRt, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(getRenderer().getGBuffer().getDepthRt(), TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(1), TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(2), TextureUsageBit::kSrvCompute);
+
+		rpass.newTextureDependency(transientRt2, TextureUsageBit::kUavCompute);
+
+		rpass.setWork([this, &ctx, transientRt1, transientRt2, hitPosAndDepthRt](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(RtShadows);
+
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_spatialDenoisingGrProg.get());
+
+			rgraphCtx.bindSrv(0, 0, transientRt1);
+			rgraphCtx.bindSrv(1, 0, hitPosAndDepthRt);
+			rgraphCtx.bindSrv(2, 0, getRenderer().getGBuffer().getDepthRt());
+			rgraphCtx.bindSrv(3, 0, getRenderer().getGBuffer().getColorRt(1));
+			rgraphCtx.bindSrv(4, 0, getRenderer().getGBuffer().getColorRt(2));
+
+			rgraphCtx.bindUav(0, 0, transientRt2);
+
+			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
+
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+		});
+	}
+
+	// m_runCtx.m_rt = transientRt2;
+	// return;
+
+	// Temporal denoising
+	{
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsTemporalDenoise");
+
+		rpass.newTextureDependency(transientRt2, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(mainRt, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(readMomentsRt, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(getRenderer().getMotionVectors().getMotionVectorsRt(), TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(hitPosAndDepthRt, TextureUsageBit::kSrvCompute);
+
+		rpass.newTextureDependency(transientRt1, TextureUsageBit::kUavCompute);
+		rpass.newTextureDependency(writeMomentsRt, TextureUsageBit::kUavCompute);
+
+		rpass.setWork(
+			[this, &ctx, transientRt1, transientRt2, mainRt, readMomentsRt, writeMomentsRt, hitPosAndDepthRt](RenderPassWorkContext& rgraphCtx) {
+				ANKI_TRACE_SCOPED_EVENT(RtShadows);
+
+				CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+				cmdb.bindShaderProgram(m_temporalDenoisingGrProg.get());
+
+				cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+
+				rgraphCtx.bindSrv(0, 0, transientRt2);
+				rgraphCtx.bindSrv(1, 0, mainRt);
+				rgraphCtx.bindSrv(2, 0, readMomentsRt);
+				rgraphCtx.bindSrv(3, 0, getRenderer().getMotionVectors().getMotionVectorsRt());
+				rgraphCtx.bindSrv(4, 0, hitPosAndDepthRt);
+
+				rgraphCtx.bindUav(0, 0, transientRt1);
+				rgraphCtx.bindUav(1, 0, writeMomentsRt);
+
+				cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
+
+				dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+			});
+	}
+
+	// Bilateral filter
+	{
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsBilateral");
+
+		rpass.newTextureDependency(transientRt1, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(writeMomentsRt, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(1), TextureUsageBit::kSrvCompute);
+
+		rpass.newTextureDependency(mainRt, TextureUsageBit::kUavCompute);
+
+		rpass.setWork([this, &ctx, transientRt1, mainRt, writeMomentsRt](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(RtShadows);
+
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_bilateralDenoisingGrProg.get());
+
+			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+
+			rgraphCtx.bindSrv(0, 0, transientRt1);
+			rgraphCtx.bindSrv(1, 0, writeMomentsRt);
+			rgraphCtx.bindSrv(2, 0, getRenderer().getGBuffer().getColorRt(1));
+
+			rgraphCtx.bindUav(0, 0, mainRt);
+
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+		});
+	}
+
+	m_runCtx.m_rt = mainRt;
 }
 
 } // end namespace anki

+ 16 - 2
AnKi/Renderer/RtReflections.h

@@ -13,6 +13,8 @@ namespace anki {
 /// @{
 
 inline BoolCVar g_rtReflectionsCVar("R", "RtReflections", false, "Enable RT reflections");
+inline NumericCVar<F32> g_rtReflectionsMaxRayDistanceCVar("R", "RtReflectionsMaxRayDistance", 100.0f, 1.0f, 10000.0f,
+														  "Max RT reflections ray distance");
 
 class RtReflections : public RendererObject
 {
@@ -32,16 +34,28 @@ public:
 		handles[0] = m_runCtx.m_rt;
 	}
 
+	RenderTargetHandle getRt() const
+	{
+		return m_runCtx.m_rt;
+	}
+
 public:
 	ShaderProgramResourcePtr m_sbtProg;
 	ShaderProgramResourcePtr m_rtProg;
 	ShaderProgramPtr m_sbtBuildSetupGrProg;
 	ShaderProgramPtr m_sbtBuildGrProg;
 	ShaderProgramPtr m_libraryGrProg;
+	ShaderProgramPtr m_spatialDenoisingGrProg;
+	ShaderProgramPtr m_temporalDenoisingGrProg;
+	ShaderProgramPtr m_bilateralDenoisingGrProg;
 
-	RenderTargetDesc m_rtDesc;
+	RenderTargetDesc m_transientRtDesc1;
+	RenderTargetDesc m_transientRtDesc2;
+	RenderTargetDesc m_hitPosAndDepthRtDesc;
 
-	ImageResourcePtr m_blueNoiseImg;
+	TexturePtr m_tex;
+	Array<TexturePtr, 2> m_momentsTextures;
+	Bool m_texImportedOnce = false;
 
 	U32 m_sbtRecordSize = 0;
 	U32 m_rayGenShaderGroupIdx = 0;

+ 32 - 0
AnKi/Renderer/Sky.cpp

@@ -17,6 +17,7 @@ Error Sky::init()
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Sky.ankiprogbin", {}, m_prog, m_multipleScatteringLutGrProg, "SkyMultipleScatteringLut"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Sky.ankiprogbin", {}, m_prog, m_skyLutGrProg, "SkyLut"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Sky.ankiprogbin", {}, m_prog, m_computeSunColorGrProg, "ComputeSunColor"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Sky.ankiprogbin", {}, m_prog, m_computeEnvMapGrProg, "ComputeEnvMap"));
 
 	const TextureUsageBit usage = TextureUsageBit::kAllCompute;
 	const TextureUsageBit initialUsage = TextureUsageBit::kSrvCompute;
@@ -36,6 +37,11 @@ Error Sky::init()
 		getRenderer().create2DRenderTargetInitInfo(kSkyLutSize.x(), kSkyLutSize.y(), formatB, usage | TextureUsageBit::kSrvPixel, "SkyLut"),
 		initialUsage);
 
+	m_envMap = getRenderer().createAndClearRenderTarget(getRenderer().create2DRenderTargetInitInfo(kEnvMapSize.x(), kEnvMapSize.y(),
+																								   getRenderer().getHdrFormat(),
+																								   usage | TextureUsageBit::kAllSrv, "SkyEnvMap"),
+														initialUsage);
+
 	return Error::kNone;
 }
 
@@ -78,10 +84,12 @@ void Sky::populateRenderGraph(RenderingContext& ctx)
 	if(m_skyLutImportedOnce) [[likely]]
 	{
 		m_runCtx.m_skyLutRt = rgraph.importRenderTarget(m_skyLut.get());
+		m_runCtx.m_envMapRt = rgraph.importRenderTarget(m_envMap.get());
 	}
 	else
 	{
 		m_runCtx.m_skyLutRt = rgraph.importRenderTarget(m_skyLut.get(), TextureUsageBit::kSrvCompute);
+		m_runCtx.m_envMapRt = rgraph.importRenderTarget(m_envMap.get(), TextureUsageBit::kSrvCompute);
 		m_skyLutImportedOnce = true;
 	}
 
@@ -154,6 +162,30 @@ void Sky::populateRenderGraph(RenderingContext& ctx)
 		});
 	}
 
+	// Sky env map
+	if(renderSkyLut)
+	{
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("SkyLutEnvMap");
+
+		rpass.newTextureDependency(m_runCtx.m_skyLutRt, TextureUsageBit::kSrvCompute);
+		rpass.newTextureDependency(m_runCtx.m_envMapRt, TextureUsageBit::kUavCompute);
+
+		rpass.setWork([this, &ctx](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(SkyLut);
+
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_computeEnvMapGrProg.get());
+
+			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+			rgraphCtx.bindSrv(0, 0, m_runCtx.m_skyLutRt);
+			rgraphCtx.bindUav(0, 0, m_runCtx.m_envMapRt);
+			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
+
+			dispatchPPCompute(cmdb, 8, 8, kEnvMapSize.x(), kEnvMapSize.y());
+		});
+	}
+
 	// Compute sun color always
 	{
 		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("ComputeSunColor");

+ 11 - 1
AnKi/Renderer/Sky.h

@@ -30,7 +30,7 @@ public:
 	void getDebugRenderTarget([[maybe_unused]] CString rtName, Array<RenderTargetHandle, kMaxDebugRenderTargets>& handles,
 							  [[maybe_unused]] ShaderProgramPtr& optionalShaderProgram) const override
 	{
-		handles[0] = m_runCtx.m_skyLutRt;
+		handles[0] = m_runCtx.m_envMapRt;
 	}
 
 	RenderTargetHandle getSkyLutRt() const
@@ -39,6 +39,12 @@ public:
 		return m_runCtx.m_skyLutRt;
 	}
 
+	RenderTargetHandle getEnvironmentMapRt() const
+	{
+		ANKI_ASSERT(isEnabled());
+		return m_runCtx.m_envMapRt;
+	}
+
 	ANKI_PURE Bool isEnabled() const;
 
 public:
@@ -47,14 +53,17 @@ public:
 	ShaderProgramPtr m_multipleScatteringLutGrProg;
 	ShaderProgramPtr m_skyLutGrProg;
 	ShaderProgramPtr m_computeSunColorGrProg;
+	ShaderProgramPtr m_computeEnvMapGrProg;
 
 	static constexpr UVec2 kTransmittanceLutSize{256, 64};
 	static constexpr UVec2 kMultipleScatteringLutSize{32, 32};
 	static constexpr UVec2 kSkyLutSize{256, 256};
+	static constexpr UVec2 kEnvMapSize{64, 64};
 
 	TexturePtr m_transmittanceLut;
 	TexturePtr m_multipleScatteringLut;
 	TexturePtr m_skyLut;
+	TexturePtr m_envMap;
 
 	Vec3 m_sunDir = Vec3(0.0f);
 	F32 m_sunPower = -100.0f;
@@ -66,6 +75,7 @@ public:
 	{
 	public:
 		RenderTargetHandle m_skyLutRt;
+		RenderTargetHandle m_envMapRt;
 	} m_runCtx;
 };
 /// @}

+ 5 - 4
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -1080,9 +1080,10 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
 Error GpuVisibilityAccelerationStructures::init()
 {
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_visibilityProg, m_visibilityGrProg));
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprogbin", m_zeroRemainingInstancesProg,
-								 m_zeroRemainingInstancesGrProg));
+	ANKI_CHECK(
+		loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", {}, m_visibilityProg, m_visibilityGrProg, "Visibility"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", {}, m_visibilityProg,
+								 m_zeroRemainingInstancesGrProg, "ZeroRemaining"));
 
 	BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
 	inf.m_size = sizeof(U32) * 2;
@@ -1167,7 +1168,7 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 		NonGraphicsRenderPass& pass =
 			rgraph.newNonGraphicsRenderPass(generateTempPassName("Accel vis zero remaining instances: %s", in.m_passesName.cstr()));
 
-		pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute);
+		pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kUavCompute | BufferUsageBit::kIndirectCompute);
 
 		pass.setWork([this, zeroInstancesDispatchArgsBuff, instancesBuff = out.m_instancesBuffer,
 					  visRenderablesBuff = out.m_renderablesBuffer](RenderPassWorkContext& rgraph) {

+ 0 - 2
AnKi/Renderer/Utils/GpuVisibility.h

@@ -305,8 +305,6 @@ public:
 private:
 	ShaderProgramResourcePtr m_visibilityProg;
 	ShaderProgramPtr m_visibilityGrProg;
-
-	ShaderProgramResourcePtr m_zeroRemainingInstancesProg;
 	ShaderProgramPtr m_zeroRemainingInstancesGrProg;
 
 	BufferPtr m_counterBuffer; ///< A buffer containing multiple counters for atomic operations.

+ 1 - 0
AnKi/Shaders/BilateralFilter.hlsl

@@ -10,6 +10,7 @@
 #include <AnKi/Shaders/Common.hlsl>
 
 // https://cs.dartmouth.edu/~wjarosz/publications/mara17towards.html
+// phi can be equal to 1
 F32 calculateBilateralWeightDepth(F32 depthCenter, F32 depthTap, F32 phi)
 {
 	const F32 diff = abs(depthTap - depthCenter);

+ 33 - 3
AnKi/Shaders/Functions.hlsl

@@ -539,12 +539,21 @@ UVec2 getOptimalGlobalInvocationId8x8Nvidia()
 }
 #endif
 
-// Gaussian distrubution function
+// Gaussian distrubution function. Play with the values here https://www.desmos.com/calculator/7oxmohg3ta
+// s is the sigma and x is a factor where abs(x) is in [0, 1]
 template<typename T>
 T gaussianWeight(T s, T x)
 {
-	T p = T(1.0) / (s * sqrt(T(2.0) * kPi));
-	p *= exp((x * x) / (T(-2.0) * s * s));
+	T p = T(1) / (s * sqrt(T(2) * kPi));
+	p *= exp((x * x) / (T(-2) * s * s));
+	return p;
+}
+
+template<typename T>
+T gaussianWeight2d(T s, T x, T y)
+{
+	T p = T(1) / (T(2) * kPi * s * s);
+	p *= exp((x * x + y * y) / (T(-2) * s * s));
 	return p;
 }
 
@@ -776,3 +785,24 @@ Bool dither4x4(Vec2 svPosition, F32 factor)
 	const F32 limit = (F32(ditherMatrix[index]) + 1.0) / (1.0 + axisSize * axisSize);
 	return (factor < limit) ? true : false;
 }
+
+// Encode a normal to octahedron UV coordinates
+Vec2 octahedronEncode(Vec3 n)
+{
+	n /= (abs(n.x) + abs(n.y) + abs(n.z));
+	const Vec2 octWrap = (1.0 - abs(n.yx)) * select(n.xy >= 0.0, 1.0, -1.0);
+	n.xy = select(n.z >= 0.0, n.xy, octWrap);
+	n.xy = n.xy * 0.5 + 0.5;
+	return n.xy;
+}
+
+// The reverse of octahedronEncode
+// https://twitter.com/Stubbesaurus/status/937994790553227264
+Vec3 octahedronDecode(Vec2 f)
+{
+	f = f * 2.0 - 1.0;
+	Vec3 n = Vec3(f.x, f.y, 1.0 - abs(f.x) - abs(f.y));
+	const F32 t = saturate(-n.z);
+	n.xy += select(n.xy >= 0.0, -t, t);
+	return normalize(n);
+}

+ 36 - 3
AnKi/Shaders/GpuVisibilityAccelerationStructures.ankiprog

@@ -3,7 +3,8 @@
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 
-#pragma anki technique comp
+#pragma anki technique Visibility comp
+#pragma anki technique ZeroRemaining comp
 
 #include <AnKi/Shaders/Common.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
@@ -11,6 +12,11 @@
 #include <AnKi/Shaders/Include/MiscRendererTypes.h>
 #include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
 
+// ===========================================================================
+// Visibility                                                                =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_Visibility
+
 // Buffers that point to the GPU scene
 StructuredBuffer<GpuSceneRenderableBoundingVolume> g_renderableBoundingVolumes : register(t0);
 StructuredBuffer<GpuSceneRenderable> g_renderables : register(t1);
@@ -26,7 +32,7 @@ RWStructuredBuffer<DispatchIndirectArgs> g_nextDispatchIndirectArgs : register(u
 
 ANKI_FAST_CONSTANTS(GpuVisibilityAccelerationStructuresConstants, g_consts)
 
-#define NUMTHREADS 64
+#	define NUMTHREADS 64
 
 [numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
 {
@@ -39,7 +45,7 @@ ANKI_FAST_CONSTANTS(GpuVisibilityAccelerationStructuresConstants, g_consts)
 
 	// Sphere test
 	GpuSceneRenderableBoundingVolume bvolume;
-	Vec3 sphereCenter;
+	Vec3 sphereCenter = 0.0;
 	if(visible)
 	{
 		bvolume = SBUFF(g_renderableBoundingVolumes, bvolumeIdx);
@@ -147,3 +153,30 @@ ANKI_FAST_CONSTANTS(GpuVisibilityAccelerationStructuresConstants, g_consts)
 		}
 	}
 }
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_Visibility
+
+// ===========================================================================
+// ZeroRemaining                                                             =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_ZeroRemaining
+
+StructuredBuffer<U32> g_visibleRenderableIndices : register(t0); // 1st element is the count
+RWStructuredBuffer<AccelerationStructureInstance> g_instances : register(u0);
+
+#	define NUMTHREADS 64
+
+[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	const U32 visibleInstances = g_visibleRenderableIndices[0];
+
+	const U32 maxInstances = getStructuredBufferElementCount(g_instances);
+
+	ANKI_ASSERT(maxInstances >= visibleInstances);
+	const U32 remainingInstances = maxInstances - visibleInstances;
+
+	if(svDispatchThreadId < remainingInstances)
+	{
+		SBUFF(g_instances, visibleInstances + svDispatchThreadId) = (AccelerationStructureInstance)0;
+	}
+}
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_ZeroRemaining

+ 0 - 29
AnKi/Shaders/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprog

@@ -1,29 +0,0 @@
-// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
-// All rights reserved.
-// Code licensed under the BSD License.
-// http://www.anki3d.org/LICENSE
-
-#pragma anki technique comp
-
-#include <AnKi/Shaders/Common.hlsl>
-#include <AnKi/Shaders/Include/GpuSceneTypes.h>
-
-StructuredBuffer<U32> g_visibleRenderableIndices : register(t0); // 1st element is the count
-RWStructuredBuffer<AccelerationStructureInstance> g_instances : register(u0);
-
-#define NUMTHREADS 64
-
-[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
-{
-	const U32 visibleInstances = g_visibleRenderableIndices[0];
-
-	const U32 maxInstances = getStructuredBufferElementCount(g_instances);
-
-	ANKI_ASSERT(maxInstances >= visibleInstances);
-	const U32 remainingInstances = maxInstances - visibleInstances;
-
-	if(svDispatchThreadId < remainingInstances)
-	{
-		g_instances[visibleInstances + svDispatchThreadId] = (AccelerationStructureInstance)0;
-	}
-}

+ 11 - 11
AnKi/Shaders/LightFunctions.hlsl

@@ -587,7 +587,7 @@ RVec3 sampleGlobalIllumination(const Vec3 worldPos, const Vec3 normal, const Glo
 
 /// To play with it use https://www.shadertoy.com/view/sttSDf
 /// http://jcgt.org/published/0007/04/01/paper.pdf by Eric Heitz
-/// Input v: view direction
+/// Input v: view direction (camPos - pos)
 /// Input alphaX, alphaY: roughness parameters
 /// Input u1, u2: uniform random numbers
 /// Output: normal sampled with PDF D_Ve(nE) = G1(v) * max(0, dot(v, nE)) * D(nE) / v.z
@@ -672,10 +672,10 @@ Vec3 sampleVndfIsotropic(Vec2 randFactors, Vec3 viewDir, F32 alpha, Vec3 normal)
 }
 
 // The PDF of sampleVndfIsotropic
-F32 pdfVndfIsotropic(Vec3 nE, Vec3 viewDir, F32 alpha, Vec3 normal)
+F32 pdfVndfIsotropic(Vec3 reflectedDir, Vec3 viewDir, F32 alpha, Vec3 normal)
 {
 	const F32 alphaSquare = alpha * alpha;
-	const Vec3 wm = normalize(nE + viewDir);
+	const Vec3 wm = normalize(reflectedDir + viewDir);
 	const F32 zm = dot(wm, normal);
 	const F32 zi = dot(viewDir, normal);
 	const F32 nrm = rsqrt((zi * zi) * (1.0f - alphaSquare) + alphaSquare);
@@ -687,25 +687,26 @@ F32 pdfVndfIsotropic(Vec3 nE, Vec3 viewDir, F32 alpha, Vec3 normal)
 
 /// Calculate the reflection vector based on roughness. Sometimes the refl vector is bellow the normal so this func will try again to get a new one.
 /// viewDir is camPos-worldPos
-Vec3 sampleReflectionVector(Vec3 viewDir, Vec3 normal, F32 roughness, Vec2 randFactors, U32 tryCount, out F32 pdf)
+Vec3 sampleReflectionVectorAnisotropic(Vec3 viewDir, Vec3 normal, F32 roughnessX, F32 roughnessY, Vec2 randFactors, U32 tryCount, out F32 pdf)
 {
 	pdf = 0.0;
 	const Mat3 tbn = rotationFromDirection(normal);
 	const Mat3 tbnT = transpose(tbn);
 	const Vec3 viewDirTbn = mul(tbnT, viewDir);
 
-	const F32 alpha = pow(roughness, 2.0);
+	const F32 alphaX = roughnessX * roughnessX;
+	const F32 alphaY = roughnessY * roughnessY;
 
 	Vec3 reflectedDirTbn;
 	do
 	{
-		const Vec3 sampledNormalTbn = sampleGgxVndf(viewDirTbn, alpha, alpha, randFactors.x, randFactors.y);
+		const Vec3 sampledNormalTbn = sampleGgxVndf(viewDirTbn, alphaX, alphaY, randFactors.x, randFactors.y);
 		reflectedDirTbn = reflect(-viewDirTbn, sampledNormalTbn);
 
 		if(dot(reflectedDirTbn, Vec3(0.0, 0.0, 1.0)) > cos(kPi / 2.0 * 0.9))
 		{
 			// Angle between the refl vec and the normal is less than 90 degr. We are good to go
-			pdf = pdfGgxVndf(sampledNormalTbn, viewDirTbn, alpha, alpha);
+			pdf = pdfGgxVndf(sampledNormalTbn, viewDirTbn, alphaX, alphaY);
 			break;
 		}
 		else
@@ -723,21 +724,19 @@ Vec3 sampleReflectionVector(Vec3 viewDir, Vec3 normal, F32 roughness, Vec2 randF
 }
 
 // Another version of sampleReflectionVector. Possibly faster
-Vec3 sampleReflectionVector2(Vec3 viewDir, Vec3 normal, F32 roughness, Vec2 randFactors, U32 tryCount, out F32 pdf)
+Vec3 sampleReflectionVectorIsotropic(Vec3 viewDir, Vec3 normal, F32 roughness, Vec2 randFactors, U32 tryCount, out F32 pdf)
 {
-	pdf = 0.0;
 	const F32 alpha = roughness * roughness;
 
 	Vec3 reflDir = normal;
 	do
 	{
 		const Vec3 nE = sampleVndfIsotropic(randFactors, viewDir, alpha, normal);
-		const Vec3 reflDir = reflect(-viewDir, nE);
+		reflDir = reflect(-viewDir, nE);
 
 		if(dot(reflDir, normal) > cos(kPi / 2.0 * 0.9))
 		{
 			// Angle between the refl vec and the normal is less than 90 degr. We are good to go
-			pdf = pdfVndfIsotropic(nE, viewDir, alpha, normal);
 			break;
 		}
 		else
@@ -748,6 +747,7 @@ Vec3 sampleReflectionVector2(Vec3 viewDir, Vec3 normal, F32 roughness, Vec2 rand
 		}
 	} while(--tryCount);
 
+	pdf = pdfVndfIsotropic(reflDir, viewDir, alpha, normal);
 	return reflDir;
 }
 

+ 11 - 2
AnKi/Shaders/PackFunctions.hlsl

@@ -194,13 +194,22 @@ vector<T, 3> unpackNormalFromGBuffer(vector<T, 4> rt2)
 }
 
 template<typename T>
-T unpackRoughnessFromGBuffer(vector<T, 4> rt1)
+T unpackRoughnessFromGBuffer(vector<T, 4> rt1, T minRoughness)
 {
 	T r = rt1.x;
-	r = r * (T(1) - T(kMinRoughness)) + T(kMinRoughness);
+	if(minRoughness > 0.0)
+	{
+		r = r * (T(1) - T(minRoughness)) + T(minRoughness);
+	}
 	return r;
 }
 
+template<typename T>
+T unpackRoughnessFromGBuffer(vector<T, 4> rt1)
+{
+	return unpackRoughnessFromGBuffer<T>(rt1, kMinRoughness);
+}
+
 // Read part of the G-buffer
 template<typename T>
 void unpackGBufferNoVelocity(vector<T, 4> rt0, vector<T, 4> rt1, vector<T, 4> rt2, out GbufferInfo<T> g)

+ 18 - 0
AnKi/Shaders/RtMaterialFetch.hlsl

@@ -15,3 +15,21 @@ struct [raypayload] RtMaterialFetchRayPayload
 	Vec3 m_emission : write(closesthit, miss): read(caller);
 	F32 m_rayT : write(closesthit, miss): read(caller);
 };
+
+// Have a common resouce interface for all shaders
+#if ANKI_RAY_GEN_SHADER
+#	define SPACE space2
+
+ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0, SPACE);
+
+RaytracingAccelerationStructure g_tlas : register(t0, SPACE);
+Texture2D<Vec4> g_depthTex : register(t1, SPACE);
+Texture2D<Vec4> g_gbufferRt1 : register(t2, SPACE);
+Texture2D<Vec4> g_gbufferRt2 : register(t3, SPACE);
+Texture2D<Vec4> g_envMap : register(t4, SPACE);
+
+RWTexture2D<Vec4> g_colorAndPdfTex : register(u0, SPACE);
+RWTexture2D<Vec4> g_hitPosAndDepthTex : register(u1, SPACE);
+
+SamplerState g_linearClampAnySampler : register(s0, SPACE);
+#endif

+ 2 - 11
AnKi/Shaders/RtMaterialFetchDbg.ankiprog

@@ -12,19 +12,10 @@
 // RayGen                                                                    =
 // ===========================================================================
 #if ANKI_RAY_GEN_SHADER
-
-#	define SPACE space2
-
-ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0, SPACE);
-
-RaytracingAccelerationStructure g_tlas : register(t0, SPACE);
-
-RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
-
 [shader("raygeneration")] void main()
 {
 	Vec2 outSize;
-	g_outTex.GetDimensions(outSize.x, outSize.y);
+	g_colorAndPdfTex.GetDimensions(outSize.x, outSize.y);
 	const Vec2 uv = Vec2(DispatchRaysIndex().xy) / outSize;
 
 	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(uvToNdc(uv), 1.0, 1.0));
@@ -48,7 +39,7 @@ RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
 	ray.TMax = 100.0; // TODO
 	TraceRay(g_tlas, flags, cullMask, sbtRecordOffset, sbtRecordStride, missIndex, ray, payload);
 
-	g_outTex[DispatchRaysIndex().xy] =
+	g_colorAndPdfTex[DispatchRaysIndex().xy] =
 		Vec4(payload.m_diffuseColor + payload.m_worldNormal * 0.0 + payload.m_rayT * 0.0 + payload.m_emission * 0.0, 0.0);
 }
 #endif // ANKI_RAY_GEN_SHADER

+ 373 - 36
AnKi/Shaders/RtReflections.ankiprog

@@ -4,72 +4,75 @@
 // http://www.anki3d.org/LICENSE
 
 #pragma anki technique RtMaterialFetch rgen miss
+#pragma anki technique SpatialDenoise comp
+#pragma anki technique TemporalDenoise comp
+#pragma anki technique BilateralDenoise comp
 
 #include <AnKi/Shaders/RtMaterialFetch.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/PackFunctions.hlsl>
 #include <AnKi/Shaders/LightFunctions.hlsl>
 #include <AnKi/Shaders/ImportanceSampling.hlsl>
+#include <AnKi/Shaders/BilateralFilter.hlsl>
+
+// Config
+constexpr F32 kSpatialUpscalingPcfTexelOffset = 8.0;
+#define SPATIAL_UPSCALING_POISON_KERNEL kPoissonDisk4
 
 // ===========================================================================
 // RayGen                                                                    =
 // ===========================================================================
 #if ANKI_RAY_GEN_SHADER
 
-#	define SPACE space2
-
-ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0, SPACE);
-
-RaytracingAccelerationStructure g_tlas : register(t0, SPACE);
-Texture2D<Vec4> g_depthTex : register(t1, SPACE);
-Texture2D<Vec4> g_gbufferRt1 : register(t2, SPACE);
-Texture2D<Vec4> g_gbufferRt2 : register(t3, SPACE);
-Texture2D<Vec4> g_blueNoiseTex : register(t4, SPACE);
-
-RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
+struct Consts
+{
+	F32 m_maxRayT;
+	F32 m_padding0;
+	F32 m_padding1;
+	F32 m_padding2;
+};
+ANKI_FAST_CONSTANTS(Consts, g_consts)
 
 [shader("raygeneration")] void main()
 {
 	UVec2 outSize;
-	g_outTex.GetDimensions(outSize.x, outSize.y);
+	g_colorAndPdfTex.GetDimensions(outSize.x, outSize.y);
+
+	const UVec2 coord = min(DispatchRaysIndex().xy, outSize - 1u);
 
-	const F32 depth = g_depthTex[DispatchRaysIndex().xy].x;
-	const Vec4 rt1 = g_gbufferRt1[DispatchRaysIndex().xy];
-	const Vec4 rt2 = g_gbufferRt2[DispatchRaysIndex().xy];
+	const F32 depth = g_depthTex[coord].x;
+	if(depth == 1.0)
+	{
+		g_colorAndPdfTex[coord] = 0.0;
+		g_hitPosAndDepthTex[coord] = 0.0;
+		return;
+	}
+
+	const Vec4 rt1 = g_gbufferRt1[coord];
+	const Vec4 rt2 = g_gbufferRt2[coord];
 
 	const Vec3 worldNormal = unpackNormalFromGBuffer(rt2);
 	const F32 roughness = unpackRoughnessFromGBuffer(rt1);
 
-	const Vec2 ndc = uvToNdc(Vec2(DispatchRaysIndex().xy) / Vec2(outSize));
+	const Vec2 ndc = uvToNdc((Vec2(coord) + 0.5) / Vec2(outSize));
 	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(ndc, depth, 1.0));
 	const Vec3 worldPos = v4.xyz / v4.w;
 
 	const DirectionalLight dirLight = g_globalRendererConstants.m_directionalLight;
 
 	// Noise
-	Vec2 randFactors;
-	if(true)
-	{
-		UVec2 noiseTexSize;
-		g_blueNoiseTex.GetDimensions(noiseTexSize.x, noiseTexSize.y);
-
-		Vec3 random = g_blueNoiseTex[DispatchRaysIndex().xy % noiseTexSize].rgb;
-		random = animateBlueNoise(random, g_globalRendererConstants.m_frame % 16u);
-		randFactors = random.xy;
-	}
-	else
-	{
-		randFactors = spatioTemporalNoise(DispatchRaysIndex().xy, g_globalRendererConstants.m_frame);
-	}
+	const UVec3 seed = rand3DPCG16(UVec3(coord, g_globalRendererConstants.m_frame % 8u));
+	const Vec2 randFactors = hammersleyRandom16(g_globalRendererConstants.m_frame % 64u, 64u, seed);
 
 	// Compute refl
 	const Vec3 viewDir = normalize(g_globalRendererConstants.m_cameraPosition - worldPos);
 #	if 1
 	F32 pdf;
-	const Vec3 reflDir = sampleReflectionVector(viewDir, worldNormal, roughness, randFactors, 4, pdf);
+	const Vec3 reflDir = sampleReflectionVectorIsotropic(viewDir, worldNormal, roughness, randFactors, 4, pdf);
 #	else
 	ANKI_MAYBE_UNUSED(roughness);
 	const Vec3 reflDir = reflect(-viewDir, worldNormal);
+	const F32 pdf = 1.0;
 #	endif
 
 	// Trace
@@ -84,7 +87,7 @@ RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
 	ray.Origin = worldPos;
 	ray.TMin = 0.1;
 	ray.Direction = reflDir;
-	ray.TMax = 100.0; // TODO
+	ray.TMax = g_consts.m_maxRayT;
 	TraceRay(g_tlas, flags, cullMask, sbtRecordOffset, sbtRecordStride, missIndex, ray, payload);
 
 	// Trace shadow
@@ -97,14 +100,19 @@ RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
 		ray.Origin = worldPos + reflDir * (payload.m_rayT - 0.01);
 		ray.TMin = 0.1;
 		ray.Direction = -dirLight.m_direction;
-		ray.TMax = 100.0; // TODO
+		ray.TMax = g_consts.m_maxRayT;
 		q.TraceRayInline(g_tlas, qFlags, cullMask, ray);
 		q.Proceed();
 		shadow = (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT) ? 0.0 : 1.0;
 	}
 	else
 	{
+		// Skybox
 		shadow = 1.0;
+		payload.m_rayT = g_consts.m_maxRayT;
+
+		const Vec2 uv = octahedronEncode(worldNormal);
+		payload.m_emission = g_envMap.SampleLevel(g_linearClampAnySampler, uv, 0.0).xyz;
 	}
 
 	// Do simple light shading
@@ -115,8 +123,8 @@ RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
 	const Vec3 diffC = diffuseLobe(payload.m_diffuseColor);
 	outColor += diffC * dirLight.m_diffuseColor * lambert * shadow;
 
-	// g_outTex[DispatchRaysIndex().xy] = Vec4(outColor, 0.0);
-	g_outTex[DispatchRaysIndex().xy] = lerp(Vec4(outColor, 0.0), g_outTex[DispatchRaysIndex().xy], 0.95);
+	g_colorAndPdfTex[coord] = Vec4(outColor, max(0.0, pdf));
+	g_hitPosAndDepthTex[coord] = Vec4(worldPos + reflDir * payload.m_rayT, depth);
 }
 #endif // ANKI_RAY_GEN_SHADER
 
@@ -128,7 +136,336 @@ RWTexture2D<Vec4> g_outTex : register(u0, SPACE);
 {
 	payload.m_diffuseColor = 0.0;
 	payload.m_worldNormal = 0.0;
-	payload.m_emission = Vec3(0.0, 0.0, 0.5); // TODO
+	payload.m_emission = 0.0;
 	payload.m_rayT = -1.0;
 }
 #endif // ANKI_MISS_SHADER
+
+// ===========================================================================
+// SpatialDenoise                                                            =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_SpatialDenoise
+Texture2D<Vec4> g_colorAndPdfTex : register(t0);
+Texture2D<Vec4> g_hitPosAndDepthTex : register(t1);
+Texture2D<Vec4> g_depthTex : register(t2);
+Texture2D<Vec4> g_gbufferRt1 : register(t3);
+Texture2D<Vec4> g_gbufferRt2 : register(t4);
+
+RWTexture2D<Vec4> g_denoisedTex : register(u0);
+
+ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
+
+#	define NUM_THREADS 64u
+
+[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID, UVec2 svGroupThreadId : SV_GROUPTHREADID,
+								U32 svGroupIndex : SV_GROUPINDEX)
+{
+	UVec2 outSize;
+	g_colorAndPdfTex.GetDimensions(outSize.x, outSize.y);
+
+	const UVec2 coord = min(svDispatchThreadId, outSize - 1);
+
+	Vec4 rgba = g_colorAndPdfTex[coord];
+	const Vec3 color = rgba.xyz;
+	const F32 pdf = rgba.w;
+
+	const F32 depth = g_depthTex[coord];
+
+	const Vec2 ndc = uvToNdc((Vec2(coord) + 0.5) / Vec2(outSize));
+	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(ndc, depth, 1.0));
+	const Vec3 worldPos = v4.xyz / v4.w;
+
+	const Vec3 viewDir = normalize(g_globalRendererConstants.m_cameraPosition - worldPos);
+
+	const Vec4 rt1 = g_gbufferRt1[coord];
+	const Vec4 rt2 = g_gbufferRt2[coord];
+	const Vec3 worldNormal = unpackNormalFromGBuffer(rt2);
+	const F32 roughness = unpackRoughnessFromGBuffer(rt1);
+	const F32 alpha = pow2(roughness);
+
+	Vec3 outColor = 0.0;
+
+	if(roughness <= kMinRoughness + kEpsilonF32)
+	{
+		outColor = color;
+	}
+	else
+	{
+		const UVec3 seed = rand3DPCG16(UVec3(svDispatchThreadId, g_globalRendererConstants.m_frame % 8u));
+		const Vec2 randFactors = hammersleyRandom16(g_globalRendererConstants.m_frame % 64u, 64u, seed);
+
+		const F32 sinTheta = sin(randFactors.x * 2.0 * kPi);
+		const F32 cosTheta = cos(randFactors.x * 2.0 * kPi);
+
+		const F32 sampleCount = ARRAY_SIZE(SPATIAL_UPSCALING_POISON_KERNEL) + 1.0;
+		F32 avgLuma = computeLuminance(color) / sampleCount;
+		outColor = color;
+		F32 weightSum = pdf;
+		for(U32 i = 0u; i < ARRAY_SIZE(SPATIAL_UPSCALING_POISON_KERNEL); ++i)
+		{
+			const Vec2 diskPoint = SPATIAL_UPSCALING_POISON_KERNEL[i];
+
+			// Rotate the disk point
+			Vec2 rotatedDiskPoint;
+			rotatedDiskPoint.x = diskPoint.x * cosTheta - diskPoint.y * sinTheta;
+			rotatedDiskPoint.y = diskPoint.y * cosTheta + diskPoint.x * sinTheta;
+
+			// Offset calculation
+			const IVec2 newCoord = clamp(IVec2(coord) + rotatedDiskPoint * kSpatialUpscalingPcfTexelOffset, 0, outSize - 1);
+
+			rgba = g_hitPosAndDepthTex[newCoord];
+			const F32 sampleDepth = rgba.w;
+			const Vec3 hitPos = rgba.xyz;
+
+			const Vec3 reflectedDir = normalize(hitPos - worldPos);
+			const F32 pdf = pdfVndfIsotropic(reflectedDir, viewDir, alpha, worldNormal);
+
+			const Vec3 sampleColor = g_colorAndPdfTex[newCoord].xyz;
+
+			const F32 weight = pdf * calculateBilateralWeightDepth(depth, sampleDepth, 1.0);
+
+			outColor += sampleColor * weight;
+			weightSum += weight;
+			avgLuma += computeLuminance(sampleColor) / sampleCount;
+		}
+
+		outColor = outColor / weightSum;
+
+		// Remove fireflies
+		const F32 luma = computeLuminance(outColor);
+		if(luma > avgLuma && luma > 0.001)
+		{
+			outColor *= avgLuma / luma;
+		}
+	}
+
+	g_denoisedTex[svDispatchThreadId] = Vec4(outColor, 1.0 - depth); // Store depth in reverse for better precision
+}
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_SpatialDenoise
+
+// ===========================================================================
+// TemporalDenoise                                                           =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_TemporalDenoise
+SamplerState g_linearAnyClampSampler : register(s0);
+
+Texture2D<Vec4> g_colorAndDepth : register(t0);
+Texture2D<Vec4> g_historyTex : register(t1);
+Texture2D<Vec4> g_momentsHistoryTex : register(t2);
+Texture2D<Vec4> g_motionVectorsTex : register(t3);
+Texture2D<Vec4> g_hitPosTex : register(t4);
+
+RWTexture2D<Vec4> g_outTex : register(u0);
+RWTexture2D<Vec4> g_momentsTex : register(u1);
+
+ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
+
+// Spacial history UV calculation to decrease parallax reprojection effect
+Vec2 computeHistoryUv(UVec2 coords, Vec2 uv)
+{
+	// Compute the history UV by reprojecting the hit point
+	const Vec3 worldPos = g_hitPosTex[coords].xyz;
+
+	Vec4 clipPos = mul(g_globalRendererConstants.m_matrices.m_viewProjection, Vec4(worldPos, 1.0));
+	clipPos.xy /= clipPos.w;
+
+	Vec4 prevClipPos = mul(g_globalRendererConstants.m_previousMatrices.m_viewProjection, Vec4(worldPos, 1.0));
+	prevClipPos.xy /= prevClipPos.w;
+
+	const Vec2 diff = ndcToUv(prevClipPos.xy) - ndcToUv(clipPos.xy);
+	const Vec2 hitHistoryUv = uv + diff;
+
+	// Read the motion vectors as well
+	const Vec2 motionHistoryUv = uv + g_motionVectorsTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0f).xy;
+
+	// Blend the 2 histories. The more the projected hit point is in the view the more we use it
+	F32 factor = max(abs(clipPos.x), abs(clipPos.y));
+	factor = min(factor, 1.0);
+	factor = pow(factor, 8.0);
+	factor = 1 - factor;
+
+	const Vec2 historyUv = lerp(motionHistoryUv, hitHistoryUv, factor);
+
+	return historyUv;
+}
+
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	UVec2 textureSize;
+	g_colorAndDepth.GetDimensions(textureSize.x, textureSize.y);
+
+	const UVec2 coord = min(svDispatchThreadId, textureSize - 1);
+	const Vec2 uv = (Vec2(coord) + 0.5f) / textureSize;
+
+	// Read crnt
+	Vec4 rgba = g_colorAndDepth[coord];
+	const F32 depth = rgba.w;
+	Vec3 sourceSample = rgba.xyz;
+	Vec3 neighboorMin = sourceSample;
+	Vec3 neighboorMax = sourceSample;
+	F32 weightSum = 1.0;
+	Vec3 m1 = sourceSample;
+	Vec3 m2 = sourceSample * sourceSample;
+	constexpr F32 sampleCount = 9.0;
+	for(I32 x = -1; x <= 1; ++x)
+	{
+		for(I32 y = -1; y <= 1; ++y)
+		{
+			if(x == 0 && y == 0)
+			{
+				continue;
+			}
+
+			IVec2 newCoords = IVec2(coord) + IVec2(x, y);
+			newCoords = clamp(newCoords, 0, textureSize - 1);
+
+			const Vec3 neighbor = g_colorAndDepth[newCoords].xyz;
+
+			const F32 weight = 0.5;
+			sourceSample += neighbor * weight;
+			weightSum += weight;
+
+			neighboorMin = min(neighboorMin, neighbor);
+			neighboorMax = max(neighboorMax, neighbor);
+
+			m1 += neighbor;
+			m2 += neighbor * neighbor;
+		}
+	}
+
+	sourceSample /= weightSum;
+
+	// Read history
+	const Vec2 historyUv = computeHistoryUv(coord, uv);
+	Vec3 history = g_historyTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0f);
+
+	// Fix history
+	const F32 gamma = 1.0;
+	const Vec3 mu = m1 / sampleCount;
+	const Vec3 sigma = sqrt(abs((m2 / sampleCount) - (mu * mu)));
+	const Vec3 minc = mu - gamma * sigma;
+	const Vec3 maxc = mu + gamma * sigma;
+
+	history = clamp(history, minc, maxc);
+
+	// Blend history and current
+	const Vec3 compressedSource = sourceSample * rcp(max3(sourceSample) + 1.0);
+	const Vec3 compressedHistory = history * rcp(max3(history) + 1.0);
+	const F32 luminanceSource = computeLuminance(compressedSource);
+	const F32 luminanceHistory = computeLuminance(compressedHistory);
+
+	F32 sourceWeight = 0.1;
+	F32 historyWeight = 1.0 - sourceWeight;
+	sourceWeight *= 1.0 / (1.0 + luminanceSource);
+	historyWeight *= 1.0 / (1.0 + luminanceHistory);
+
+	const Vec3 finalVal = (sourceSample * sourceWeight + history * historyWeight) / max(sourceWeight + historyWeight, 0.00001);
+
+	// Temporal variance
+	const Vec2 momentsHistory = g_momentsHistoryTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0f).xy;
+	Vec2 crntMoments;
+	crntMoments.x = luminanceSource;
+	crntMoments.y = crntMoments.x * crntMoments.x;
+	const Vec2 moments = lerp(crntMoments, momentsHistory, 0.25);
+
+	// Write value
+	g_outTex[svDispatchThreadId] = Vec4(finalVal, depth);
+	g_momentsTex[svDispatchThreadId] = Vec4(moments, 0.0, 0.0);
+}
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_TemporalDenoise
+
+// ===========================================================================
+// BilateralDenoise                                                          =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoise
+SamplerState g_linearAnyClampSampler : register(s0);
+Texture2D<Vec4> g_colorAndDepth : register(t0);
+Texture2D<Vec4> g_momentsTex : register(t1);
+Texture2D<Vec4> g_gbufferRt1 : register(t2);
+
+RWTexture2D<Vec4> g_outTex : register(u0);
+
+F32 computeVarianceCenter(IVec2 coord, UVec2 textureSize)
+{
+#	if 1
+	const F32 kernel[2][2] = {{1.0 / 4.0, 1.0 / 8.0}, {1.0 / 8.0, 1.0 / 16.0}};
+	const I32 radius = 1;
+
+	Vec2 sumMoments = 0.0f;
+	for(I32 yy = -radius; yy <= radius; yy++)
+	{
+		for(I32 xx = -radius; xx <= radius; xx++)
+		{
+			IVec2 newCoord = coord + IVec2(xx, yy);
+			newCoord = clamp(newCoord, 0, textureSize - 1);
+
+			const F32 k = kernel[abs(xx)][abs(yy)];
+			sumMoments += g_momentsTex[newCoord].xy * k;
+		}
+	}
+
+	return abs(sumMoments.y - sumMoments.x * sumMoments.x);
+#	else
+	Vec2 sumMoments = g_momentsTex[coord].xy;
+	return abs(sumMoments.y - sumMoments.x * sumMoments.x);
+#	endif
+}
+
+[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+{
+	UVec2 outSize;
+	g_outTex.GetDimensions(outSize.x, outSize.y);
+
+	const UVec2 coord = min(svDispatchThreadId, outSize - 1);
+	Vec4 rgba = g_colorAndDepth[coord];
+	const F32 refDepth = rgba.w;
+	const Vec3 centerColor = rgba.xyz;
+
+	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5) / outSize;
+	const Vec2 texelSize = 1.0 / outSize;
+	const Vec2 halfTexelSize = texelSize / 2.0;
+
+	const F32 variance = sqrt(computeVarianceCenter(coord, outSize)) * 100.0;
+
+	const Vec4 rt1 = g_gbufferRt1[coord];
+	const F32 roughness = unpackRoughnessFromGBuffer<F32>(rt1, 0.0);
+	const F32 sqRoughness = sqrt(roughness);
+
+	constexpr F32 kSamples = 5.0;
+	constexpr F32 kGaussianSigma = 0.55;
+
+	const F32 lerpFactor = sqRoughness * min(1.0, max(sqRoughness, variance));
+
+	const F32 sampleCount = round(lerp(0, kSamples, lerpFactor));
+
+	Vec3 colorSum = centerColor;
+	F32 weightSum = gaussianWeight2d<F32>(kGaussianSigma, 0.0, 0.0);
+	for(F32 x = -sampleCount; x <= sampleCount; x += 1.0)
+	{
+		for(F32 y = -sampleCount; y <= sampleCount; y += 1.0)
+		{
+			if(x == 0.0 && y == 0.0)
+			{
+				continue;
+			}
+
+			const Vec2 suv = uv + Vec2(x, y) * texelSize + Vec2(sign(x), sign(y)) * halfTexelSize;
+
+			rgba = g_colorAndDepth.SampleLevel(g_linearAnyClampSampler, suv, 0.0);
+			const F32 sampleDepth = rgba.w;
+			const Vec3 sampleColor = rgba.xyz;
+
+			const F32 gaussianWeight = gaussianWeight2d<F32>(kGaussianSigma, x / sampleCount, y / sampleCount);
+			const F32 depthWeight = calculateBilateralWeightDepth(refDepth, sampleDepth, 1.0);
+			const F32 weight = gaussianWeight * depthWeight;
+
+			colorSum += sampleColor * weight;
+			weightSum += weight;
+		}
+	}
+
+	colorSum /= weightSum;
+
+	g_outTex[coord] = Vec4(colorSum, 1.0);
+}
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoise

+ 40 - 2
AnKi/Shaders/Sky.ankiprog

@@ -9,6 +9,7 @@
 #pragma anki technique SkyMultipleScatteringLut comp
 #pragma anki technique SkyLut comp
 #pragma anki technique ComputeSunColor comp
+#pragma anki technique ComputeEnvMap comp
 
 #include <AnKi/Shaders/Sky.hlsl>
 
@@ -426,7 +427,6 @@ Vec3 raymarchScattering(Vec3 pos, Vec3 rayDir, Vec3 dirToSun, F32 tMax, F32 numS
 // Compute sun color                                                         =
 // ===========================================================================
 #if ANKI_TECHNIQUE_ComputeSunColor
-
 #	include <AnKi/Shaders/Include/MiscRendererTypes.h>
 #	include <AnKi/Shaders/TonemappingFunctions.hlsl>
 
@@ -440,5 +440,43 @@ globallycoherent RWStructuredBuffer<GlobalRendererConstants> g_globalConstants :
 
 	g_globalConstants[0].m_directionalLight.m_diffuseColor = Vec4(sunPower * sunTransmittance, 0.0f);
 }
-
 #endif // ANKI_TECHNIQUE_ComputeSunColor
+
+// ===========================================================================
+// Compute a cheap env map                                                   =
+// ===========================================================================
+#if ANKI_TECHNIQUE_ComputeEnvMap
+#	include <AnKi/Shaders/Functions.hlsl>
+#	include <AnKi/Shaders/Include/MiscRendererTypes.h>
+
+SamplerState g_linearAnyClampSampler : register(s0);
+
+Texture2D<Vec4> g_skyLut : register(t0);
+
+RWTexture2D<Vec4> g_envMap : register(u0);
+
+ConstantBuffer<GlobalRendererConstants> g_consts : register(b0);
+
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+{
+	const Vec2 svDispatchThreadIdf = Vec2(svDispatchThreadId);
+
+	Vec2 envMapSize;
+	g_envMap.GetDimensions(envMapSize.x, envMapSize.y);
+
+	if(any(svDispatchThreadIdf >= envMapSize))
+	{
+		return;
+	}
+
+	const Vec2 envMapUv = (svDispatchThreadIdf + 0.5) / envMapSize;
+
+	const Vec3 eyeToFrag = octahedronDecode(envMapUv);
+
+	const Vec3 output = computeSkyColor(g_skyLut, g_linearAnyClampSampler, eyeToFrag, -g_consts.m_directionalLight.m_direction,
+										g_consts.m_directionalLight.m_power, true);
+
+	g_envMap[svDispatchThreadId] = Vec4(output, 0.0);
+}
+
+#endif // ANKI_TECHNIQUE_ComputeEnvMap

+ 1 - 1
AnKi/Shaders/TonemappingFunctions.hlsl

@@ -15,7 +15,7 @@ T log10(T x)
 }
 
 template<typename T>
-vector<T, 3> computeLuminance(vector<T, 3> color)
+T computeLuminance(vector<T, 3> color)
 {
 	return max(dot(vector<T, 3>(0.30, 0.59, 0.11), color), getEpsilon<T>());
 }

+ 1 - 0
AnKi/Shaders/VisualizeRenderTarget.ankiprog

@@ -16,6 +16,7 @@ Texture2D g_inTex : register(t0);
 Vec3 main(VertOut input) : SV_TARGET0
 {
 	const Vec4 rgba = g_inTex.SampleLevel(g_nearestAnyClampSampler, input.m_uv, 0.0);
+	// return (input.m_uv.x > 0.5) ? rgba.xyz : rgba.aaa * 1.0;
 	return rgba.xyz;
 }
 #endif // ANKI_PIXEL_SHADER

+ 3 - 3
Samples/Sponza/Assets/floor_71cbd2644e53ab8c.ankimtl

@@ -14,16 +14,16 @@
 	</shaderProgram>
 
 	<inputs>
-		
+
 		<input name="m_diffuseTex" value="Assets/sponza_floor_a_diff.ankitex"/>
 		<input name="m_diffuseScale" value="1.000000 1.000000 1.000000 1.000000"/>
 		<input name="m_specularScale" value="0.040000 0.040000 0.040000"/>
 		<input name="m_roughnessMetalnessTex" value="Assets/Sponza_Floor_roughness.ankitex"/>
-		<input name="m_roughnessScale" value="1.000000"/>
+		<input name="m_roughnessScale" value="0.150000"/>
 		<input name="m_metalnessScale" value="0.000000"/>
 		<input name="m_normalTex" value="Assets/Sponza_Floor_normal.ankitex"/>
 		<input name="m_emissionScale" value="0.000000 0.000000 0.000000"/>
 		<input name="m_subsurface" value="0.000000"/>
-		
+
 	</inputs>
 </material>