Răsfoiți Sursa

Some SSAO optimizations

Panagiotis Christopoulos Charitos 4 luni în urmă
părinte
comite
9b1da8a106

+ 82 - 35
AnKi/Renderer/Ssao.cpp

@@ -8,6 +8,7 @@
 #include <AnKi/Renderer/GBuffer.h>
 #include <AnKi/Renderer/MotionVectors.h>
 #include <AnKi/Renderer/DepthDownscale.h>
+#include <AnKi/Renderer/HistoryLength.h>
 #include <AnKi/Util/Tracer.h>
 
 namespace anki {
@@ -32,12 +33,14 @@ Error Ssao::init()
 		getRenderer().create2DRenderTargetDescription(rez.x(), rez.y(), Format::kR8G8B8A8_Snorm, "Bent normals + SSAO temp");
 	m_bentNormalsAndSsaoRtDescr.bake();
 
+	const Array<SubMutation, 2> mutation = {
+		{{"SPATIAL_DENOISE_SAMPLE_COUNT", g_ssaoSpatialDenoiseSampleCountCVar}, {"DENOISING_QUARTER_RESOLUTION", g_ssaoQuarterRezCVar}}};
+
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", mutation, m_prog, m_grProg, "Ssao"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", mutation, m_prog, m_spatialDenoiseVerticalGrProg, "SsaoSpatialDenoiseVertical"));
 	ANKI_CHECK(
-		loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", {{"SPATIAL_DENOISE_QUALITY", g_ssaoSpatialQualityCVar}}, m_prog, m_grProg, "Ssao"));
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", {{"SPATIAL_DENOISE_QUALITY", g_ssaoSpatialQualityCVar}}, m_prog,
-								 m_spatialDenoiseGrProg, "SsaoSpatialDenoise"));
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", {{"SPATIAL_DENOISE_QUALITY", g_ssaoSpatialQualityCVar}}, m_prog,
-								 m_tempralDenoiseGrProg, "SsaoTemporalDenoise"));
+		loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", mutation, m_prog, m_spatialDenoiseHorizontalGrProg, "SsaoSpatialDenoiseHorizontal"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/Ssao.ankiprogbin", mutation, m_prog, m_tempralDenoiseGrProg, "SsaoTemporalDenoise"));
 
 	ANKI_CHECK(ResourceManager::getSingleton().loadResource("EngineAssets/BlueNoise_Rgba8_64x64.png", m_noiseImage));
 
@@ -69,6 +72,7 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 	}
 
 	m_runCtx.m_finalRt = finalRt;
+	const RenderTargetHandle depthRt = (g_ssaoQuarterRezCVar) ? getDepthDownscale().getRt() : getGBuffer().getDepthRt();
 
 	const RenderTargetHandle bentNormalsAndSsaoTempRt = rgraph.newRenderTarget(m_bentNormalsAndSsaoRtDescr);
 
@@ -101,7 +105,7 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 		}
 
 		ppass->newTextureDependency(getGBuffer().getColorRt(2), readUsage);
-		ppass->newTextureDependency((g_ssaoQuarterRezCVar) ? getDepthDownscale().getRt() : getGBuffer().getDepthRt(), readUsage);
+		ppass->newTextureDependency(getDepthDownscale().getRt(), readUsage);
 		ppass->newTextureDependency(finalRt, writeUsage);
 
 		ppass->setWork([this, &ctx, finalRt](RenderPassWorkContext& rgraphCtx) {
@@ -111,15 +115,15 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 			cmdb.bindShaderProgram(m_grProg.get());
 
 			rgraphCtx.bindSrv(0, 0, getGBuffer().getColorRt(2));
-			rgraphCtx.bindSrv(1, 0, (g_ssaoQuarterRezCVar) ? getDepthDownscale().getRt() : getGBuffer().getDepthRt());
+			rgraphCtx.bindSrv(1, 0, getDepthDownscale().getRt());
 
 			cmdb.bindSrv(2, 0, TextureView(&m_noiseImage->getTexture(), TextureSubresourceDesc::all()));
 			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
 			cmdb.bindSampler(1, 0, getRenderer().getSamplers().m_trilinearClamp.get());
 
-			const UVec2 rez = (g_ssaoQuarterRezCVar) ? getRenderer().getInternalResolution() / 2u : getRenderer().getInternalResolution();
+			const UVec2 rez = getRenderer().getInternalResolution() / 2u;
 
-			SsaoConstants consts;
+			SsaoConstants& consts = *allocateAndBindConstants<SsaoConstants>(cmdb, 0, 0);
 			consts.m_radius = g_ssaoRadiusCVar;
 			consts.m_sampleCount = g_ssaoSampleCountCVar;
 			consts.m_viewportSizef = Vec2(rez);
@@ -131,7 +135,7 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 			consts.m_frameCount = getRenderer().getFrameCount() % kMaxU32;
 			consts.m_ssaoPower = g_ssaoPowerCVar;
 			consts.m_viewMat = ctx.m_matrices.m_view;
-			cmdb.setFastConstants(&consts, sizeof(consts));
+			consts.m_viewToWorldMat = ctx.m_matrices.m_cameraTransform;
 
 			if(g_preferComputeCVar)
 			{
@@ -148,43 +152,43 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 		});
 	}
 
-	// Spatial denoise
+	// Temporal denoise
 	{
 		RenderPassBase* ppass;
 
 		if(preferCompute)
 		{
-			NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("SSAO spatial denoise");
+			NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("SSAO temporal denoise");
 			ppass = &pass;
 		}
 		else
 		{
-			GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("SSAO spatial denoise");
+			GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("SSAO temporal denoise");
 			pass.setRenderpassInfo({GraphicsRenderPassTargetDesc(bentNormalsAndSsaoTempRt)});
 			ppass = &pass;
 		}
 
 		ppass->newTextureDependency(finalRt, readUsage);
-		ppass->newTextureDependency(getGBuffer().getDepthRt(), readUsage);
+		ppass->newTextureDependency(historyRt, readUsage);
+		ppass->newTextureDependency(getMotionVectors().getMotionVectorsRt(), readUsage);
 		ppass->newTextureDependency(bentNormalsAndSsaoTempRt, writeUsage);
+		ppass->newTextureDependency(getHistoryLength().getRt(), readUsage);
 
-		ppass->setWork([this, finalRt, bentNormalsAndSsaoTempRt, &ctx](RenderPassWorkContext& rgraphCtx) {
-			ANKI_TRACE_SCOPED_EVENT(SsaoSpatialDenoise);
+		ppass->setWork([this, bentNormalsAndSsaoTempRt, finalRt, historyRt, &ctx](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(SsaoTemporalDenoise);
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
-			cmdb.bindShaderProgram(m_spatialDenoiseGrProg.get());
+			cmdb.bindShaderProgram(m_tempralDenoiseGrProg.get());
 
 			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
 			rgraphCtx.bindSrv(0, 0, finalRt);
-			rgraphCtx.bindSrv(1, 0, getGBuffer().getDepthRt());
+			rgraphCtx.bindSrv(1, 0, historyRt);
+			rgraphCtx.bindSrv(2, 0, getMotionVectors().getMotionVectorsRt());
+			rgraphCtx.bindSrv(3, 0, getHistoryLength().getRt());
 
-			const UVec2 rez = (g_ssaoQuarterRezCVar) ? getRenderer().getInternalResolution() / 2u : getRenderer().getInternalResolution();
+			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
 
-			SsaoSpatialDenoiseConstants consts;
-			computeLinearizeDepthOptimal(ctx.m_matrices.m_near, ctx.m_matrices.m_far, consts.m_linearizeDepthParams.x(),
-										 consts.m_linearizeDepthParams.y());
-			consts.m_viewToWorldMat = ctx.m_matrices.m_cameraTransform;
-			cmdb.setFastConstants(&consts, sizeof(consts));
+			const UVec2 rez = (g_ssaoQuarterRezCVar) ? getRenderer().getInternalResolution() / 2u : getRenderer().getInternalResolution();
 
 			if(g_preferComputeCVar)
 			{
@@ -199,37 +203,80 @@ void Ssao::populateRenderGraph(RenderingContext& ctx)
 		});
 	}
 
-	// Temporal denoise
+	// Spatial denoise vertical
 	{
 		RenderPassBase* ppass;
 
 		if(preferCompute)
 		{
-			NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("SSAO temporal denoise");
+			NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("SSAO spatial denoise vertical");
 			ppass = &pass;
 		}
 		else
 		{
-			GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("SSAO temporal denoise");
-			pass.setRenderpassInfo({GraphicsRenderPassTargetDesc(finalRt)});
+			GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("SSAO spatial denoise vertical");
+			pass.setRenderpassInfo({GraphicsRenderPassTargetDesc(historyRt)});
 			ppass = &pass;
 		}
 
 		ppass->newTextureDependency(bentNormalsAndSsaoTempRt, readUsage);
+		ppass->newTextureDependency(depthRt, readUsage);
+		ppass->newTextureDependency(historyRt, writeUsage);
+
+		ppass->setWork([this, historyRt, bentNormalsAndSsaoTempRt, depthRt](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(SsaoSpatialDenoise);
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_spatialDenoiseVerticalGrProg.get());
+
+			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+			rgraphCtx.bindSrv(0, 0, bentNormalsAndSsaoTempRt);
+			rgraphCtx.bindSrv(1, 0, depthRt);
+
+			const UVec2 rez = (g_ssaoQuarterRezCVar) ? getRenderer().getInternalResolution() / 2u : getRenderer().getInternalResolution();
+
+			if(g_preferComputeCVar)
+			{
+				rgraphCtx.bindUav(0, 0, historyRt);
+				dispatchPPCompute(cmdb, 8, 8, rez.x(), rez.y());
+			}
+			else
+			{
+				cmdb.setViewport(0, 0, rez.x(), rez.y());
+				drawQuad(cmdb);
+			}
+		});
+	}
+
+	// Spatial denoise horizontal
+	{
+		RenderPassBase* ppass;
+
+		if(preferCompute)
+		{
+			NonGraphicsRenderPass& pass = rgraph.newNonGraphicsRenderPass("SSAO spatial denoise horizontal");
+			ppass = &pass;
+		}
+		else
+		{
+			GraphicsRenderPass& pass = rgraph.newGraphicsRenderPass("SSAO spatial denoise horizontal");
+			pass.setRenderpassInfo({GraphicsRenderPassTargetDesc(finalRt)});
+			ppass = &pass;
+		}
+
 		ppass->newTextureDependency(historyRt, readUsage);
-		ppass->newTextureDependency(getRenderer().getMotionVectors().getMotionVectorsRt(), readUsage);
+		ppass->newTextureDependency(depthRt, readUsage);
 		ppass->newTextureDependency(finalRt, writeUsage);
 
-		ppass->setWork([this, bentNormalsAndSsaoTempRt, finalRt, historyRt](RenderPassWorkContext& rgraphCtx) {
-			ANKI_TRACE_SCOPED_EVENT(SsaoTemporalDenoise);
+		ppass->setWork([this, historyRt, finalRt, depthRt](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(SsaoSpatialDenoise);
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
-			cmdb.bindShaderProgram(m_tempralDenoiseGrProg.get());
+			cmdb.bindShaderProgram(m_spatialDenoiseHorizontalGrProg.get());
 
 			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
-			rgraphCtx.bindSrv(0, 0, bentNormalsAndSsaoTempRt);
-			rgraphCtx.bindSrv(1, 0, historyRt);
-			rgraphCtx.bindSrv(2, 0, getRenderer().getMotionVectors().getMotionVectorsRt());
+			rgraphCtx.bindSrv(0, 0, historyRt);
+			rgraphCtx.bindSrv(1, 0, depthRt);
 
 			const UVec2 rez = (g_ssaoQuarterRezCVar) ? getRenderer().getInternalResolution() / 2u : getRenderer().getInternalResolution();
 

+ 8 - 2
AnKi/Renderer/Ssao.h

@@ -18,7 +18,12 @@ inline NumericCVar<U32> g_ssaoSampleCountCVar("R", "SsaoSampleCount", 4, 1, 1024
 inline NumericCVar<F32> g_ssaoRadiusCVar("R", "SsaoRadius", 2.0f, 0.1f, 100.0f, "SSAO radius in meters");
 inline BoolCVar g_ssaoQuarterRezCVar("R", "SsaoQuarterResolution", ANKI_PLATFORM_MOBILE, "Render SSAO in quarter rez");
 inline NumericCVar<F32> g_ssaoPowerCVar("R", "SsaoPower", 1.5f, 0.1f, 100.0f, "SSAO power");
-inline NumericCVar<U8> g_ssaoSpatialQualityCVar("R", "SsaoSpatialQuality", (ANKI_PLATFORM_MOBILE) ? 0 : 1, 0, 1, "SSAO spatial denoise quality");
+inline NumericCVar<U8> g_ssaoSpatialDenoiseSampleCountCVar(
+	"R", "SsaoSpatialDenoiseSampleCount", (ANKI_PLATFORM_MOBILE) ? 3 : 9,
+	[](U8 val) {
+		return val == 3 || val == 5 || val == 7 || val == 9;
+	},
+	"SSAO spatial denoise quality");
 
 /// Screen space ambient occlusion.
 class Ssao : public RendererObject
@@ -48,7 +53,8 @@ public:
 public:
 	ShaderProgramResourcePtr m_prog;
 	ShaderProgramPtr m_grProg;
-	ShaderProgramPtr m_spatialDenoiseGrProg;
+	ShaderProgramPtr m_spatialDenoiseVerticalGrProg;
+	ShaderProgramPtr m_spatialDenoiseHorizontalGrProg;
 	ShaderProgramPtr m_tempralDenoiseGrProg;
 
 	RenderTargetDesc m_bentNormalsAndSsaoRtDescr;

+ 9 - 2
AnKi/Shaders/Functions.hlsl

@@ -405,9 +405,16 @@ Vec3 colorPerCubeFace(const U32 dir)
 	return color;
 }
 
-Bool incorrectColor(const Vec3 c)
+template<I32 kVecSize>
+Bool isInfOrNan(const vector<F32, kVecSize> c)
 {
-	return isnan(c.x) || isnan(c.y) || isnan(c.z) || isinf(c.x) || isinf(c.y) || isinf(c.z);
+	Bool incorrect = false;
+	[unroll] for(I32 i = 0; i < kVecSize; ++i)
+	{
+		incorrect = incorrect || (isnan(c[i]) || isinf(c[i]));
+	}
+
+	return incorrect;
 }
 
 F32 areaElement(const F32 x, const F32 y)

+ 0 - 6
AnKi/Shaders/Include/MiscRendererTypes.h

@@ -225,14 +225,8 @@ struct SsaoConstants
 	U32 m_frameCount;
 
 	Mat3x4 m_viewMat;
-};
 
-struct SsaoSpatialDenoiseConstants
-{
 	Mat3x4 m_viewToWorldMat;
-
-	Vec2 m_linearizeDepthParams;
-	Vec2 m_padding;
 };
 
 struct LodAndRenderableIndex

+ 1 - 1
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -828,7 +828,7 @@ RWTexture2D<Vec4> g_outTex : register(u0);
 
 	const F32 refDepth = TEX(g_depthTex, coord);
 
-	F32 weightSum = kEpsilonF32;
+	F32 weightSum = calculateBilateralWeightDepth<F32>(0.0, 0.0, 1.0); // Highest weight that this function can give
 
 	const Vec4 rgba = TEX(g_inTex, coord);
 

+ 173 - 193
AnKi/Shaders/Ssao.ankiprog

@@ -5,10 +5,14 @@
 
 // Ground truth ambiend occlusion
 
-#pragma anki mutator SPATIAL_DENOISE_QUALITY 0 1
+#pragma anki 16bit
+
+#pragma anki mutator SPATIAL_DENOISE_SAMPLE_COUNT 3 5 7 9
+#pragma anki mutator DENOISING_QUARTER_RESOLUTION 0 1
 
 #pragma anki technique Ssao vert pixel comp mutators
-#pragma anki technique SsaoSpatialDenoise vert pixel comp
+#pragma anki technique SsaoSpatialDenoiseHorizontal vert pixel comp
+#pragma anki technique SsaoSpatialDenoiseVertical vert pixel comp
 #pragma anki technique SsaoTemporalDenoise vert pixel comp
 
 #include <AnKi/Shaders/QuadVert.hlsl>
@@ -26,15 +30,15 @@
 Texture2D<Vec4> g_gbufferRt2 : register(t0);
 Texture2D<Vec4> g_depthTex : register(t1);
 
-Texture2D<RVec4> g_noiseTex : register(t2);
+Texture2D<Vec4> g_noiseTex : register(t2);
 SamplerState g_trilinearRepeatSampler : register(s0);
 SamplerState g_linearAnyClampSampler : register(s1);
 
 #	if ANKI_COMPUTE_SHADER
-RWTexture2D<RVec4> g_bentNormalsAndSsaoStorageTex : register(u0);
+RWTexture2D<Vec4> g_bentNormalsAndSsaoStorageTex : register(u0);
 #	endif
 
-ANKI_FAST_CONSTANTS(SsaoConstants, g_consts)
+ConstantBuffer<SsaoConstants> g_consts : register(b0);
 
 Vec3 unproject(Vec2 ndc)
 {
@@ -48,105 +52,92 @@ Vec4 project(Vec4 p)
 									  p);
 }
 
-RF32 computeFalloff(RF32 len)
+F16 computeFalloff(F16 len)
 {
-	return sqrt(1.0f - min(1.0f, len / g_consts.m_radius));
+	return sqrt(1.0 - min(1.0, len / F16(g_consts.m_radius)));
 }
 
-#	if ANKI_COMPUTE_SHADER
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
-#	else
-RVec4 main(VertOut input) : SV_TARGET0
-#	endif
+HVec4 doWork(Vec2 coord)
 {
-#	if ANKI_COMPUTE_SHADER
-	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5) / g_consts.m_viewportSizef;
-#	else
-	const UVec2 svDispatchThreadId = input.m_svPosition;
-	ANKI_MAYBE_UNUSED(svDispatchThreadId);
-	const Vec2 uv = input.m_uv;
-#	endif
+	const Vec2 uv = (coord + 0.5) / g_consts.m_viewportSizef;
 
 	const Vec2 ndc = uvToNdc(uv);
 	const F32 depth = g_depthTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0).r;
 	const Vec3 Pc = cheapPerspectiveUnprojection(g_consts.m_unprojectionParameters, ndc, depth);
-	const RVec3 V = normalize(-Pc); // View vector
+	const HVec3 V = normalize(-Pc); // View vector
 
 	// Get noise
-#	if 0
-	Vec2 noiseTexSize;
-	g_noiseTex.GetDimensions(noiseTexSize.x, noiseTexSize.y);
-	const RVec2 noiseUv = Vec2(g_consts.m_viewportSizef) / noiseTexSize * uv;
-	const RVec2 noise2 = animateBlueNoise(g_noiseTex.SampleLevel(g_trilinearRepeatSampler, noiseUv, 0.0).xyz, g_consts.m_frameCount).yx;
+#	if 1
+	const HVec2 noise2 = animateBlueNoise(g_noiseTex[UVec2(coord) % 64].xyz, g_consts.m_frameCount).yx;
 #	else
-	const RVec2 noise2 = spatioTemporalNoise(svDispatchThreadId, g_consts.m_frameCount);
+	const HVec2 noise2 = spatioTemporalNoise(coord, g_consts.m_frameCount);
 #	endif
 
 	// Rand slice direction
-	const RF32 randAng = noise2.x * kPi;
+	const F16 randAng = noise2.x * kPi;
 #	if 0
-	const RF32 aspect = g_consts.m_viewportSizef.x / g_consts.m_viewportSizef.y;
-	const RVec2 dir2d = normalize(Vec2(cos(randAng), sin(randAng)) * Vec2(1.0f, aspect));
+	const F16 aspect = g_consts.m_viewportSizef.x / g_consts.m_viewportSizef.y;
+	const HVec2 dir2d = normalize(Vec2(cos(randAng), sin(randAng)) * Vec2(1.0, aspect));
 #	else
-	const RVec2 dir2d = Vec2(cos(randAng), sin(randAng));
+	const HVec2 dir2d = Vec2(cos(randAng), sin(randAng));
 #	endif
 
 	// Project the view normal to the slice
 	const Vec3 worldNormal = unpackNormalFromGBuffer(g_gbufferRt2.SampleLevel(g_linearAnyClampSampler, uv, 0.0));
-	const RVec3 viewNormal = mul(g_consts.m_viewMat, Vec4(worldNormal, 0.0));
+	const HVec3 viewNormal = mul(g_consts.m_viewMat, Vec4(worldNormal, 0.0));
 
-	const RVec3 directionVec = RVec3(dir2d, 0.0f);
-	const RVec3 orthoDirectionVec = directionVec - (dot(directionVec, V) * V);
-	const RVec3 axisVec = normalize(cross(orthoDirectionVec, V));
-	const RVec3 projectedNormalVec = viewNormal - axisVec * dot(viewNormal, axisVec);
-	const RF32 signNorm = (F32)sign(dot(orthoDirectionVec, projectedNormalVec));
-	const RF32 projectedNormalVecLength = length(projectedNormalVec);
-	const RF32 cosNorm = saturate(dot(projectedNormalVec, V) / projectedNormalVecLength);
-	const RF32 n = -signNorm * fastAcos(cosNorm);
+	const HVec3 directionVec = HVec3(dir2d, 0.0);
+	const HVec3 orthoDirectionVec = directionVec - (dot(directionVec, V) * V);
+	const HVec3 axisVec = normalize(cross(orthoDirectionVec, V));
+	const HVec3 projectedNormalVec = viewNormal - axisVec * dot(viewNormal, axisVec);
+	const F16 signNorm = (F32)sign(dot(orthoDirectionVec, projectedNormalVec));
+	const F16 projectedNormalVecLength = length(projectedNormalVec);
+	const F16 cosNorm = saturate(dot(projectedNormalVec, V) / projectedNormalVecLength);
+	const F16 n = -signNorm * fastAcos(cosNorm);
 
 	// Find the projected radius
 	const Vec3 sphereLimit = Pc + Vec3(g_consts.m_radius, 0.0, 0.0);
 	const Vec4 projSphereLimit = project(Vec4(sphereLimit, 1.0));
 	const Vec2 projSphereLimit2 = projSphereLimit.xy / projSphereLimit.w;
-	const RF32 projRadius = length(projSphereLimit2 - ndc);
+	const F16 projRadius = length(projSphereLimit2 - ndc);
 
 	// Compute the inner integral (Slide 54)
 	const U32 stepCount = max(1u, g_consts.m_sampleCount / 2u);
 
-	const RF32 lowHorizonCos1 = cos(n - kPi / 2.0f);
-	const RF32 lowHorizonCos2 = cos(n + kPi / 2.0f);
+	const F16 lowHorizonCos1 = cos(n - kPi / 2.0);
+	const F16 lowHorizonCos2 = cos(n + kPi / 2.0);
 
-	RF32 cosH1 = lowHorizonCos1;
-	RF32 cosH2 = lowHorizonCos2;
+	F16 cosH1 = lowHorizonCos1;
+	F16 cosH2 = lowHorizonCos2;
 
 	for(U32 i = 0u; i < stepCount; ++i)
 	{
-		const RF32 stepBaseNoise = RF32(i * stepCount) * 0.6180339887498948482;
-		const RF32 stepNoise = frac(noise2.y + stepBaseNoise);
-		RF32 s = (i + stepNoise) / RF32(stepCount);
+		const F16 stepBaseNoise = F16(i * stepCount) * 0.6180339887498948482;
+		const F16 stepNoise = frac(noise2.y + stepBaseNoise);
+		F16 s = (i + stepNoise) / F16(stepCount);
 		s *= s;
 		const Vec2 sampleOffset = dir2d * projRadius * s;
 
 		// h1
 		const Vec3 Ps = unproject(ndc + sampleOffset);
 		const Vec3 Ds = Ps - Pc;
-		const RF32 DsLen = length(Ds);
+		const F16 DsLen = length(Ds);
 		cosH1 = max(cosH1, lerp(lowHorizonCos1, dot(V, Ds) / DsLen, computeFalloff(DsLen)));
 
 		// h2
 		const Vec3 Pt = unproject(ndc - sampleOffset);
 		const Vec3 Dt = Pt - Pc;
-		const RF32 DtLen = length(Dt);
+		const F16 DtLen = length(Dt);
 		cosH2 = max(cosH2, lerp(lowHorizonCos2, dot(V, Dt) / DtLen, computeFalloff(DtLen)));
 	}
 
 	// Compute the h1 and h2
-	const RF32 h1 = n + max(-fastAcos(cosH1) - n, -kPi / 2);
-	const RF32 h2 = n + min(fastAcos(cosH2) - n, kPi / 2);
+	const F16 h1 = n + max(-fastAcos(cosH1) - n, -kPi / 2.0);
+	const F16 h2 = n + min(fastAcos(cosH2) - n, kPi / 2.0);
 
 	// Compute the final value (Slide 61)
-	RF32 Vd = -cos(2.0f * h1 - n) + cos(n) + 2.0f * h1 * sin(n);
-	Vd += -cos(2.0f * h2 - n) + cos(n) + 2.0f * h2 * sin(n);
+	F16 Vd = -cos(2.0 * h1 - n) + cos(n) + 2.0 * h1 * sin(n);
+	Vd += -cos(2.0 * h2 - n) + cos(n) + 2.0 * h2 * sin(n);
 	Vd *= 0.25;
 	Vd *= projectedNormalVecLength;
 
@@ -154,197 +145,186 @@ RVec4 main(VertOut input) : SV_TARGET0
 	Vd = pow(Vd, g_consts.m_ssaoPower);
 
 	// Compute bent normal: see "Algorithm 2 Extension that computes bent normals b."
-	const RF32 t0 =
-		(6.0f * sin(h1 - n) - sin(3.0f * h1 - n) + 6.0f * sin(h2 - n) - sin(3.0f * h2 - n) + 16.0f * sin(n) - 3.0f * (sin(h1 + n) + sin(h2 + n)))
-		/ 12.0f;
-	const RF32 t1 = (-cos(3.0f * h1 - n) - cos(3.0f * h2 - n) + 8.0f * cos(n) - 3.0f * (cos(h1 + n) + cos(h2 + n))) / 12.0f;
-	RVec3 bentNormal = RVec3(-dir2d.x * t0, -dir2d.y * t0, t1);
+	const F16 t0 =
+		(6.0 * sin(h1 - n) - sin(3.0 * h1 - n) + 6.0 * sin(h2 - n) - sin(3.0 * h2 - n) + 16.0 * sin(n) - 3.0 * (sin(h1 + n) + sin(h2 + n))) / 12.0;
+	const F16 t1 = (-cos(3.0 * h1 - n) - cos(3.0 * h2 - n) + 8.0 * cos(n) - 3.0 * (cos(h1 + n) + cos(h2 + n))) / 12.0;
+	HVec3 bentNormal = HVec3(-dir2d.x * t0, -dir2d.y * t0, t1);
 	bentNormal = normalize(bentNormal);
+	bentNormal = mul(g_consts.m_viewToWorldMat, Vec4(bentNormal, 0.0));
+
+	return HVec4(bentNormal, Vd);
+}
 
 #	if ANKI_COMPUTE_SHADER
-	g_bentNormalsAndSsaoStorageTex[svDispatchThreadId] = RVec4(bentNormal, Vd);
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const Vec2 coord = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+	g_bentNormalsAndSsaoStorageTex[coord] = doWork(coord);
+}
 #	else
-	return RVec4(bentNormal, Vd);
-#	endif
+Vec4 main(VertOut input) : SV_TARGET0
+{
+	const Vec2 coord = floor(input.m_svPosition.xy);
+	return doWork(coord);
 }
-#endif // ANKI_TECHNIQUE_Ssao && (ANKI_COMPUTE_SHADER || ANKI_PIXEL_SHADER)
+#	endif
+#endif
 
 // ===========================================================================
-// SSAO spatial denoise                                                      =
+// SSAO temporal denoise                                                     =
 // ===========================================================================
-#if ANKI_TECHNIQUE_SsaoSpatialDenoise && (ANKI_COMPUTE_SHADER || ANKI_PIXEL_SHADER)
-#	include <AnKi/Shaders/BilateralFilter.hlsl>
-#	include <AnKi/Shaders/Include/MiscRendererTypes.h>
+#if ANKI_TECHNIQUE_SsaoTemporalDenoise && (ANKI_COMPUTE_SHADER || ANKI_PIXEL_SHADER)
 #	include <AnKi/Shaders/Functions.hlsl>
+#	include <AnKi/Shaders/TonemappingFunctions.hlsl>
+#	include <AnKi/Shaders/TonemappingFunctions.hlsl>
+#	include <AnKi/Shaders/Include/MiscRendererTypes.h>
 
 SamplerState g_linearAnyClampSampler : register(s0);
-Texture2D<RVec4> g_bentNormalsAndSsaoTex : register(t0);
-Texture2D<Vec4> g_depthTex : register(t1);
+
+Texture2D<Vec4> g_bentNormalsAndSsaoTex : register(t0);
+Texture2D<Vec4> g_historyBentNormalsAndSsaoTex : register(t1);
+Texture2D<Vec4> g_motionVectorsTex : register(t2);
+Texture2D<Vec4> g_historyLengthTex : register(t3);
+
+ConstantBuffer<GlobalRendererConstants> g_globalRendererConsts : register(b0);
 
 #	if ANKI_COMPUTE_SHADER
-RWTexture2D<RVec4> g_bentNormalsAndSsaoStorageTex : register(u0);
+RWTexture2D<Vec4> g_bentNormalsAndSsaoStorageTex : register(u0);
 #	endif
 
-ANKI_FAST_CONSTANTS(SsaoSpatialDenoiseConstants, g_consts)
-
-F32 computeWeight(F32 depth, F32 refDepth)
+HVec4 doWork(Vec2 coord)
 {
-	const F32 diff = abs(depth - refDepth);
-	return sqrt(1.0 / (0.0003 + diff));
-}
+	Vec2 viewport;
+	g_bentNormalsAndSsaoTex.GetDimensions(viewport.x, viewport.y);
+	const Vec2 uv = (coord + 0.5) / viewport;
 
-void sampleTex(Vec2 uv, IVec2 offset, F32 refDepth, inout RF32 ssao, inout RVec3 bentNormal, inout F32 weight)
-{
-	const F32 linearDepth = linearizeDepthOptimal(g_depthTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0, offset).x,
-												  g_consts.m_linearizeDepthParams.x, g_consts.m_linearizeDepthParams.y);
-	const RVec4 bentNormalAndSsao = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0, offset);
-	const F32 w = computeWeight(refDepth, linearDepth);
-	ssao += bentNormalAndSsao.w * w;
-	bentNormal += bentNormalAndSsao.xyz * w;
-	weight += w;
-}
+	const F16 minBlendFactor = 0.1;
+	const F16 maxBlendFactor = 0.9;
 
-#	if ANKI_COMPUTE_SHADER
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
-#	else
-RVec4 main(VertOut input) : SV_TARGET0
-#	endif
-{
-// Set UVs
-#	if ANKI_COMPUTE_SHADER
-	Vec2 textureSize;
-	U32 mipCount;
-	g_bentNormalsAndSsaoTex.GetDimensions(0, textureSize.x, textureSize.y, mipCount);
-	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5f) / textureSize;
-#	else
-	const Vec2 uv = input.m_uv;
-#	endif
+	const F16 historyLen = g_historyLengthTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0).x * kMaxHistoryLength;
 
-	// Sample ref
-	const RVec4 refBentNormalAndSsao = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0);
-	RF32 ssao = refBentNormalAndSsao.w;
-	RVec3 bentNormal = refBentNormalAndSsao.xyz;
-	const F32 refDepth = linearizeDepthOptimal(g_depthTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0).x, g_consts.m_linearizeDepthParams.x,
-											   g_consts.m_linearizeDepthParams.y);
-	F32 weight = computeWeight(0.0f, 0.0f); // Highest weight that this function can give
-
-	// Sample taps
-	sampleTex(uv, IVec2(1, 1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(0, 1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-1, 1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-1, 0), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-1, -1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(0, -1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(1, -1), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(1, 0), refDepth, ssao, bentNormal, weight);
-
-#	if SPATIAL_DENOISE_QUALITY == 1
-	sampleTex(uv, IVec2(2, 2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(0, 2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-2, 2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-2, 0), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(-2, -2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(0, -2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(2, -2), refDepth, ssao, bentNormal, weight);
-	sampleTex(uv, IVec2(2, 0), refDepth, ssao, bentNormal, weight);
+	F16 blendFactor = min(1.0, historyLen / 1.0);
+	blendFactor = lerp(maxBlendFactor, minBlendFactor, blendFactor);
+
+	Vec2 uv2 = uv;
+#	if !DENOISING_QUARTER_RESOLUTION
+	uv2 /= 2.0;
 #	endif
+	HVec4 outColor = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv2, 0.0);
+
+	if(blendFactor > maxBlendFactor * 0.9)
+	{
+		// Don't accumulate
+	}
+	else
+	{
+		const Vec2 mv = g_motionVectorsTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0);
+		const Vec2 historyUv =
+			uv + mv
+			+ (g_globalRendererConsts.m_previousMatrices.m_jitterOffsetNdc - g_globalRendererConsts.m_matrices.m_jitterOffsetNdc) / Vec2(2.0, -2.0);
 
-	ssao /= weight;
-	ssao = saturate(ssao);
+		const HVec4 history = g_historyBentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0);
 
-	bentNormal /= weight;
-	bentNormal = normalize(bentNormal);
-	bentNormal = mul(g_consts.m_viewToWorldMat, Vec4(bentNormal, 0.0f));
+		outColor = lerp(history, outColor, blendFactor);
+		outColor.xyz = normalize(outColor.xyz);
+	}
+
+	return outColor;
+}
 
-	// Write value
 #	if ANKI_COMPUTE_SHADER
-	g_bentNormalsAndSsaoStorageTex[svDispatchThreadId] = RVec4(bentNormal, ssao);
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const Vec2 coord = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+	g_bentNormalsAndSsaoStorageTex[coord] = doWork(coord);
+}
 #	else
-	return RVec4(bentNormal, ssao);
-#	endif
+Vec4 main(VertOut input) : SV_TARGET0
+{
+	const Vec2 coord = floor(input.m_svPosition.xy);
+	return doWork(coord);
 }
+#	endif
 #endif
 
 // ===========================================================================
-// SSAO temporal denoise                                                     =
+// SSAO spatial denoise                                                      =
 // ===========================================================================
-#if ANKI_TECHNIQUE_SsaoTemporalDenoise && (ANKI_COMPUTE_SHADER || ANKI_PIXEL_SHADER)
+#if(ANKI_TECHNIQUE_SsaoSpatialDenoiseHorizontal || ANKI_TECHNIQUE_SsaoSpatialDenoiseVertical) && (ANKI_COMPUTE_SHADER || ANKI_PIXEL_SHADER)
+#	include <AnKi/Shaders/BilateralFilter.hlsl>
+#	include <AnKi/Shaders/Include/MiscRendererTypes.h>
 #	include <AnKi/Shaders/Functions.hlsl>
-#	include <AnKi/Shaders/TonemappingFunctions.hlsl>
+#	include <AnKi/Shaders/BilateralFilter.hlsl>
 
 SamplerState g_linearAnyClampSampler : register(s0);
-Texture2D<RVec4> g_bentNormalsAndSsaoTex : register(t0);
-Texture2D<RVec4> g_historyBentNormalsAndSsaoTex : register(t1);
-Texture2D<Vec4> g_motionVectorsTex : register(t2);
+Texture2D<Vec4> g_bentNormalsAndSsaoTex : register(t0);
+Texture2D<Vec4> g_depthTex : register(t1);
 
 #	if ANKI_COMPUTE_SHADER
-RWTexture2D<RVec4> g_bentNormalsAndSsaoStorageTex : register(u0);
+RWTexture2D<Vec4> g_bentNormalsAndSsaoStorageTex : register(u0);
 #	endif
 
-#	if ANKI_COMPUTE_SHADER
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
-#	else
-RVec4 main(VertOut input) : SV_TARGET0
-#	endif
+F32 depthWeight(F32 refDepth, F32 sampleDepth)
 {
-#	if ANKI_COMPUTE_SHADER
-	Vec2 textureSize;
-	U32 mipCount;
-	g_bentNormalsAndSsaoTex.GetDimensions(0, textureSize.x, textureSize.y, mipCount);
-	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5f) / textureSize;
-#	else
-	const Vec2 uv = input.m_uv;
-#	endif
+	return calculateBilateralWeightDepth<F32>(refDepth, sampleDepth, 1.0);
+}
 
-	const Vec2 historyUv = uv + g_motionVectorsTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0f).xy;
+HVec4 doWork(Vec2 coord, Bool horizontal)
+{
+	Vec2 viewport;
+	g_bentNormalsAndSsaoTex.GetDimensions(viewport.x, viewport.y);
 
-	// Read textures
-	RVec4 history = g_historyBentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0f);
-	RVec4 current = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0.0f);
+	// Sample ref
+	const F32 refDepth = g_depthTex[coord].x;
+	if(refDepth == 1.0)
+	{
+		return 0.0;
+	}
 
-	// Remove ghosting by clamping the history color to neighbour's AABB
-	const RVec4 near0 = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0, IVec2(1, 0));
-	const RVec4 near1 = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0, IVec2(0, 1));
-	const RVec4 near2 = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0, IVec2(-1, 0));
-	const RVec4 near3 = g_bentNormalsAndSsaoTex.SampleLevel(g_linearAnyClampSampler, uv, 0, IVec2(0, -1));
+	F32 weightSum = depthWeight(0.0, 0.0); // Highest weight that this function can give
 
-#	if 0
-	const RVec4 boxMin = min(current, min4(near0, near1, near2, near3));
-	const RVec4 boxMax = max(current, max4(near0, near1, near2, near3));
-#	else
-	const RVec4 m1 = current + near0 + near1 + near2 + near3;
-	const RVec4 m2 = current * current + near0 * near0 + near1 * near1 + near2 * near2 + near3 * near3;
+	HVec4 bentNormalAndSsao = g_bentNormalsAndSsaoTex[coord];
+	bentNormalAndSsao *= weightSum;
 
-	const RVec4 mu = m1 / 5.0;
-	const RVec4 sigma = sqrt(m2 / 5.0 - mu * mu);
+	const F32 halfSampleCount = SPATIAL_DENOISE_SAMPLE_COUNT / 2;
+	for(F32 i = -halfSampleCount; i <= halfSampleCount; i += 1.0)
+	{
+		if(i == 0.0)
+		{
+			continue;
+		}
 
-	const F32 varianceClippingGamma = 1.2f;
-	const RVec4 boxMin = mu - varianceClippingGamma * sigma;
-	const RVec4 boxMax = mu + varianceClippingGamma * sigma;
-#	endif
+		Vec2 newCoord = coord + ((horizontal) ? Vec2(i, 0.0) : Vec2(0.0, i));
+		newCoord = clamp(newCoord, 0.0, viewport - 1.0);
 
-	history = clamp(history, boxMin, boxMax);
+		const HVec4 sampleColor = g_bentNormalsAndSsaoTex[newCoord];
+		const F32 sampleDepth = g_depthTex[newCoord].x;
 
-	// Final
-	const RF32 kBlendFactor = 0.1f;
+		const F32 weight = depthWeight(refDepth, sampleDepth);
 
-	const F32 lum0 = computeLuminance(current.xyz) * current.w;
-	const F32 lum1 = computeLuminance(history.xyz) * history.w;
-	const F32 maxLum = 1.0;
+		bentNormalAndSsao += sampleColor * weight;
+		weightSum += weight;
+	}
 
-	RF32 diff = abs(lum0 - lum1) / max(lum0, max(lum1, maxLum + kEpsilonF32));
-	diff = 1.0 - diff;
-	diff = diff * diff;
-	const RF32 feedback = lerp(0.0, kBlendFactor, diff);
+	bentNormalAndSsao /= weightSum;
 
-	RVec4 finalVal = lerp(history, current, feedback);
-	finalVal.xyz = normalize(finalVal.xyz);
+	bentNormalAndSsao.w = saturate(bentNormalAndSsao.w);
+	bentNormalAndSsao.xyz = normalize(bentNormalAndSsao.xyz);
+
+	return bentNormalAndSsao;
+}
 
-	// Write value
 #	if ANKI_COMPUTE_SHADER
-	g_bentNormalsAndSsaoStorageTex[svDispatchThreadId] = finalVal;
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const Vec2 coord = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
+	g_bentNormalsAndSsaoStorageTex[coord] = doWork(coord, ANKI_TECHNIQUE_SsaoSpatialDenoiseHorizontal);
+}
 #	else
-	return finalVal;
-#	endif
+Vec4 main(VertOut input) : SV_TARGET0
+{
+	const Vec2 coord = floor(input.m_svPosition.xy);
+	return doWork(coord, ANKI_TECHNIQUE_SsaoSpatialDenoiseHorizontal);
 }
+#	endif
 #endif

+ 1 - 1
Samples/Common/SampleApp.cpp

@@ -71,7 +71,7 @@ Error SampleApp::userMainLoop(Bool& quit, Second elapsedTime)
 
 	if(in.getKey(KeyCode::kI) == 1)
 	{
-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "IndirectDiffuse") ? "" : "IndirectDiffuse");
+		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "Ssao") ? "" : "Ssao");
 	}
 
 	if(in.getKey(KeyCode::kO) == 1)

+ 1 - 1
Samples/PhysicsPlayground/Main.cpp

@@ -212,7 +212,7 @@ Error MyApp::userMainLoop(Bool& quit, [[maybe_unused]] Second elapsedTime)
 
 	if(in.getKey(KeyCode::kI) == 1)
 	{
-		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "IndirectDiffuse") ? "" : "IndirectDiffuse");
+		renderer.setCurrentDebugRenderTarget((renderer.getCurrentDebugRenderTarget() == "Ssao") ? "" : "Ssao");
 	}
 
 	if(in.getKey(KeyCode::kO) == 1)