Sfoglia il codice sorgente

RT reflection optimizations

Panagiotis Christopoulos Charitos 1 anno fa
parent
commit
bd618bbd3b

+ 33 - 9
AnKi/Renderer/RtReflections.cpp

@@ -41,7 +41,10 @@ Error RtReflections::init()
 
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_spatialDenoisingGrProg, "SpatialDenoise"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_temporalDenoisingGrProg, "TemporalDenoise"));
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_bilateralDenoisingGrProg, "BilateralDenoise"));
+	ANKI_CHECK(
+		loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_verticalBilateralDenoisingGrProg, "BilateralDenoiseVertical"));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/RtReflections.ankiprogbin", {}, m_rtProg, m_horizontalBilateralDenoisingGrProg,
+								 "BilateralDenoiseHorizontal"));
 
 	m_sbtRecordSize = getAlignedRoundUp(GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment,
 										GrManager::getSingleton().getDeviceCapabilities().m_shaderGroupHandleSize + U32(sizeof(UVec4)));
@@ -55,7 +58,7 @@ Error RtReflections::init()
 	m_transientRtDesc2.bake();
 
 	m_hitPosAndDepthRtDesc = getRenderer().create2DRenderTargetDescription(
-		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), Format::kR32G32B32A32_Sfloat, "HitPosAndDepth");
+		getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y(), Format::kR16G16B16A16_Sfloat, "HitPosAndDepth");
 	m_hitPosAndDepthRtDesc.bake();
 
 	TextureInitInfo texInit = getRenderer().create2DRenderTargetDescription(
@@ -306,29 +309,50 @@ void RtReflections::populateRenderGraph(RenderingContext& ctx)
 			});
 	}
 
-	// Bilateral filter
+	// Hotizontal bilateral filter
 	{
-		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsBilateral");
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsHorizBilateral");
 
 		rpass.newTextureDependency(transientRt1, TextureUsageBit::kSrvCompute);
 		rpass.newTextureDependency(writeMomentsRt, TextureUsageBit::kSrvCompute);
 		rpass.newTextureDependency(getRenderer().getGBuffer().getColorRt(1), TextureUsageBit::kSrvCompute);
 
-		rpass.newTextureDependency(mainRt, TextureUsageBit::kUavCompute);
+		rpass.newTextureDependency(transientRt2, TextureUsageBit::kUavCompute);
 
-		rpass.setWork([this, &ctx, transientRt1, mainRt, writeMomentsRt](RenderPassWorkContext& rgraphCtx) {
+		rpass.setWork([this, &ctx, transientRt1, transientRt2, writeMomentsRt](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtShadows);
 
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
-			cmdb.bindShaderProgram(m_bilateralDenoisingGrProg.get());
-
-			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearClamp.get());
+			cmdb.bindShaderProgram(m_horizontalBilateralDenoisingGrProg.get());
 
 			rgraphCtx.bindSrv(0, 0, transientRt1);
 			rgraphCtx.bindSrv(1, 0, writeMomentsRt);
 			rgraphCtx.bindSrv(2, 0, getRenderer().getGBuffer().getColorRt(1));
 
+			rgraphCtx.bindUav(0, 0, transientRt2);
+
+			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());
+		});
+	}
+
+	// Vertical bilateral filter
+	{
+		NonGraphicsRenderPass& rpass = rgraph.newNonGraphicsRenderPass("RtReflectionsVertBilateral");
+
+		rpass.newTextureDependency(transientRt2, TextureUsageBit::kSrvCompute);
+
+		rpass.newTextureDependency(mainRt, TextureUsageBit::kUavCompute);
+
+		rpass.setWork([this, &ctx, transientRt2, mainRt](RenderPassWorkContext& rgraphCtx) {
+			ANKI_TRACE_SCOPED_EVENT(RtShadows);
+
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_verticalBilateralDenoisingGrProg.get());
+
+			rgraphCtx.bindSrv(0, 0, transientRt2);
+
 			rgraphCtx.bindUav(0, 0, mainRt);
 
 			dispatchPPCompute(cmdb, 8, 8, getRenderer().getInternalResolution().x(), getRenderer().getInternalResolution().y());

+ 2 - 1
AnKi/Renderer/RtReflections.h

@@ -47,7 +47,8 @@ public:
 	ShaderProgramPtr m_libraryGrProg;
 	ShaderProgramPtr m_spatialDenoisingGrProg;
 	ShaderProgramPtr m_temporalDenoisingGrProg;
-	ShaderProgramPtr m_bilateralDenoisingGrProg;
+	ShaderProgramPtr m_verticalBilateralDenoisingGrProg;
+	ShaderProgramPtr m_horizontalBilateralDenoisingGrProg;
 
 	RenderTargetDesc m_transientRtDesc1;
 	RenderTargetDesc m_transientRtDesc2;

+ 7 - 0
AnKi/Shaders/Common.hlsl

@@ -225,6 +225,13 @@ DEFINE_COMPARISON2(max)
 #undef DEFINE_COMPARISON2
 #undef DEFINE_COMPARISON
 
+// Trick intellisense
+#if defined(__INTELLISENSE__)
+#	define NOT_ZERO(exr) (1)
+#else
+#	define NOT_ZERO(exr) ((exr) != 0)
+#endif
+
 template<typename T>
 T pow2(T x)
 {

+ 206 - 76
AnKi/Shaders/RtReflections.ankiprog

@@ -6,7 +6,8 @@
 #pragma anki technique RtMaterialFetch rgen miss
 #pragma anki technique SpatialDenoise comp
 #pragma anki technique TemporalDenoise comp
-#pragma anki technique BilateralDenoise comp
+#pragma anki technique BilateralDenoiseVertical comp
+#pragma anki technique BilateralDenoiseHorizontal comp
 
 #include <AnKi/Shaders/RtMaterialFetch.hlsl>
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
@@ -18,6 +19,8 @@
 // Config
 constexpr F32 kSpatialUpscalingPcfTexelOffset = 8.0;
 #define SPATIAL_UPSCALING_POISON_KERNEL kPoissonDisk4
+constexpr F32 kMaxBilateralSamples = 5.0;
+constexpr F32 kGaussianSigma = 0.55;
 
 // ===========================================================================
 // RayGen                                                                    =
@@ -71,6 +74,7 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 	const Vec3 reflDir = sampleReflectionVectorIsotropic(viewDir, worldNormal, roughness, randFactors, 4, pdf);
 #	else
 	ANKI_MAYBE_UNUSED(roughness);
+	ANKI_MAYBE_UNUSED(randFactors);
 	const Vec3 reflDir = reflect(-viewDir, worldNormal);
 	const F32 pdf = 1.0;
 #	endif
@@ -124,7 +128,11 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 	outColor += diffC * dirLight.m_diffuseColor * lambert * shadow;
 
 	g_colorAndPdfTex[coord] = Vec4(outColor, max(0.0, pdf));
-	g_hitPosAndDepthTex[coord] = Vec4(worldPos + reflDir * payload.m_rayT, depth);
+
+	Vec3 hitPos = worldPos + reflDir * payload.m_rayT;
+	hitPos -= g_globalRendererConstants.m_cameraPosition; // Move it with camera to avoid precision issues since it's stored in fp16
+
+	g_hitPosAndDepthTex[coord] = Vec4(hitPos, 1.0 - depth); // Store depth in reverse for better precision
 }
 #endif // ANKI_RAY_GEN_SHADER
 
@@ -144,7 +152,7 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 // ===========================================================================
 // SpatialDenoise                                                            =
 // ===========================================================================
-#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_SpatialDenoise
+#if ANKI_COMPUTE_SHADER && NOT_ZERO(ANKI_TECHNIQUE_SpatialDenoise)
 Texture2D<Vec4> g_colorAndPdfTex : register(t0);
 Texture2D<Vec4> g_hitPosAndDepthTex : register(t1);
 Texture2D<Vec4> g_depthTex : register(t2);
@@ -170,6 +178,11 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 	const F32 pdf = rgba.w;
 
 	const F32 depth = g_depthTex[coord];
+	if(depth == 1.0)
+	{
+		g_denoisedTex[svDispatchThreadId] = 0.0;
+		return;
+	}
 
 	const Vec2 ndc = uvToNdc((Vec2(coord) + 0.5) / Vec2(outSize));
 	const Vec4 v4 = mul(g_globalRendererConstants.m_matrices.m_invertedViewProjectionJitter, Vec4(ndc, depth, 1.0));
@@ -178,8 +191,6 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 	const Vec3 viewDir = normalize(g_globalRendererConstants.m_cameraPosition - worldPos);
 
 	const Vec4 rt1 = g_gbufferRt1[coord];
-	const Vec4 rt2 = g_gbufferRt2[coord];
-	const Vec3 worldNormal = unpackNormalFromGBuffer(rt2);
 	const F32 roughness = unpackRoughnessFromGBuffer(rt1);
 	const F32 alpha = pow2(roughness);
 
@@ -191,6 +202,9 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 	}
 	else
 	{
+		const Vec4 rt2 = g_gbufferRt2[coord];
+		const Vec3 worldNormal = unpackNormalFromGBuffer(rt2);
+
 		const UVec3 seed = rand3DPCG16(UVec3(svDispatchThreadId, g_globalRendererConstants.m_frame % 8u));
 		const Vec2 randFactors = hammersleyRandom16(g_globalRendererConstants.m_frame % 64u, 64u, seed);
 
@@ -214,19 +228,22 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 			const IVec2 newCoord = clamp(IVec2(coord) + rotatedDiskPoint * kSpatialUpscalingPcfTexelOffset, 0, outSize - 1);
 
 			rgba = g_hitPosAndDepthTex[newCoord];
-			const F32 sampleDepth = rgba.w;
-			const Vec3 hitPos = rgba.xyz;
+			const F32 sampleDepth = 1.0 - rgba.w;
+			const Vec3 hitPos = rgba.xyz + g_globalRendererConstants.m_cameraPosition;
 
 			const Vec3 reflectedDir = normalize(hitPos - worldPos);
 			const F32 pdf = pdfVndfIsotropic(reflectedDir, viewDir, alpha, worldNormal);
 
-			const Vec3 sampleColor = g_colorAndPdfTex[newCoord].xyz;
-
 			const F32 weight = pdf * calculateBilateralWeightDepth(depth, sampleDepth, 1.0);
 
-			outColor += sampleColor * weight;
-			weightSum += weight;
-			avgLuma += computeLuminance(sampleColor) / sampleCount;
+			if(weight > 0.001)
+			{
+				const Vec3 sampleColor = g_colorAndPdfTex[newCoord].xyz;
+
+				outColor += sampleColor * weight;
+				weightSum += weight;
+				avgLuma += computeLuminance(sampleColor) / sampleCount;
+			}
 		}
 
 		outColor = outColor / weightSum;
@@ -246,7 +263,7 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 // ===========================================================================
 // TemporalDenoise                                                           =
 // ===========================================================================
-#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_TemporalDenoise
+#if ANKI_COMPUTE_SHADER && NOT_ZERO(ANKI_TECHNIQUE_TemporalDenoise)
 SamplerState g_linearAnyClampSampler : register(s0);
 
 Texture2D<Vec4> g_colorAndDepth : register(t0);
@@ -264,12 +281,12 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 Vec2 computeHistoryUv(UVec2 coords, Vec2 uv)
 {
 	// Compute the history UV by reprojecting the hit point
-	const Vec3 worldPos = g_hitPosTex[coords].xyz;
+	const Vec3 hitWorldPos = g_hitPosTex[coords].xyz + g_globalRendererConstants.m_cameraPosition;
 
-	Vec4 clipPos = mul(g_globalRendererConstants.m_matrices.m_viewProjection, Vec4(worldPos, 1.0));
+	Vec4 clipPos = mul(g_globalRendererConstants.m_matrices.m_viewProjection, Vec4(hitWorldPos, 1.0));
 	clipPos.xy /= clipPos.w;
 
-	Vec4 prevClipPos = mul(g_globalRendererConstants.m_previousMatrices.m_viewProjection, Vec4(worldPos, 1.0));
+	Vec4 prevClipPos = mul(g_globalRendererConstants.m_previousMatrices.m_viewProjection, Vec4(hitWorldPos, 1.0));
 	prevClipPos.xy /= prevClipPos.w;
 
 	const Vec2 diff = ndcToUv(prevClipPos.xy) - ndcToUv(clipPos.xy);
@@ -289,41 +306,20 @@ Vec2 computeHistoryUv(UVec2 coords, Vec2 uv)
 	return historyUv;
 }
 
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
+void accumulateSourceColor(Vec2 newUv, Vec4 texelWeights, inout Vec3 m1, inout Vec3 m2, inout Vec3 sourceSample, inout Vec3 neighboorMin,
+						   inout Vec3 neighboorMax)
 {
-	UVec2 textureSize;
-	g_colorAndDepth.GetDimensions(textureSize.x, textureSize.y);
+	const Vec4 red = g_colorAndDepth.GatherRed(g_linearAnyClampSampler, newUv);
+	const Vec4 green = g_colorAndDepth.GatherGreen(g_linearAnyClampSampler, newUv);
+	const Vec4 blue = g_colorAndDepth.GatherBlue(g_linearAnyClampSampler, newUv);
 
-	const UVec2 coord = min(svDispatchThreadId, textureSize - 1);
-	const Vec2 uv = (Vec2(coord) + 0.5f) / textureSize;
-
-	// Read crnt
-	Vec4 rgba = g_colorAndDepth[coord];
-	const F32 depth = rgba.w;
-	Vec3 sourceSample = rgba.xyz;
-	Vec3 neighboorMin = sourceSample;
-	Vec3 neighboorMax = sourceSample;
-	F32 weightSum = 1.0;
-	Vec3 m1 = sourceSample;
-	Vec3 m2 = sourceSample * sourceSample;
-	constexpr F32 sampleCount = 9.0;
-	for(I32 x = -1; x <= 1; ++x)
+	[unroll] for(U32 c = 0; c < 4; ++c)
 	{
-		for(I32 y = -1; y <= 1; ++y)
+		if(texelWeights[c] > 0.0)
 		{
-			if(x == 0 && y == 0)
-			{
-				continue;
-			}
-
-			IVec2 newCoords = IVec2(coord) + IVec2(x, y);
-			newCoords = clamp(newCoords, 0, textureSize - 1);
-
-			const Vec3 neighbor = g_colorAndDepth[newCoords].xyz;
+			const Vec3 neighbor = Vec3(red[c], green[c], blue[c]);
 
-			const F32 weight = 0.5;
-			sourceSample += neighbor * weight;
-			weightSum += weight;
+			sourceSample += neighbor * texelWeights[c];
 
 			neighboorMin = min(neighboorMin, neighbor);
 			neighboorMax = max(neighboorMax, neighbor);
@@ -332,14 +328,88 @@ Vec2 computeHistoryUv(UVec2 coords, Vec2 uv)
 			m2 += neighbor * neighbor;
 		}
 	}
+}
+
+void accumulateSourceColor(IVec2 coord, IVec2 textureSize, F32 weight, inout Vec3 m1, inout Vec3 m2, inout Vec3 sourceSample, inout Vec3 neighboorMin,
+						   inout Vec3 neighboorMax)
+{
+	coord = clamp(coord, 0, textureSize - 1);
+
+	const Vec3 neighbor = g_colorAndDepth[coord].xyz;
+
+	sourceSample += neighbor * weight;
+
+	neighboorMin = min(neighboorMin, neighbor);
+	neighboorMax = max(neighboorMax, neighbor);
+
+	m1 += neighbor;
+	m2 += neighbor * neighbor;
+}
+
+void computeSourceColor(Vec2 uv, IVec2 coord, IVec2 textureSize, out Vec3 m1, out Vec3 m2, out Vec3 sourceSample, out Vec3 neighboorMin,
+						out Vec3 neighboorMax)
+{
+	sourceSample = 0.0;
+	neighboorMin = 1000.0;
+	neighboorMax = -1000.0;
+	m1 = 0.0;
+	m2 = 0.0;
+
+	const Vec2 texelSize = 1.0 / textureSize;
+	const Vec2 halfTexelSize = texelSize / 2.0;
+
+	// Positioning mentioned bellow is in screen space (bottom left is in the bottom left of the screen)
+	// Alogithm wants to sample 9 taps of this:
+	// +-+-+-+
+	// |6|7|8|
+	// +-+-+-+
+	// |3|4|5|
+	// +-+-+-+
+	// |0|1|2|
+	// +-+-+-+
+	// "uv" points to the middle of 4
+
+	// Bottom left (0, 1, 4, 3)
+	Vec2 newUv = uv + Vec2(-halfTexelSize.x, +halfTexelSize.y);
+	accumulateSourceColor(newUv, Vec4(0.5, 0.5, 1.0, 0.5), m1, m2, sourceSample, neighboorMin, neighboorMax);
+
+	// Top right (4, 5, 8, 7)
+	newUv = uv + Vec2(+halfTexelSize.x, -halfTexelSize.y);
+	accumulateSourceColor(newUv, Vec4(0.0, 0.5, 0.5, 0.5), m1, m2, sourceSample, neighboorMin, neighboorMax);
+
+	// Top left
+	accumulateSourceColor(coord + IVec2(-1, -1), textureSize, 0.5, m1, m2, sourceSample, neighboorMin, neighboorMax);
+
+	// Bottom right
+	accumulateSourceColor(coord + IVec2(+1, +1), textureSize, 0.5, m1, m2, sourceSample, neighboorMin, neighboorMax);
+
+	// Misc
+	sourceSample /= 1.0 + 0.5 * 8.0;
+}
+
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	UVec2 textureSize;
+	g_colorAndDepth.GetDimensions(textureSize.x, textureSize.y);
+
+	const UVec2 coord = min(svDispatchThreadId, textureSize - 1);
+	const Vec2 uv = (Vec2(coord) + 0.5f) / textureSize;
 
-	sourceSample /= weightSum;
+	// Read crnt
+	const F32 depth = g_colorAndDepth[coord].w;
+	Vec3 sourceSample = 0.0;
+	Vec3 neighboorMin = 0.0;
+	Vec3 neighboorMax = 0.0;
+	Vec3 m1 = 0.0;
+	Vec3 m2 = 0.0;
+	computeSourceColor(uv, coord, textureSize, m1, m2, sourceSample, neighboorMin, neighboorMax);
 
 	// Read history
 	const Vec2 historyUv = computeHistoryUv(coord, uv);
 	Vec3 history = g_historyTex.SampleLevel(g_linearAnyClampSampler, historyUv, 0.0f);
 
 	// Fix history
+	constexpr F32 sampleCount = 9.0;
 	const F32 gamma = 1.0;
 	const Vec3 mu = m1 / sampleCount;
 	const Vec3 sigma = sqrt(abs((m2 / sampleCount) - (mu * mu)));
@@ -369,16 +439,15 @@ Vec2 computeHistoryUv(UVec2 coords, Vec2 uv)
 	const Vec2 moments = lerp(crntMoments, momentsHistory, 0.25);
 
 	// Write value
-	g_outTex[svDispatchThreadId] = Vec4(finalVal, depth);
-	g_momentsTex[svDispatchThreadId] = Vec4(moments, 0.0, 0.0);
+	g_outTex[coord] = Vec4(finalVal, depth);
+	g_momentsTex[coord] = Vec4(moments, 0.0, 0.0);
 }
 #endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_TemporalDenoise
 
 // ===========================================================================
-// BilateralDenoise                                                          =
+// BilateralDenoiseHorizontal                                                =
 // ===========================================================================
-#if ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoise
-SamplerState g_linearAnyClampSampler : register(s0);
+#if ANKI_COMPUTE_SHADER && NOT_ZERO(ANKI_TECHNIQUE_BilateralDenoiseHorizontal)
 Texture2D<Vec4> g_colorAndDepth : register(t0);
 Texture2D<Vec4> g_momentsTex : register(t1);
 Texture2D<Vec4> g_gbufferRt1 : register(t2);
@@ -421,51 +490,112 @@ F32 computeVarianceCenter(IVec2 coord, UVec2 textureSize)
 	const F32 refDepth = rgba.w;
 	const Vec3 centerColor = rgba.xyz;
 
-	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5) / outSize;
-	const Vec2 texelSize = 1.0 / outSize;
-	const Vec2 halfTexelSize = texelSize / 2.0;
-
 	const F32 variance = sqrt(computeVarianceCenter(coord, outSize)) * 100.0;
 
 	const Vec4 rt1 = g_gbufferRt1[coord];
 	const F32 roughness = unpackRoughnessFromGBuffer<F32>(rt1, 0.0);
 	const F32 sqRoughness = sqrt(roughness);
 
-	constexpr F32 kSamples = 5.0;
-	constexpr F32 kGaussianSigma = 0.55;
-
 	const F32 lerpFactor = sqRoughness * min(1.0, max(sqRoughness, variance));
 
-	const F32 sampleCount = round(lerp(0, kSamples, lerpFactor));
+	const F32 sampleCount = round(lerp(0, kMaxBilateralSamples, lerpFactor));
 
-	Vec3 colorSum = centerColor;
 	F32 weightSum = gaussianWeight2d<F32>(kGaussianSigma, 0.0, 0.0);
+	Vec3 colorSum = centerColor * weightSum;
 	for(F32 x = -sampleCount; x <= sampleCount; x += 1.0)
 	{
-		for(F32 y = -sampleCount; y <= sampleCount; y += 1.0)
+		if(x == 0.0)
 		{
-			if(x == 0.0 && y == 0.0)
-			{
-				continue;
-			}
+			continue;
+		}
+
+		IVec2 newCoord = coord + IVec2(x, 0);
+		newCoord.x = clamp(newCoord.x, 0, outSize.x - 1);
+
+		rgba = g_colorAndDepth[newCoord];
+		const F32 sampleDepth = rgba.w;
+		const Vec3 sampleColor = rgba.xyz;
+
+		const F32 gWeight = gaussianWeight<F32>(kGaussianSigma, x / sampleCount);
+		const F32 depthWeight = calculateBilateralWeightDepth(refDepth, sampleDepth, 1.0);
+		const F32 weight = gWeight * depthWeight;
+
+		colorSum += sampleColor * weight;
+		weightSum += weight;
+	}
+
+	colorSum /= weightSum;
+
+	// Encode the step count in the signs of the out color
+	const U32 sampleCountu = sampleCount;
+	Vec4 signs;
+	[unroll] for(U32 i = 0; i < 4; i++)
+	{
+		signs[i] = (sampleCountu & (1u << i)) ? 1.0 : -1.0;
+	}
+
+	g_outTex[coord] = Vec4(colorSum, refDepth) * signs;
+}
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoiseHorizontal
 
-			const Vec2 suv = uv + Vec2(x, y) * texelSize + Vec2(sign(x), sign(y)) * halfTexelSize;
+// ===========================================================================
+// BilateralDenoiseVertical                                                  =
+// ===========================================================================
+#if ANKI_COMPUTE_SHADER && NOT_ZERO(ANKI_TECHNIQUE_BilateralDenoiseVertical)
+Texture2D<Vec4> g_colorAndDepthAndSampleCount : register(t0);
+
+RWTexture2D<Vec4> g_outTex : register(u0);
 
-			rgba = g_colorAndDepth.SampleLevel(g_linearAnyClampSampler, suv, 0.0);
-			const F32 sampleDepth = rgba.w;
-			const Vec3 sampleColor = rgba.xyz;
+F32 decodeSampleCount(Vec4 rgba)
+{
+	U32 sampleCountu = 0;
+	[unroll] for(U32 i = 0; i < 4; ++i)
+	{
+		sampleCountu |= (sign(rgba[i]) > 0.0) ? (1u << i) : 0u;
+	}
 
-			const F32 gaussianWeight = gaussianWeight2d<F32>(kGaussianSigma, x / sampleCount, y / sampleCount);
-			const F32 depthWeight = calculateBilateralWeightDepth(refDepth, sampleDepth, 1.0);
-			const F32 weight = gaussianWeight * depthWeight;
+	return sampleCountu;
+}
 
-			colorSum += sampleColor * weight;
-			weightSum += weight;
+[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+{
+	UVec2 outSize;
+	g_outTex.GetDimensions(outSize.x, outSize.y);
+
+	const UVec2 coord = min(svDispatchThreadId, outSize - 1);
+
+	Vec4 rgba = g_colorAndDepthAndSampleCount[coord];
+	const F32 sampleCount = decodeSampleCount(rgba);
+	rgba = abs(rgba);
+	const F32 refDepth = rgba.w;
+	const Vec3 refColor = rgba.xyz;
+
+	F32 weightSum = gaussianWeight<F32>(kGaussianSigma, 0.0);
+	Vec3 colorSum = refColor * weightSum;
+	for(F32 y = -sampleCount; y <= sampleCount; y += 1.0)
+	{
+		if(y == 0.0)
+		{
+			continue;
 		}
+
+		IVec2 newCoord = coord + IVec2(0.0, y);
+		newCoord.y = clamp(newCoord.y, 0, outSize.y - 1);
+
+		rgba = abs(g_colorAndDepthAndSampleCount[newCoord]);
+		const F32 sampleDepth = rgba.w;
+		const Vec3 sampleColor = rgba.xyz;
+
+		const F32 gWeight = gaussianWeight<F32>(kGaussianSigma, y / sampleCount);
+		const F32 depthWeight = calculateBilateralWeightDepth(refDepth, sampleDepth, 1.0);
+		const F32 weight = gWeight * depthWeight;
+
+		colorSum += sampleColor * weight;
+		weightSum += weight;
 	}
 
 	colorSum /= weightSum;
 
 	g_outTex[coord] = Vec4(colorSum, 1.0);
 }
-#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoise
+#endif // ANKI_COMPUTE_SHADER && ANKI_TECHNIQUE_BilateralDenoiseVertical