Browse Source

Enable HiZ testing in GPU culling. Not working correctly yet

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
9ec1b41b45

+ 7 - 5
AnKi/Renderer/GpuVisibility.cpp

@@ -78,12 +78,13 @@ void GpuVisibility::populateRenderGraph(RenderingContext& ctx)
 		cmdb.bindStorageBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
 
 		rpass.bindColorTexture(0, 3, getRenderer().getHiZ().getHiZRt());
+		cmdb.bindSampler(0, 4, getRenderer().getSamplers().m_nearestNearestClamp.get());
 
-		rpass.bindStorageBuffer(0, 4, m_runCtx.m_instanceRateRenderables);
-		rpass.bindStorageBuffer(0, 5, m_runCtx.m_drawIndexedIndirectArgs);
+		rpass.bindStorageBuffer(0, 5, m_runCtx.m_instanceRateRenderables);
+		rpass.bindStorageBuffer(0, 6, m_runCtx.m_drawIndexedIndirectArgs);
 
 		U32* offsets = allocateAndBindStorage<U32*>(
-			sizeof(U32) * RenderStateBucketContainer::getSingleton().getBucketCount(RenderingTechnique::kGBuffer), cmdb, 0, 6);
+			sizeof(U32) * RenderStateBucketContainer::getSingleton().getBucketCount(RenderingTechnique::kGBuffer), cmdb, 0, 7);
 		U32 bucketCount = 0;
 		U32 userCount = 0;
 		RenderStateBucketContainer::getSingleton().iterateBuckets(RenderingTechnique::kGBuffer, [&](const RenderStateInfo&, U32 userCount_) {
@@ -93,9 +94,9 @@ void GpuVisibility::populateRenderGraph(RenderingContext& ctx)
 		});
 		ANKI_ASSERT(userCount == RenderStateBucketContainer::getSingleton().getBucketsItemCount(RenderingTechnique::kGBuffer));
 
-		rpass.bindStorageBuffer(0, 7, m_runCtx.m_mdiDrawCounts);
+		rpass.bindStorageBuffer(0, 8, m_runCtx.m_mdiDrawCounts);
 
-		GpuVisibilityUniforms* unis = allocateAndBindUniforms<GpuVisibilityUniforms*>(sizeof(GpuVisibilityUniforms), cmdb, 0, 8);
+		GpuVisibilityUniforms* unis = allocateAndBindUniforms<GpuVisibilityUniforms*>(sizeof(GpuVisibilityUniforms), cmdb, 0, 9);
 
 		Array<Plane, 6> planes;
 		extractClipPlanes(ctx.m_matrices.m_viewProjection, planes);
@@ -115,6 +116,7 @@ void GpuVisibility::populateRenderGraph(RenderingContext& ctx)
 		unis->m_maxLodDistances[3] = kMaxF32;
 
 		unis->m_cameraOrigin = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz();
+		unis->m_viewProjectionMat = ctx.m_matrices.m_viewProjection;
 
 		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
 	});

+ 3 - 18
AnKi/Renderer/HiZ.cpp

@@ -50,7 +50,7 @@ Error HiZ::init()
 
 	m_hiZRtDescr = getRenderer().create2DRenderTargetDescription(ConfigSet::getSingleton().getRHiZWidth(), ConfigSet::getSingleton().getRHiZHeight(),
 																 Format::kR32_Uint, "HiZ U32");
-	m_hiZRtDescr.m_mipmapCount = U8(computeMaxMipmapCount2d(m_hiZRtDescr.m_width, m_hiZRtDescr.m_height, 16));
+	m_hiZRtDescr.m_mipmapCount = U8(computeMaxMipmapCount2d(m_hiZRtDescr.m_width, m_hiZRtDescr.m_height, 1));
 	m_hiZRtDescr.bake();
 
 	BufferInitInfo buffInit("HiZCounterBuffer");
@@ -177,23 +177,8 @@ void HiZ::populateRenderGraph(RenderingContext& ctx)
 				rgraphCtx.bindImage(0, 0, m_runCtx.m_hiZRt, subresource, mip);
 			}
 
-			if(mipsToCompute >= 5)
-			{
-				TextureSubresourceInfo subresource;
-				subresource.m_firstMipmap = 4;
-				rgraphCtx.bindImage(0, 1, m_runCtx.m_hiZRt, subresource);
-			}
-			else
-			{
-				// Bind something random that is not the 1st mip
-				TextureSubresourceInfo subresource;
-				subresource.m_firstMipmap = 1;
-				rgraphCtx.bindImage(0, 1, m_runCtx.m_hiZRt, subresource);
-			}
-
-			cmdb.bindStorageBuffer(0, 2, m_mipmapping.m_counterBuffer.get(), 0, kMaxPtrSize);
-
-			rgraphCtx.bindTexture(0, 3, m_runCtx.m_hiZRt, firstMipSubresource);
+			cmdb.bindStorageBuffer(0, 1, m_mipmapping.m_counterBuffer.get(), 0, kMaxPtrSize);
+			rgraphCtx.bindTexture(0, 2, m_runCtx.m_hiZRt, firstMipSubresource);
 
 			cmdb.dispatchCompute(dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1], 1);
 		});

+ 73 - 6
AnKi/Shaders/GpuVisibility.ankiprog

@@ -15,17 +15,18 @@
 [[vk::binding(2)]] ByteAddressBuffer g_gpuScene;
 
 [[vk::binding(3)]] Texture2D<U32> g_hiZTex;
+[[vk::binding(4)]] SamplerState g_nearestAnyClampSampler;
 
 // These 2 have the same size
-[[vk::binding(4)]] RWStructuredBuffer<GpuSceneRenderable> g_instanceRateRenderables;
-[[vk::binding(5)]] RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs;
+[[vk::binding(5)]] RWStructuredBuffer<GpuSceneRenderable> g_instanceRateRenderables;
+[[vk::binding(6)]] RWStructuredBuffer<DrawIndexedIndirectArgs> g_drawIndexedIndirectArgs;
 
 // Index pointing to the above arrays. One for each render state bucket
-[[vk::binding(6)]] StructuredBuffer<U32> g_drawIndirectArgsOffsets;
+[[vk::binding(7)]] StructuredBuffer<U32> g_drawIndirectArgsOffsets;
 // The MDI counts. One for each render state bucket
-[[vk::binding(7)]] RWStructuredBuffer<U32> g_mdiDrawCounts;
+[[vk::binding(8)]] RWStructuredBuffer<U32> g_mdiDrawCounts;
 
-[[vk::binding(8)]] ConstantBuffer<GpuVisibilityUniforms> g_unis;
+[[vk::binding(9)]] ConstantBuffer<GpuVisibilityUniforms> g_unis;
 
 [numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
@@ -51,7 +52,72 @@
 		return;
 	}
 
-	// TODO HiZ testing
+	// Screen-space AABB calculation and checking
+	//
+	const Vec3 A = aabb.m_sphereCenter - aabb.m_aabbExtend;
+	const Vec3 B = aabb.m_sphereCenter + aabb.m_aabbExtend;
+	const Vec3 aabbEdges[8u] = {Vec3(A.x, A.y, A.z), Vec3(B.x, A.y, A.z), Vec3(A.x, B.y, A.z), Vec3(A.x, A.y, B.z),
+								Vec3(B.x, B.y, A.z), Vec3(B.x, A.y, B.z), Vec3(A.x, B.y, B.z), Vec3(B.x, B.y, B.z)};
+
+	F32 aabbMinDepth = 1.0f;
+	Vec2 minNdc = 1000.0f;
+	Vec2 maxNdc = -1000.0f;
+	[unroll] for(U32 i = 0; i < 8; ++i)
+	{
+		Vec4 p = mul(g_unis.m_viewProjectionMat, Vec4(aabbEdges[i], 1.0f));
+
+		p.z = max(p.z, 0.0f);
+		p.xyz /= p.w;
+
+		minNdc = min(minNdc, p.xy);
+		maxNdc = max(maxNdc, p.xy);
+		aabbMinDepth = min(aabbMinDepth, p.z);
+	}
+
+	aabbMinDepth = saturate(aabbMinDepth);
+	if(any(minNdc > 1.0f) || any(maxNdc < -1.0f))
+	{
+		return;
+	}
+
+	// HiZ culling
+	//
+
+	// Compute the mip
+	Vec2 texSize;
+	F32 mipCount;
+	g_hiZTex.GetDimensions(0, texSize.x, texSize.y, mipCount);
+
+	const Vec2 minUv = saturate(ndcToUv(minNdc));
+	const Vec2 maxUv = saturate(ndcToUv(maxNdc));
+	const Vec2 sizeXY = (maxUv - minUv) * texSize;
+	F32 mip = ceil(log2(max(sizeXY.x, sizeXY.y)));
+	mip = clamp(mip, 0.0, mipCount - 1.0);
+
+	const F32 levelLower = max(mip - 1.0, 0.0);
+	const Vec2 scale = exp2(-levelLower);
+	const Vec2 a = floor(minUv * scale);
+	const Vec2 b = ceil(maxUv * scale);
+	const Vec2 dims = b - a;
+
+	if(dims.x <= 2.0 && dims.y <= 2.0)
+	{
+		mip = levelLower;
+	}
+
+	// Sample mip
+	Vec4 depths;
+	depths[0] = asfloat(g_hiZTex.SampleLevel(g_nearestAnyClampSampler, minUv, mip));
+	depths[1] = asfloat(g_hiZTex.SampleLevel(g_nearestAnyClampSampler, maxUv, mip));
+	depths[2] = asfloat(g_hiZTex.SampleLevel(g_nearestAnyClampSampler, Vec2(minUv.x, maxUv.y), mip));
+	depths[3] = asfloat(g_hiZTex.SampleLevel(g_nearestAnyClampSampler, Vec2(maxUv.x, minUv.y), mip));
+
+	const F32 maxDepth = max(depths[0], max(depths[1], max(depths[2], depths[3])));
+
+	if(aabbMinDepth > maxDepth)
+	{
+		return;
+	}
 
 	// Compute the LOD
 	//
@@ -71,6 +137,7 @@
 	{
 		lod = 2u;
 	}
+
 	// Add the drawcall
 	//
 	const U32 renderStateBucket = aabb.m_renderableIndexAndRenderStateBucket & ((1u << 12u) - 1u);

+ 5 - 15
AnKi/Shaders/HiZGenPyramid.ankiprog

@@ -18,10 +18,9 @@ struct Uniforms
 
 [[vk::push_constant]] ConstantBuffer<Uniforms> g_uniforms;
 
-[[vk::binding(0)]] RWTexture2D<UVec4> g_dstUavs[12u];
-[[vk::binding(1)]] globallycoherent RWTexture2D<UVec4> g_dstUav5;
-[[vk::binding(2)]] globallycoherent RWStructuredBuffer<U32> g_spdCounter;
-[[vk::binding(3)]] Texture2D<UVec4> g_srcTex;
+[[vk::binding(0)]] globallycoherent RWTexture2D<UVec4> g_dstUavs[12u];
+[[vk::binding(1)]] globallycoherent RWStructuredBuffer<U32> g_spdCounter;
+[[vk::binding(2)]] Texture2D<UVec4> g_srcTex;
 
 // Include SPD
 #define A_GPU 1
@@ -41,22 +40,14 @@ AF4 SpdLoadSourceImage(AU2 p, AU1 slice)
 AF4 SpdLoad(AU2 p, AU1 slice)
 {
 	ANKI_MAYBE_UNUSED(slice);
-	const U32 u = g_dstUav5[p].r;
+	const U32 u = g_dstUavs[5][p].r;
 	return AF4(asfloat(u), 0.0, 0.0, 0.0);
 }
 
 void SpdStore(AU2 p, AF4 value, AU1 mip, AU1 slice)
 {
 	ANKI_MAYBE_UNUSED(slice);
-
-	if(mip == 5u)
-	{
-		g_dstUav5[p] = Vec4(asuint(value.x), 0.0, 0.0, 0.0);
-	}
-	else
-	{
-		g_dstUavs[mip][p] = Vec4(asuint(value.x), 0.0, 0.0, 0.0);
-	}
+	g_dstUavs[mip][p] = Vec4(asuint(value.x), 0.0, 0.0, 0.0);
 }
 
 void SpdIncreaseAtomicCounter(AU1 slice)
@@ -92,7 +83,6 @@ AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
 	return AF4(maxDepth, 0.0, 0.0, 0.0);
 }
 
-#define SPD_LINEAR_SAMPLER 1
 #include <ThirdParty/FidelityFX/ffx_spd.h>
 
 [numthreads(256, 1, 1)] void main(UVec3 svGroupId : SV_GROUPID, U32 svGroupIndex : SV_GROUPINDEX)

+ 2 - 0
AnKi/Shaders/Include/MiscRendererTypes.h

@@ -158,6 +158,8 @@ struct GpuVisibilityUniforms
 
 	Vec3 m_cameraOrigin;
 	F32 m_padding2;
+
+	Mat4 m_viewProjectionMat;
 };
 
 ANKI_END_NAMESPACE