Переглянути джерело

Add all culling tests to task shader

Panagiotis Christopoulos Charitos 2 роки тому
батько
коміт
cd1238191b

+ 1 - 1
AnKi/Importer/GltfImporterMesh.cpp

@@ -370,7 +370,7 @@ static void generateMeshlets(SubMesh& submesh)
 	meshlets.resize(maxMeshlets);
 
 	// Meshletize
-	constexpr F32 coneWeight = 0.7f;
+	constexpr F32 coneWeight = 0.0f;
 	const U32 meshletCount =
 		U32(meshopt_buildMeshlets(meshlets.getBegin(), indicesToVertexBuffer.getBegin(), localIndices.getBegin(), submesh.m_indices.getBegin(),
 								  submesh.m_indices.getSize(), &submesh.m_verts[0].m_position.x(), submesh.m_verts.getSize(), sizeof(TempVertex),

+ 14 - 1
AnKi/Renderer/GBuffer.cpp

@@ -60,7 +60,7 @@ Error GBuffer::initInternal()
 	}
 
 	{
-		const TextureUsageBit usage = TextureUsageBit::kSampledCompute | TextureUsageBit::kUavComputeWrite;
+		const TextureUsageBit usage = TextureUsageBit::kSampledCompute | TextureUsageBit::kUavComputeWrite | TextureUsageBit::kSampledGeometry;
 
 		TextureInitInfo texinit =
 			getRenderer().create2DRenderTargetInitInfo(g_hzbWidthCVar.get(), g_hzbHeightCVar.get(), Format::kR32_Sfloat, usage, "GBuffer HZB");
@@ -128,6 +128,14 @@ void GBuffer::runInThread(const RenderingContext& ctx, const GpuVisibilityOutput
 	args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAnisoResolutionScalingBias.get();
 	args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
 	args.m_viewport = UVec4(0, 0, getRenderer().getInternalResolution());
+
+	TextureViewPtr hzbView;
+	if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	{
+		hzbView = rgraphCtx.createTextureView(m_runCtx.m_hzbRt);
+		args.m_hzbTexture = hzbView.get();
+	}
+
 	args.fillMdi(visOut);
 
 	cmdb.setDepthCompareOperation(CompareOperation::kLessEqual);
@@ -241,6 +249,11 @@ void GBuffer::populateRenderGraph(RenderingContext& ctx)
 		pass.newTextureDependency(sriRt, TextureUsageBit::kFramebufferShadingRate);
 	}
 
+	if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+	{
+		pass.newTextureDependency(m_runCtx.m_hzbRt, TextureUsageBit::kSampledGeometry);
+	}
+
 	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kUavGeometryRead | BufferUsageBit::kUavFragmentRead);
 
 	// Only add one depedency to the GPU visibility. No need to track all buffers

+ 13 - 0
AnKi/Renderer/ShadowMapping.cpp

@@ -73,6 +73,8 @@ public:
 	GpuVisibilityOutput m_visOut;
 
 	BufferOffsetRange m_clearTileIndirectArgs;
+
+	RenderTargetHandle m_hzbRt;
 };
 
 Error ShadowMapping::init()
@@ -386,6 +388,10 @@ void ShadowMapping::processLights(RenderingContext& ctx)
 			work.m_viewProjMat = cascadeViewProjMats[cascade];
 			work.m_viewMat = cascadeViewMats[cascade];
 			work.m_viewport = atlasViewports[cascade];
+			if(GrManager::getSingleton().getDeviceCapabilities().m_meshShaders)
+			{
+				work.m_hzbRt = hzbGenIn.m_cascades[cascade].m_hzbRt;
+			}
 
 			// Vis testing
 			const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
@@ -625,6 +631,13 @@ void ShadowMapping::runShadowMapping(RenderPassWorkContext& rgraphCtx)
 		args.m_viewport = UVec4(work.m_viewport[0], work.m_viewport[1], work.m_viewport[2], work.m_viewport[3]);
 		args.fillMdi(work.m_visOut);
 
+		TextureViewPtr hzbView;
+		if(work.m_hzbRt.isValid())
+		{
+			hzbView = rgraphCtx.createTextureView(work.m_hzbRt);
+			args.m_hzbTexture = hzbView.get();
+		}
+
 		getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 	}
 }

+ 8 - 0
AnKi/Renderer/Utils/Drawer.cpp

@@ -41,6 +41,11 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 		static_assert(sizeof(globalUniforms->m_cameraTransform) == sizeof(args.m_cameraTransform));
 		memcpy(&globalUniforms->m_cameraTransform, &args.m_cameraTransform, sizeof(args.m_cameraTransform));
 
+		ANKI_ASSERT(args.m_viewport != UVec4(0u));
+		globalUniforms->m_viewport = Vec4(args.m_viewport);
+
+		globalUniforms->m_enableHzbTesting = args.m_hzbTexture != nullptr;
+
 		cmdb.bindConstantBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kGlobalConstants), globalUniformsToken);
 	}
 
@@ -58,6 +63,9 @@ void RenderableDrawer::setState(const RenderableDrawerArguments& args, CommandBu
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kTaskShaderPayloads), args.m_taskShaderPayloadsBuffer);
 	cmdb.bindUavBuffer(U32(MaterialSet::kGlobal), U32(MaterialBinding::kRenderables),
 					   GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+	cmdb.bindTexture(U32(MaterialSet::kGlobal), U32(MaterialBinding::kHzbTexture),
+					 (args.m_hzbTexture) ? args.m_hzbTexture : &getRenderer().getDummyTextureView2d());
+	cmdb.bindSampler(U32(MaterialSet::kGlobal), U32(MaterialBinding::kNearestClampSampler), getRenderer().getSamplers().m_nearestNearestClamp.get());
 
 	// Misc
 	cmdb.setVertexAttribute(0, 0, Format::kR32G32B32A32_Uint, 0);

+ 3 - 1
AnKi/Renderer/Utils/Drawer.h

@@ -24,7 +24,9 @@ public:
 	Mat4 m_viewProjectionMatrix;
 	Mat4 m_previousViewProjectionMatrix;
 
-	UVec4 m_viewport; ///< Only used for information purposes.
+	UVec4 m_viewport;
+
+	TextureView* m_hzbTexture = nullptr; ///< Optional.
 
 	Sampler* m_sampler = nullptr;
 

+ 4 - 3
AnKi/Resource/MeshResource.cpp

@@ -326,11 +326,12 @@ Error MeshResource::loadAsync(MeshBinaryLoader& loader) const
 				outMeshlet.m_firstPrimitive =
 					lod.m_meshletIndices.getOffset() / getFormatInfo(kMeshletPrimitiveFormat).m_texelSize + inMeshlet.m_firstPrimitive;
 				outMeshlet.m_primitiveCount_R16_Uint_vertexCount_R16_Uint = (inMeshlet.m_primitiveCount << 16u) | inMeshlet.m_vertexCount;
-				outMeshlet.m_sphereCenter = inMeshlet.m_boundingVolume.m_sphereCenter;
-				outMeshlet.m_sphereRadius = inMeshlet.m_boundingVolume.m_sphereRadius;
+				outMeshlet.m_aabbMin = inMeshlet.m_boundingVolume.m_aabbMin;
+				outMeshlet.m_aabbMax = inMeshlet.m_boundingVolume.m_aabbMax;
 				outMeshlet.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm =
 					packSnorm4x8(Vec4(inMeshlet.m_coneDirection, cos(inMeshlet.m_coneAngle / 2.0f)));
-				outMeshlet.m_coneApex = inMeshlet.m_coneApex;
+				outMeshlet.m_coneApex_R8G8B8A8_Snorm = packSnorm4x8(inMeshlet.m_coneApex.xyz0());
+				outMeshlet.m_sphereRadius = ((outMeshlet.m_aabbMin + outMeshlet.m_aabbMax) / 2.0f - outMeshlet.m_aabbMax).getLength();
 			}
 
 			cmdb->copyBufferToBuffer(&handle2.getBuffer(), handle2.getOffset(), unifiedGeometryBuffer, lod.m_meshlets.getOffset(),

+ 63 - 3
AnKi/Shaders/GBufferGeneric.ankiprog

@@ -24,7 +24,11 @@
 #define REALLY_VELOCITY ((ANKI_VELOCITY || ANKI_BONES) && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER)
 #define REALLY_USING_PARALLAX (PARALLAX == 1 && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER && ALPHA_TEST == 0)
 
-#define MESHLET_BACKFACE_CULLING 1
+#define VISUALIZE_MESHLETS (0 && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER)
+#define MESHLET_BACKFACE_CULLING 0
+#define MESHLET_OUTSIDE_OF_SCREEN_CULLING 1
+#define MESHLET_NO_SAMPLING_POINT_CULLING 1
+#define MESHLET_HZB_CULLING 1
 
 #include <AnKi/Shaders/Include/MaterialTypes.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
@@ -84,6 +88,10 @@ struct VertOut
 #endif
 
 	nointerpolation U32 m_constantsOffset : UNIS_OFFSET;
+
+#if VISUALIZE_MESHLETS
+	nointerpolation U32 m_meshletIndex : MESHLET_INDEX;
+#endif
 };
 
 #if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
@@ -262,11 +270,34 @@ struct FirstPayload
 	{
 		Bool cull = false;
 
-#if MESHLET_BACKFACE_CULLING
 		const Meshlet meshlet = g_meshlets[firstMeshlet + svGroupIndex];
 		const Mat3x4 worldTransform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
 
-		cull = cull || cullBackfaceMeshlet(meshlet, worldTransform, g_globalConstants.m_cameraTransform.getTranslationPart());
+#if MESHLET_BACKFACE_CULLING
+		cull = cullBackfaceMeshlet(meshlet, worldTransform, g_globalConstants.m_cameraTransform.getTranslationPart());
+#endif
+
+		const Mat4 wordTransform4 = {worldTransform.m_row0, worldTransform.m_row1, worldTransform.m_row2, Vec4(0.0f, 0.0f, 0.0f, 1.0f)};
+		const Mat4 mvp = mul(g_globalConstants.m_viewProjectionMatrix, wordTransform4);
+
+		Vec2 minNdc, maxNdc;
+		F32 aabbMinDepth;
+		projectAabb(meshlet.m_aabbMin, meshlet.m_aabbMax, mvp, minNdc, maxNdc, aabbMinDepth);
+
+#if MESHLET_OUTSIDE_OF_SCREEN_CULLING
+		// Outside of the screen
+		cull = !cull && (any(minNdc > 1.0f) || any(maxNdc < -1.0f));
+#endif
+
+#if MESHLET_NO_SAMPLING_POINT_CULLING
+		// Sampling points test
+		const Vec2 windowCoordsMin = ndcToUv(minNdc) * g_globalConstants.m_viewport.zw;
+		const Vec2 windowCoordsMax = ndcToUv(maxNdc) * g_globalConstants.m_viewport.zw;
+		cull = !cull && any(round(windowCoordsMin) == round(windowCoordsMax));
+#endif
+
+#if MESHLET_HZB_CULLING
+		cull = !cull && g_globalConstants.m_enableHzbTesting == 1u && cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTexture, g_nearestClampSampler);
 #endif
 
 		if(!cull)
@@ -345,6 +376,10 @@ main(in payload MeshShaderPayload payload, out vertices VertOut verts[kMaxVertic
 			velocity(worldTransform, prevWorldTransform, prevPos, output);
 #endif
 
+#if VISUALIZE_MESHLETS
+			output.m_meshletIndex = relativeMeshletIdx;
+#endif
+
 			verts[idx] = output;
 		}
 	}
@@ -463,6 +498,31 @@ FragOut main(VertOut input)
 	g.m_metallic = metallic;
 	g.m_velocity = velocity;
 
+#	if VISUALIZE_MESHLETS
+	const U32 meshletIdx = input.m_meshletIndex % 6u;
+	switch(meshletIdx)
+	{
+	case 0:
+		g.m_diffuse = Vec3(1.0f, 0.0f, 0.0f);
+		break;
+	case 1:
+		g.m_diffuse = Vec3(0.0f, 1.0f, 0.0f);
+		break;
+	case 2:
+		g.m_diffuse = Vec3(0.0f, 0.0f, 1.0f);
+		break;
+	case 3:
+		g.m_diffuse = Vec3(1.0f, 1.0f, 0.0f);
+		break;
+	case 4:
+		g.m_diffuse = Vec3(1.0f, 0.0f, 1.0f);
+		break;
+	case 5:
+		g.m_diffuse = Vec3(0.0f, 1.0f, 1.0f);
+		break;
+	}
+#	endif
+
 	FragOut output;
 	packGBuffer(g, output.m_color0, output.m_color1, output.m_color2, output.m_color3);
 	return output;

+ 2 - 32
AnKi/Shaders/GpuVisibility.ankiprog

@@ -111,43 +111,13 @@ struct DrawIndirectArgsWithPadding
 
 	// HiZ culling
 	//
-
 #	if HZB_TEST
-	// Compute the mip
-	Vec2 texSize;
-	F32 mipCount;
-	g_hzbTex.GetDimensions(0, texSize.x, texSize.y, mipCount);
-
-	const Vec2 minUv = saturate(ndcToUv(minNdc));
-	const Vec2 maxUv = saturate(ndcToUv(maxNdc));
-	const Vec2 sizeXY = (maxUv - minUv) * texSize;
-	F32 mip = ceil(log2(max(sizeXY.x, sizeXY.y)));
-
-	// Try to use a more detailed mip if you can
-	const F32 levelLower = max(mip - 1.0, 0.0);
-	const Vec2 mipSize = texSize / pow(2.0f, levelLower);
-	const Vec2 a = floor(minUv * mipSize);
-	const Vec2 b = ceil(maxUv * mipSize);
-	const Vec2 dims = b - a;
-
-	if(dims.x <= 2.0 && dims.y <= 2.0)
-	{
-		mip = levelLower;
-	}
-
-	// Sample mip
-	Vec4 depths;
-	depths[0] = g_hzbTex.SampleLevel(g_nearestAnyClampSampler, minUv, mip);
-	depths[1] = g_hzbTex.SampleLevel(g_nearestAnyClampSampler, maxUv, mip);
-	depths[2] = g_hzbTex.SampleLevel(g_nearestAnyClampSampler, Vec2(minUv.x, maxUv.y), mip);
-	depths[3] = g_hzbTex.SampleLevel(g_nearestAnyClampSampler, Vec2(maxUv.x, minUv.y), mip);
-	const F32 maxDepth = max4(depths);
-
-	if(aabbMinDepth > maxDepth)
+	if(cullHzb(minNdc, maxNdc, aabbMinDepth, g_hzbTex, g_nearestAnyClampSampler))
 	{
 		return;
 	}
 #	endif // HZB_TEST
+
 #else // DISTANCE_TEST == 1
 	if(!testSphereSphereCollision(sphereCenter, sphereRadius, g_consts.m_pointOfTest, g_consts.m_testRadius))
 	{

+ 1 - 1
AnKi/Shaders/Include/Common.h

@@ -788,7 +788,7 @@ constexpr U32 kMaxMipsSinglePassDownsamplerCanProduce = 12u;
 
 constexpr U32 kMaxPrimitivesPerMeshlet = 128;
 constexpr U32 kMaxVerticesPerMeshlet = 128;
-#define ANKI_TASK_SHADER_THREADGROUP_SIZE 128u
+#define ANKI_TASK_SHADER_THREADGROUP_SIZE 64u
 constexpr U32 kMeshletGroupSize = ANKI_TASK_SHADER_THREADGROUP_SIZE;
 
 #define ANKI_MESH_SHADER_THREADGROUP_SIZE 32u

+ 12 - 5
AnKi/Shaders/Include/MaterialTypes.h

@@ -16,8 +16,15 @@ struct MaterialGlobalConstants
 	Mat4 m_previousViewProjectionMatrix;
 	Mat3x4 m_viewTransform;
 	Mat3x4 m_cameraTransform;
+
+	Vec4 m_viewport;
+
+	U32 m_enableHzbTesting;
+	U32 m_padding0;
+	U32 m_padding1;
+	U32 m_padding2;
 };
-static_assert(sizeof(MaterialGlobalConstants) == 14 * sizeof(Vec4));
+static_assert(sizeof(MaterialGlobalConstants) == 16 * sizeof(Vec4));
 
 /// @brief
 enum class MaterialSet : U32
@@ -37,9 +44,12 @@ enum class MaterialBinding : U32
 #define ANKI_UNIFIED_GEOM_FORMAT(fmt, shaderType) kUnifiedGeometry_##fmt,
 #include <AnKi/Shaders/Include/UnifiedGeometryTypes.defs.h>
 
-	kMeshlets, // Pointing to the unified geom buffer
+	// For mesh shading
+	kMeshlets, ///< Points to the unified geom buffer
 	kTaskShaderPayloads,
 	kRenderables,
+	kHzbTexture,
+	kNearestClampSampler,
 
 	// For FW shading:
 	kLinearClampSampler,
@@ -49,9 +59,6 @@ enum class MaterialBinding : U32
 	kClusterShadingConstants,
 	kClusterShadingLights,
 	kClusters = kClusterShadingLights + 2,
-
-	kCount,
-	kFirst = 0
 };
 
 // Techniques

+ 4 - 4
AnKi/Shaders/Include/MeshTypes.h

@@ -90,11 +90,11 @@ struct Meshlet
 	U32 m_primitiveCount_R16_Uint_vertexCount_R16_Uint;
 	U32 m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm;
 
-	Vec3 m_sphereCenter;
-	F32 m_sphereRadius;
+	Vec3 m_aabbMin;
+	U32 m_coneApex_R8G8B8A8_Snorm;
 
-	Vec3 m_coneApex;
-	F32 m_padding;
+	Vec3 m_aabbMax;
+	F32 m_sphereRadius;
 };
 // Power of 2 because the sizeof will be used as allocation alignment and allocation alignments need to be power of 2
 static_assert(isPowerOfTwo(sizeof(Meshlet)));

+ 7 - 2
AnKi/Shaders/MaterialShadersCommon.hlsl

@@ -11,6 +11,7 @@
 #include <AnKi/Shaders/Include/MeshTypes.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
 #include <AnKi/Shaders/PackFunctions.hlsl>
+#include <AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl>
 
 ANKI_BINDLESS_SET(MaterialSet::kBindless)
 
@@ -26,6 +27,8 @@ ANKI_BINDLESS_SET(MaterialSet::kBindless)
 [[vk::binding(MaterialBinding::kMeshlets, MaterialSet::kGlobal)]] StructuredBuffer<Meshlet> g_meshlets;
 [[vk::binding(MaterialBinding::kTaskShaderPayloads, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneTaskShaderPayload> g_taskShaderPayloads;
 [[vk::binding(MaterialBinding::kRenderables, MaterialSet::kGlobal)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
+[[vk::binding(MaterialBinding::kHzbTexture, MaterialSet::kGlobal)]] Texture2D<Vec4> g_hzbTexture;
+[[vk::binding(MaterialBinding::kNearestClampSampler, MaterialSet::kGlobal)]] SamplerState g_nearestClampSampler;
 
 // FW shading specific
 #if defined(FORWARD_SHADING)
@@ -83,7 +86,9 @@ Bool cullBackfaceMeshlet(Meshlet meshlet, Mat3x4 worldTransform, Vec3 cameraWorl
 {
 	const Vec4 coneData = unpackSnorm4x8(meshlet.m_coneDirection_R8G8B8_Snorm_cosHalfAngle_R8_Snorm);
 
-	meshlet.m_sphereCenter = mul(worldTransform, Vec4(meshlet.m_sphereCenter, 1.0f));
+	Vec3 center = (meshlet.m_aabbMin + meshlet.m_aabbMax) / 2.0f;
+
+	center = mul(worldTransform, Vec4(center, 1.0f));
 	const Vec3 coneAxisWspace = normalize(mul(worldTransform, Vec4(coneData.xyz, 0.0f)));
 
 	// Extract uniform scale
@@ -92,6 +97,6 @@ Bool cullBackfaceMeshlet(Meshlet meshlet, Mat3x4 worldTransform, Vec3 cameraWorl
 
 	meshlet.m_sphereRadius *= uniformScale;
 
-	const Vec3 dir = meshlet.m_sphereCenter - cameraWorldPos;
+	const Vec3 dir = center - cameraWorldPos;
 	return dot(dir, coneAxisWspace) >= coneData.w * length(dir) + meshlet.m_sphereRadius;
 }

+ 34 - 0
AnKi/Shaders/VisibilityAndCollisionFunctions.hlsl

@@ -218,3 +218,37 @@ void projectAabb(Vec3 aabbMin, Vec3 aabbMax, Mat4 viewProjMat, out Vec2 minNdc,
 
 	aabbMinDepth = saturate(aabbMinDepth);
 }
+
+Bool cullHzb(Vec2 aabbMinNdc, Vec2 aabbMaxNdc, F32 aabbMinDepth, Texture2D<Vec4> hzb, SamplerState nearestAnyClampSampler)
+{
+	Vec2 texSize;
+	F32 mipCount;
+	hzb.GetDimensions(0, texSize.x, texSize.y, mipCount);
+
+	const Vec2 minUv = saturate(ndcToUv(aabbMinNdc));
+	const Vec2 maxUv = saturate(ndcToUv(aabbMaxNdc));
+	const Vec2 sizeXY = (maxUv - minUv) * texSize;
+	F32 mip = ceil(log2(max(sizeXY.x, sizeXY.y)));
+
+	// Try to use a more detailed mip if you can
+	const F32 levelLower = max(mip - 1.0, 0.0);
+	const Vec2 mipSize = texSize / pow(2.0f, levelLower);
+	const Vec2 a = floor(minUv * mipSize);
+	const Vec2 b = ceil(maxUv * mipSize);
+	const Vec2 dims = b - a;
+
+	if(dims.x <= 2.0 && dims.y <= 2.0)
+	{
+		mip = levelLower;
+	}
+
+	// Sample mip
+	Vec4 depths;
+	depths[0] = hzb.SampleLevel(nearestAnyClampSampler, minUv, mip);
+	depths[1] = hzb.SampleLevel(nearestAnyClampSampler, maxUv, mip);
+	depths[2] = hzb.SampleLevel(nearestAnyClampSampler, Vec2(minUv.x, maxUv.y), mip);
+	depths[3] = hzb.SampleLevel(nearestAnyClampSampler, Vec2(maxUv.x, minUv.y), mip);
+	const F32 maxDepth = max4(depths);
+
+	return (aabbMinDepth > maxDepth);
+}