Browse Source

Add GPU visibility for top-level acceleration structure building

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
38ebdef1a5

+ 2 - 0
AnKi/Gr/AccelerationStructure.h

@@ -140,6 +140,8 @@ public:
 		return m_scratchBufferSize;
 	}
 
+	U64 getGpuAddress() const;
+
 protected:
 	PtrSize m_scratchBufferSize = 0;
 	AccelerationStructureType m_type = AccelerationStructureType::kCount;

+ 6 - 0
AnKi/Gr/Vulkan/AccelerationStructure.cpp

@@ -21,4 +21,10 @@ AccelerationStructure* AccelerationStructure::newInstance(const AccelerationStru
 	return impl;
 }
 
+U64 AccelerationStructure::getGpuAddress() const
+{
+	ANKI_VK_SELF_CONST(AccelerationStructureImpl);
+	return self.getAsDeviceAddress();
+}
+
 } // end namespace anki

+ 6 - 0
AnKi/Gr/Vulkan/AccelerationStructureImpl.h

@@ -38,6 +38,12 @@ public:
 		return m_topLevelInfo.m_maxInstanceCount;
 	}
 
+	VkDeviceAddress getAsDeviceAddress() const
+	{
+		ANKI_ASSERT(m_deviceAddress);
+		return m_deviceAddress;
+	}
+
 	void generateBuildInfo(U64 scratchBufferAddress, VkAccelerationStructureBuildGeometryInfoKHR& buildInfo,
 						   VkAccelerationStructureBuildRangeInfoKHR& rangeInfo) const
 	{

+ 35 - 33
AnKi/Renderer/AccelerationStructureBuilder.cpp

@@ -7,59 +7,61 @@
 #include <AnKi/Renderer/RenderQueue.h>
 #include <AnKi/Renderer/Renderer.h>
 #include <AnKi/Util/Tracer.h>
+#include <AnKi/Core/App.h>
+#include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
 
 namespace anki {
 
+static NumericCVar<F32>
+	g_rayTracingExtendedFrustumDistanceCVar(CVarSubsystem::kRenderer, "RayTracingExtendedFrustumDistance", 100.0f, 10.0f, 10000.0f,
+											"Every object that its distance from the camera is bellow that value will take part in ray tracing");
+
 void AccelerationStructureBuilder::populateRenderGraph(RenderingContext& ctx)
 {
 	ANKI_TRACE_SCOPED_EVENT(RTlas);
 
-	// Get some things
-	ANKI_ASSERT(ctx.m_renderQueue->m_rayTracingQueue);
-	ConstWeakArray<RayTracingInstanceQueueElement> instanceElements = ctx.m_renderQueue->m_rayTracingQueue->m_rayTracingInstances;
-	const U32 instanceCount = instanceElements.getSize();
-	ANKI_ASSERT(instanceCount > 0);
-
-	// Create the instances. Allocate but not construct to save some CPU time
-	void* instancesMem =
-		ctx.m_tempPool->allocate(sizeof(AccelerationStructureInstanceInfo) * instanceCount, alignof(AccelerationStructureInstanceInfo));
-	WeakArray<AccelerationStructureInstanceInfo> instances(static_cast<AccelerationStructureInstanceInfo*>(instancesMem), instanceCount);
-
-	for(U32 instanceIdx = 0; instanceIdx < instanceCount; ++instanceIdx)
+	// Do visibility
+	GpuVisibilityAccelerationStructuresOutput visOut;
 	{
-		const RayTracingInstanceQueueElement& element = instanceElements[instanceIdx];
+		const Array<F32, kMaxLodCount - 1> lodDistances = {g_lod0MaxDistanceCVar.get(), g_lod1MaxDistanceCVar.get()};
+
+		GpuVisibilityAccelerationStructuresInput in;
+		in.m_passesName = "Main TLAS visiblity";
+		in.m_technique = RenderingTechnique::kGBuffer;
+		in.m_lodReferencePoint = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz();
+		in.m_lodDistances = lodDistances;
+		in.m_pointOfTest = in.m_lodReferencePoint;
+		in.m_testRadius = g_rayTracingExtendedFrustumDistanceCVar.get();
+		in.m_viewProjectionMatrix = ctx.m_matrices.m_viewProjection;
+		in.m_rgraph = &ctx.m_renderGraphDescr;
 
-		// Init instance
-		AccelerationStructureInstanceInfo& out = instances[instanceIdx];
-		::new(&out) AccelerationStructureInstanceInfo();
-		out.m_bottomLevel.reset(element.m_bottomLevelAccelerationStructure);
-		memcpy(&out.m_transform, &element.m_transform, sizeof(out.m_transform));
-		out.m_hitgroupSbtRecordIndex = instanceIdx;
-		out.m_mask = 0xFF;
+		getRenderer().getGpuVisibilityAccelerationStructures().pupulateRenderGraph(in, visOut);
 	}
 
 	// Create the TLAS
-	AccelerationStructureInitInfo initInf("MainTlas");
+	AccelerationStructureInitInfo initInf("Main TLAS");
 	initInf.m_type = AccelerationStructureType::kTopLevel;
-	initInf.m_topLevel.m_directArgs.m_instances = instances;
+	initInf.m_topLevel.m_indirectArgs.m_maxInstanceCount = GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getElementCount();
+	initInf.m_topLevel.m_indirectArgs.m_instancesBuffer = visOut.m_instancesBuffer.m_buffer;
+	initInf.m_topLevel.m_indirectArgs.m_instancesBufferOffset = visOut.m_instancesBuffer.m_offset;
 	m_runCtx.m_tlas = GrManager::getSingleton().newAccelerationStructure(initInf);
 
-	// Need a cleanup
-	for(U32 instanceIdx = 0; instanceIdx < instanceCount; ++instanceIdx)
-	{
-		instances[instanceIdx].m_bottomLevel.reset(nullptr);
-	}
-
 	// Build the job
 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
+
+	const BufferOffsetRange scratchBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(m_runCtx.m_tlas->getBuildScratchBufferSize());
+
 	m_runCtx.m_tlasHandle = rgraph.importAccelerationStructure(m_runCtx.m_tlas.get(), AccelerationStructureUsageBit::kNone);
-	ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("BuildTlas");
-	rpass.setWork([this](RenderPassWorkContext& rgraphCtx) {
-		ANKI_TRACE_SCOPED_EVENT(RTlas);
-		rgraphCtx.m_commandBuffer->buildAccelerationStructure(m_runCtx.m_tlas.get());
-	});
 
+	ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("Build TLAS");
 	rpass.newAccelerationStructureDependency(m_runCtx.m_tlasHandle, AccelerationStructureUsageBit::kBuild);
+	rpass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kAccelerationStructureBuild);
+
+	rpass.setWork([this, scratchBuff, rangeBuff = visOut.m_rangeBuffer](RenderPassWorkContext& rgraphCtx) {
+		ANKI_TRACE_SCOPED_EVENT(RTlas);
+		rgraphCtx.m_commandBuffer->buildAccelerationStructureIndirect(m_runCtx.m_tlas.get(), scratchBuff.m_buffer, scratchBuff.m_offset,
+																	  rangeBuff.m_buffer, rangeBuff.m_offset);
+	});
 }
 
 } // end namespace anki

+ 1 - 0
AnKi/Renderer/Renderer.cpp

@@ -309,6 +309,7 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 
 	ANKI_CHECK(m_visibility.init());
 	ANKI_CHECK(m_nonRenderablesVisibility.init());
+	ANKI_CHECK(m_asVisibility.init());
 	ANKI_CHECK(m_hzbGenerator.init());
 
 	return Error::kNone;

+ 6 - 0
AnKi/Renderer/Renderer.h

@@ -107,6 +107,11 @@ public:
 		return m_nonRenderablesVisibility;
 	}
 
+	GpuVisibilityAccelerationStructures& getGpuVisibilityAccelerationStructures()
+	{
+		return m_asVisibility;
+	}
+
 	const HzbGenerator& getHzbGenerator() const
 	{
 		return m_hzbGenerator;
@@ -212,6 +217,7 @@ private:
 	RenderableDrawer m_sceneDrawer;
 	GpuVisibility m_visibility;
 	GpuVisibilityNonRenderables m_nonRenderablesVisibility;
+	GpuVisibilityAccelerationStructures m_asVisibility;
 	HzbGenerator m_hzbGenerator;
 	ReadbackManager m_readbaks;
 

+ 143 - 14
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -211,33 +211,25 @@ void GpuVisibility::populateRenderGraphInternal(Bool distanceBased, BaseGpuVisib
 			cmdb.bindShaderProgram(m_distGrProgs[gatherAabbIndices].get());
 		}
 
+		BufferOffsetRange aabbsBuffer;
 		switch(technique)
 		{
 		case RenderingTechnique::kGBuffer:
-			cmdb.bindStorageBuffer(0, 0, &GpuSceneBuffer::getSingleton().getBuffer(),
-								   GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getGpuSceneOffsetOfArrayBase(),
-								   GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getBufferRange());
+			aabbsBuffer = GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getBufferOffsetRange();
 			break;
 		case RenderingTechnique::kDepth:
-			cmdb.bindStorageBuffer(0, 0, &GpuSceneBuffer::getSingleton().getBuffer(),
-								   GpuSceneArrays::RenderableAabbDepth::getSingleton().getGpuSceneOffsetOfArrayBase(),
-								   GpuSceneArrays::RenderableAabbDepth::getSingleton().getBufferRange());
+			aabbsBuffer = GpuSceneArrays::RenderableAabbDepth::getSingleton().getBufferOffsetRange();
 			break;
 		case RenderingTechnique::kForward:
-			cmdb.bindStorageBuffer(0, 0, &GpuSceneBuffer::getSingleton().getBuffer(),
-								   GpuSceneArrays::RenderableAabbForward::getSingleton().getGpuSceneOffsetOfArrayBase(),
-								   GpuSceneArrays::RenderableAabbForward::getSingleton().getBufferRange());
+			aabbsBuffer = GpuSceneArrays::RenderableAabbForward::getSingleton().getBufferOffsetRange();
 			break;
 		default:
 			ANKI_ASSERT(0);
 		}
 
-		cmdb.bindStorageBuffer(0, 1, &GpuSceneBuffer::getSingleton().getBuffer(),
-							   GpuSceneArrays::Renderable::getSingleton().getGpuSceneOffsetOfArrayBase(),
-							   GpuSceneArrays::Renderable::getSingleton().getBufferRange());
-
+		cmdb.bindStorageBuffer(0, 0, aabbsBuffer);
+		cmdb.bindStorageBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 		cmdb.bindStorageBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
-
 		cmdb.bindStorageBuffer(0, 3, instanceRateRenderables);
 		cmdb.bindStorageBuffer(0, 4, indirectArgs);
 
@@ -491,4 +483,141 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 	});
 }
 
+Error GpuVisibilityAccelerationStructures::init()
+{
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_prog, m_grProg));
+
+	return Error::kNone;
+}
+
+void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in,
+															  GpuVisibilityAccelerationStructuresOutput& out)
+{
+	in.validate();
+	RenderGraphDescription& rgraph = *in.m_rgraph;
+
+	const Bool firstRunInFrame = m_lastFrameIdx != getRenderer().getFrameCount();
+	if(firstRunInFrame)
+	{
+		// 1st run in this frame, do some bookkeeping
+		m_lastFrameIdx = getRenderer().getFrameCount();
+		m_currentCounterBufferOffset = 0;
+
+		m_counterBufferHandle = {};
+	}
+
+	// Maybe create the counter buffer
+	const U32 counterBufferElementSize =
+		max<U32>(2 * sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
+	if(!m_counterBuffer.isCreated() || m_currentCounterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
+	{
+		BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
+		inf.m_size = (!m_counterBuffer.isCreated()) ? kInitialCounterBufferElementCount * counterBufferElementSize : m_counterBuffer->getSize() * 2;
+		inf.m_usage = BufferUsageBit::kStorageComputeWrite | BufferUsageBit::kStorageComputeRead | BufferUsageBit::kTransferDestination;
+		m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
+
+		m_counterBufferHandle = rgraph.importBuffer(m_counterBuffer.get(), BufferUsageBit::kTransferDestination);
+
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("GpuVisibilityNonRenderablesClearCounterBuffer");
+
+		pass.newBufferDependency(m_counterBufferHandle, BufferUsageBit::kTransferDestination);
+
+		pass.setWork([counterBuffer = m_counterBuffer](RenderPassWorkContext& rgraph) {
+			rgraph.m_commandBuffer->fillBuffer(counterBuffer.get(), 0, kMaxPtrSize, 0);
+		});
+
+		m_currentCounterBufferOffset = 0;
+	}
+
+	// Allocate the transient buffers
+	U32 aabbCount = 0;
+	switch(in.m_technique)
+	{
+	case RenderingTechnique::kGBuffer:
+		aabbCount = GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getElementCount();
+		break;
+	case RenderingTechnique::kDepth:
+		aabbCount = GpuSceneArrays::RenderableAabbDepth::getSingleton().getElementCount();
+		break;
+	case RenderingTechnique::kForward:
+		aabbCount = GpuSceneArrays::RenderableAabbForward::getSingleton().getElementCount();
+		break;
+	default:
+		ANKI_ASSERT(0);
+	}
+
+	out.m_instancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(AccelerationStructureInstance));
+	out.m_someBufferHandle = rgraph.importBuffer(BufferUsageBit::kStorageComputeWrite, out.m_instancesBuffer);
+
+	out.m_renderableIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((aabbCount + 1) * sizeof(U32));
+
+	out.m_rangeBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(AccelerationStructureBuildRangeInfo));
+
+	// Create the compute pass
+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(in.m_passesName);
+
+	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
+	pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kStorageComputeWrite);
+	if(m_counterBufferHandle.isValid())
+	{
+		pass.newBufferDependency(m_counterBufferHandle, BufferUsageBit::kStorageComputeWrite);
+	}
+
+	pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
+				  testRadius = in.m_testRadius, technique = in.m_technique, instancesBuff = out.m_instancesBuffer,
+				  indicesBuff = out.m_renderableIndicesBuffer, rangeBuff = out.m_rangeBuffer, counterBufferOffset = m_currentCounterBufferOffset,
+				  aabbCount](RenderPassWorkContext& rgraph) {
+		CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+
+		cmdb.bindShaderProgram(m_grProg.get());
+
+		GpuVisibilityAccelerationStructuresUniforms unis;
+		Array<Plane, 6> planes;
+		extractClipPlanes(viewProjMat, planes);
+		for(U32 i = 0; i < 6; ++i)
+		{
+			unis.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+		}
+
+		unis.m_pointOfTest = pointOfTest;
+		unis.m_testRadius = testRadius;
+
+		ANKI_ASSERT(kMaxLodCount == 3);
+		unis.m_maxLodDistances[0] = lodDistances[0];
+		unis.m_maxLodDistances[1] = lodDistances[1];
+		unis.m_maxLodDistances[2] = kMaxF32;
+		unis.m_maxLodDistances[3] = kMaxF32;
+
+		cmdb.setPushConstants(&unis, sizeof(unis));
+
+		BufferOffsetRange aabbsBuffer;
+		switch(technique)
+		{
+		case RenderingTechnique::kGBuffer:
+			aabbsBuffer = GpuSceneArrays::RenderableAabbGBuffer::getSingleton().getBufferOffsetRange();
+			break;
+		case RenderingTechnique::kDepth:
+			aabbsBuffer = GpuSceneArrays::RenderableAabbDepth::getSingleton().getBufferOffsetRange();
+			break;
+		case RenderingTechnique::kForward:
+			aabbsBuffer = GpuSceneArrays::RenderableAabbForward::getSingleton().getBufferOffsetRange();
+			break;
+		default:
+			ANKI_ASSERT(0);
+		}
+
+		cmdb.bindStorageBuffer(0, 0, aabbsBuffer);
+		cmdb.bindStorageBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+		cmdb.bindStorageBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
+		cmdb.bindStorageBuffer(0, 3, instancesBuff);
+		cmdb.bindStorageBuffer(0, 4, indicesBuff);
+		cmdb.bindStorageBuffer(0, 5, rangeBuff);
+		cmdb.bindStorageBuffer(0, 6, m_counterBuffer.get(), counterBufferOffset, sizeof(U32) * 2);
+
+		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
+	});
+
+	m_currentCounterBufferOffset += counterBufferElementSize;
+}
+
 } // end namespace anki

+ 60 - 0
AnKi/Renderer/Utils/GpuVisibility.h

@@ -129,6 +129,66 @@ private:
 	U64 m_lastFrameIdx = kMaxU64;
 	U32 m_counterBufferOffset = 0;
 };
+
+/// @memberof GpuVisibilityAccelerationStructures
+class GpuVisibilityAccelerationStructuresInput
+{
+public:
+	CString m_passesName;
+	RenderingTechnique m_technique = RenderingTechnique::kCount;
+
+	Vec3 m_lodReferencePoint = Vec3(kMaxF32);
+	Array<F32, kMaxLodCount - 1> m_lodDistances = {};
+
+	Vec3 m_pointOfTest = Vec3(kMaxF32);
+	F32 m_testRadius = kMaxF32;
+
+	Mat4 m_viewProjectionMatrix;
+
+	RenderGraphDescription* m_rgraph = nullptr;
+
+	void validate() const
+	{
+		ANKI_ASSERT(m_passesName.getLength() > 0);
+		ANKI_ASSERT(m_technique != RenderingTechnique::kCount);
+		ANKI_ASSERT(m_lodReferencePoint.x() != kMaxF32);
+		ANKI_ASSERT(m_lodReferencePoint == m_pointOfTest && "For now these should be the same");
+		ANKI_ASSERT(m_testRadius != kMaxF32);
+		ANKI_ASSERT(m_viewProjectionMatrix != Mat4());
+		ANKI_ASSERT(m_rgraph);
+	}
+};
+
+/// @memberof GpuVisibilityAccelerationStructures
+class GpuVisibilityAccelerationStructuresOutput
+{
+public:
+	BufferHandle m_someBufferHandle; ///< Some handle to track dependencies. No need to track every buffer.
+
+	BufferOffsetRange m_rangeBuffer; ///< Points to a single AccelerationStructureBuildRangeInfo. The m_primitiveCount holds the instance count.
+	BufferOffsetRange m_instancesBuffer; ///< Points to AccelerationStructureBuildRangeInfo::m_primitiveCount number of AccelerationStructureInstance.
+	BufferOffsetRange m_renderableIndicesBuffer; ///< AccelerationStructureBuildRangeInfo::m_primitiveCount number of indices to renderables.
+};
+
+/// Performs visibility to gather bottom-level acceleration structures in a buffer that can be used to build a TLAS.
+class GpuVisibilityAccelerationStructures : public RendererObject
+{
+public:
+	Error init();
+
+	void pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in, GpuVisibilityAccelerationStructuresOutput& out);
+
+private:
+	ShaderProgramResourcePtr m_prog;
+	ShaderProgramPtr m_grProg;
+
+	static constexpr U32 kInitialCounterBufferElementCount = 3;
+	BufferPtr m_counterBuffer; ///< A buffer containing multiple counters for atomic operations.
+	U64 m_lastFrameIdx = kMaxU64;
+	U32 m_currentCounterBufferOffset = 0;
+
+	BufferHandle m_counterBufferHandle;
+};
 /// @}
 
 } // end namespace anki

+ 5 - 0
AnKi/Resource/ModelResource.cpp

@@ -46,6 +46,11 @@ void ModelPatch::getGeometryInfo(U32 lod, ModelPatchGeometryInfo& inf) const
 	{
 		inf.m_vertexBufferOffsets[stream] = m_lodInfos[lod].m_vertexBufferOffsets[stream];
 	}
+
+	if(!!(m_mtl->getRenderingTechniques() & RenderingTechniqueBit::kAllRt))
+	{
+		inf.m_blas = m_mesh->getBottomLevelAccelerationStructure(lod);
+	}
 }
 
 void ModelPatch::getRayTracingInfo(const RenderingKey& key, ModelRayTracingInfo& info) const

+ 2 - 0
AnKi/Resource/ModelResource.h

@@ -55,6 +55,8 @@ public:
 
 	/// Offset to the vertex buffer or kMaxPtrSize if stream is not present.
 	Array<PtrSize, U32(VertexStreamId::kMeshRelatedCount)> m_vertexBufferOffsets;
+
+	AccelerationStructurePtr m_blas;
 };
 
 /// Model patch class. Its very important class and it binds a material with a mesh.

+ 0 - 40
AnKi/Scene/Components/CameraComponent.cpp

@@ -24,9 +24,6 @@ static NumericCVar<F32> g_shadowCascade3DistanceCVar(CVarSubsystem::kScene, "Sha
 static NumericCVar<F32> g_earyZDistanceCVar(CVarSubsystem::kScene, "EarlyZDistance", (ANKI_PLATFORM_MOBILE) ? 0.0f : 10.0f, 0.0f, kMaxF32,
 											"Objects with distance lower than that will be used in early Z");
 BoolCVar g_rayTracedShadowsCVar(CVarSubsystem::kScene, "RayTracedShadows", true, "Enable or not ray traced shadows. Ignored if RT is not supported");
-static NumericCVar<F32>
-	g_rayTracingExtendedFrustumDistanceCVar(CVarSubsystem::kScene, "RayTracingExtendedFrustumDistance", 100.0f, 10.0f, 10000.0f,
-											"Every object that its distance from the camera is bellow that value will take part in ray tracing");
 
 CameraComponent::CameraComponent(SceneNode* node)
 	: SceneComponent(node, kClassType)
@@ -49,21 +46,6 @@ CameraComponent::CameraComponent(SceneNode* node)
 	m_frustum.setEarlyZDistance(g_earyZDistanceCVar.get());
 
 	m_frustum.update();
-
-	// Init extended frustum
-	m_usesExtendedFrustum = GrManager::getSingleton().getDeviceCapabilities().m_rayTracingEnabled && g_rayTracedShadowsCVar.get();
-
-	if(m_usesExtendedFrustum)
-	{
-		m_extendedFrustum.init(FrustumType::kOrthographic);
-
-		const F32 dist = g_rayTracingExtendedFrustumDistanceCVar.get();
-
-		m_extendedFrustum.setOrthographic(0.1f, dist * 2.0f, dist, -dist, dist, -dist);
-		m_extendedFrustum.setWorldTransform(computeExtendedFrustumTransform(node->getWorldTransform()));
-
-		m_extendedFrustum.update();
-	}
 }
 
 CameraComponent::~CameraComponent()
@@ -75,33 +57,11 @@ Error CameraComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 	if(info.m_node->movedThisFrame())
 	{
 		m_frustum.setWorldTransform(info.m_node->getWorldTransform());
-
-		if(m_usesExtendedFrustum)
-		{
-			m_extendedFrustum.setWorldTransform(computeExtendedFrustumTransform(info.m_node->getWorldTransform()));
-		}
 	}
 
 	updated = m_frustum.update();
 
-	if(m_usesExtendedFrustum)
-	{
-		const Bool extendedUpdated = m_extendedFrustum.update();
-		updated = updated || extendedUpdated;
-	}
-
 	return Error::kNone;
 }
 
-Transform CameraComponent::computeExtendedFrustumTransform(const Transform& cameraTransform) const
-{
-	const F32 far = m_extendedFrustum.getFar();
-	Transform extendedFrustumTransform = Transform::getIdentity();
-	Vec3 newOrigin = cameraTransform.getOrigin().xyz();
-	newOrigin.z() += far / 2.0f;
-	extendedFrustumTransform.setOrigin(newOrigin.xyz0());
-
-	return extendedFrustumTransform;
-}
-
 } // end namespace anki

+ 0 - 22
AnKi/Scene/Components/CameraComponent.h

@@ -119,32 +119,10 @@ public:
 		return m_frustum;
 	}
 
-	ANKI_INTERNAL Bool getHasExtendedFrustum() const
-	{
-		return m_usesExtendedFrustum;
-	}
-
-	ANKI_INTERNAL const Frustum& getExtendedFrustum() const
-	{
-		ANKI_ASSERT(m_usesExtendedFrustum);
-		return m_extendedFrustum;
-	}
-
-	ANKI_INTERNAL Frustum& getExtendedFrustum()
-	{
-		ANKI_ASSERT(m_usesExtendedFrustum);
-		return m_extendedFrustum;
-	}
-
 private:
 	Frustum m_frustum;
-	Frustum m_extendedFrustum; ///< For ray tracing.
-
-	Bool m_usesExtendedFrustum : 1 = false;
 
 	Error update(SceneComponentUpdateInfo& info, Bool& updated) override;
-
-	Transform computeExtendedFrustumTransform(const Transform& cameraTransform) const;
 };
 /// @}
 

+ 6 - 66
AnKi/Scene/Components/ModelComponent.cpp

@@ -174,6 +174,12 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 						meshLod.m_vertexOffsets[U32(stream)] = kMaxU32;
 					}
 				}
+
+				if(inf.m_blas)
+				{
+					const U64 address = inf.m_blas->getGpuAddress();
+					memcpy(&meshLod.m_blasAddress, &address, sizeof(meshLod.m_blasAddress));
+				}
 			}
 
 			// Copy the last LOD to the rest just in case
@@ -301,72 +307,6 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 	return Error::kNone;
 }
 
-void ModelComponent::setupRayTracingInstanceQueueElements(U32 lod, RenderingTechnique technique,
-														  WeakArray<RayTracingInstanceQueueElement>& outInstances) const
-{
-	ANKI_ASSERT(isEnabled());
-
-	outInstances.setArray(nullptr, 0);
-
-	const RenderingTechniqueBit requestedRenderingTechniqueMask = RenderingTechniqueBit(1 << technique);
-	if(!(m_presentRenderingTechniques & requestedRenderingTechniqueMask))
-	{
-		return;
-	}
-
-	// Allocate instances
-	U32 instanceCount = 0;
-	for(U32 i = 0; i < m_patchInfos.getSize(); ++i)
-	{
-		instanceCount += !!(m_patchInfos[i].m_techniques & requestedRenderingTechniqueMask);
-	}
-
-	if(instanceCount == 0)
-	{
-		return;
-	}
-
-	RayTracingInstanceQueueElement* instances = static_cast<RayTracingInstanceQueueElement*>(SceneGraph::getSingleton().getFrameMemoryPool().allocate(
-		sizeof(RayTracingInstanceQueueElement) * instanceCount, alignof(RayTracingInstanceQueueElement)));
-
-	outInstances.setArray(instances, instanceCount);
-
-	RenderingKey key;
-	key.setLod(lod);
-	key.setRenderingTechnique(technique);
-
-	instanceCount = 0;
-	for(U32 i = 0; i < m_patchInfos.getSize(); ++i)
-	{
-		if(!(m_patchInfos[i].m_techniques & requestedRenderingTechniqueMask))
-		{
-			continue;
-		}
-
-		RayTracingInstanceQueueElement& queueElem = instances[instanceCount];
-
-		const ModelPatch& patch = m_model->getModelPatches()[i];
-
-		ModelRayTracingInfo modelInf;
-		patch.getRayTracingInfo(key, modelInf);
-
-		queueElem.m_bottomLevelAccelerationStructure = modelInf.m_bottomLevelAccelerationStructure.get();
-		queueElem.m_shaderGroupHandleIndex = modelInf.m_shaderGroupHandleIndex;
-		queueElem.m_worldTransformsOffset = m_gpuSceneTransforms.getGpuSceneOffset();
-		queueElem.m_uniformsOffset = m_patchInfos[i].m_gpuSceneUniformsOffset;
-		queueElem.m_geometryOffset =
-			U32(m_patchInfos[i].m_gpuSceneMeshLods.getIndex() * sizeof(GpuSceneMeshLod) * kMaxLodCount + lod * sizeof(GpuSceneMeshLod));
-		queueElem.m_geometryOffset += U32(GpuSceneArrays::MeshLod::getSingleton().getGpuSceneOffsetOfArrayBase());
-		queueElem.m_indexBufferOffset = U32(modelInf.m_indexBufferOffset);
-
-		const Transform positionTransform(patch.getMesh()->getPositionsTranslation().xyz0(), Mat3x4::getIdentity(),
-										  patch.getMesh()->getPositionsScale());
-		queueElem.m_transform = Mat3x4(m_node->getWorldTransform()).combineTransformations(Mat3x4(positionTransform));
-
-		++instanceCount;
-	}
-}
-
 void ModelComponent::onOtherComponentRemovedOrAdded(SceneComponent* other, Bool added)
 {
 	ANKI_ASSERT(other);

+ 0 - 2
AnKi/Scene/Components/ModelComponent.h

@@ -44,8 +44,6 @@ public:
 		return m_castsShadow;
 	}
 
-	void setupRayTracingInstanceQueueElements(U32 lod, RenderingTechnique technique, WeakArray<RayTracingInstanceQueueElement>& outRenderables) const;
-
 private:
 	class PatchInfo
 	{

+ 0 - 10
AnKi/Scene/Visibility.cpp

@@ -641,16 +641,6 @@ void SceneGraph::doVisibilityTests(SceneNode& camera, SceneGraph& scene, RenderQ
 	static_cast<FrustumFlags&>(visFrustum) = getCameraFrustumFlags();
 	ctx.submitNewWork(visFrustum, visFrustum, rqueue, hive);
 
-	if(camerac.getHasExtendedFrustum())
-	{
-		VisibilityFrustum evisFrustum;
-		evisFrustum.m_frustum = &camerac.getExtendedFrustum();
-		static_cast<FrustumFlags&>(evisFrustum) = getCameraExtendedFrustumFlags();
-
-		rqueue.m_rayTracingQueue = newInstance<RenderQueue>(scene.getFrameMemoryPool());
-		ctx.submitNewWork(evisFrustum, visFrustum, *rqueue.m_rayTracingQueue, hive);
-	}
-
 	hive.waitAllTasks();
 }
 

+ 8 - 0
AnKi/Shaders/CollisionFunctions.hlsl

@@ -57,6 +57,14 @@ Bool testAabbAabb(Vec3 aMin, Vec3 aMax, Vec3 bMin, Vec3 bMax)
 	return all(aMin < bMax) && all(bMin < aMax);
 }
 
+Bool testSphereSphereCollision(Vec3 sphereCenterA, F32 sphereRadiusA, Vec3 sphereCenterB, F32 sphereRadiusB)
+{
+	const Vec3 vec = sphereCenterA - sphereCenterB;
+	const F32 distSquared = dot(vec, vec);
+	const F32 maxDist = sphereRadiusA + sphereRadiusB;
+	return (distSquared < maxDist * maxDist);
+}
+
 /// Intersect a ray against an AABB. The ray is inside the AABB. The function returns the distance 'a' where the
 /// intersection point is rayOrigin + rayDir * a
 /// https://community.arm.com/graphics/b/blog/posts/reflections-based-on-local-cubemaps-in-unity

+ 1 - 4
AnKi/Shaders/GpuVisibility.ankiprog

@@ -158,10 +158,7 @@ struct DrawIndirectArgsWithPadding
 	}
 #	endif // HZB_TEST
 #else // DISTANCE_TEST == 1
-	const Vec3 vec = aabb.m_sphereCenter - g_unis.m_pointOfTest;
-	const F32 distFromTestPointSquared = dot(vec, vec);
-	const F32 maxDist = g_unis.m_testRadius + aabb.m_sphereRadius;
-	if(distFromTestPointSquared >= maxDist * maxDist)
+	if(!testSphereSphereCollision(aabb.m_sphereCenter, aabb.m_sphereRadius, g_unis.m_pointOfTest, aabb.m_sphereRadius))
 	{
 		return;
 	}

+ 130 - 0
AnKi/Shaders/GpuVisibilityAccelerationStructures.ankiprog

@@ -0,0 +1,130 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Common.hlsl>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+#include <AnKi/Shaders/Include/GpuVisibilityTypes.h>
+#include <AnKi/Shaders/CollisionFunctions.hlsl>
+
+// Buffers that point to the GPU scene
+[[vk::binding(0)]] StructuredBuffer<GpuSceneRenderableAabb> g_aabbs;
+[[vk::binding(1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
+[[vk::binding(2)]] ByteAddressBuffer g_gpuScene;
+
+[[vk::binding(3)]] RWStructuredBuffer<AccelerationStructureInstance> g_visibleInstances;
+[[vk::binding(4)]] RWStructuredBuffer<U32> g_visibleRenderableIndices; // 1st element is the count
+[[vk::binding(5)]] RWStructuredBuffer<AccelerationStructureBuildRangeInfo> g_range;
+
+[[vk::binding(6)]] globallycoherent RWStructuredBuffer<U32> g_counterBuffer; // 2 counters per dispatch
+
+[[vk::push_constant]] ConstantBuffer<GpuVisibilityAccelerationStructuresUniforms> g_unis;
+
+#define NUMTHREADS 64
+
+[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
+{
+	// Skip remaining threads
+	const U32 aabbIdx = svDispatchThreadId;
+	U32 aabbCount;
+	U32 unused;
+	g_aabbs.GetDimensions(aabbCount, unused);
+	Bool visible = (aabbIdx < aabbCount);
+
+	// Sphere test
+	GpuSceneRenderableAabb aabb;
+	if(visible)
+	{
+		aabb = g_aabbs[aabbIdx];
+		visible = testSphereSphereCollision(aabb.m_sphereCenter, aabb.m_sphereRadius, g_unis.m_pointOfTest, aabb.m_sphereRadius);
+	}
+
+	// All good, write the instance
+	if(visible)
+	{
+		// LOD selection
+		U32 lod;
+		const Bool insideCameraFrustum = frustumTest(g_unis.m_clipPlanes, aabb.m_sphereCenter, aabb.m_sphereRadius);
+		if(insideCameraFrustum)
+		{
+			// Visible by the camera, need to match the camera LODs
+			const F32 distFromLodPoint = length(aabb.m_sphereCenter - g_unis.m_pointOfTest) - aabb.m_sphereRadius;
+			if(distFromLodPoint < g_unis.m_maxLodDistances[0])
+			{
+				lod = 0u;
+			}
+			else if(distFromLodPoint < g_unis.m_maxLodDistances[1])
+			{
+				lod = 1u;
+			}
+			else
+			{
+				lod = 2u;
+			}
+		}
+		else
+		{
+			// Not visible by the main camera, lowest LOD
+			lod = 2u;
+		}
+
+		const U32 renderableIdx = aabb.m_renderableIndexAndRenderStateBucket >> 12u;
+		const GpuSceneRenderable renderable = g_renderables[renderableIdx];
+
+		const U32 meshLodOffset = renderable.m_meshLodsOffset + sizeof(GpuSceneMeshLod) * lod;
+		const GpuSceneMeshLod meshLod = g_gpuScene.Load<GpuSceneMeshLod>(meshLodOffset);
+
+		if(meshLod.m_blasAddress.x != 0 || meshLod.m_blasAddress.y != 0)
+		{
+			// It has a BLAS, write what is to write
+			U32 instanceIdx;
+			InterlockedAdd(g_counterBuffer[0], 1, instanceIdx);
+
+			AccelerationStructureInstance instance;
+			instance.m_transform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
+			instance.m_instanceCustomIndex24_mask8 = (instanceIdx << 8u) | (meshLod.m_tlasInstanceMask & 0xFFu);
+			instance.m_instanceShaderBindingTableRecordOffset24_flags8 =
+				(instanceIdx << 8u)
+				| (kAccellerationStructureFlagTriangleFrontCounterlockwise | kAccellerationStructureFlagTriangleFacingCullDisable);
+			instance.m_accelerationStructureAddress = meshLod.m_blasAddress;
+			g_visibleInstances[instanceIdx] = instance;
+
+			g_visibleRenderableIndices[instanceIdx + 1] = renderableIdx;
+		}
+	}
+
+	// Store the counters to the actual buffers
+	{
+		Bool lastThreadgroupExecuting = false;
+		if(svGroupIndex == 0)
+		{
+			U32 threadgroupIdx;
+			InterlockedAdd(g_counterBuffer[1], 1, threadgroupIdx);
+			const U32 threadgroupCount = (aabbCount + NUMTHREADS - 1) / NUMTHREADS;
+			lastThreadgroupExecuting = (threadgroupIdx + 1 == threadgroupCount);
+		}
+
+		// Sync to make sure all the atomic ops have finished before the following code reads them
+		AllMemoryBarrierWithGroupSync();
+
+		if(lastThreadgroupExecuting)
+		{
+			AccelerationStructureBuildRangeInfo range;
+			range.m_primitiveCount = g_counterBuffer[0];
+			range.m_primitiveOffset = 0;
+			range.m_firstVertex = 0;
+			range.m_transformOffset = 0;
+			g_range[0] = range;
+
+			g_visibleRenderableIndices[0] = g_counterBuffer[0];
+
+			g_counterBuffer[0] = 0;
+			g_counterBuffer[1] = 0;
+		}
+	}
+}
+
+#pragma anki end

+ 6 - 5
AnKi/Shaders/Include/Common.h

@@ -800,12 +800,13 @@ struct AccelerationStructureBuildRangeInfo
 };
 
 /// Mirrors VkGeometryInstanceFlagBitsKHR
-enum class AccellerationStructureFlag : U32
+enum AccellerationStructureFlag : U32
 {
-	kTriangleFacingCullDisable = 1 << 0,
-	kFlipFacing = 1 << 1,
-	kForceOpaque = 1 << 2,
-	kForceNoOpaque = 1 << 3
+	kAccellerationStructureFlagTriangleFacingCullDisable = 1 << 0,
+	kAccellerationStructureFlagFlipFacing = 1 << 1,
+	kAccellerationStructureFlagForceOpaque = 1 << 2,
+	kAccellerationStructureFlagForceNoOpaque = 1 << 3,
+	kAccellerationStructureFlagTriangleFrontCounterlockwise = kAccellerationStructureFlagFlipFacing
 };
 
 /// Mirrors VkAccelerationStructureInstanceKHR.

+ 5 - 1
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -56,8 +56,12 @@ struct GpuSceneMeshLod
 
 	Vec3 m_positionTranslation;
 	F32 m_positionScale;
+
+	UVec2 m_blasAddress;
+	U32 m_tlasInstanceMask; ///< Mask that goes to AccelerationStructureInstance::m_instanceCustomIndex24_mask8
+	U32 m_padding;
 };
-static_assert(sizeof(GpuSceneMeshLod) == sizeof(Vec4) * 3);
+static_assert(sizeof(GpuSceneMeshLod) == sizeof(Vec4) * 4);
 
 struct GpuSceneParticleEmitter
 {

+ 5 - 15
AnKi/Shaders/Include/GpuVisibilityTypes.h

@@ -37,24 +37,14 @@ struct GpuVisibilityNonRenderableUniforms
 	Vec4 m_clipPlanes[6u];
 };
 
-struct PointLightRendererCacheEntry
+struct GpuVisibilityAccelerationStructuresUniforms
 {
-	U32 m_shadowLayer; ///< Shadow layer used in RT shadows. Also used to show that it doesn't cast shadow.
-	F32 m_shadowAtlasTileScale; ///< UV scale for all tiles.
-	U32 m_uuid;
-	F32 m_padding0;
-
-	Vec4 m_shadowAtlasTileOffsets[6u]; ///< It's a array of Vec2 but because of padding round it up.
-};
+	Vec4 m_clipPlanes[6u];
 
-struct SpotLightRendererCacheEntry
-{
-	U32 m_shadowLayer; ///< Shadow layer used in RT shadows. Also used to show that it doesn't cast shadow.
-	U32 m_uuid;
-	U32 m_padding0;
-	U32 m_padding1;
+	Vec3 m_pointOfTest;
+	F32 m_testRadius;
 
-	Mat4 m_textureMatrix;
+	Vec4 m_maxLodDistances;
 };
 
 ANKI_END_NAMESPACE

+ 3 - 0
AnKi/Shaders/Intellisense.hlsl

@@ -18,6 +18,9 @@
 #define numthreads(x, y, z) [nodiscard]
 #define unroll [nodiscard]
 #define loop [nodiscard]
+#define out
+#define in
+#define inout
 
 #define ANKI_BEGIN_NAMESPACE
 #define ANKI_END_NAMESPACE