Browse Source

Fix raytracing bugs

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
d78dcc52d4

+ 1 - 0
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -85,6 +85,7 @@ private:
 		U32 alignment = GrManager::getSingleton().getDeviceCapabilities().m_uniformBufferBindOffsetAlignment;
 		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
 		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
+		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_accelerationStructureBuildScratchOffsetAlignment);
 
 		BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
 								   | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kTransferDestination;

+ 3 - 9
AnKi/Gr/CommandBuffer.h

@@ -404,16 +404,10 @@ public:
 	void copyBufferToBuffer(Buffer* src, Buffer* dst, ConstWeakArray<CopyBufferToBufferInfo> copies);
 
 	/// Build the acceleration structure.
-	void buildAccelerationStructure(AccelerationStructure* as);
-
-	/// Build an acceleration stracture indirectly. Only valid for TLASes.
 	/// @param as The AS to build.
 	/// @param scratchBuffer A scratch buffer. Ask the AS for size.
 	/// @param scratchBufferOffset Scratch buffer offset.
-	/// @param rangeBuffer Points to a single AccelerationStructureBuildRangeInfo.
-	/// @param rangeBufferOffsset Offset in rangeBuffer.
-	void buildAccelerationStructureIndirect(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset, Buffer* rangeBuffer,
-											PtrSize rangeBufferOffsset);
+	void buildAccelerationStructure(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset);
 
 	/// Do upscaling by an external upscaler
 	/// @param[in] upscaler the upscaler to use for upscaling
@@ -424,8 +418,8 @@ public:
 	/// @param[in] exposure 1x1 Texture containing exposure
 	/// @param[in] resetAccumulation Whether to clean or not any temporal history
 	/// @param[in] jitterOffset Jittering offset that was applied during the generation of sourceTexture
-	/// @param[in] motionVectorsScale Any scale factor that might need to be applied to the motionVectorsTexture (i.e UV
-	///                               space to Pixel space conversion)
+	/// @param[in] motionVectorsScale Any scale factor that might need to be applied to the motionVectorsTexture (i.e UV space to Pixel space
+	///                               conversion)
 	void upscale(GrUpscaler* upscaler, TextureView* inColor, TextureView* outUpscaledColor, TextureView* motionVectors, TextureView* depth,
 				 TextureView* exposure, Bool resetAccumulation, const Vec2& jitterOffset, const Vec2& motionVectorsScale);
 	/// @}

+ 3 - 0
AnKi/Gr/Common.h

@@ -167,6 +167,9 @@ public:
 	/// The max combined size of shared variables (with paddings) in compute shaders.
 	PtrSize m_computeSharedMemorySize = 16_KB;
 
+	/// Alignment of the scratch buffer used in AS building.
+	U32 m_accelerationStructureBuildScratchOffsetAlignment = 0;
+
 	/// Each SBT record should be a multiple of this.
 	U32 m_sbtRecordAlignment = kMaxU32;
 

+ 3 - 0
AnKi/Gr/RenderGraph.cpp

@@ -1335,7 +1335,10 @@ void RenderGraph::run() const
 			inf.m_nextUsage = barrier.m_usageAfter;
 			inf.m_as = m_ctx->m_as[barrier.m_idx].m_as.get();
 		}
+
+		cmdb.pushDebugMarker("Barrier", Vec3(1.0f, 0.0f, 0.0f));
 		cmdb.setPipelineBarrier(texBarriers, buffBarriers, asBarriers);
+		cmdb.popDebugMarker();
 
 		// Call the passes
 		for(U32 passIdx : batch.m_passIndices)

+ 1 - 1
AnKi/Gr/Vulkan/AccelerationStructureImpl.cpp

@@ -185,7 +185,7 @@ Error AccelerationStructureImpl::init(const AccelerationStructureInitInfo& inf)
 		buildInfo.dstAccelerationStructure = m_handle;
 
 		// Range info
-		m_rangeInfo.primitiveCount = inf.m_topLevel.m_directArgs.m_instances.getSize();
+		m_rangeInfo.primitiveCount = instanceCount;
 	}
 
 	return Error::kNone;

+ 2 - 9
AnKi/Gr/Vulkan/CommandBuffer.cpp

@@ -359,17 +359,10 @@ void CommandBuffer::copyBufferToBuffer(Buffer* src, Buffer* dst, ConstWeakArray<
 	self.copyBufferToBufferInternal(src, dst, copies);
 }
 
-void CommandBuffer::buildAccelerationStructure(AccelerationStructure* as)
+void CommandBuffer::buildAccelerationStructure(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset)
 {
 	ANKI_VK_SELF(CommandBufferImpl);
-	self.buildAccelerationStructureInternal(as);
-}
-
-void CommandBuffer::buildAccelerationStructureIndirect(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset,
-													   Buffer* rangeBuffer, PtrSize rangeBufferOffsset)
-{
-	ANKI_VK_SELF(CommandBufferImpl);
-	self.buildAccelerationStructureIndirectInternal(as, scratchBuffer, scratchBufferOffset, rangeBuffer, rangeBufferOffsset);
+	self.buildAccelerationStructureInternal(as, scratchBuffer, scratchBufferOffset);
 }
 
 void CommandBuffer::upscale(GrUpscaler* upscaler, TextureView* inColor, TextureView* outUpscaledColor, TextureView* motionVectors, TextureView* depth,

+ 5 - 38
AnKi/Gr/Vulkan/CommandBufferImpl.cpp

@@ -486,35 +486,10 @@ void CommandBufferImpl::rebindDynamicState()
 	}
 }
 
-void CommandBufferImpl::buildAccelerationStructureInternal(AccelerationStructure* as)
+void CommandBufferImpl::buildAccelerationStructureInternal(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset)
 {
-	commandCommon();
-
-	// Get objects
-	const AccelerationStructureImpl& asImpl = static_cast<AccelerationStructureImpl&>(*as);
-
-	// Create the scrach buffer
-	BufferInitInfo bufferInit;
-	bufferInit.m_usage = BufferUsageBit::kAccelerationStructureBuildScratch;
-	bufferInit.m_size = asImpl.getBuildScratchBufferSize();
-	BufferPtr scratchBuff = getGrManagerImpl().newBuffer(bufferInit);
-
-	// Create the build info
-	VkAccelerationStructureBuildGeometryInfoKHR buildInfo;
-	VkAccelerationStructureBuildRangeInfoKHR rangeInfo;
-	asImpl.generateBuildInfo(scratchBuff->getGpuAddress(), buildInfo, rangeInfo);
-
-	// Run the command
-	Array<const VkAccelerationStructureBuildRangeInfoKHR*, 1> pRangeInfos = {&rangeInfo};
-	vkCmdBuildAccelerationStructuresKHR(m_handle, 1, &buildInfo, &pRangeInfos[0]);
-}
-
-void CommandBufferImpl::buildAccelerationStructureIndirectInternal(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset,
-																   Buffer* rangeBuffer, PtrSize rangeBufferOffsset)
-{
-	ANKI_ASSERT(as && scratchBuffer && rangeBuffer);
+	ANKI_ASSERT(as && scratchBuffer);
 	ANKI_ASSERT(as->getBuildScratchBufferSize() + scratchBufferOffset <= scratchBuffer->getSize());
-	ANKI_ASSERT(rangeBufferOffsset + sizeof(AccelerationStructureBuildRangeInfo) <= rangeBuffer->getSize());
 
 	commandCommon();
 
@@ -523,20 +498,12 @@ void CommandBufferImpl::buildAccelerationStructureIndirectInternal(AccelerationS
 
 	// Create the build info
 	VkAccelerationStructureBuildGeometryInfoKHR buildInfo;
-	[[maybe_unused]] VkAccelerationStructureBuildRangeInfoKHR rangeInfo;
+	VkAccelerationStructureBuildRangeInfoKHR rangeInfo;
 	asImpl.generateBuildInfo(scratchBuffer->getGpuAddress() + scratchBufferOffset, buildInfo, rangeInfo);
 
 	// Run the command
-	constexpr U32 kASCount = 1;
-	constexpr U32 kGeometryCount = 1;
-
-	Array<VkDeviceAddress, kASCount> rangeAddr = {rangeBuffer->getGpuAddress() + rangeBufferOffsset};
-	Array<U32, kASCount> strides = {sizeof(AccelerationStructureBuildRangeInfo)};
-
-	Array<U32, kGeometryCount> maxPrimitives = {asImpl.getMaxInstanceCount()};
-	Array<const U32*, kASCount> maxPrimitiveCountArr = {&maxPrimitives[0]};
-
-	vkCmdBuildAccelerationStructuresIndirectKHR(m_handle, kASCount, &buildInfo, &rangeAddr[0], &strides[0], &maxPrimitiveCountArr[0]);
+	Array<const VkAccelerationStructureBuildRangeInfoKHR*, 1> pRangeInfos = {&rangeInfo};
+	vkCmdBuildAccelerationStructuresKHR(m_handle, 1, &buildInfo, &pRangeInfos[0]);
 }
 
 #if ANKI_DLSS

+ 1 - 4
AnKi/Gr/Vulkan/CommandBufferImpl.h

@@ -424,10 +424,7 @@ public:
 
 	void copyBufferToBufferInternal(Buffer* src, Buffer* dst, ConstWeakArray<CopyBufferToBufferInfo> copies);
 
-	void buildAccelerationStructureInternal(AccelerationStructure* as);
-
-	void buildAccelerationStructureIndirectInternal(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset,
-													Buffer* rangeBuffer, PtrSize rangeBufferOffsset);
+	void buildAccelerationStructureInternal(AccelerationStructure* as, Buffer* scratchBuffer, PtrSize scratchBufferOffset);
 
 	void upscaleInternal(GrUpscaler* upscaler, TextureView* inColor, TextureView* outUpscaledColor, TextureView* motionVectors, TextureView* depth,
 						 TextureView* exposure, const Bool resetAccumulation, const Vec2& jitterOffset, const Vec2& motionVectorsScale);

+ 7 - 5
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -991,6 +991,12 @@ Error GrManagerImpl::initDevice()
 		ANKI_ASSERT(m_accelerationStructureFeatures.pNext == nullptr);
 		m_accelerationStructureFeatures.pNext = const_cast<void*>(ci.pNext);
 		ci.pNext = &m_rtPipelineFeatures;
+
+		// Get some more stuff
+		VkPhysicalDeviceAccelerationStructurePropertiesKHR props = {};
+		props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR;
+		getPhysicalDeviceProperties2(props);
+		m_capabilities.m_accelerationStructureBuildScratchOffsetAlignment = props.minAccelerationStructureScratchOffsetAlignment;
 	}
 
 	// Pipeline features
@@ -1076,11 +1082,7 @@ Error GrManagerImpl::initDevice()
 		{
 			VkPhysicalDeviceFragmentShadingRatePropertiesKHR fragmentShadingRateProperties = {};
 			fragmentShadingRateProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR;
-
-			VkPhysicalDeviceProperties2 properties = {};
-			properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-			properties.pNext = &fragmentShadingRateProperties;
-			vkGetPhysicalDeviceProperties2(m_physicalDevice, &properties);
+			getPhysicalDeviceProperties2(fragmentShadingRateProperties);
 
 			if(fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.width > 16
 			   || fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.height > 16

+ 9 - 0
AnKi/Gr/Vulkan/GrManagerImpl.h

@@ -324,6 +324,15 @@ private:
 										   const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, void* pUserData);
 
 	Error printPipelineShaderInfoInternal(VkPipeline ppline, CString name, U64 hash) const;
+
+	template<typename TProps>
+	void getPhysicalDeviceProperties2(TProps& props) const
+	{
+		VkPhysicalDeviceProperties2 properties = {};
+		properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+		properties.pNext = &props;
+		vkGetPhysicalDeviceProperties2(m_physicalDevice, &properties);
+	}
 };
 /// @}
 

+ 1 - 1
AnKi/Math/Mat.h

@@ -1462,7 +1462,7 @@ public:
 				{
 					fmt = "%f ";
 				}
-				str += String().sprintf(fmt, m_arr2[j][i]);
+				str += String().sprintf(fmt.cstr(), m_arr2[j][i]);
 			}
 		}
 		return str;

+ 2 - 3
AnKi/Renderer/AccelerationStructureBuilder.cpp

@@ -59,10 +59,9 @@ void AccelerationStructureBuilder::populateRenderGraph(RenderingContext& ctx)
 		rpass.newAccelerationStructureDependency(m_runCtx.m_tlasHandle, AccelerationStructureUsageBit::kBuild);
 		rpass.newBufferDependency(visOut.m_someBufferHandle, BufferUsageBit::kAccelerationStructureBuild);
 
-		rpass.setWork([this, scratchBuff, rangeBuff = visOut.m_rangeBuffer](RenderPassWorkContext& rgraphCtx) {
+		rpass.setWork([this, scratchBuff](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(ASBuilder);
-			rgraphCtx.m_commandBuffer->buildAccelerationStructureIndirect(m_runCtx.m_tlas.get(), scratchBuff.m_buffer, scratchBuff.m_offset,
-																		  rangeBuff.m_buffer, rangeBuff.m_offset);
+			rgraphCtx.m_commandBuffer->buildAccelerationStructure(m_runCtx.m_tlas.get(), scratchBuff.m_buffer, scratchBuff.m_offset);
 		});
 	}
 }

+ 3 - 1
AnKi/Renderer/RtShadows.cpp

@@ -226,6 +226,8 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 
 		ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("RtShadows setup build SBT");
 
+		rpass.newBufferDependency(sbtBuildIndirectArgsHandle, BufferUsageBit::kAccelerationStructureBuild);
+
 		rpass.setWork([this, sbtBuildIndirectArgsBuffer](RenderPassWorkContext& rgraphCtx) {
 			ANKI_TRACE_SCOPED_EVENT(RtShadows);
 			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -271,7 +273,7 @@ void RtShadows::populateRenderGraph(RenderingContext& ctx)
 
 			cmdb.bindShaderProgram(m_buildSbtGrProg.get());
 
-			cmdb.bindStorageBuffer(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferOffsetRange());
+			cmdb.bindStorageBuffer(0, 0, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
 			cmdb.bindStorageBuffer(0, 1, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
 			cmdb.bindStorageBuffer(0, 2, visibleRenderableIndicesBuff);
 			cmdb.bindStorageBuffer(0, 3, &m_rtLibraryGrProg->getShaderGroupHandlesGpuBuffer(), 0, kMaxPtrSize);

+ 71 - 72
AnKi/Renderer/Utils/GpuVisibility.cpp

@@ -480,7 +480,16 @@ void GpuVisibilityNonRenderables::populateRenderGraph(GpuVisibilityNonRenderable
 
 Error GpuVisibilityAccelerationStructures::init()
 {
-	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_prog, m_grProg));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructures.ankiprogbin", m_visibilityProg, m_visibilityGrProg));
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprogbin", m_zeroRemainingInstancesProg,
+								 m_zeroRemainingInstancesGrProg));
+
+	BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
+	inf.m_size = sizeof(U32) * 2;
+	inf.m_usage = BufferUsageBit::kStorageComputeWrite | BufferUsageBit::kStorageComputeRead | BufferUsageBit::kTransferDestination;
+	m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
+
+	zeroBuffer(m_counterBuffer.get());
 
 	return Error::kNone;
 }
@@ -491,98 +500,88 @@ void GpuVisibilityAccelerationStructures::pupulateRenderGraph(GpuVisibilityAccel
 	in.validate();
 	RenderGraphDescription& rgraph = *in.m_rgraph;
 
-	const Bool firstRunInFrame = m_lastFrameIdx != getRenderer().getFrameCount();
-	if(firstRunInFrame)
-	{
-		// 1st run in this frame, do some bookkeeping
-		m_lastFrameIdx = getRenderer().getFrameCount();
-		m_currentCounterBufferOffset = 0;
-
-		m_counterBufferHandle = {};
-	}
+#if ANKI_ASSERTIONS_ENABLED
+	ANKI_ASSERT(m_lastFrameIdx != getRenderer().getFrameCount());
+	m_lastFrameIdx = getRenderer().getFrameCount();
+#endif
 
-	// Maybe create the counter buffer
-	const U32 counterBufferElementSize =
-		max<U32>(2 * sizeof(U32), GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
-	if(!m_counterBuffer.isCreated() || m_currentCounterBufferOffset + counterBufferElementSize > m_counterBuffer->getSize()) [[unlikely]]
-	{
-		BufferInitInfo inf("GpuVisibilityAccelerationStructuresCounters");
-		inf.m_size = (!m_counterBuffer.isCreated()) ? kInitialCounterBufferElementCount * counterBufferElementSize : m_counterBuffer->getSize() * 2;
-		inf.m_usage = BufferUsageBit::kStorageComputeWrite | BufferUsageBit::kStorageComputeRead | BufferUsageBit::kTransferDestination;
-		m_counterBuffer = GrManager::getSingleton().newBuffer(inf);
+	// Allocate the transient buffers
+	const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
 
-		m_counterBufferHandle = rgraph.importBuffer(m_counterBuffer.get(), BufferUsageBit::kTransferDestination);
+	out.m_instancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(AccelerationStructureInstance));
+	out.m_someBufferHandle = rgraph.importBuffer(BufferUsageBit::kStorageComputeWrite, out.m_instancesBuffer);
 
-		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("GpuVisibilityNonRenderablesClearCounterBuffer");
+	out.m_renderableIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((aabbCount + 1) * sizeof(U32));
 
-		pass.newBufferDependency(m_counterBufferHandle, BufferUsageBit::kTransferDestination);
+	const BufferOffsetRange zeroInstancesDispatchArgsBuff = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs));
 
-		pass.setWork([counterBuffer = m_counterBuffer](RenderPassWorkContext& rgraph) {
-			rgraph.m_commandBuffer->fillBuffer(counterBuffer.get(), 0, kMaxPtrSize, 0);
-		});
+	// Create vis pass
+	{
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(in.m_passesName);
 
-		m_currentCounterBufferOffset = 0;
-	}
+		pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
+		pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kStorageComputeWrite);
 
-	// Allocate the transient buffers
-	const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
+		pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
+					  testRadius = in.m_testRadius, instancesBuff = out.m_instancesBuffer, indicesBuff = out.m_renderableIndicesBuffer,
+					  zeroInstancesDispatchArgsBuff](RenderPassWorkContext& rgraph) {
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
 
-	out.m_instancesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(AccelerationStructureInstance));
-	out.m_someBufferHandle = rgraph.importBuffer(BufferUsageBit::kStorageComputeWrite, out.m_instancesBuffer);
+			cmdb.bindShaderProgram(m_visibilityGrProg.get());
 
-	out.m_renderableIndicesBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate((aabbCount + 1) * sizeof(U32));
+			GpuVisibilityAccelerationStructuresUniforms unis;
+			Array<Plane, 6> planes;
+			extractClipPlanes(viewProjMat, planes);
+			for(U32 i = 0; i < 6; ++i)
+			{
+				unis.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+			}
 
-	out.m_rangeBuffer = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(AccelerationStructureBuildRangeInfo));
+			unis.m_pointOfTest = pointOfTest;
+			unis.m_testRadius = testRadius;
 
-	// Create the compute pass
-	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(in.m_passesName);
+			ANKI_ASSERT(kMaxLodCount == 3);
+			unis.m_maxLodDistances[0] = lodDistances[0];
+			unis.m_maxLodDistances[1] = lodDistances[1];
+			unis.m_maxLodDistances[2] = kMaxF32;
+			unis.m_maxLodDistances[3] = kMaxF32;
 
-	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), BufferUsageBit::kStorageComputeRead);
-	pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kStorageComputeWrite);
-	if(m_counterBufferHandle.isValid())
-	{
-		pass.newBufferDependency(m_counterBufferHandle, BufferUsageBit::kStorageComputeWrite);
-	}
+			cmdb.setPushConstants(&unis, sizeof(unis));
 
-	pass.setWork([this, viewProjMat = in.m_viewProjectionMatrix, lodDistances = in.m_lodDistances, pointOfTest = in.m_pointOfTest,
-				  testRadius = in.m_testRadius, instancesBuff = out.m_instancesBuffer, indicesBuff = out.m_renderableIndicesBuffer,
-				  rangeBuff = out.m_rangeBuffer, counterBufferOffset = m_currentCounterBufferOffset](RenderPassWorkContext& rgraph) {
-		CommandBuffer& cmdb = *rgraph.m_commandBuffer;
+			cmdb.bindStorageBuffer(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferOffsetRange());
+			cmdb.bindStorageBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
+			cmdb.bindStorageBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
+			cmdb.bindStorageBuffer(0, 3, instancesBuff);
+			cmdb.bindStorageBuffer(0, 4, indicesBuff);
+			cmdb.bindStorageBuffer(0, 5, m_counterBuffer.get(), 0, sizeof(U32) * 2);
+			cmdb.bindStorageBuffer(0, 6, zeroInstancesDispatchArgsBuff);
 
-		cmdb.bindShaderProgram(m_grProg.get());
+			const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
+			dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
+		});
+	}
 
-		GpuVisibilityAccelerationStructuresUniforms unis;
-		Array<Plane, 6> planes;
-		extractClipPlanes(viewProjMat, planes);
-		for(U32 i = 0; i < 6; ++i)
-		{
-			unis.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
-		}
+	// Zero remaining instances
+	{
+		Array<Char, 64> passName;
+		snprintf(passName.getBegin(), sizeof(passName), "%s: Zero remaining instances", in.m_passesName.cstr());
 
-		unis.m_pointOfTest = pointOfTest;
-		unis.m_testRadius = testRadius;
+		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passName.getBegin());
 
-		ANKI_ASSERT(kMaxLodCount == 3);
-		unis.m_maxLodDistances[0] = lodDistances[0];
-		unis.m_maxLodDistances[1] = lodDistances[1];
-		unis.m_maxLodDistances[2] = kMaxF32;
-		unis.m_maxLodDistances[3] = kMaxF32;
+		pass.newBufferDependency(out.m_someBufferHandle, BufferUsageBit::kStorageComputeWrite);
 
-		cmdb.setPushConstants(&unis, sizeof(unis));
+		pass.setWork([this, zeroInstancesDispatchArgsBuff, instancesBuff = out.m_instancesBuffer,
+					  indicesBuff = out.m_renderableIndicesBuffer](RenderPassWorkContext& rgraph) {
+			CommandBuffer& cmdb = *rgraph.m_commandBuffer;
 
-		cmdb.bindStorageBuffer(0, 0, GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getBufferOffsetRange());
-		cmdb.bindStorageBuffer(0, 1, GpuSceneArrays::Renderable::getSingleton().getBufferOffsetRange());
-		cmdb.bindStorageBuffer(0, 2, &GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
-		cmdb.bindStorageBuffer(0, 3, instancesBuff);
-		cmdb.bindStorageBuffer(0, 4, indicesBuff);
-		cmdb.bindStorageBuffer(0, 5, rangeBuff);
-		cmdb.bindStorageBuffer(0, 6, m_counterBuffer.get(), counterBufferOffset, sizeof(U32) * 2);
+			cmdb.bindShaderProgram(m_zeroRemainingInstancesGrProg.get());
 
-		const U32 aabbCount = GpuSceneArrays::RenderableBoundingVolumeRt::getSingleton().getElementCount();
-		dispatchPPCompute(cmdb, 64, 1, aabbCount, 1);
-	});
+			cmdb.bindStorageBuffer(0, 0, indicesBuff);
+			cmdb.bindStorageBuffer(0, 1, instancesBuff);
 
-	m_currentCounterBufferOffset += counterBufferElementSize;
+			cmdb.dispatchComputeIndirect(zeroInstancesDispatchArgsBuff.m_buffer, zeroInstancesDispatchArgsBuff.m_offset);
+		});
+	}
 }
 
 } // end namespace anki

+ 8 - 7
AnKi/Renderer/Utils/GpuVisibility.h

@@ -163,7 +163,6 @@ class GpuVisibilityAccelerationStructuresOutput
 public:
 	BufferHandle m_someBufferHandle; ///< Some handle to track dependencies. No need to track every buffer.
 
-	BufferOffsetRange m_rangeBuffer; ///< Points to a single AccelerationStructureBuildRangeInfo. The m_primitiveCount holds the instance count.
 	BufferOffsetRange m_instancesBuffer; ///< Points to AccelerationStructureBuildRangeInfo::m_primitiveCount number of AccelerationStructureInstance.
 	BufferOffsetRange m_renderableIndicesBuffer; ///< AccelerationStructureBuildRangeInfo::m_primitiveCount number of indices to renderables.
 };
@@ -177,15 +176,17 @@ public:
 	void pupulateRenderGraph(GpuVisibilityAccelerationStructuresInput& in, GpuVisibilityAccelerationStructuresOutput& out);
 
 private:
-	ShaderProgramResourcePtr m_prog;
-	ShaderProgramPtr m_grProg;
+	ShaderProgramResourcePtr m_visibilityProg;
+	ShaderProgramPtr m_visibilityGrProg;
+
+	ShaderProgramResourcePtr m_zeroRemainingInstancesProg;
+	ShaderProgramPtr m_zeroRemainingInstancesGrProg;
 
-	static constexpr U32 kInitialCounterBufferElementCount = 3;
 	BufferPtr m_counterBuffer; ///< A buffer containing multiple counters for atomic operations.
-	U64 m_lastFrameIdx = kMaxU64;
-	U32 m_currentCounterBufferOffset = 0;
 
-	BufferHandle m_counterBufferHandle;
+#if ANKI_ASSERTIONS_ENABLED
+	U64 m_lastFrameIdx = kMaxU64;
+#endif
 };
 /// @}
 

+ 7 - 1
AnKi/Resource/MeshResource.cpp

@@ -289,7 +289,13 @@ Error MeshResource::loadAsync(MeshBinaryLoader& loader) const
 		// Build BLASes
 		for(U32 lodIdx = 0; lodIdx < m_lods.getSize(); ++lodIdx)
 		{
-			cmdb->buildAccelerationStructure(m_lods[lodIdx].m_blas.get());
+			// TODO find a temp buffer
+			BufferInitInfo buffInit("BLAS scratch");
+			buffInit.m_size = m_lods[lodIdx].m_blas->getBuildScratchBufferSize();
+			buffInit.m_usage = BufferUsageBit::kAccelerationStructureBuildScratch;
+			BufferPtr scratchBuff = GrManager::getSingleton().newBuffer(buffInit);
+
+			cmdb->buildAccelerationStructure(m_lods[lodIdx].m_blas.get(), scratchBuff.get(), 0);
 		}
 
 		// Barriers again

+ 25 - 12
AnKi/Shaders/GpuVisibilityAccelerationStructures.ankiprog

@@ -17,9 +17,10 @@
 
 [[vk::binding(3)]] RWStructuredBuffer<AccelerationStructureInstance> g_visibleInstances;
 [[vk::binding(4)]] RWStructuredBuffer<U32> g_visibleRenderableIndices; // 1st element is the count
-[[vk::binding(5)]] RWStructuredBuffer<AccelerationStructureBuildRangeInfo> g_range;
 
-[[vk::binding(6)]] globallycoherent RWStructuredBuffer<U32> g_counterBuffer; // 2 counters per dispatch
+[[vk::binding(5)]] globallycoherent RWStructuredBuffer<U32> g_counterBuffer; // 2 counters per dispatch
+
+[[vk::binding(6)]] RWStructuredBuffer<DispatchIndirectArgs> g_nextDispatchIndirectArgs;
 
 [[vk::push_constant]] ConstantBuffer<GpuVisibilityAccelerationStructuresUniforms> g_unis;
 
@@ -39,7 +40,7 @@
 	if(visible)
 	{
 		bvolume = g_renderableBoundingVolumes[bvolumeIdx];
-		visible = testSphereSphereCollision(bvolume.m_sphereCenter, bvolume.m_sphereRadius, g_unis.m_pointOfTest, bvolume.m_sphereRadius);
+		visible = testSphereSphereCollision(bvolume.m_sphereCenter, bvolume.m_sphereRadius, g_unis.m_pointOfTest, g_unis.m_testRadius);
 	}
 
 	// All good, write the instance
@@ -80,11 +81,19 @@
 		if(meshLod.m_blasAddress.x != 0 || meshLod.m_blasAddress.y != 0)
 		{
 			// It has a BLAS, write what is to write
+
+			const Mat3x4 transform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
+			Mat3x4 meshQuantizationTransform;
+			meshQuantizationTransform.m_row0 = Vec4(meshLod.m_positionScale, 0.0f, 0.0f, meshLod.m_positionTranslation.x);
+			meshQuantizationTransform.m_row1 = Vec4(0.0f, meshLod.m_positionScale, 0.0f, meshLod.m_positionTranslation.y);
+			meshQuantizationTransform.m_row2 = Vec4(0.0f, 0.0f, meshLod.m_positionScale, meshLod.m_positionTranslation.z);
+			const Mat3x4 finalTrf = combineTransformations(transform, meshQuantizationTransform);
+
 			U32 instanceIdx;
 			InterlockedAdd(g_counterBuffer[0], 1, instanceIdx);
 
 			AccelerationStructureInstance instance;
-			instance.m_transform = g_gpuScene.Load<Mat3x4>(renderable.m_worldTransformsOffset);
+			instance.m_transform = finalTrf;
 			instance.m_instanceCustomIndex24_mask8 = (instanceIdx << 8u) | (meshLod.m_tlasInstanceMask & 0xFFu);
 			instance.m_instanceShaderBindingTableRecordOffset24_flags8 =
 				(instanceIdx << 8u)
@@ -112,17 +121,21 @@
 
 		if(lastThreadgroupExecuting)
 		{
-			AccelerationStructureBuildRangeInfo range;
-			range.m_primitiveCount = g_counterBuffer[0];
-			range.m_primitiveOffset = 0;
-			range.m_firstVertex = 0;
-			range.m_transformOffset = 0;
-			g_range[0] = range;
-
-			g_visibleRenderableIndices[0] = g_counterBuffer[0];
+			const U32 visible = g_counterBuffer[0];
+			g_visibleRenderableIndices[0] = visible;
 
 			g_counterBuffer[0] = 0;
 			g_counterBuffer[1] = 0;
+
+			// Update indirect args of some next job
+			U32 total, unused;
+			g_visibleInstances.GetDimensions(total, unused);
+
+			const U32 remaining = total - visible;
+
+			g_nextDispatchIndirectArgs[0].m_threadGroupCountX = (remaining + NUMTHREADS - 1) / NUMTHREADS;
+			g_nextDispatchIndirectArgs[0].m_threadGroupCountY = 1;
+			g_nextDispatchIndirectArgs[0].m_threadGroupCountZ = 1;
 		}
 	}
 }

+ 32 - 0
AnKi/Shaders/GpuVisibilityAccelerationStructuresZeroRemainingInstances.ankiprog

@@ -0,0 +1,32 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Common.hlsl>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+
+[[vk::binding(0)]] StructuredBuffer<U32> g_visibleRenderableIndices; // 1st element is the count
+[[vk::binding(1)]] RWStructuredBuffer<AccelerationStructureInstance> g_instances;
+
+#define NUMTHREADS 64
+
+[numthreads(NUMTHREADS, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	const U32 visibleInstances = g_visibleRenderableIndices[0];
+
+	U32 totalInstances, unused;
+	g_instances.GetDimensions(totalInstances, unused);
+
+	ANKI_ASSERT(totalInstances >= visibleInstances);
+	const U32 remainingInstances = totalInstances - visibleInstances;
+
+	if(svDispatchThreadId < remainingInstances)
+	{
+		g_instances[visibleInstances + svDispatchThreadId] = (AccelerationStructureInstance)0;
+	}
+}
+
+#pragma anki end

+ 27 - 12
AnKi/Shaders/Include/Common.h

@@ -402,6 +402,33 @@ Mat3 transpose(Mat3 m)
 	return constructMatrixColumns(m.m_row0, m.m_row1, m.m_row2);
 }
 
+Mat3x4 combineTransformations(Mat3x4 a_, Mat3x4 b_)
+{
+	const Vec4 a[3] = {a_.m_row0, a_.m_row1, a_.m_row2};
+	const Vec4 b[3] = {b_.m_row0, b_.m_row1, b_.m_row2};
+	Vec4 c[3];
+
+	[unroll] for(U32 i = 0; i < 3; i++)
+	{
+		Vec4 t2;
+
+		t2 = b[0] * a[i][0];
+		t2 += b[1] * a[i][1];
+		t2 += b[2] * a[i][2];
+
+		const Vec4 v4 = Vec4(0.0f, 0.0f, 0.0f, a[i][3]);
+		t2 += v4;
+
+		c[i] = t2;
+	}
+
+	Mat3x4 o;
+	o.m_row0 = c[0];
+	o.m_row1 = c[1];
+	o.m_row2 = c[2];
+	return o;
+}
+
 // Common constants
 constexpr F32 kEpsilonF32 = 0.000001f;
 #	if ANKI_SUPPORTS_16BIT_TYPES
@@ -730,9 +757,6 @@ constexpr F32 kPi = 3.14159265358979323846f;
 //! == Common ==========================================================================================================
 ANKI_BEGIN_NAMESPACE
 
-/// The renderer will group drawcalls into instances up to this number.
-constexpr U32 kMaxInstanceCount = 64u;
-
 constexpr U32 kMaxLodCount = 3u;
 constexpr U32 kMaxShadowCascades = 4u;
 
@@ -829,15 +853,6 @@ struct DispatchIndirectArgs
 	U32 m_threadGroupCountZ;
 };
 
-/// Mirrors VkAccelerationStructureBuildRangeInfoKHR.
-struct AccelerationStructureBuildRangeInfo
-{
-	U32 m_primitiveCount; ///< For a TLAS it's the instance count.
-	U32 m_primitiveOffset;
-	U32 m_firstVertex;
-	U32 m_transformOffset;
-};
-
 /// Mirrors VkGeometryInstanceFlagBitsKHR
 enum AccellerationStructureFlag : U32
 {