Browse Source

Move probe reflection to GDR. Move the cluster binning to GDR

Panagiotis Christopoulos Charitos 2 years ago
parent
commit
55e6e8e5bb
33 changed files with 1339 additions and 273 deletions
  1. 2 2
      AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h
  2. 2 0
      AnKi/Gr/CommandBuffer.h
  3. 1 1
      AnKi/Gr/RenderGraph.h
  4. 6 0
      AnKi/Gr/Vulkan/CommandBuffer.cpp
  5. 4 0
      AnKi/Gr/Vulkan/CommandBufferImpl.h
  6. 51 37
      AnKi/Gr/Vulkan/CommandBufferImpl.inl.h
  7. 263 0
      AnKi/Renderer/ClusterBinning2.cpp
  8. 50 0
      AnKi/Renderer/ClusterBinning2.h
  9. 4 1
      AnKi/Renderer/Common.h
  10. 93 31
      AnKi/Renderer/PrimaryNonRenderableVisibility.cpp
  11. 29 3
      AnKi/Renderer/PrimaryNonRenderableVisibility.h
  12. 86 67
      AnKi/Renderer/ProbeReflections.cpp
  13. 6 5
      AnKi/Renderer/ProbeReflections.h
  14. 8 0
      AnKi/Renderer/Renderer.cpp
  15. 2 2
      AnKi/Renderer/Renderer.h
  16. 22 0
      AnKi/Renderer/RendererObject.cpp
  17. 1 0
      AnKi/Renderer/RendererObject.def.h
  18. 9 0
      AnKi/Renderer/RendererObject.h
  19. 1 1
      AnKi/Renderer/Utils/Readback.h
  20. 6 0
      AnKi/Scene/Components/GlobalIlluminationProbeComponent.h
  21. 126 0
      AnKi/Scene/Components/LightComponent.cpp
  22. 14 0
      AnKi/Scene/Components/LightComponent.h
  23. 36 65
      AnKi/Scene/Components/ReflectionProbeComponent.cpp
  24. 19 24
      AnKi/Scene/Components/ReflectionProbeComponent.h
  25. 6 0
      AnKi/Scene/GpuSceneArray.h
  26. 1 31
      AnKi/Scene/Visibility.cpp
  27. 254 0
      AnKi/Shaders/ClusterBinning2.ankiprog
  28. 78 0
      AnKi/Shaders/ClusterBinning2PackVisibles.ankiprog
  29. 71 0
      AnKi/Shaders/ClusterBinning2Setup.ankiprog
  30. 76 0
      AnKi/Shaders/Include/ClusteredShadingTypes.h
  31. 7 0
      AnKi/Shaders/Include/Common.h
  32. 1 0
      AnKi/Shaders/Include/GpuSceneTypes.h
  33. 4 3
      AnKi/Shaders/Intellisense.hlsl

+ 2 - 2
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -58,8 +58,8 @@ private:
 		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_storageBufferBindOffsetAlignment);
 		alignment = max(alignment, GrManager::getSingleton().getDeviceCapabilities().m_sbtRecordAlignment);
 
-		const BufferUsageBit buffUsage =
-			BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw | BufferUsageBit::kVertex;
+		const BufferUsageBit buffUsage = BufferUsageBit::kAllUniform | BufferUsageBit::kAllStorage | BufferUsageBit::kIndirectDraw
+										 | BufferUsageBit::kIndirectCompute | BufferUsageBit::kVertex | BufferUsageBit::kTransferDestination;
 		m_pool.init(10_MB, 2.0, 0, alignment, buffUsage, BufferMapAccessBit::kNone, true, "GpuVisibleTransientMemoryPool");
 	}
 

+ 2 - 0
AnKi/Gr/CommandBuffer.h

@@ -289,6 +289,8 @@ public:
 
 	void dispatchCompute(U32 groupCountX, U32 groupCountY, U32 groupCountZ);
 
+	void dispatchComputeIndirect(Buffer* argBuffer, PtrSize argBufferOffset);
+
 	/// Trace rays.
 	///
 	/// The 1st thing in the sbtBuffer is the ray gen shader group handle:

+ 1 - 1
AnKi/Gr/RenderGraph.h

@@ -29,7 +29,7 @@ class RenderGraphDescription;
 
 /// @name RenderGraph constants
 /// @{
-constexpr U32 kMaxRenderGraphPasses = 128;
+constexpr U32 kMaxRenderGraphPasses = 256;
 constexpr U32 kMaxRenderGraphRenderTargets = 64; ///< Max imported or not render targets in RenderGraph.
 constexpr U32 kMaxRenderGraphBuffers = 128;
 constexpr U32 kMaxRenderGraphAccelerationStructures = 32;

+ 6 - 0
AnKi/Gr/Vulkan/CommandBuffer.cpp

@@ -293,6 +293,12 @@ void CommandBuffer::dispatchCompute(U32 groupCountX, U32 groupCountY, U32 groupC
 	self.dispatchComputeInternal(groupCountX, groupCountY, groupCountZ);
 }
 
+void CommandBuffer::dispatchComputeIndirect(Buffer* argBuffer, PtrSize argBufferOffset)
+{
+	ANKI_VK_SELF(CommandBufferImpl);
+	self.dispatchComputeIndirectInternal(argBuffer, argBufferOffset);
+}
+
 void CommandBuffer::traceRays(Buffer* sbtBuffer, PtrSize sbtBufferOffset, U32 sbtRecordSize, U32 hitGroupSbtRecordCount, U32 rayTypeCount, U32 width,
 							  U32 height, U32 depth)
 {

+ 4 - 0
AnKi/Gr/Vulkan/CommandBufferImpl.h

@@ -340,6 +340,8 @@ public:
 
 	void dispatchComputeInternal(U32 groupCountX, U32 groupCountY, U32 groupCountZ);
 
+	void dispatchComputeIndirectInternal(Buffer* argBuffer, PtrSize argBufferOffset);
+
 	void traceRaysInternal(Buffer* sbtBuffer, PtrSize sbtBufferOffset, U32 sbtRecordSize, U32 hitGroupSbtRecordCount, U32 rayTypeCount, U32 width,
 						   U32 height, U32 depth);
 
@@ -538,6 +540,8 @@ private:
 
 	void drawcallCommon();
 
+	void dispatchCommon();
+
 	Bool insideRenderPass() const
 	{
 		return m_activeFb != nullptr;

+ 51 - 37
AnKi/Gr/Vulkan/CommandBufferImpl.inl.h

@@ -109,46 +109,20 @@ ANKI_FORCE_INLINE void CommandBufferImpl::setImageBarrier(VkPipelineStageFlags s
 
 ANKI_FORCE_INLINE void CommandBufferImpl::dispatchComputeInternal(U32 groupCountX, U32 groupCountY, U32 groupCountZ)
 {
-	ANKI_ASSERT(m_computeProg);
-	ANKI_ASSERT(m_computeProg->getReflectionInfo().m_pushConstantsSize == m_setPushConstantsSize && "Forgot to set pushConstants");
-
-	commandCommon();
-
-	// Bind descriptors
-	for(U32 i = 0; i < kMaxDescriptorSets; ++i)
-	{
-		if(m_computeProg->getReflectionInfo().m_descriptorSetMask.get(i))
-		{
-			DescriptorSet dset;
-			Bool dirty;
-			Array<PtrSize, kMaxBindingsPerDescriptorSet> dynamicOffsetsPtrSize;
-			U32 dynamicOffsetCount;
-			if(getGrManagerImpl().getDescriptorSetFactory().newDescriptorSet(*m_pool, m_dsetState[i], dset, dirty, dynamicOffsetsPtrSize,
-																			 dynamicOffsetCount))
-			{
-				ANKI_VK_LOGF("Cannot recover");
-			}
-
-			if(dirty)
-			{
-				// Vulkan should have had the dynamic offsets as VkDeviceSize and not U32. Workaround that.
-				Array<U32, kMaxBindingsPerDescriptorSet> dynamicOffsets;
-				for(U32 i = 0; i < dynamicOffsetCount; ++i)
-				{
-					dynamicOffsets[i] = U32(dynamicOffsetsPtrSize[i]);
-				}
-
-				VkDescriptorSet dsHandle = dset.getHandle();
-
-				vkCmdBindDescriptorSets(m_handle, VK_PIPELINE_BIND_POINT_COMPUTE, m_computeProg->getPipelineLayout().getHandle(), i, 1, &dsHandle,
-										dynamicOffsetCount, &dynamicOffsets[0]);
-			}
-		}
-	}
-
+	ANKI_ASSERT(groupCountX > 0 && groupCountY > 0 && groupCountZ > 0);
+	dispatchCommon();
 	vkCmdDispatch(m_handle, groupCountX, groupCountY, groupCountZ);
 }
 
+ANKI_FORCE_INLINE void CommandBufferImpl::dispatchComputeIndirectInternal(Buffer* argBuffer, PtrSize argBufferOffset)
+{
+	ANKI_ASSERT(argBuffer);
+	ANKI_ASSERT(argBufferOffset + sizeof(U32) * 2 < argBuffer->getSize());
+	ANKI_ASSERT(argBufferOffset % 4 == 0);
+	dispatchCommon();
+	vkCmdDispatchIndirect(m_handle, static_cast<BufferImpl*>(argBuffer)->getHandle(), argBufferOffset);
+}
+
 ANKI_FORCE_INLINE void CommandBufferImpl::traceRaysInternal(Buffer* sbtBuffer, PtrSize sbtBufferOffset, U32 sbtRecordSize32,
 															U32 hitGroupSbtRecordCount, U32 rayTypeCount, U32 width, U32 height, U32 depth)
 {
@@ -376,6 +350,46 @@ ANKI_FORCE_INLINE void CommandBufferImpl::drawcallCommon()
 	ANKI_TRACE_INC_COUNTER(VkDrawcall, 1);
 }
 
+ANKI_FORCE_INLINE void CommandBufferImpl::dispatchCommon()
+{
+	ANKI_ASSERT(m_computeProg);
+	ANKI_ASSERT(m_computeProg->getReflectionInfo().m_pushConstantsSize == m_setPushConstantsSize && "Forgot to set pushConstants");
+
+	commandCommon();
+
+	// Bind descriptors
+	for(U32 i = 0; i < kMaxDescriptorSets; ++i)
+	{
+		if(m_computeProg->getReflectionInfo().m_descriptorSetMask.get(i))
+		{
+			DescriptorSet dset;
+			Bool dirty;
+			Array<PtrSize, kMaxBindingsPerDescriptorSet> dynamicOffsetsPtrSize;
+			U32 dynamicOffsetCount;
+			if(getGrManagerImpl().getDescriptorSetFactory().newDescriptorSet(*m_pool, m_dsetState[i], dset, dirty, dynamicOffsetsPtrSize,
+																			 dynamicOffsetCount))
+			{
+				ANKI_VK_LOGF("Cannot recover");
+			}
+
+			if(dirty)
+			{
+				// Vulkan should have had the dynamic offsets as VkDeviceSize and not U32. Workaround that.
+				Array<U32, kMaxBindingsPerDescriptorSet> dynamicOffsets;
+				for(U32 i = 0; i < dynamicOffsetCount; ++i)
+				{
+					dynamicOffsets[i] = U32(dynamicOffsetsPtrSize[i]);
+				}
+
+				VkDescriptorSet dsHandle = dset.getHandle();
+
+				vkCmdBindDescriptorSets(m_handle, VK_PIPELINE_BIND_POINT_COMPUTE, m_computeProg->getPipelineLayout().getHandle(), i, 1, &dsHandle,
+										dynamicOffsetCount, &dynamicOffsets[0]);
+			}
+		}
+	}
+}
+
 ANKI_FORCE_INLINE void CommandBufferImpl::writeOcclusionQueriesResultToBufferInternal(ConstWeakArray<OcclusionQuery*> queries, PtrSize offset,
 																					  Buffer* buff)
 {

+ 263 - 0
AnKi/Renderer/ClusterBinning2.cpp

@@ -0,0 +1,263 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#include <AnKi/Renderer/ClusterBinning2.h>
+#include <AnKi/Renderer/PrimaryNonRenderableVisibility.h>
+#include <AnKi/Renderer/Renderer.h>
+#include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
+#include <AnKi/Scene/Components/CameraComponent.h>
+#include <AnKi/Collision/Functions.h>
+
+namespace anki {
+
+ClusterBinning2::ClusterBinning2()
+{
+}
+
+ClusterBinning2::~ClusterBinning2()
+{
+}
+
+Error ClusterBinning2::init()
+{
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/ClusterBinning2Setup.ankiprogbin", m_jobSetupProg, m_jobSetupGrProg));
+
+	ANKI_CHECK(ResourceManager::getSingleton().loadResource("ShaderBinaries/ClusterBinning2.ankiprogbin", m_binningProg));
+
+	for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+	{
+		ShaderProgramResourceVariantInitInfo inf(m_binningProg);
+		inf.addMutation("OBJECT_TYPE", MutatorValue(type));
+		inf.addConstant("kTileSize", getRenderer().getTileSize());
+		inf.addConstant("kZSplitCount", getRenderer().getZSplitCount());
+		const ShaderProgramResourceVariant* variant;
+		m_binningProg->getOrCreateVariant(inf, variant);
+		m_binningGrProgs[type].reset(&variant->getProgram());
+
+		ANKI_CHECK(loadShaderProgram("ShaderBinaries/ClusterBinning2PackVisibles.ankiprogbin",
+									 Array<SubMutation, 1>{{"OBJECT_TYPE", MutatorValue(type)}}, m_packingProg, m_packingGrProgs[type]));
+	}
+
+	return Error::kNone;
+}
+
+void ClusterBinning2::populateRenderGraph(RenderingContext& ctx)
+{
+	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
+
+	// Allocate the clusters buffer
+	{
+		const U32 clusterCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y() + getRenderer().getZSplitCount();
+		const GpuVisibleTransientMemoryAllocation alloc = GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(Cluster) * clusterCount);
+		m_runCtx.m_clustersBuffer.m_buffer = alloc.m_buffer;
+		m_runCtx.m_clustersBuffer.m_offset = alloc.m_offset;
+		m_runCtx.m_clustersBuffer.m_range = alloc.m_size;
+		m_runCtx.m_clustersHandle = rgraph.importBuffer(alloc.m_buffer, BufferUsageBit::kNone, alloc.m_offset, alloc.m_size);
+	}
+
+	// Setup the indirect dispatches and zero the clusters buffer
+	BufferOffsetRange indirectArgsBuff;
+	BufferHandle indirectArgsHandle;
+	{
+		// Allocate memory for the indirect args
+		constexpr U32 dispatchCount = U32(GpuSceneNonRenderableObjectType::kCount) * 2;
+		const GpuVisibleTransientMemoryAllocation alloc =
+			GpuVisibleTransientMemoryPool::getSingleton().allocate(sizeof(DispatchIndirectArgs) * dispatchCount);
+		indirectArgsBuff.m_buffer = alloc.m_buffer;
+		indirectArgsBuff.m_offset = alloc.m_offset;
+		indirectArgsBuff.m_range = alloc.m_size;
+		indirectArgsHandle = rgraph.importBuffer(alloc.m_buffer, BufferUsageBit::kNone, alloc.m_offset, alloc.m_size);
+
+		// Create the pass
+		ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("Cluster binning setup");
+
+		for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+		{
+			rpass.newBufferDependency(getRenderer().getPrimaryNonRenderableVisibility().getVisibleIndicesBufferHandle(type),
+									  BufferUsageBit::kStorageComputeRead);
+		}
+		rpass.newBufferDependency(indirectArgsHandle, BufferUsageBit::kStorageComputeWrite);
+		rpass.newBufferDependency(m_runCtx.m_clustersHandle, BufferUsageBit::kTransferDestination);
+
+		rpass.setWork([this, indirectArgsBuff](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			cmdb.bindShaderProgram(m_jobSetupGrProg.get());
+
+			const UVec4 uniforms(getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y());
+			cmdb.setPushConstants(&uniforms, sizeof(uniforms));
+
+			for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+			{
+				const BufferOffsetRange& buff = getRenderer().getPrimaryNonRenderableVisibility().getVisibleIndicesBuffer(type);
+				cmdb.bindStorageBuffer(0, 0, buff.m_buffer, buff.m_offset, buff.m_range, U32(type));
+			}
+
+			cmdb.bindStorageBuffer(0, 1, indirectArgsBuff.m_buffer, indirectArgsBuff.m_offset, indirectArgsBuff.m_range);
+
+			cmdb.dispatchCompute(1, 1, 1);
+
+			// Now zero the clusters buffer
+			cmdb.fillBuffer(m_runCtx.m_clustersBuffer.m_buffer, m_runCtx.m_clustersBuffer.m_offset, m_runCtx.m_clustersBuffer.m_range, 0);
+		});
+	}
+
+	// Cluster binning
+	{
+		// Create the pass
+		ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("Cluster binning");
+
+		rpass.newBufferDependency(indirectArgsHandle, BufferUsageBit::kIndirectCompute);
+		rpass.newBufferDependency(m_runCtx.m_clustersHandle, BufferUsageBit::kStorageComputeWrite);
+
+		rpass.setWork([this, &ctx, indirectArgsBuff](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			PtrSize indirectArgsBuffOffset = indirectArgsBuff.m_offset;
+			for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+			{
+				cmdb.bindShaderProgram(m_binningGrProgs[type].get());
+
+				const BufferOffsetRange& idsBuff = getRenderer().getPrimaryNonRenderableVisibility().getVisibleIndicesBuffer(type);
+				cmdb.bindStorageBuffer(0, 0, idsBuff.m_buffer, idsBuff.m_offset, idsBuff.m_range);
+
+				PtrSize objBufferOffset = 0;
+				PtrSize objBufferRange = 0;
+				switch(type)
+				{
+				case GpuSceneNonRenderableObjectType::kLight:
+					objBufferOffset = GpuSceneArrays::Light::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::Light::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kDecal:
+					objBufferOffset = GpuSceneArrays::Decal::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::Decal::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kFogDensityVolume:
+					objBufferOffset = GpuSceneArrays::FogDensityVolume::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::FogDensityVolume::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
+					objBufferOffset = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kReflectionProbe:
+					objBufferOffset = GpuSceneArrays::ReflectionProbe::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::ReflectionProbe::getSingleton().getBufferRange();
+					break;
+				default:
+					ANKI_ASSERT(0);
+				}
+				cmdb.bindStorageBuffer(0, 1, &GpuSceneBuffer::getSingleton().getBuffer(), objBufferOffset, objBufferRange);
+
+				cmdb.bindStorageBuffer(0, 2, m_runCtx.m_clustersBuffer.m_buffer, m_runCtx.m_clustersBuffer.m_offset,
+									   m_runCtx.m_clustersBuffer.m_range);
+
+				struct ClusterBinningUniforms
+				{
+					Vec3 m_cameraOrigin;
+					F32 m_zSplitCountOverFrustumLength;
+
+					Vec2 m_renderingSize;
+					U32 m_tileCountX;
+					U32 m_tileCount;
+
+					Vec4 m_nearPlaneWorld;
+
+					Mat4 m_invertedViewProjMat;
+				} uniforms;
+
+				uniforms.m_cameraOrigin = ctx.m_matrices.m_cameraTransform.getTranslationPart().xyz();
+				uniforms.m_zSplitCountOverFrustumLength = F32(getRenderer().getZSplitCount()) / (ctx.m_cameraFar - ctx.m_cameraNear);
+				uniforms.m_renderingSize = Vec2(getRenderer().getInternalResolution());
+				uniforms.m_tileCountX = getRenderer().getTileCounts().x();
+				uniforms.m_tileCount = getRenderer().getTileCounts().x() * getRenderer().getTileCounts().y();
+
+				Plane nearPlane;
+				extractClipPlane(ctx.m_matrices.m_viewProjection, FrustumPlaneType::kNear, nearPlane);
+				uniforms.m_nearPlaneWorld = Vec4(nearPlane.getNormal().xyz(), nearPlane.getOffset());
+
+				uniforms.m_invertedViewProjMat = ctx.m_matrices.m_invertedViewProjectionJitter;
+
+				cmdb.setPushConstants(&uniforms, sizeof(uniforms));
+
+				cmdb.dispatchComputeIndirect(indirectArgsBuff.m_buffer, indirectArgsBuffOffset);
+				indirectArgsBuffOffset += sizeof(DispatchIndirectArgs);
+			}
+		});
+	}
+
+	// Object packing
+	{
+		// Allocations
+		for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+		{
+			const GpuVisibleTransientMemoryAllocation alloc =
+				GpuVisibleTransientMemoryPool::getSingleton().allocate(kClusteredObjectSizes2[type] * kMaxVisibleClusteredObjects2[type]);
+			m_runCtx.m_packedObjectsBuffers[type].m_buffer = alloc.m_buffer;
+			m_runCtx.m_packedObjectsBuffers[type].m_offset = alloc.m_offset;
+			m_runCtx.m_packedObjectsBuffers[type].m_range = alloc.m_size;
+			m_runCtx.m_packedObjectsHandles[type] = rgraph.importBuffer(alloc.m_buffer, BufferUsageBit::kNone, alloc.m_offset, alloc.m_size);
+		}
+
+		// Create the pass
+		ComputeRenderPassDescription& rpass = rgraph.newComputeRenderPass("Cluster object packing");
+		rpass.newBufferDependency(indirectArgsHandle, BufferUsageBit::kIndirectCompute);
+		for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+		{
+			rpass.newBufferDependency(m_runCtx.m_packedObjectsHandles[type], BufferUsageBit::kStorageComputeWrite);
+		}
+
+		rpass.setWork([this, &ctx, indirectArgsBuff](RenderPassWorkContext& rgraphCtx) {
+			CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
+
+			PtrSize indirectArgsBuffOffset = indirectArgsBuff.m_offset + sizeof(DispatchIndirectArgs) * U32(GpuSceneNonRenderableObjectType::kCount);
+			for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
+			{
+				cmdb.bindShaderProgram(m_packingGrProgs[type].get());
+
+				PtrSize objBufferOffset = 0;
+				PtrSize objBufferRange = 0;
+				switch(type)
+				{
+				case GpuSceneNonRenderableObjectType::kLight:
+					objBufferOffset = GpuSceneArrays::Light::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::Light::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kDecal:
+					objBufferOffset = GpuSceneArrays::Decal::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::Decal::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kFogDensityVolume:
+					objBufferOffset = GpuSceneArrays::FogDensityVolume::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::FogDensityVolume::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
+					objBufferOffset = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getBufferRange();
+					break;
+				case GpuSceneNonRenderableObjectType::kReflectionProbe:
+					objBufferOffset = GpuSceneArrays::ReflectionProbe::getSingleton().getGpuSceneOffsetOfArrayBase();
+					objBufferRange = GpuSceneArrays::ReflectionProbe::getSingleton().getBufferRange();
+					break;
+				default:
+					ANKI_ASSERT(0);
+				}
+
+				cmdb.bindStorageBuffer(0, 0, &GpuSceneBuffer::getSingleton().getBuffer(), objBufferOffset, objBufferRange);
+				cmdb.bindStorageBuffer(0, 1, m_runCtx.m_packedObjectsBuffers[type].m_buffer, m_runCtx.m_packedObjectsBuffers[type].m_offset,
+									   m_runCtx.m_packedObjectsBuffers[type].m_range);
+
+				const BufferOffsetRange& idsBuff = getRenderer().getPrimaryNonRenderableVisibility().getVisibleIndicesBuffer(type);
+				cmdb.bindStorageBuffer(0, 2, idsBuff.m_buffer, idsBuff.m_offset, idsBuff.m_range);
+
+				cmdb.dispatchComputeIndirect(indirectArgsBuff.m_buffer, indirectArgsBuffOffset);
+				indirectArgsBuffOffset += sizeof(DispatchIndirectArgs);
+			}
+		});
+	}
+}
+
+} // end namespace anki

+ 50 - 0
AnKi/Renderer/ClusterBinning2.h

@@ -0,0 +1,50 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma once
+
+#include <AnKi/Renderer/RendererObject.h>
+
+namespace anki {
+
+/// @addtogroup renderer
+/// @{
+
+/// Bins clusterer objects to the clusterer.
+class ClusterBinning2 : public RendererObject
+{
+public:
+	ClusterBinning2();
+
+	~ClusterBinning2();
+
+	Error init();
+
+	/// Populate the rendergraph.
+	void populateRenderGraph(RenderingContext& ctx);
+
+private:
+	ShaderProgramResourcePtr m_jobSetupProg;
+	ShaderProgramPtr m_jobSetupGrProg;
+
+	ShaderProgramResourcePtr m_binningProg;
+	Array<ShaderProgramPtr, U32(GpuSceneNonRenderableObjectType::kCount)> m_binningGrProgs;
+
+	ShaderProgramResourcePtr m_packingProg;
+	Array<ShaderProgramPtr, U32(GpuSceneNonRenderableObjectType::kCount)> m_packingGrProgs;
+
+	class
+	{
+	public:
+		BufferHandle m_clustersHandle;
+		BufferOffsetRange m_clustersBuffer;
+
+		Array<BufferHandle, U32(GpuSceneNonRenderableObjectType::kCount)> m_packedObjectsHandles;
+		Array<BufferOffsetRange, U32(GpuSceneNonRenderableObjectType::kCount)> m_packedObjectsBuffers;
+	} m_runCtx;
+};
+/// @}
+
+} // end namespace anki

+ 4 - 1
AnKi/Renderer/Common.h

@@ -16,7 +16,7 @@ namespace anki {
 
 // Forward
 #define ANKI_RENDERER_OBJECT_DEF(a, b) class a;
-#include <AnKi/Renderer/RendererObject.defs.h>
+#include <AnKi/Renderer/RendererObject.def.h>
 #undef ANKI_RENDERER_OBJECT_DEF
 
 class Renderer;
@@ -89,6 +89,9 @@ public:
 	CommonMatrices m_matrices;
 	CommonMatrices m_prevMatrices;
 
+	F32 m_cameraNear = 0.0f;
+	F32 m_cameraFar = 0.0f;
+
 	/// The render target that the Renderer will populate.
 	RenderTargetHandle m_outRenderTarget;
 

+ 93 - 31
AnKi/Renderer/PrimaryNonRenderableVisibility.cpp

@@ -5,11 +5,38 @@
 
 #include <AnKi/Renderer/PrimaryNonRenderableVisibility.h>
 #include <AnKi/Renderer/Renderer.h>
-#include <AnKi/Scene/GpuSceneArray.h>
 #include <AnKi/Shaders/Include/GpuSceneFunctions.h>
+#include <AnKi/Scene/GpuSceneArray.h>
+#include <AnKi/Scene/Components/LightComponent.h>
+#include <AnKi/Scene/Components/ReflectionProbeComponent.h>
+#include <AnKi/Scene/Components/GlobalIlluminationProbeComponent.h>
 
 namespace anki {
 
+template<typename TComponent, typename TArray, typename TPool>
+static WeakArray<TComponent*> gatherComponents(ConstWeakArray<UVec2> pairs, TArray& array, TPool& pool)
+{
+	DynamicArray<TComponent*, MemoryPoolPtrWrapper<StackMemoryPool>> components(&pool);
+
+	for(UVec2 pair : pairs)
+	{
+		if(!array.indexExists(pair.y()))
+		{
+			continue;
+		}
+
+		TComponent* comp = &array[pair.y()];
+		if(comp->getUuid() == pair.x())
+		{
+			components.emplaceBack(comp);
+		}
+	}
+
+	WeakArray<TComponent*> out;
+	components.moveAndReset(out);
+	return out;
+}
+
 void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 {
 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
@@ -19,22 +46,28 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 	for(GpuSceneNonRenderableObjectType type : EnumIterable<GpuSceneNonRenderableObjectType>())
 	{
 		U32 objCount = 0;
+		CString passName;
 		switch(type)
 		{
 		case GpuSceneNonRenderableObjectType::kLight:
 			objCount = GpuSceneArrays::Light::getSingleton().getElementCount();
+			passName = "Primary non-renderable visibility: Lights";
 			break;
 		case GpuSceneNonRenderableObjectType::kDecal:
 			objCount = GpuSceneArrays::Decal::getSingleton().getElementCount();
+			passName = "Primary non-renderable visibility: Decals";
 			break;
 		case GpuSceneNonRenderableObjectType::kFogDensityVolume:
 			objCount = GpuSceneArrays::FogDensityVolume::getSingleton().getElementCount();
+			passName = "Primary non-renderable visibility: Fog volumes";
 			break;
 		case GpuSceneNonRenderableObjectType::kReflectionProbe:
 			objCount = GpuSceneArrays::ReflectionProbe::getSingleton().getElementCount();
+			passName = "Primary non-renderable visibility: Refl probes";
 			break;
 		case GpuSceneNonRenderableObjectType::kGlobalIlluminationProbe:
 			objCount = GpuSceneArrays::GlobalIlluminationProbe::getSingleton().getElementCount();
+			passName = "Primary non-renderable visibility: GI probes";
 			break;
 		default:
 			ANKI_ASSERT(0);
@@ -42,48 +75,77 @@ void PrimaryNonRenderableVisibility::populateRenderGraph(RenderingContext& ctx)
 
 		if(objCount == 0)
 		{
-			continue;
-		}
+			// No objects, point to a buffer with zeros
+
+			RebarAllocation alloc;
+			void* mem = RebarTransientMemoryPool::getSingleton().allocateFrame(sizeof(U32), alloc);
+			memset(mem, 0, sizeof(U32));
 
-		GpuVisibilityNonRenderablesInput in;
-		in.m_passesName = "NonRenderableVisibility";
-		in.m_objectType = type;
-		in.m_viewProjectionMat = ctx.m_matrices.m_viewProjection;
-		in.m_hzbRt = nullptr; // TODO
-		in.m_rgraph = &rgraph;
+			m_runCtx.m_visibleIndicesBuffers[type].m_buffer = &RebarTransientMemoryPool::getSingleton().getBuffer();
+			m_runCtx.m_visibleIndicesBuffers[type].m_offset = alloc.m_offset;
+			m_runCtx.m_visibleIndicesBuffers[type].m_range = alloc.m_range;
 
-		const GpuSceneNonRenderableObjectTypeWithFeedback feedbackType = toGpuSceneNonRenderableObjectTypeWithFeedback(type);
-		if(feedbackType != GpuSceneNonRenderableObjectTypeWithFeedback::kCount)
+			m_runCtx.m_visibleIndicesHandles[type] =
+				rgraph.importBuffer(m_runCtx.m_visibleIndicesBuffers[type].m_buffer, BufferUsageBit::kNone,
+									m_runCtx.m_visibleIndicesBuffers[type].m_offset, m_runCtx.m_visibleIndicesBuffers[type].m_range);
+		}
+		else
 		{
-			// Read feedback from the GPU
-			DynamicArray<U32, MemoryPoolPtrWrapper<StackMemoryPool>> readbackData(ctx.m_tempPool);
-			getRenderer().getReadbackManager().readMostRecentData(m_readbacks[feedbackType], readbackData);
+			// Some objects, perform visibility testing
+
+			GpuVisibilityNonRenderablesInput in;
+			in.m_passesName = passName;
+			in.m_objectType = type;
+			in.m_viewProjectionMat = ctx.m_matrices.m_viewProjection;
+			in.m_hzbRt = nullptr; // TODO
+			in.m_rgraph = &rgraph;
 
-			if(readbackData.getSize())
+			const GpuSceneNonRenderableObjectTypeWithFeedback feedbackType = toGpuSceneNonRenderableObjectTypeWithFeedback(type);
+			if(feedbackType != GpuSceneNonRenderableObjectTypeWithFeedback::kCount)
 			{
-				ANKI_ASSERT(readbackData.getSize() > 1);
-				const U32 pairCount = readbackData[0];
+				// Read feedback from the GPU
+				DynamicArray<U32, MemoryPoolPtrWrapper<StackMemoryPool>> readbackData(ctx.m_tempPool);
+				getRenderer().getReadbackManager().readMostRecentData(m_readbacks[feedbackType], readbackData);
 
-				if(pairCount)
+				if(readbackData.getSize())
 				{
-					m_runCtx.m_uuidArrayIndexPairs[feedbackType] = WeakArray<UVec2>(reinterpret_cast<UVec2*>(&readbackData[1]), pairCount);
+					ANKI_ASSERT(readbackData.getSize() > 1);
+					const U32 pairCount = readbackData[0];
 
-					// Transfer ownership
-					WeakArray<U32> dummy;
-					readbackData.moveAndReset(dummy);
+					if(pairCount)
+					{
+						WeakArray<UVec2> pairs(reinterpret_cast<UVec2*>(&readbackData[1]), pairCount);
+						if(feedbackType == GpuSceneNonRenderableObjectTypeWithFeedback::kLight)
+						{
+							m_runCtx.m_interestingComponents.m_shadowLights =
+								gatherComponents<LightComponent>(pairs, SceneGraph::getSingleton().getComponentArrays().getLights(), *ctx.m_tempPool);
+						}
+						else if(feedbackType == GpuSceneNonRenderableObjectTypeWithFeedback::kReflectionProbe)
+						{
+							m_runCtx.m_interestingComponents.m_reflectionProbes = gatherComponents<ReflectionProbeComponent>(
+								pairs, SceneGraph::getSingleton().getComponentArrays().getReflectionProbes(), *ctx.m_tempPool);
+						}
+						else
+						{
+							ANKI_ASSERT(feedbackType == GpuSceneNonRenderableObjectTypeWithFeedback::kGlobalIlluminationProbe);
+							m_runCtx.m_interestingComponents.m_globalIlluminationProbes = gatherComponents<GlobalIlluminationProbeComponent>(
+								pairs, SceneGraph::getSingleton().getComponentArrays().getGlobalIlluminationProbes(), *ctx.m_tempPool);
+						}
+					}
 				}
-			}
 
-			// Allocate feedback buffer for this frame
-			in.m_cpuFeedbackBuffer.m_range = (objCount * 2 + 1) * sizeof(U32);
-			getRenderer().getReadbackManager().allocateData(m_readbacks[feedbackType], in.m_cpuFeedbackBuffer.m_range,
-															in.m_cpuFeedbackBuffer.m_buffer, in.m_cpuFeedbackBuffer.m_offset);
-		}
+				// Allocate feedback buffer for this frame
+				in.m_cpuFeedbackBuffer.m_range = (objCount * 2 + 1) * sizeof(U32);
+				getRenderer().getReadbackManager().allocateData(m_readbacks[feedbackType], in.m_cpuFeedbackBuffer.m_range,
+																in.m_cpuFeedbackBuffer.m_buffer, in.m_cpuFeedbackBuffer.m_offset);
+			}
 
-		GpuVisibilityNonRenderablesOutput out;
-		getRenderer().getGpuVisibilityNonRenderables().populateRenderGraph(in, out);
+			GpuVisibilityNonRenderablesOutput out;
+			getRenderer().getGpuVisibilityNonRenderables().populateRenderGraph(in, out);
 
-		m_runCtx.m_visOutBufferHandle[type] = out.m_visiblesBufferHandle;
+			m_runCtx.m_visibleIndicesHandles[type] = out.m_visiblesBufferHandle;
+			m_runCtx.m_visibleIndicesBuffers[type] = out.m_visiblesBuffer;
+		}
 	}
 }
 

+ 29 - 3
AnKi/Renderer/PrimaryNonRenderableVisibility.h

@@ -15,6 +15,16 @@ namespace anki {
 /// @addtogroup renderer
 /// @{
 
+/// Contains some interesting visible scene components that will be used by various renderer systems.
+/// @memberof PrimaryNonRenderableVisibility
+class InterestingVisibleComponents
+{
+public:
+	WeakArray<LightComponent*> m_shadowLights;
+	WeakArray<ReflectionProbeComponent*> m_reflectionProbes;
+	WeakArray<GlobalIlluminationProbeComponent*> m_globalIlluminationProbes;
+};
+
 /// Multiple passes for GPU visibility of non-renderable entities.
 class PrimaryNonRenderableVisibility : public RendererObject
 {
@@ -26,16 +36,32 @@ public:
 
 	void populateRenderGraph(RenderingContext& ctx);
 
+	const InterestingVisibleComponents& getInterestingVisibleComponents() const
+	{
+		return m_runCtx.m_interestingComponents;
+	}
+
+	BufferHandle getVisibleIndicesBufferHandle(GpuSceneNonRenderableObjectType type) const
+	{
+		return m_runCtx.m_visibleIndicesHandles[type];
+	}
+
+	const BufferOffsetRange& getVisibleIndicesBuffer(GpuSceneNonRenderableObjectType type) const
+	{
+		return m_runCtx.m_visibleIndicesBuffers[type];
+	}
+
 private:
 	Array<MultiframeReadbackToken, U32(GpuSceneNonRenderableObjectTypeWithFeedback::kCount)> m_readbacks;
 
 	class
 	{
 	public:
-		Array<BufferHandle, U32(GpuSceneNonRenderableObjectType::kCount)> m_visOutBufferHandle;
+		Array<BufferHandle, U32(GpuSceneNonRenderableObjectType::kCount)> m_visibleIndicesHandles;
+		Array<BufferOffsetRange, U32(GpuSceneNonRenderableObjectType::kCount)> m_visibleIndicesBuffers;
 
-		/// Feedback from the GPU. It's an array of object UUID and array index.
-		Array<WeakArray<UVec2>, U32(GpuSceneNonRenderableObjectTypeWithFeedback::kCount)> m_uuidArrayIndexPairs;
+		/// Feedback from the GPU
+		InterestingVisibleComponents m_interestingComponents;
 	} m_runCtx;
 };
 /// @}

+ 86 - 67
AnKi/Renderer/ProbeReflections.cpp

@@ -9,11 +9,16 @@
 #include <AnKi/Renderer/FinalComposite.h>
 #include <AnKi/Renderer/GBuffer.h>
 #include <AnKi/Renderer/RenderQueue.h>
+#include <AnKi/Renderer/PrimaryNonRenderableVisibility.h>
 #include <AnKi/Core/CVarSet.h>
 #include <AnKi/Util/Tracer.h>
+#include <AnKi/Core/StatsSet.h>
 #include <AnKi/Resource/MeshResource.h>
+#include <AnKi/Resource/AsyncLoader.h>
 #include <AnKi/Shaders/Include/TraditionalDeferredShadingTypes.h>
 #include <AnKi/Scene/Components/ReflectionProbeComponent.h>
+#include <AnKi/Scene/Components/LightComponent.h>
+#include <AnKi/Scene/SceneGraph.h>
 
 namespace anki {
 
@@ -21,6 +26,7 @@ static NumericCVar<U32> g_probeReflectionIrradianceResolutionCVar(CVarSubsystem:
 																  "Reflection probe irradiance resolution");
 static NumericCVar<U32> g_probeReflectionShadowMapResolutionCVar(CVarSubsystem::kRenderer, "ProbeReflectionShadowMapResolution", 64, 4, 2048,
 																 "Reflection probe shadow resolution");
+static StatCounter g_probeReflectionCountStatVar(StatCategory::kMisc, "Reflection probes rendered");
 
 Error ProbeReflections::init()
 {
@@ -180,11 +186,10 @@ Error ProbeReflections::initShadowMapping()
 	return Error::kNone;
 }
 
-void ProbeReflections::runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts, RenderPassWorkContext& rgraphCtx)
+void ProbeReflections::runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts, const Array<Mat4, 6>& viewProjMatx, const Array<Mat3x4, 6> viewMats,
+								  RenderPassWorkContext& rgraphCtx)
 {
-	ANKI_ASSERT(m_ctx.m_probe);
 	ANKI_TRACE_SCOPED_EVENT(RCubeRefl);
-	const ReflectionProbeQueueElementForRefresh& probe = *m_ctx.m_probe;
 	CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
 
 	const U32 faceIdx = rgraphCtx.m_currentSecondLevelCommandBufferIndex;
@@ -193,12 +198,10 @@ void ProbeReflections::runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts,
 	cmdb.setViewport(viewportX, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
 	cmdb.setScissor(viewportX, 0, m_gbuffer.m_tileSize, m_gbuffer.m_tileSize);
 
-	const RenderQueue& rqueue = *probe.m_renderQueues[faceIdx];
-
 	RenderableDrawerArguments args;
-	args.m_viewMatrix = rqueue.m_viewMatrix;
-	args.m_cameraTransform = rqueue.m_cameraTransform;
-	args.m_viewProjectionMatrix = rqueue.m_viewProjectionMatrix;
+	args.m_viewMatrix = viewMats[faceIdx];
+	args.m_cameraTransform = Mat3x4(Mat4(viewMats[faceIdx], Vec4(0.0f, 0.0f, 0.0f, 1.0f)).getInverse());
+	args.m_viewProjectionMatrix = viewProjMatx[faceIdx];
 	args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care about prev mats
 	args.m_sampler = getRenderer().getSamplers().m_trilinearRepeat.get();
 	args.m_renderingTechinuqe = RenderingTechnique::kGBuffer;
@@ -207,33 +210,37 @@ void ProbeReflections::runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts,
 	getRenderer().getSceneDrawer().drawMdi(args, cmdb);
 }
 
-void ProbeReflections::runLightShading(U32 faceIdx, const BufferOffsetRange& visResult, RenderPassWorkContext& rgraphCtx)
+void ProbeReflections::runLightShading(U32 faceIdx, const BufferOffsetRange& visResult, const Mat4& viewProjMat, const Mat4& cascadeViewProjMat,
+									   const ReflectionProbeComponent& probe, RenderPassWorkContext& rgraphCtx)
 {
 	ANKI_ASSERT(faceIdx <= 6);
 	ANKI_TRACE_SCOPED_EVENT(RCubeRefl);
 
-	ANKI_ASSERT(m_ctx.m_probe);
-	const ReflectionProbeQueueElementForRefresh& probe = *m_ctx.m_probe;
-	const RenderQueue& rqueue = *probe.m_renderQueues[faceIdx];
-	const Bool hasDirLight = probe.m_renderQueues[0]->m_directionalLight.m_uuid;
-
 	TraditionalDeferredLightShadingDrawInfo dsInfo;
-	dsInfo.m_viewProjectionMatrix = rqueue.m_viewProjectionMatrix;
-	dsInfo.m_invViewProjectionMatrix = rqueue.m_viewProjectionMatrix.getInverse();
-	dsInfo.m_cameraPosWSpace = rqueue.m_cameraTransform.getTranslationPart().xyz1();
+	dsInfo.m_viewProjectionMatrix = viewProjMat;
+	dsInfo.m_invViewProjectionMatrix = viewProjMat.getInverse();
+	dsInfo.m_cameraPosWSpace = probe.getWorldPosition().xyz1();
 	dsInfo.m_viewport = UVec4(0, 0, m_lightShading.m_tileSize, m_lightShading.m_tileSize);
 	dsInfo.m_gbufferTexCoordsScale = Vec2(1.0f / F32(m_lightShading.m_tileSize * 6), 1.0f / F32(m_lightShading.m_tileSize));
 	dsInfo.m_gbufferTexCoordsBias = Vec2(F32(faceIdx) * (1.0f / 6.0f), 0.0f);
 	dsInfo.m_lightbufferTexCoordsScale = Vec2(1.0f / F32(m_lightShading.m_tileSize), 1.0f / F32(m_lightShading.m_tileSize));
 	dsInfo.m_lightbufferTexCoordsBias = Vec2(0.0f, 0.0f);
-	dsInfo.m_effectiveShadowDistance = (hasDirLight) ? probe.m_renderQueues[faceIdx]->m_directionalLight.m_shadowCascadesDistances[0] : 0.0f;
-	dsInfo.m_dirLightMatrix = (hasDirLight) ? probe.m_renderQueues[faceIdx]->m_directionalLight.m_textureMatrices[0] : Mat4::getIdentity();
+	dsInfo.m_effectiveShadowDistance = probe.getShadowsRenderRadius();
+
+	const F32 xScale = 1.0f / 6.0f;
+	const F32 yScale = 1.0f;
+	const F32 xOffset = F32(faceIdx) * (1.0f / 6.0f);
+	const F32 yOffset = 0.0f;
+	const Mat4 atlasMtx(xScale, 0.0f, 0.0f, xOffset, 0.0f, yScale, 0.0f, yOffset, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+	const Mat4 biasMat4(0.5f, 0.0f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+	dsInfo.m_dirLightMatrix = atlasMtx * biasMat4 * cascadeViewProjMat;
+
 	dsInfo.m_visibleLightsBuffer = visResult;
 	dsInfo.m_gbufferRenderTargets[0] = m_ctx.m_gbufferColorRts[0];
 	dsInfo.m_gbufferRenderTargets[1] = m_ctx.m_gbufferColorRts[1];
 	dsInfo.m_gbufferRenderTargets[2] = m_ctx.m_gbufferColorRts[2];
 	dsInfo.m_gbufferDepthRenderTarget = m_ctx.m_gbufferDepthRt;
-	if(hasDirLight && probe.m_renderQueues[faceIdx]->m_directionalLight.hasShadow())
+	if(m_ctx.m_shadowMapRt.isValid())
 	{
 		dsInfo.m_directionalLightShadowmapRenderTarget = m_ctx.m_shadowMapRt;
 	}
@@ -312,33 +319,52 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 {
 	ANKI_TRACE_SCOPED_EVENT(RCubeRefl);
 
-	if(rctx.m_renderQueue->m_reflectionProbeForRefresh == nullptr) [[likely]]
+	// Iterate the visible probes to find a candidate for update
+	WeakArray<ReflectionProbeComponent*> visibleProbes =
+		getRenderer().getPrimaryNonRenderableVisibility().getInterestingVisibleComponents().m_reflectionProbes;
+	ReflectionProbeComponent* probeToRefresh = nullptr;
+	for(ReflectionProbeComponent* probe : visibleProbes)
+	{
+		if(probe->getEnvironmentTextureNeedsRefresh())
+		{
+			probeToRefresh = probe;
+			break;
+		}
+	}
+
+	if(probeToRefresh == nullptr || ResourceManager::getSingleton().getAsyncLoader().getTasksInFlightCount() != 0) [[likely]]
 	{
-		// Early exit
+		// Nothing to update or can't update right now, early exit
 		m_ctx.m_lightShadingRt = {};
 		return;
 	}
 
+	g_probeReflectionCountStatVar.increment(1);
+	probeToRefresh->setEnvironmentTextureAsRefreshed();
+
 #if ANKI_EXTRA_CHECKS
 	m_ctx = {};
 #endif
 
-	m_ctx.m_probe = rctx.m_renderQueue->m_reflectionProbeForRefresh;
-
 	RenderGraphDescription& rgraph = rctx.m_renderGraphDescr;
 
 	// GBuffer visibility
 	Array<GpuVisibilityOutput, 6> visOuts;
+	Array<Frustum, 6> frustums;
 	for(U32 i = 0; i < 6; ++i)
 	{
-		const RenderQueue& queue = *m_ctx.m_probe->m_renderQueues[i];
+		Frustum& frustum = frustums[i];
+		frustum.setPerspective(kClusterObjectFrustumNearPlane, probeToRefresh->getRenderRadius(), kPi / 2.0f, kPi / 2.0f);
+		frustum.setWorldTransform(Transform(probeToRefresh->getWorldPosition().xyz0(), Frustum::getOmnidirectionalFrustumRotations()[i], 1.0f));
+		frustum.update();
+
 		Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
 
 		GpuVisibilityInput visIn;
 		visIn.m_passesName = "Cube refl GBuffer visibility";
 		visIn.m_technique = RenderingTechnique::kGBuffer;
-		visIn.m_viewProjectionMatrix = queue.m_viewProjectionMatrix;
-		visIn.m_lodReferencePoint = queue.m_cameraTransform.getTranslationPart().xyz();
+		visIn.m_viewProjectionMatrix = frustum.getViewProjectionMatrix();
+		visIn.m_lodReferencePoint = probeToRefresh->getWorldPosition();
 		visIn.m_lodDistances = lodDistances;
 		visIn.m_rgraph = &rgraph;
 
@@ -356,11 +382,20 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 		}
 		m_ctx.m_gbufferDepthRt = rgraph.newRenderTarget(m_gbuffer.m_depthRtDescr);
 
+		// Prepare the matrices
+		Array<Mat4, 6> viewProjMats;
+		Array<Mat3x4, 6> viewMats;
+		for(U32 f = 0; f < 6; ++f)
+		{
+			viewProjMats[f] = frustums[f].getViewProjectionMatrix();
+			viewMats[f] = frustums[f].getViewMatrix();
+		}
+
 		// Pass
 		GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass("Cube refl GBuffer");
 		pass.setFramebufferInfo(m_gbuffer.m_fbDescr, rts, m_ctx.m_gbufferDepthRt);
-		pass.setWork(6, [this, visOuts](RenderPassWorkContext& rgraphCtx) {
-			runGBuffer(visOuts, rgraphCtx);
+		pass.setWork(6, [this, visOuts, viewProjMats, viewMats](RenderPassWorkContext& rgraphCtx) {
+			runGBuffer(visOuts, viewProjMats, viewMats, rgraphCtx);
 		});
 
 		for(U i = 0; i < kGBufferColorRenderTargetCount; ++i)
@@ -381,21 +416,27 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	}
 
 	// Shadow visibility. Optional
-	const Bool doShadows =
-		m_ctx.m_probe->m_renderQueues[0]->m_directionalLight.m_uuid && m_ctx.m_probe->m_renderQueues[0]->m_directionalLight.m_shadowCascadeCount > 0;
+	const LightComponent* dirLightc = SceneGraph::getSingleton().getDirectionalLight();
+	const Bool doShadows = dirLightc && dirLightc->getShadowEnabled();
 	Array<GpuVisibilityOutput, 6> shadowVisOuts;
+	Array<Mat4, 6> cascadeViewProjMats;
+	Array<Mat3x4, 6> cascadeViewMats;
 	if(doShadows)
 	{
 		for(U i = 0; i < 6; ++i)
 		{
-			const RenderQueue& queue = *m_ctx.m_probe->m_renderQueues[i]->m_directionalLight.m_shadowRenderQueues[0];
+			constexpr U32 kCascadeCount = 1;
+			dirLightc->computeCascadeFrustums(frustums[i], Array<F32, kCascadeCount>{probeToRefresh->getShadowsRenderRadius()},
+											  WeakArray<Mat4>(&cascadeViewProjMats[i], kCascadeCount),
+											  WeakArray<Mat3x4>(&cascadeViewMats[i], kCascadeCount));
+
 			Array<F32, kMaxLodCount - 1> lodDistances = {1000.0f, 1001.0f}; // Something far to force detailed LODs
 
 			GpuVisibilityInput visIn;
 			visIn.m_passesName = "Cube refl shadows visibility";
 			visIn.m_technique = RenderingTechnique::kDepth;
-			visIn.m_viewProjectionMatrix = queue.m_viewProjectionMatrix;
-			visIn.m_lodReferencePoint = queue.m_cameraTransform.getTranslationPart().xyz();
+			visIn.m_viewProjectionMatrix = cascadeViewProjMats[i];
+			visIn.m_lodReferencePoint = probeToRefresh->getWorldPosition();
 			visIn.m_lodDistances = lodDistances;
 			visIn.m_rgraph = &rgraph;
 
@@ -406,30 +447,14 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	// Shadows. Optional
 	if(doShadows)
 	{
-		// Update light matrices
-		for(U i = 0; i < 6; ++i)
-		{
-			ANKI_ASSERT(m_ctx.m_probe->m_renderQueues[i]->m_directionalLight.m_uuid
-						&& m_ctx.m_probe->m_renderQueues[i]->m_directionalLight.m_shadowCascadeCount == 1);
-
-			const F32 xScale = 1.0f / 6.0f;
-			const F32 yScale = 1.0f;
-			const F32 xOffset = F32(i) * (1.0f / 6.0f);
-			const F32 yOffset = 0.0f;
-			const Mat4 atlasMtx(xScale, 0.0f, 0.0f, xOffset, 0.0f, yScale, 0.0f, yOffset, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f);
-
-			Mat4& lightMat = m_ctx.m_probe->m_renderQueues[i]->m_directionalLight.m_textureMatrices[0];
-			lightMat = atlasMtx * lightMat;
-		}
-
 		// RT
 		m_ctx.m_shadowMapRt = rgraph.newRenderTarget(m_shadowMapping.m_rtDescr);
 
 		// Pass
 		GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass("Cube refl shadows");
 		pass.setFramebufferInfo(m_shadowMapping.m_fbDescr, {}, m_ctx.m_shadowMapRt);
-		pass.setWork(6, [this, shadowVisOuts](RenderPassWorkContext& rgraphCtx) {
-			runShadowMapping(shadowVisOuts, rgraphCtx);
+		pass.setWork(6, [this, shadowVisOuts, cascadeViewProjMats, cascadeViewMats](RenderPassWorkContext& rgraphCtx) {
+			runShadowMapping(shadowVisOuts, cascadeViewProjMats, cascadeViewMats, rgraphCtx);
 		});
 
 		TextureSubresourceInfo subresource(DepthStencilAspectBit::kDepth);
@@ -455,7 +480,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 		GpuVisibilityNonRenderablesInput in;
 		in.m_passesName = "Cube refl light visibility";
 		in.m_objectType = GpuSceneNonRenderableObjectType::kLight;
-		in.m_viewProjectionMat = m_ctx.m_probe->m_renderQueues[faceIdx]->m_viewProjectionMatrix;
+		in.m_viewProjectionMat = cascadeViewProjMats[faceIdx];
 		in.m_rgraph = &rgraph;
 		getRenderer().getGpuVisibilityNonRenderables().populateRenderGraph(in, lightVis[faceIdx]);
 	}
@@ -463,7 +488,7 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	// Light shading passes
 	{
 		// RT
-		m_ctx.m_lightShadingRt = rgraph.importRenderTarget(m_ctx.m_probe->m_reflectionTexture, TextureUsageBit::kNone);
+		m_ctx.m_lightShadingRt = rgraph.importRenderTarget(&probeToRefresh->getReflectionTexture(), TextureUsageBit::kNone);
 
 		// Passes
 		static constexpr Array<CString, 6> passNames = {"Cube refl light shading #0", "Cube refl light shading #1", "Cube refl light shading #2",
@@ -473,8 +498,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 			GraphicsRenderPassDescription& pass = rgraph.newGraphicsRenderPass(passNames[faceIdx]);
 
 			pass.setFramebufferInfo(m_lightShading.m_fbDescr[faceIdx], {m_ctx.m_lightShadingRt});
-			pass.setWork([this, visResult = lightVis[faceIdx].m_visiblesBuffer, faceIdx](RenderPassWorkContext& rgraphCtx) {
-				runLightShading(faceIdx, visResult, rgraphCtx);
+			pass.setWork([this, visResult = lightVis[faceIdx].m_visiblesBuffer, faceIdx, viewProjMat = frustums[faceIdx].getViewProjectionMatrix(),
+						  cascadeViewProjMat = cascadeViewProjMats[faceIdx], probeToRefresh](RenderPassWorkContext& rgraphCtx) {
+				runLightShading(faceIdx, visResult, viewProjMat, cascadeViewProjMat, *probeToRefresh, rgraphCtx);
 			});
 
 			pass.newBufferDependency(lightVis[faceIdx].m_visiblesBufferHandle, BufferUsageBit::kStorageFragmentRead);
@@ -553,9 +579,9 @@ void ProbeReflections::populateRenderGraph(RenderingContext& rctx)
 	}
 }
 
-void ProbeReflections::runShadowMapping(const Array<GpuVisibilityOutput, 6>& visOuts, RenderPassWorkContext& rgraphCtx)
+void ProbeReflections::runShadowMapping(const Array<GpuVisibilityOutput, 6>& visOuts, const Array<Mat4, 6>& viewProjMats,
+										const Array<Mat3x4, 6>& viewMats, RenderPassWorkContext& rgraphCtx)
 {
-	ANKI_ASSERT(m_ctx.m_probe);
 	ANKI_TRACE_SCOPED_EVENT(RCubeRefl);
 
 	CommandBuffer& cmdb = *rgraphCtx.m_commandBuffer;
@@ -563,21 +589,14 @@ void ProbeReflections::runShadowMapping(const Array<GpuVisibilityOutput, 6>& vis
 
 	const U32 faceIdx = rgraphCtx.m_currentSecondLevelCommandBufferIndex;
 
-	ANKI_ASSERT(m_ctx.m_probe->m_renderQueues[faceIdx]);
-	const RenderQueue& faceRenderQueue = *m_ctx.m_probe->m_renderQueues[faceIdx];
-	ANKI_ASSERT(faceRenderQueue.m_directionalLight.m_uuid != 0);
-	ANKI_ASSERT(faceRenderQueue.m_directionalLight.m_shadowCascadeCount == 1);
-	ANKI_ASSERT(faceRenderQueue.m_directionalLight.m_shadowRenderQueues[0]);
-	const RenderQueue& cascadeRenderQueue = *faceRenderQueue.m_directionalLight.m_shadowRenderQueues[0];
-
 	const U32 rez = m_shadowMapping.m_rtDescr.m_height;
 	cmdb.setViewport(rez * faceIdx, 0, rez, rez);
 	cmdb.setScissor(rez * faceIdx, 0, rez, rez);
 
 	RenderableDrawerArguments args;
-	args.m_viewMatrix = cascadeRenderQueue.m_viewMatrix;
+	args.m_viewMatrix = viewMats[faceIdx];
 	args.m_cameraTransform = Mat3x4::getIdentity(); // Don't care
-	args.m_viewProjectionMatrix = cascadeRenderQueue.m_viewProjectionMatrix;
+	args.m_viewProjectionMatrix = viewProjMats[faceIdx];
 	args.m_previousViewProjectionMatrix = Mat4::getIdentity(); // Don't care
 	args.m_sampler = getRenderer().getSamplers().m_trilinearRepeatAniso.get();
 	args.m_renderingTechinuqe = RenderingTechnique::kDepth;

+ 6 - 5
AnKi/Renderer/ProbeReflections.h

@@ -103,8 +103,6 @@ private:
 	class
 	{
 	public:
-		const ReflectionProbeQueueElementForRefresh* m_probe = nullptr;
-
 		Array<RenderTargetHandle, kGBufferColorRenderTargetCount> m_gbufferColorRts;
 		RenderTargetHandle m_gbufferDepthRt;
 		RenderTargetHandle m_lightShadingRt;
@@ -119,9 +117,12 @@ private:
 	Error initIrradianceToRefl();
 	Error initShadowMapping();
 
-	void runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts, RenderPassWorkContext& rgraphCtx);
-	void runShadowMapping(const Array<GpuVisibilityOutput, 6>& visOuts, RenderPassWorkContext& rgraphCtx);
-	void runLightShading(U32 faceIdx, const BufferOffsetRange& visResult, RenderPassWorkContext& rgraphCtx);
+	void runGBuffer(const Array<GpuVisibilityOutput, 6>& visOuts, const Array<Mat4, 6>& viewProjMatx, const Array<Mat3x4, 6> viewMats,
+					RenderPassWorkContext& rgraphCtx);
+	void runShadowMapping(const Array<GpuVisibilityOutput, 6>& visOuts, const Array<Mat4, 6>& viewProjMats, const Array<Mat3x4, 6>& viewMats,
+						  RenderPassWorkContext& rgraphCtx);
+	void runLightShading(U32 faceIdx, const BufferOffsetRange& visResult, const Mat4& viewProjMat, const Mat4& cascadeViewProjMat,
+						 const ReflectionProbeComponent& probe, RenderPassWorkContext& rgraphCtx);
 	void runMipmappingOfLightShading(U32 faceIdx, RenderPassWorkContext& rgraphCtx);
 	void runIrradiance(RenderPassWorkContext& rgraphCtx);
 	void runIrradianceToRefl(RenderPassWorkContext& rgraphCtx);

+ 8 - 0
AnKi/Renderer/Renderer.cpp

@@ -46,6 +46,7 @@
 #include <AnKi/Renderer/VrsSriGeneration.h>
 #include <AnKi/Renderer/PackVisibleClusteredObjects.h>
 #include <AnKi/Renderer/PrimaryNonRenderableVisibility.h>
+#include <AnKi/Renderer/ClusterBinning2.h>
 
 namespace anki {
 
@@ -258,6 +259,9 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	m_clusterBinning.reset(newInstance<ClusterBinning>(RendererMemoryPool::getSingleton()));
 	ANKI_CHECK(m_clusterBinning->init());
 
+	m_clusterBinning2.reset(newInstance<ClusterBinning2>(RendererMemoryPool::getSingleton()));
+	ANKI_CHECK(m_clusterBinning2->init());
+
 	m_packVisibleClustererObjects.reset(newInstance<PackVisibleClusteredObjects>(RendererMemoryPool::getSingleton()));
 	ANKI_CHECK(m_packVisibleClustererObjects->init());
 
@@ -341,6 +345,9 @@ Error Renderer::populateRenderGraph(RenderingContext& ctx)
 
 	ctx.m_matrices.m_unprojectionParameters = ctx.m_matrices.m_projection.extractPerspectiveUnprojectionParams();
 
+	ctx.m_cameraNear = ctx.m_renderQueue->m_cameraNear;
+	ctx.m_cameraFar = ctx.m_renderQueue->m_cameraFar;
+
 	// Import RTs first
 	m_downscaleBlur->importRenderTargets(ctx);
 	m_tonemapping->importRenderTargets(ctx);
@@ -354,6 +361,7 @@ Error Renderer::populateRenderGraph(RenderingContext& ctx)
 	m_packVisibleClustererObjects->populateRenderGraph(ctx);
 	m_genericCompute->populateRenderGraph(ctx);
 	m_clusterBinning->populateRenderGraph(ctx);
+	m_clusterBinning2->populateRenderGraph(ctx);
 	if(m_accelerationStructureBuilder)
 	{
 		m_accelerationStructureBuilder->populateRenderGraph(ctx);

+ 2 - 2
AnKi/Renderer/Renderer.h

@@ -51,7 +51,7 @@ public:
 	{ \
 		return *m_##b; \
 	}
-#include <AnKi/Renderer/RendererObject.defs.h>
+#include <AnKi/Renderer/RendererObject.def.h>
 #undef ANKI_RENDERER_OBJECT_DEF
 
 	Bool getRtShadowsEnabled() const
@@ -192,7 +192,7 @@ private:
 	/// @name Rendering stages
 	/// @{
 #define ANKI_RENDERER_OBJECT_DEF(a, b) UniquePtr<a, SingletonMemoryPoolDeleter<RendererMemoryPool>> m_##b;
-#include <AnKi/Renderer/RendererObject.defs.h>
+#include <AnKi/Renderer/RendererObject.def.h>
 #undef ANKI_RENDERER_OBJECT_DEF
 	/// @}
 

+ 22 - 0
AnKi/Renderer/RendererObject.cpp

@@ -76,4 +76,26 @@ Error RendererObject::loadShaderProgram(CString filename, ShaderProgramResourceP
 	return Error::kNone;
 }
 
+Error RendererObject::loadShaderProgram(CString filename, ConstWeakArray<SubMutation> mutators, ShaderProgramResourcePtr& rsrc,
+										ShaderProgramPtr& grProg)
+{
+	if(!rsrc.isCreated())
+	{
+		ANKI_CHECK(ResourceManager::getSingleton().loadResource(filename, rsrc));
+	}
+
+	ShaderProgramResourceVariantInitInfo initInf(rsrc);
+	for(SubMutation pair : mutators)
+	{
+		initInf.addMutation(pair.m_mutatorName, pair.m_value);
+	}
+
+	const ShaderProgramResourceVariant* variant;
+	rsrc->getOrCreateVariant(initInf, variant);
+
+	grProg.reset(&variant->getProgram());
+
+	return Error::kNone;
+}
+
 } // end namespace anki

+ 1 - 0
AnKi/Renderer/RendererObject.defs.h → AnKi/Renderer/RendererObject.def.h

@@ -33,3 +33,4 @@ ANKI_RENDERER_OBJECT_DEF(IndirectDiffuse, indirectDiffuse)
 ANKI_RENDERER_OBJECT_DEF(VrsSriGeneration, vrsSriGeneration)
 ANKI_RENDERER_OBJECT_DEF(PackVisibleClusteredObjects, packVisibleClustererObjects)
 ANKI_RENDERER_OBJECT_DEF(PrimaryNonRenderableVisibility, primaryNonRenderableVisibility)
+ANKI_RENDERER_OBJECT_DEF(ClusterBinning2, clusterBinning2)

+ 9 - 0
AnKi/Renderer/RendererObject.h

@@ -102,6 +102,15 @@ protected:
 	void registerDebugRenderTarget(CString rtName);
 
 	static Error loadShaderProgram(CString filename, ShaderProgramResourcePtr& rsrc, ShaderProgramPtr& grProg);
+
+	class SubMutation
+	{
+	public:
+		CString m_mutatorName;
+		MutatorValue m_value;
+	};
+
+	static Error loadShaderProgram(CString filename, ConstWeakArray<SubMutation> mutators, ShaderProgramResourcePtr& rsrc, ShaderProgramPtr& grProg);
 };
 /// @}
 

+ 1 - 1
AnKi/Renderer/Utils/Readback.h

@@ -43,7 +43,7 @@ public:
 			const GpuReadbackMemoryAllocation& allocation = token.m_allocations[slot];
 
 			data.resize(allocation.getAllocatedSize() / sizeof(T));
-			memcpy(&data[0], static_cast<const U8*>(allocation.getMappedMemory()) + allocation.getOffset(), allocation.getAllocatedSize());
+			memcpy(&data[0], static_cast<const U8*>(allocation.getMappedMemory()), allocation.getAllocatedSize());
 		}
 		else
 		{

+ 6 - 0
AnKi/Scene/Components/GlobalIlluminationProbeComponent.h

@@ -99,6 +99,12 @@ public:
 		++m_cellIdxToRefresh;
 	}
 
+	U32 getUuid() const
+	{
+		ANKI_ASSERT(m_uuid);
+		return m_uuid;
+	}
+
 private:
 	Vec3 m_halfSize = Vec3(0.5f);
 	Vec3 m_worldPos = Vec3(0.0f);

+ 126 - 0
AnKi/Scene/Components/LightComponent.cpp

@@ -404,4 +404,130 @@ void LightComponent::setupDirectionalLightQueueElement(const Frustum& primaryFru
 	}
 }
 
+void LightComponent::computeCascadeFrustums(const Frustum& primaryFrustum, ConstWeakArray<F32> cascadeDistances, WeakArray<Mat4> cascadeViewProjMats,
+											WeakArray<Mat3x4> cascadeViewMats) const
+{
+	ANKI_ASSERT(m_type == LightComponentType::kDirectional);
+	ANKI_ASSERT(m_shadow);
+	ANKI_ASSERT(cascadeViewProjMats.getSize() <= kMaxShadowCascades && cascadeViewProjMats.getSize() > 0);
+	ANKI_ASSERT(cascadeDistances.getSize() == cascadeViewProjMats.getSize());
+
+	const U32 shadowCascadeCount = cascadeViewProjMats.getSize();
+
+	// Compute the texture matrices
+	const Mat4 lightTrf(m_worldTransform);
+	if(primaryFrustum.getFrustumType() == FrustumType::kPerspective)
+	{
+		// Get some stuff
+		const F32 fovX = primaryFrustum.getFovX();
+		const F32 fovY = primaryFrustum.getFovY();
+
+		// Compute a sphere per cascade
+		Array<Sphere, kMaxShadowCascades> boundingSpheres;
+		for(U32 cascade = 0; cascade < shadowCascadeCount; ++cascade)
+		{
+			// Compute the center of the sphere
+			//           ^ z
+			//           |
+			// ----------|---------- A(a, -f)
+			//  \        |        /
+			//   \       |       /
+			//    \    C(0,z)   /
+			//     \     |     /
+			//      \    |    /
+			//       \---|---/ B(b, -n)
+			//        \  |  /
+			//         \ | /
+			//           v
+			// --------------------------> x
+			//           |
+			// The square distance of A-C is equal to B-C. Solve the equation to find the z.
+			const F32 f = cascadeDistances[cascade]; // Cascade far
+			const F32 n = (cascade == 0) ? primaryFrustum.getNear() : cascadeDistances[cascade - 1]; // Cascade near
+			const F32 a = f * tan(fovY / 2.0f) * fovX / fovY;
+			const F32 b = n * tan(fovY / 2.0f) * fovX / fovY;
+			const F32 z = (b * b + n * n - a * a - f * f) / (2.0f * (f - n));
+			ANKI_ASSERT(absolute((Vec2(a, -f) - Vec2(0, z)).getLength() - (Vec2(b, -n) - Vec2(0, z)).getLength()) <= kEpsilonf * 100.0f);
+
+			Vec3 C(0.0f, 0.0f, z); // Sphere center
+
+			// Compute the radius of the sphere
+			const Vec3 A(a, tan(fovY / 2.0f) * f, -f);
+			const F32 r = (A - C).getLength();
+
+			// Set the sphere
+			boundingSpheres[cascade].setRadius(r);
+			boundingSpheres[cascade].setCenter(primaryFrustum.getWorldTransform().transform(C));
+		}
+
+		// Compute the matrices
+		for(U32 cascade = 0; cascade < shadowCascadeCount; ++cascade)
+		{
+			const Sphere& sphere = boundingSpheres[cascade];
+			const Vec3 sphereCenter = sphere.getCenter().xyz();
+			const F32 sphereRadius = sphere.getRadius();
+			const Vec3& lightDir = getDirection();
+			const Vec3 sceneMin = m_dir.m_sceneMin - Vec3(sphereRadius); // Push the bounds a bit
+			const Vec3 sceneMax = m_dir.m_sceneMax + Vec3(sphereRadius);
+
+			// Compute the intersections with the scene bounds
+			Vec3 eye;
+			if(sphereCenter > sceneMin && sphereCenter < sceneMax)
+			{
+				// Inside the scene bounds
+				const Aabb sceneBox(sceneMin, sceneMax);
+				const F32 t = testCollisionInside(sceneBox, Ray(sphereCenter, -lightDir));
+				eye = sphereCenter + t * (-lightDir);
+			}
+			else
+			{
+				eye = sphereCenter + sphereRadius * (-lightDir);
+			}
+
+			// View
+			Transform cascadeTransform = m_worldTransform;
+			cascadeTransform.setOrigin(eye.xyz0());
+			const Mat4 cascadeViewMat = Mat4(cascadeTransform.getInverse());
+
+			// Projection
+			const F32 far = (eye - sphereCenter).getLength() + sphereRadius;
+			Mat4 cascadeProjMat = Mat4::calculateOrthographicProjectionMatrix(sphereRadius, -sphereRadius, sphereRadius, -sphereRadius,
+																			  kClusterObjectFrustumNearPlane, far);
+
+			// Now it's time to stabilize the shadows by aligning the projection matrix
+			{
+				// Project a random fixed point to the light matrix
+				const Vec4 randomPointAlmostLightSpace = (cascadeProjMat * cascadeViewMat) * Vec3(0.0f).xyz1();
+
+				// Chose a random low shadowmap size and align the random point
+				const F32 shadowmapSize = 128.0f;
+				const F32 shadowmapSize2 = shadowmapSize / 2.0f; // Div with 2 because the projected point is in NDC
+				const F32 alignedX = std::round(randomPointAlmostLightSpace.x() * shadowmapSize2) / shadowmapSize2;
+				const F32 alignedY = std::round(randomPointAlmostLightSpace.y() * shadowmapSize2) / shadowmapSize2;
+
+				const F32 dx = alignedX - randomPointAlmostLightSpace.x();
+				const F32 dy = alignedY - randomPointAlmostLightSpace.y();
+
+				// Fix the projection matrix by applying an offset
+				Mat4 correctionTranslationMat = Mat4::getIdentity();
+				correctionTranslationMat.setTranslationPart(Vec4(dx, dy, 0, 1.0f));
+
+				cascadeProjMat = correctionTranslationMat * cascadeProjMat;
+			}
+
+			// Write the results
+			cascadeViewProjMats[cascade] = cascadeProjMat * cascadeViewMat;
+
+			if(cascade < cascadeViewMats.getSize())
+			{
+				cascadeViewMats[cascade] = Mat3x4(cascadeViewMat);
+			}
+		}
+	}
+	else
+	{
+		ANKI_ASSERT(!"TODO");
+	}
+}
+
 } // end namespace anki

+ 14 - 0
AnKi/Scene/Components/LightComponent.h

@@ -154,6 +154,20 @@ public:
 	/// @param[out] cascadeFrustums Fill those frustums as well. The size of this array is the count of the cascades.
 	void setupDirectionalLightQueueElement(const Frustum& cameraFrustum, DirectionalLightQueueElement& el, WeakArray<Frustum> cascadeFrustums) const;
 
+	/// Calculate some matrices for each cascade. For dir lights.
+	/// @param cameraFrustum Who is looking at the light.
+	/// @param cascadeDistances The distances of the cascades.
+	/// @param cascadeViewProjMats View projection matrices for each cascade.
+	/// @param cascadeViewMats View matrices for each cascade. Optional.
+	void computeCascadeFrustums(const Frustum& cameraFrustum, ConstWeakArray<F32> cascadeDistances, WeakArray<Mat4> cascadeViewProjMats,
+								WeakArray<Mat3x4> cascadeViewMats = {}) const;
+
+	U32 getUuid() const
+	{
+		ANKI_ASSERT(m_uuid);
+		return m_uuid;
+	}
+
 private:
 	Vec4 m_diffColor = Vec4(0.5f);
 	Transform m_worldTransform = Transform::getIdentity();

+ 36 - 65
AnKi/Scene/Components/ReflectionProbeComponent.cpp

@@ -16,107 +16,78 @@ NumericCVar<U32> g_reflectionProbeResolutionCVar(CVarSubsystem::kScene, "Reflect
 
 ReflectionProbeComponent::ReflectionProbeComponent(SceneNode* node)
 	: SceneComponent(node, kClassType)
-	, m_spatial(this)
 {
 	m_worldPos = node->getWorldTransform().getOrigin().xyz();
+	m_gpuSceneProbe.allocate();
 
-	for(U32 i = 0; i < 6; ++i)
-	{
-		m_frustums[i].init(FrustumType::kPerspective);
-		m_frustums[i].setPerspective(kClusterObjectFrustumNearPlane, 100.0f, kPi / 2.0f, kPi / 2.0f);
-		m_frustums[i].setWorldTransform(Transform(m_worldPos.xyz0(), Frustum::getOmnidirectionalFrustumRotations()[i], 1.0f));
-		m_frustums[i].setShadowCascadeCount(1);
-		m_frustums[i].update();
-	}
+	TextureInitInfo texInit("ReflectionProbe");
+	texInit.m_format =
+		(GrManager::getSingleton().getDeviceCapabilities().m_unalignedBbpTextureFormats) ? Format::kR16G16B16_Sfloat : Format::kR16G16B16A16_Sfloat;
+	texInit.m_width = g_reflectionProbeResolutionCVar.get();
+	texInit.m_height = texInit.m_width;
+	texInit.m_mipmapCount = U8(computeMaxMipmapCount2d(texInit.m_width, texInit.m_height, 8));
+	texInit.m_type = TextureType::kCube;
+	texInit.m_usage = TextureUsageBit::kAllSampled | TextureUsageBit::kImageComputeWrite | TextureUsageBit::kImageComputeRead
+					  | TextureUsageBit::kAllFramebuffer | TextureUsageBit::kGenerateMipmaps;
 
-	m_gpuSceneProbe.allocate();
+	m_reflectionTex = GrManager::getSingleton().newTexture(texInit);
+
+	TextureViewInitInfo viewInit(m_reflectionTex.get(), "ReflectionProbe");
+	m_reflectionView = GrManager::getSingleton().newTextureView(viewInit);
+
+	m_reflectionTexBindlessIndex = m_reflectionView->getOrCreateBindlessTextureIndex();
 }
 
 ReflectionProbeComponent::~ReflectionProbeComponent()
 {
-	m_spatial.removeFromOctree(SceneGraph::getSingleton().getOctree());
 }
 
 Error ReflectionProbeComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 {
 	const Bool moved = info.m_node->movedThisFrame();
-	const Bool shapeUpdated = m_dirty;
+	updated = moved || m_dirty;
 	m_dirty = false;
-	updated = moved || shapeUpdated;
 
-	if(shapeUpdated && !m_reflectionTex) [[unlikely]]
+	if(moved) [[unlikely]]
 	{
-		TextureInitInfo texInit("ReflectionProbe");
-		texInit.m_format = (GrManager::getSingleton().getDeviceCapabilities().m_unalignedBbpTextureFormats) ? Format::kR16G16B16_Sfloat
-																											: Format::kR16G16B16A16_Sfloat;
-		texInit.m_width = g_reflectionProbeResolutionCVar.get();
-		texInit.m_height = texInit.m_width;
-		texInit.m_mipmapCount = U8(computeMaxMipmapCount2d(texInit.m_width, texInit.m_height, 8));
-		texInit.m_type = TextureType::kCube;
-		texInit.m_usage = TextureUsageBit::kAllSampled | TextureUsageBit::kImageComputeWrite | TextureUsageBit::kImageComputeRead
-						  | TextureUsageBit::kAllFramebuffer | TextureUsageBit::kGenerateMipmaps;
-
-		m_reflectionTex = GrManager::getSingleton().newTexture(texInit);
-
-		TextureViewInitInfo viewInit(m_reflectionTex.get(), "ReflectionPRobe");
-		m_reflectionView = GrManager::getSingleton().newTextureView(viewInit);
-
-		m_reflectionTexBindlessIndex = m_reflectionView->getOrCreateBindlessTextureIndex();
+		m_reflectionNeedsRefresh = true;
 	}
 
 	if(updated) [[unlikely]]
 	{
-		m_reflectionNeedsRefresh = true;
-
 		m_worldPos = info.m_node->getWorldTransform().getOrigin().xyz();
 
-		F32 effectiveDistance = max(m_halfSize.x(), m_halfSize.y());
-		effectiveDistance = max(effectiveDistance, m_halfSize.z());
-		effectiveDistance = max(effectiveDistance, g_probeEffectiveDistanceCVar.get());
-
-		const F32 shadowCascadeDistance = min(effectiveDistance, g_probeShadowEffectiveDistanceCVar.get());
-
-		for(U32 i = 0; i < 6; ++i)
-		{
-			m_frustums[i].setWorldTransform(Transform(m_worldPos.xyz0(), Frustum::getOmnidirectionalFrustumRotations()[i], 1.0f));
-
-			m_frustums[i].setFar(effectiveDistance);
-			m_frustums[i].setShadowCascadeDistance(0, shadowCascadeDistance);
-
-			// Add something really far to force LOD 0 to be used. The importing tools create LODs with holes some times and that causes the sky to
-			// bleed to GI rendering
-			m_frustums[i].setLodDistances(
-				{effectiveDistance - 3.0f * kEpsilonf, effectiveDistance - 2.0f * kEpsilonf, effectiveDistance - 1.0f * kEpsilonf});
-		}
-
-		const Aabb aabbWorld(-m_halfSize + m_worldPos, m_halfSize + m_worldPos);
-		m_spatial.setBoundingShape(aabbWorld);
-
-		// New UUID
-		m_uuid = SceneGraph::getSingleton().getNewUuid();
+		// Update the UUID
+		m_uuid = (m_reflectionNeedsRefresh) ? SceneGraph::getSingleton().getNewUuid() : 0;
 
 		// Upload to the GPU scene
 		GpuSceneReflectionProbe gpuProbe;
 		gpuProbe.m_position = m_worldPos;
 		gpuProbe.m_cubeTexture = m_reflectionTexBindlessIndex;
+
+		const Aabb aabbWorld(-m_halfSize + m_worldPos, m_halfSize + m_worldPos);
 		gpuProbe.m_aabbMin = aabbWorld.getMin().xyz();
 		gpuProbe.m_aabbMax = aabbWorld.getMax().xyz();
+
 		gpuProbe.m_uuid = m_uuid;
 		gpuProbe.m_arrayIndex = getArrayIndex();
 		m_gpuSceneProbe.uploadToGpuScene(gpuProbe);
 	}
 
-	// Update spatial and frustums
-	const Bool spatialUpdated = m_spatial.update(SceneGraph::getSingleton().getOctree());
-	updated = updated || spatialUpdated;
+	return Error::kNone;
+}
 
-	for(U32 i = 0; i < 6; ++i)
-	{
-		const Bool frustumUpdated = m_frustums[i].update();
-		updated = updated || frustumUpdated;
-	}
+F32 ReflectionProbeComponent::getRenderRadius() const
+{
+	F32 effectiveDistance = max(m_halfSize.x(), m_halfSize.y());
+	effectiveDistance = max(effectiveDistance, m_halfSize.z());
+	effectiveDistance = max(effectiveDistance, g_probeEffectiveDistanceCVar.get());
+	return effectiveDistance;
+}
 
-	return Error::kNone;
+F32 ReflectionProbeComponent::getShadowsRenderRadius() const
+{
+	return min(getRenderRadius(), g_probeShadowEffectiveDistanceCVar.get());
 }
 
 } // end namespace anki

+ 19 - 24
AnKi/Scene/Components/ReflectionProbeComponent.h

@@ -33,6 +33,7 @@ public:
 	{
 		m_halfSize = sizeXYZ / 2.0f;
 		m_dirty = true;
+		m_reflectionNeedsRefresh = true;
 	}
 
 	Vec3 getBoxVolumeSize() const
@@ -40,38 +41,36 @@ public:
 		return m_halfSize * 2.0f;
 	}
 
-	ANKI_INTERNAL WeakArray<Frustum> getFrustums()
+	ANKI_INTERNAL Bool getEnvironmentTextureNeedsRefresh() const
 	{
-		return WeakArray<Frustum>(m_frustums);
+		return m_reflectionNeedsRefresh;
 	}
 
-	ANKI_INTERNAL void setupReflectionProbeQueueElement(ReflectionProbeQueueElement& el) const
+	ANKI_INTERNAL void setEnvironmentTextureAsRefreshed()
 	{
-		ANKI_ASSERT(!m_reflectionNeedsRefresh);
-		ANKI_ASSERT(m_worldPos.x() != kMaxF32);
-		el.m_worldPosition = m_worldPos;
-		el.m_aabbMin = -m_halfSize + m_worldPos;
-		el.m_aabbMax = m_halfSize + m_worldPos;
-		ANKI_ASSERT(el.m_textureBindlessIndex != kMaxU32);
-		el.m_textureBindlessIndex = m_reflectionTexBindlessIndex;
-		el.m_index = m_gpuSceneProbe.getIndex();
+		m_reflectionNeedsRefresh = false;
+		m_dirty = true; // To force update of the gpu scene
 	}
 
-	ANKI_INTERNAL void setupReflectionProbeQueueElementForRefresh(ReflectionProbeQueueElementForRefresh& el) const
+	U32 getUuid() const
 	{
-		ANKI_ASSERT(m_reflectionNeedsRefresh);
-		el.m_worldPosition = m_worldPos;
-		el.m_reflectionTexture = m_reflectionTex.get();
+		return m_uuid;
 	}
 
-	ANKI_INTERNAL Bool getReflectionNeedsRefresh() const
+	Vec3 getWorldPosition() const
 	{
-		return m_reflectionNeedsRefresh;
+		ANKI_ASSERT(m_worldPos.x() != kMaxF32);
+		return m_worldPos;
 	}
 
-	ANKI_INTERNAL void setReflectionNeedsRefresh(Bool needsRefresh)
+	/// The radius around the probe's center that can infuence the rendering of the env texture.
+	F32 getRenderRadius() const;
+
+	F32 getShadowsRenderRadius() const;
+
+	Texture& getReflectionTexture() const
 	{
-		m_reflectionNeedsRefresh = needsRefresh;
+		return *m_reflectionTex;
 	}
 
 private:
@@ -80,12 +79,8 @@ private:
 
 	GpuSceneArrays::ReflectionProbe::Allocation m_gpuSceneProbe;
 
-	Spatial m_spatial;
-
-	Array<Frustum, 6> m_frustums;
-
 	TexturePtr m_reflectionTex;
-	TextureViewPtr m_reflectionView;
+	TextureViewPtr m_reflectionView; ///< Keept alive for the bindless index.
 	U32 m_reflectionTexBindlessIndex = kMaxU32;
 	U32 m_uuid = 0;
 

+ 6 - 0
AnKi/Scene/GpuSceneArray.h

@@ -102,6 +102,12 @@ public:
 		return m_gpuSceneAllocation.getOffset();
 	}
 
+	/// @note Thread-safe
+	PtrSize getBufferRange() const
+	{
+		return getElementCount() * getElementSize();
+	}
+
 	/// @note Thread-safe
 	U32 getElementCount() const
 	{

+ 1 - 31
AnKi/Scene/Visibility.cpp

@@ -557,37 +557,7 @@ void VisibilityTestTask::test(ThreadHive& hive, U32 taskId)
 		}
 		else if(compType == ReflectionProbeComponent::kClassType)
 		{
-			if(!isInside())
-			{
-				continue;
-			}
-
-			ReflectionProbeComponent& reflc = static_cast<ReflectionProbeComponent&>(comp);
-
-			if(reflc.getReflectionNeedsRefresh() && m_frcCtx->m_reflectionProbesForRefreshCount.fetchAdd(1) == 0)
-			{
-				ReflectionProbeQueueElementForRefresh* el = newInstance<ReflectionProbeQueueElementForRefresh>(framePool);
-				m_frcCtx->m_reflectionProbeForRefresh = el;
-
-				reflc.setupReflectionProbeQueueElementForRefresh(*el);
-				reflc.setReflectionNeedsRefresh(false);
-
-				nextQueues = WeakArray<RenderQueue>(newArray<RenderQueue>(framePool, 6), 6);
-				nextFrustums = WeakArray<VisibilityFrustum>(newArray<VisibilityFrustum>(framePool, 6), 6);
-
-				for(U32 i = 0; i < 6; ++i)
-				{
-					el->m_renderQueues[i] = &nextQueues[i];
-
-					nextFrustums[i].m_frustum = &reflc.getFrustums()[i];
-					static_cast<FrustumFlags&>(nextFrustums[i]) = getProbeFrustumFlags();
-				}
-			}
-			else if(!reflc.getReflectionNeedsRefresh())
-			{
-				ReflectionProbeQueueElement* el = result.m_reflectionProbes.newElement();
-				reflc.setupReflectionProbeQueueElement(*el);
-			}
+			ANKI_ASSERT(!"Refl probes use GPU visibility from now on");
 		}
 		else if(compType == DecalComponent::kClassType)
 		{

+ 254 - 0
AnKi/Shaders/ClusterBinning2.ankiprog

@@ -0,0 +1,254 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+// Performs cluster binning. The dispatch's threadcount X is the tileCount*sampleCount/numthreads and the Y is the number of visible objects
+
+#pragma anki mutator OBJECT_TYPE 0 1 2 3 4 // Same as GpuSceneNonRenderableObjectType
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Include/ClusteredShadingTypes.h>
+#include <AnKi/Shaders/CollisionFunctions.hlsl>
+
+ANKI_SPECIALIZATION_CONSTANT_U32(kTileSize, 0u);
+ANKI_SPECIALIZATION_CONSTANT_U32(kZSplitCount, 1u);
+
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+typedef GpuSceneLight GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
+typedef GpuSceneDecal GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
+typedef GpuSceneFogDensityVolume GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
+typedef GpuSceneReflectionProbe GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
+typedef GpuSceneGlobalIlluminationProbe GpuSceneType;
+#else
+#	error See file
+#endif
+
+struct ClusterBinningUniforms
+{
+	Vec3 m_cameraOrigin;
+	F32 m_zSplitCountOverFrustumLength;
+
+	Vec2 m_renderingSize;
+	U32 m_tileCountX;
+	U32 m_tileCount;
+
+	Vec4 m_nearPlaneWorld;
+
+	Mat4 m_invertedViewProjMat;
+};
+
+[[vk::push_constant]] ConstantBuffer<ClusterBinningUniforms> g_unis;
+
+[[vk::binding(0)]] StructuredBuffer<U32> g_visibleObjectIds; // 1st index is the count and then the indices to the g_objects
+[[vk::binding(1)]] StructuredBuffer<GpuSceneType> g_objects;
+
+[[vk::binding(2)]] RWStructuredBuffer<Cluster> g_clusters;
+
+#define THREADGROUP_SIZE 64
+
+// DX Sample locations
+constexpr U32 kSampleCount = 4u;
+#define LOCATION(x, y) UVec2(Vec2(IVec2(x, y) + 8) / 16.0 * F32(kTileSize))
+constexpr UVec2 kSampleLocations[kSampleCount] = {LOCATION(-2, -6), LOCATION(6, -2), LOCATION(-6, 2), LOCATION(2, 6)};
+#undef LOCATION
+
+constexpr U32 kTilesPerThreadgroup = THREADGROUP_SIZE / kSampleCount;
+
+// A mask per tile of this threadgroup for the clusterer object being processed by this workgroup
+groupshared ExtendedClusterObjectMask s_tileMasks[kTilesPerThreadgroup];
+
+// A mask for each Z split for a specific clusterer object
+groupshared ExtendedClusterObjectMask s_zSplitMasks[kMaxZsplitCount];
+
+[numthreads(THREADGROUP_SIZE, 1, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIdx : SV_GROUPINDEX)
+{
+	const U32 dispatchThreadIdX = min(svDispatchThreadId.x, g_unis.m_tileCount * kSampleCount);
+	const U32 tileIdx = dispatchThreadIdX / kSampleCount;
+	const U32 sampleIdx = dispatchThreadIdX % kSampleCount;
+	const U32 localTileIdx = svGroupIdx / kSampleCount;
+	const U32 visibleObjectIdx = svDispatchThreadId.y;
+
+	const UVec2 tileXY = UVec2(tileIdx % g_unis.m_tileCountX, tileIdx / g_unis.m_tileCountX);
+
+	// This is a pixel in one of the main framebuffers of the renderer, eg the gbuffer's framebuffers
+	const UVec2 pixel = tileXY * kTileSize + kSampleLocations[sampleIdx];
+	const Vec2 uv = Vec2(pixel) / g_unis.m_renderingSize;
+	const Vec2 ndc = uvToNdc(uv);
+
+	// Unproject the sample in world space
+	const Vec4 farWorldPos4 = mul(g_unis.m_invertedViewProjMat, Vec4(ndc, 1.0, 1.0));
+	const Vec3 farWorldPos = farWorldPos4.xyz / farWorldPos4.w;
+
+	// Create the ray that will test the clusterer objects
+	const Vec3 rayOrigin = g_unis.m_cameraOrigin;
+	const Vec3 rayDir = normalize(farWorldPos - rayOrigin);
+
+	// Zero shared memory
+	s_tileMasks[localTileIdx] = 0;
+	const U32 splitsPerInvocation = max(1u, kZSplitCount / THREADGROUP_SIZE);
+	for(U32 i = svGroupIdx * splitsPerInvocation; i < (svGroupIdx + 1u) * splitsPerInvocation && i < kZSplitCount; ++i)
+	{
+		s_zSplitMasks[i] = 0;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	// Do collision
+	F32 t0, t1;
+	Bool collides;
+	const GpuSceneType obj = g_objects[g_visibleObjectIds[visibleObjectIdx + 1]];
+
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+	if((U32)obj.m_flags & (U32)GpuSceneLightFlag::kPointLight)
+	{
+		collides = testRaySphere(rayOrigin, rayDir, obj.m_position, obj.m_radius, t0, t1);
+	}
+	else
+	{
+		// Spot light
+
+		t0 = 10000.0;
+		t1 = -10000.0;
+
+		// Iterate all triangles
+		const U32 indices[6u * 3u] = {0u, 1u, 2u, 0u, 2u, 3u, 0u, 3u, 4u, 0u, 1u, 4u, 1u, 2u, 3u, 3u, 4u, 1u};
+		U32 hits = 0u;
+		U32 idx = 0u;
+		do
+		{
+			const Vec3 v0 = obj.m_edgePoints[indices[idx + 0u]].xyz;
+			const Vec3 v1 = obj.m_edgePoints[indices[idx + 1u]].xyz;
+			const Vec3 v2 = obj.m_edgePoints[indices[idx + 2u]].xyz;
+
+			F32 t, u, v;
+			const Bool localCollides = testRayTriangle(rayOrigin, rayDir, v0, v1, v2, false, t, u, v);
+
+			if(localCollides)
+			{
+				t0 = min(t0, t);
+				t1 = max(t1, t);
+				++hits;
+			}
+			idx += 3u;
+		} while(hits < 2u && idx < 6u * 3u);
+
+		if(hits == 1u)
+		{
+			t0 = 0.0;
+		}
+
+		collides = (hits != 0u);
+	}
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
+	collides = testRayObb(rayOrigin, rayDir, obj.m_obbExtend, obj.m_invertedTransform, t0, t1);
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
+	if(obj.m_isBox != 0u)
+	{
+		collides = testRayAabb(rayOrigin, rayDir, obj.m_aabbMinOrSphereCenter, obj.m_aabbMaxOrSphereRadius, t0, t1);
+	}
+	else
+	{
+		collides = testRaySphere(rayOrigin, rayDir, obj.m_aabbMinOrSphereCenter, obj.m_aabbMaxOrSphereRadius.x, t0, t1);
+	}
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE \
+	|| OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
+	collides = testRayAabb(rayOrigin, rayDir, obj.m_aabbMin, obj.m_aabbMax, t0, t1);
+#else
+#	error See file
+#endif
+
+	// Update the masks
+	if(collides)
+	{
+		// Set the tile
+		const ExtendedClusterObjectMask mask = ExtendedClusterObjectMask(1) << ExtendedClusterObjectMask(visibleObjectIdx);
+		InterlockedOr(s_tileMasks[localTileIdx], mask);
+
+		// Compute and set the Z splits
+		const Vec3 hitpointA = rayDir * t0 + rayOrigin;
+		const Vec3 hitpointB = rayDir * t1 + rayOrigin;
+		const F32 distFromNearPlaneA = testPlanePoint(g_unis.m_nearPlaneWorld.xyz, g_unis.m_nearPlaneWorld.w, hitpointA);
+		const F32 distFromNearPlaneB = testPlanePoint(g_unis.m_nearPlaneWorld.xyz, g_unis.m_nearPlaneWorld.w, hitpointB);
+
+		F32 minDistFromNearPlane;
+		F32 maxDistFromNearPlane;
+		if(distFromNearPlaneA < distFromNearPlaneB)
+		{
+			minDistFromNearPlane = distFromNearPlaneA;
+			maxDistFromNearPlane = distFromNearPlaneB;
+		}
+		else
+		{
+			minDistFromNearPlane = distFromNearPlaneB;
+			maxDistFromNearPlane = distFromNearPlaneA;
+		}
+
+		const I32 startZSplit = max(I32(minDistFromNearPlane * g_unis.m_zSplitCountOverFrustumLength), 0);
+		const I32 endZSplit = clamp(I32(maxDistFromNearPlane * g_unis.m_zSplitCountOverFrustumLength), 0, I32(kZSplitCount) - 1);
+		for(I32 i = startZSplit; i <= endZSplit; ++i)
+		{
+			InterlockedOr(s_zSplitMasks[i], mask);
+		}
+	}
+
+	// Sync
+	GroupMemoryBarrierWithGroupSync();
+
+	// First sample writes the tile mask
+	if(sampleIdx == 0u && s_tileMasks[localTileIdx] != 0)
+	{
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+		if((U32)obj.m_flags & (U32)GpuSceneLightFlag::kPointLight)
+		{
+			InterlockedOr(g_clusters[tileIdx].m_pointLightsMask, s_tileMasks[localTileIdx]);
+		}
+		else
+		{
+			InterlockedOr(g_clusters[tileIdx].m_spotLightsMask, s_tileMasks[localTileIdx]);
+		}
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
+		InterlockedOr(g_clusters[tileIdx].m_decalsMask, s_tileMasks[localTileIdx]);
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
+		InterlockedOr(g_clusters[tileIdx].m_fogDensityVolumesMask, U32(s_tileMasks[localTileIdx]));
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
+		InterlockedOr(g_clusters[tileIdx].m_reflectionProbesMask, U32(s_tileMasks[localTileIdx]));
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
+		InterlockedOr(g_clusters[tileIdx].m_giProbesMask, U32(s_tileMasks[localTileIdx]));
+#else
+#	error See file
+#endif
+	}
+
+	// All invocations write at least one Z split
+	for(U32 i = svGroupIdx * splitsPerInvocation; i < (svGroupIdx + 1u) * splitsPerInvocation && i < kZSplitCount; ++i)
+	{
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+		if((U32)obj.m_flags & (U32)GpuSceneLightFlag::kPointLight)
+		{
+			InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_pointLightsMask, s_zSplitMasks[i]);
+		}
+		else
+		{
+			InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_spotLightsMask, s_zSplitMasks[i]);
+		}
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
+		InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_decalsMask, s_zSplitMasks[i]);
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
+		InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_fogDensityVolumesMask, U32(s_zSplitMasks[i]));
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
+		InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_reflectionProbesMask, U32(s_zSplitMasks[i]));
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
+		InterlockedOr(g_clusters[g_unis.m_tileCount + i].m_giProbesMask, U32(s_zSplitMasks[i]));
+#else
+#	error See file
+#endif
+	}
+}
+
+#pragma anki end

+ 78 - 0
AnKi/Shaders/ClusterBinning2PackVisibles.ankiprog

@@ -0,0 +1,78 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki mutator OBJECT_TYPE 0 1 2 3 4 // Same as GpuSceneNonRenderableObjectType
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Include/ClusteredShadingTypes.h>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+typedef LightUnion ClusteredType;
+typedef GpuSceneLight GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_DECAL
+typedef Decal ClusteredType;
+typedef GpuSceneDecal GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME
+typedef FogDensityVolume ClusteredType;
+typedef GpuSceneFogDensityVolume GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE
+typedef ReflectionProbe ClusteredType;
+typedef GpuSceneReflectionProbe GpuSceneType;
+#elif OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE
+typedef GlobalIlluminationProbe ClusteredType;
+typedef GpuSceneGlobalIlluminationProbe GpuSceneType;
+#else
+#	error See file
+#endif
+
+[[vk::binding(0)]] StructuredBuffer<GpuSceneType> g_inBuffer;
+[[vk::binding(1)]] RWStructuredBuffer<ClusteredType> g_outBuffer;
+[[vk::binding(2)]] StructuredBuffer<U32> g_visibles;
+
+#define THREAD_GROUP_SIZE 64
+
+[numthreads(THREAD_GROUP_SIZE, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	const U32 visibleObjCount = min(g_visibles[0], kMaxVisibleClusteredObjects2[OBJECT_TYPE]);
+	const U32 idxOut = svDispatchThreadId.x;
+	if(idxOut >= visibleObjCount)
+	{
+		return;
+	}
+
+#if OBJECT_TYPE == ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_LIGHT
+	const GpuSceneLight input = g_inBuffer[g_visibles[idxOut + 1]];
+
+	const Bool isPoint = ((U32)input.m_flags & (U32)GpuSceneLightFlag::kPointLight) ? true : false;
+
+	LightUnion output;
+	output.m_position = input.m_position;
+	output.m_radius = input.m_radius;
+
+	output.m_diffuseColor = input.m_diffuseColor;
+	output.m_lightType = (isPoint) ? 0 : 1;
+
+	output.m_shadowLayer = kMaxU32; // TODO
+	output.m_shadow = ((U32)input.m_flags & (U32)GpuSceneLightFlag::kShadow) ? 1 : 0;
+	output.m_innerCos = input.m_innerCos;
+	output.m_outerCos = input.m_outerCos;
+
+	output.m_direction = input.m_direction;
+	output.m_shadowAtlasTileScale = 0.0f; // TODO
+
+	for(U32 i = 0; i < 6; ++i)
+	{
+		output.m_shadowAtlasTileOffsetsOrTextureMat[i] = 0.0f; // TODO
+	}
+
+	g_outBuffer[idxOut] = output;
+#else
+	g_outBuffer[idxOut] = g_inBuffer[g_visibles[idxOut + 1]];
+#endif
+}
+
+#pragma anki end

+ 71 - 0
AnKi/Shaders/ClusterBinning2Setup.ankiprog

@@ -0,0 +1,71 @@
+// Copyright (C) 2009-2023, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+// This shader prepares the indirect args of future dispatches
+
+#pragma anki start comp
+
+#include <AnKi/Shaders/Common.hlsl>
+#include <AnKi/Shaders/Include/ClusteredShadingTypes.h>
+
+[[vk::binding(0)]] StructuredBuffer<U32> g_visibleIndices[(U32)GpuSceneNonRenderableObjectType::kCount];
+
+// This has a size of 2*GpuSceneNonRenderableObjectType::kCount. The first GpuSceneNonRenderableObjectType::kCount elements are for the cluster
+// binning dispatches and the rest GpuSceneNonRenderableObjectType::kCount for the packing dispatches
+[[vk::binding(1)]] RWStructuredBuffer<DispatchIndirectArgs> g_indirectArgs;
+
+struct Uniforms
+{
+	U32 m_tileCount;
+	U32 m_padding1;
+	U32 m_padding2;
+	U32 m_padding3;
+};
+[[vk::push_constant]] ConstantBuffer<Uniforms> g_unis;
+
+constexpr U32 kSampleCount = 4;
+constexpr U32 kClusterBinningThreadgroupSize = 64;
+constexpr U32 kPackVisiblesThreadgroupSize = 64;
+
+#define THREADGROUP_SIZE 16
+
+[numthreads(THREADGROUP_SIZE, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
+{
+	if(svDispatchThreadId < (U32)GpuSceneNonRenderableObjectType::kCount)
+	{
+		// First threads set the dispatch args of cluster binning
+
+		const GpuSceneNonRenderableObjectType type = (GpuSceneNonRenderableObjectType)svDispatchThreadId;
+		const U32 objCount = min(kMaxVisibleClusteredObjects2[(U32)type], g_visibleIndices[(U32)type][0]);
+
+		DispatchIndirectArgs args;
+		args.m_threadGroupCountX = (g_unis.m_tileCount * kSampleCount + kClusterBinningThreadgroupSize - 1) / kClusterBinningThreadgroupSize;
+		args.m_threadGroupCountY = objCount;
+		args.m_threadGroupCountZ = 1;
+
+		g_indirectArgs[svDispatchThreadId] = args;
+	}
+	else if(svDispatchThreadId < (U32)GpuSceneNonRenderableObjectType::kCount * 2)
+	{
+		// Next threads set the dispatch args of packing
+
+		const GpuSceneNonRenderableObjectType type =
+			(GpuSceneNonRenderableObjectType)(svDispatchThreadId - (U32)GpuSceneNonRenderableObjectType::kCount);
+		const U32 objCount = min(kMaxVisibleClusteredObjects2[(U32)type], g_visibleIndices[(U32)type][0]);
+
+		DispatchIndirectArgs args;
+		args.m_threadGroupCountX = (objCount + kPackVisiblesThreadgroupSize - 1) / kPackVisiblesThreadgroupSize;
+		args.m_threadGroupCountY = 1;
+		args.m_threadGroupCountZ = 1;
+
+		g_indirectArgs[svDispatchThreadId] = args;
+	}
+	else
+	{
+		// Skip remaining threads
+	}
+}
+
+#pragma anki end

+ 76 - 0
AnKi/Shaders/Include/ClusteredShadingTypes.h

@@ -28,10 +28,12 @@ ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS(ClusteredObjectType)
 
 // Limits
 #if ANKI_CLUSTERED_SHADING_USE_64BIT
+constexpr U32 kMaxVisibleLights = 64u;
 constexpr U32 kMaxVisiblePointLights = 64u;
 constexpr U32 kMaxVisibleSpotLights = 64u;
 constexpr U32 kMaxVisibleDecals = 64u;
 #else
+constexpr U32 kMaxVisibleLights = 32u;
 constexpr U32 kMaxVisiblePointLights = 32u;
 constexpr U32 kMaxVisibleSpotLights = 32u;
 constexpr U32 kMaxVisibleDecals = 32u;
@@ -45,6 +47,72 @@ constexpr RF32 kClusterObjectFrustumNearPlane = 0.1f / 4.0f; ///< Near plane of
 constexpr RF32 kSubsurfaceMin = 0.01f;
 constexpr U32 kMaxZsplitCount = 128u;
 
+/// A union of all fields of spot and point lights. Since we don't have unions in HLSL we had to get creative.
+struct LightUnion
+{
+	Vec3 m_position; ///< Position in world space.
+	RF32 m_radius; ///< Radius
+
+	RVec3 m_diffuseColor;
+	U32 m_lightType; ///< 0 is point and 1 is spot
+
+	U32 m_shadowLayer; ///< Shadow layer used in RT shadows
+	U32 m_shadow;
+	F32 m_innerCos; ///< SPOT LIGHTS
+	F32 m_outerCos; ///< SPOT LIGHTS
+
+	RVec3 m_direction; ///< SPOT LIGHTS: Light direction.
+	F32 m_shadowAtlasTileScale; ///< POINT LIGHTS: UV scale for all tiles.
+
+	/// When it's a point light this is an array of 6 Vec2s (but because of padding it's actually Vec4s). When it's a spot light the first 4 Vec4s are
+	/// the rows of the texture matrix
+	Vec4 m_shadowAtlasTileOffsetsOrTextureMat[6u];
+};
+
+/// Point light.
+struct PointLight2
+{
+	Vec3 m_position; ///< Position in world space.
+	RF32 m_radius; ///< Radius
+
+	RVec3 m_diffuseColor;
+	U32 m_padding1;
+
+	U32 m_shadowLayer; ///< Shadow layer used in RT shadows. Also used to show that it doesn't cast shadow.
+	F32 m_shadow;
+	F32 m_padding2;
+	U32 m_padding3;
+
+	RVec3 m_padding4;
+	F32 m_shadowAtlasTileScale; ///< UV scale for all tiles.
+
+	Vec4 m_shadowAtlasTileOffsets[6u]; ///< It's a array of Vec2 but because of padding round it up.
+};
+static_assert(sizeof(PointLight2) == sizeof(LightUnion));
+
+/// Spot light.
+struct SpotLight2
+{
+	Vec3 m_position; ///< Position in world space.
+	RF32 m_radius; ///< Radius
+
+	RVec3 m_diffuseColor;
+	U32 m_padding1;
+
+	U32 m_shadowLayer; ///< Shadow layer used in RT shadows. Also used to show that it doesn't cast shadow.
+	U32 m_shadow;
+	F32 m_innerCos;
+	F32 m_outerCos;
+
+	RVec3 m_direction;
+	F32 m_padding2;
+
+	Mat4 m_textureMatrix;
+
+	Vec4 m_padding3[2];
+};
+static_assert(sizeof(SpotLight2) == sizeof(LightUnion));
+
 /// Point light.
 struct PointLight
 {
@@ -258,9 +326,11 @@ constexpr U32 kSizeof_Cluster = 2u * sizeof(Vec4);
 static_assert(sizeof(Cluster) == kSizeof_Cluster);
 #endif
 
+// TODO rm
 constexpr ANKI_ARRAY(U32, ClusteredObjectType::kCount, kClusteredObjectSizes) = {
 	sizeof(PointLight), sizeof(SpotLight), sizeof(Decal), sizeof(FogDensityVolume), sizeof(ReflectionProbe), sizeof(GlobalIlluminationProbe)};
 
+// TODO rm
 constexpr ANKI_ARRAY(U32, ClusteredObjectType::kCount, kMaxVisibleClusteredObjects) = {
 #if ANKI_CLUSTERED_SHADING_USE_64BIT
 	64, 64, 64,
@@ -269,4 +339,10 @@ constexpr ANKI_ARRAY(U32, ClusteredObjectType::kCount, kMaxVisibleClusteredObjec
 #endif
 	16, 16, 16};
 
+constexpr ANKI_ARRAY(U32, GpuSceneNonRenderableObjectType::kCount, kClusteredObjectSizes2) = {
+	sizeof(LightUnion), sizeof(Decal), sizeof(FogDensityVolume), sizeof(ReflectionProbe), sizeof(GlobalIlluminationProbe)};
+
+constexpr ANKI_ARRAY(U32, GpuSceneNonRenderableObjectType::kCount, kMaxVisibleClusteredObjects2) = {
+	kMaxVisibleLights, kMaxVisibleDecals, kMaxVisibleFogDensityVolumes, kMaxVisibleReflectionProbes, kMaxVisibleGlobalIlluminationProbes};
+
 ANKI_END_NAMESPACE

+ 7 - 0
AnKi/Shaders/Include/Common.h

@@ -783,4 +783,11 @@ struct DrawIndexedIndirectArgs
 #endif
 };
 
+struct DispatchIndirectArgs
+{
+	U32 m_threadGroupCountX;
+	U32 m_threadGroupCountY;
+	U32 m_threadGroupCountZ;
+};
+
 ANKI_END_NAMESPACE

+ 1 - 0
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -170,6 +170,7 @@ ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS(GpuSceneNonRenderableObjectType)
 #define ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_FOG_DENSITY_VOLUME 2
 #define ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_REFLECTION_PROBE 3
 #define ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_GLOBAL_ILLUMINATION_PROBE 4
+#define ANKI_GPU_SCENE_NON_RENDERABLE_OBJECT_TYPE_COUNT 5
 
 enum class GpuSceneNonRenderableObjectTypeBit : U32
 {

+ 4 - 3
AnKi/Shaders/Intellisense.hlsl

@@ -7,9 +7,10 @@
 
 #define groupshared
 #define globallycoherent
-#define SV_DISPATCHTHREADID
-#define SV_GROUPINDEX
-#define SV_GROUPID
+#define SV_DISPATCHTHREADID // gl_GlobalInvocationID
+#define SV_GROUPINDEX // gl_LocalInvocationIndex
+#define SV_GROUPID // gl_WorkGroupID
+#define SV_GROUPTHREADID // gl_LocalInvocationID
 #define SV_VERTEXID
 #define SV_POSITION
 #define SV_INSTANCEID