浏览代码

Reflection CPU optimizations

Panagiotis Christopoulos Charitos 10 年之前
父节点
当前提交
4ff5238cea
共有 7 个文件被更改,包括 305 次插入150 次删除
  1. 1 0
      include/anki/core/Trace.h
  2. 5 0
      include/anki/renderer/Clusterer.h
  3. 20 9
      include/anki/renderer/Ir.h
  4. 6 4
      shaders/Pack.glsl
  5. 1 0
      src/core/Trace.cpp
  6. 259 128
      src/renderer/Ir.cpp
  7. 13 9
      src/renderer/Is.cpp

+ 1 - 0
include/anki/core/Trace.h

@@ -51,6 +51,7 @@ enum class TraceCounterType
 	RENDERER_LIGHTS,
 	RENDERER_LIGHTS,
 	RENDERER_SHADOW_PASSES,
 	RENDERER_SHADOW_PASSES,
 	RENDERER_MERGED_DRAWCALLS,
 	RENDERER_MERGED_DRAWCALLS,
+	RENDERER_REFLECTIONS,
 	SCENE_NODES_UPDATED,
 	SCENE_NODES_UPDATED,
 
 
 	COUNT
 	COUNT

+ 5 - 0
include/anki/renderer/Clusterer.h

@@ -42,6 +42,11 @@ public:
 		return m_clusterIds.getBegin() + m_count;
 		return m_clusterIds.getBegin() + m_count;
 	}
 	}
 
 
+	U getClusterCount() const
+	{
+		return m_count;
+	}
+
 private:
 private:
 	DArray<Array<U8, 3>> m_clusterIds;
 	DArray<Array<U8, 3>> m_clusterIds;
 	U32 m_count = 0;
 	U32 m_count = 0;

+ 20 - 9
include/anki/renderer/Ir.h

@@ -11,8 +11,10 @@
 namespace anki {
 namespace anki {
 
 
 // Forward
 // Forward
-struct ShaderReflectionProbe;
-class IrBuildContext;
+struct IrShaderReflectionProbe;
+class IrRunContext;
+class IrTaskContext;
+class ReflectionProbeComponent;
 
 
 /// @addtogroup renderer
 /// @addtogroup renderer
 /// @{
 /// @{
@@ -20,10 +22,10 @@ class IrBuildContext;
 /// Image based reflections.
 /// Image based reflections.
 class Ir: public RenderingPass
 class Ir: public RenderingPass
 {
 {
+	friend class IrTask;
+
 anki_internal:
 anki_internal:
-	Ir(Renderer* r)
-		: RenderingPass(r)
-	{}
+	Ir(Renderer* r);
 
 
 	~Ir();
 	~Ir();
 
 
@@ -70,20 +72,29 @@ private:
 	U16 m_cubemapArrSize = 0;
 	U16 m_cubemapArrSize = 0;
 	U16 m_fbSize = 0;
 	U16 m_fbSize = 0;
 	DArray<CacheEntry> m_cacheEntries;
 	DArray<CacheEntry> m_cacheEntries;
+	Barrier m_barrier;
 
 
 	// Tokens
 	// Tokens
 	DynamicBufferToken m_probesToken;
 	DynamicBufferToken m_probesToken;
 	DynamicBufferToken m_clustersToken;
 	DynamicBufferToken m_clustersToken;
 	DynamicBufferToken m_indicesToken;
 	DynamicBufferToken m_indicesToken;
 
 
+	/// Bin probes in clusters.
+	void binProbes(U32 threadId, PtrSize threadsCount, IrRunContext& ctx);
+
+	ANKI_USE_RESULT Error writeProbeAndRender(SceneNode& node,
+		IrShaderReflectionProbe& probe);
+
+	void binProbe(U probeIdx, IrRunContext& ctx, IrTaskContext& task) const;
+
 	ANKI_USE_RESULT Error renderReflection(SceneNode& node,
 	ANKI_USE_RESULT Error renderReflection(SceneNode& node,
-		ShaderReflectionProbe& shaderProb);
+		ReflectionProbeComponent& reflc, U cubemapIdx);
+
+	static void writeIndicesAndCluster(U clusterIdx, Bool hasPrevCluster,
+		IrRunContext& ctx);
 
 
 	/// Find a cache entry to store the reflection.
 	/// Find a cache entry to store the reflection.
 	void findCacheEntry(SceneNode& node, U& entry, Bool& render);
 	void findCacheEntry(SceneNode& node, U& entry, Bool& render);
-
-	void binProbe(const SceneNode& node, U index, IrBuildContext& ctx);
-	void populateIndexAndClusterBuffers(IrBuildContext& ctx);
 };
 };
 /// @}
 /// @}
 
 

+ 6 - 4
shaders/Pack.glsl

@@ -114,7 +114,7 @@ void readGBuffer(
 	in sampler2D rt0,
 	in sampler2D rt0,
 	in sampler2D rt1,
 	in sampler2D rt1,
 	in sampler2D rt2,
 	in sampler2D rt2,
-	in vec2 uv,
+	in vec2 uv_,
 	out vec3 diffColor,
 	out vec3 diffColor,
 	out vec3 normal,
 	out vec3 normal,
 	out vec3 specColor,
 	out vec3 specColor,
@@ -122,16 +122,18 @@ void readGBuffer(
 	out float subsurface,
 	out float subsurface,
 	out float emission)
 	out float emission)
 {
 {
-	vec4 comp = textureLod(rt0, uv, 0.0);
+	ivec2 uv = ivec2(gl_FragCoord.xy);
+
+	vec4 comp = texelFetch(rt0, uv, 0);
 	diffColor = comp.xyz;
 	diffColor = comp.xyz;
 	subsurface = comp.w;
 	subsurface = comp.w;
 
 
-	comp = textureLod(rt1, uv, 0.0);
+	comp = texelFetch(rt1, uv, 0);
 	specColor = vec3(unpackUnorm1ToUnorm2(comp.x), comp.y);
 	specColor = vec3(unpackUnorm1ToUnorm2(comp.x), comp.y);
 	roughness = comp.z;
 	roughness = comp.z;
 	emission = comp.w * MAX_EMISSION;
 	emission = comp.w * MAX_EMISSION;
 
 
-	normal = textureLod(rt2, uv, 0.0).xyz;
+	normal = texelFetch(rt2, uv, 0).xyz;
 	normal = normalize(normal * 2.0 - 1.0);
 	normal = normalize(normal * 2.0 - 1.0);
 }
 }
 
 

+ 1 - 0
src/core/Trace.cpp

@@ -39,6 +39,7 @@ static Array<const char*, U(TraceCounterType::COUNT)> counterNames = {{
 	"RENDERER_LIGHTS",
 	"RENDERER_LIGHTS",
 	"RENDERER_SHADOW_PASSES",
 	"RENDERER_SHADOW_PASSES",
 	"RENDERER_MERGED_DRAWCALLS",
 	"RENDERER_MERGED_DRAWCALLS",
+	"RENDERER_REFLECTIONS",
 	"SCENE_NODES_UPDATED"
 	"SCENE_NODES_UPDATED"
 }};
 }};
 
 

+ 259 - 128
src/renderer/Ir.cpp

@@ -20,7 +20,7 @@ namespace anki {
 // Misc                                                                        =
 // Misc                                                                        =
 //==============================================================================
 //==============================================================================
 
 
-struct ShaderReflectionProbe
+struct IrShaderReflectionProbe
 {
 {
 	Vec3 m_pos;
 	Vec3 m_pos;
 	F32 m_radiusSq;
 	F32 m_radiusSq;
@@ -28,7 +28,7 @@ struct ShaderReflectionProbe
 	U32 _m_pading[3];
 	U32 _m_pading[3];
 };
 };
 
 
-struct ShaderCluster
+struct IrShaderCluster
 {
 {
 	U32 m_indexOffset;
 	U32 m_indexOffset;
 	U32 m_probeCount;
 	U32 m_probeCount;
@@ -47,20 +47,23 @@ public:
 class IrClusterData
 class IrClusterData
 {
 {
 public:
 public:
-	U32 m_probeCount = 0;
+	Atomic<U32> m_probeCount = {0};
 	Array<ClusterDataIndex, MAX_PROBES_PER_CLUSTER> m_probeIds;
 	Array<ClusterDataIndex, MAX_PROBES_PER_CLUSTER> m_probeIds;
 
 
 	Bool operator==(const IrClusterData& b) const
 	Bool operator==(const IrClusterData& b) const
 	{
 	{
-		if(m_probeCount != b.m_probeCount)
+		const U probeCount = m_probeCount.load() % MAX_PROBES_PER_CLUSTER;
+		const U bProbeCount = b.m_probeCount.load() % MAX_PROBES_PER_CLUSTER;
+
+		if(probeCount != bProbeCount)
 		{
 		{
 			return false;
 			return false;
 		}
 		}
 
 
-		if(m_probeCount > 0)
+		if(probeCount > 0)
 		{
 		{
 			if(memcmp(&m_probeIds[0], &b.m_probeIds[0],
 			if(memcmp(&m_probeIds[0], &b.m_probeIds[0],
-				sizeof(m_probeIds[0]) * m_probeCount) != 0)
+				sizeof(m_probeIds[0]) * probeCount) != 0)
 			{
 			{
 				return false;
 				return false;
 			}
 			}
@@ -72,10 +75,11 @@ public:
 	/// Sort the indices from the smallest probe to the biggest.
 	/// Sort the indices from the smallest probe to the biggest.
 	void sort()
 	void sort()
 	{
 	{
-		if(m_probeCount > 1)
+		const U probeCount = m_probeCount.load() % MAX_PROBES_PER_CLUSTER;
+		if(probeCount > 1)
 		{
 		{
 			std::sort(m_probeIds.getBegin(),
 			std::sort(m_probeIds.getBegin(),
-				m_probeIds.getBegin() + m_probeCount,
+				m_probeIds.getBegin() + probeCount,
 				[](const ClusterDataIndex& a, const ClusterDataIndex& b)
 				[](const ClusterDataIndex& a, const ClusterDataIndex& b)
 			{
 			{
 				ANKI_ASSERT(a.m_probeRadius > 0.0 && b.m_probeRadius > 0.0);
 				ANKI_ASSERT(a.m_probeRadius > 0.0 && b.m_probeRadius > 0.0);
@@ -85,18 +89,63 @@ public:
 	}
 	}
 };
 };
 
 
-class IrBuildContext
+/// Context for the whole run.
+class IrRunContext
 {
 {
 public:
 public:
+	Ir* m_ir ANKI_DBG_NULLIFY_PTR;
+
 	DArray<IrClusterData> m_clusterData;
 	DArray<IrClusterData> m_clusterData;
-	U32 m_indexCount = 0;
+	SArray<IrShaderCluster> m_clusters;
+	SArray<U32> m_indices;
+	Atomic<U32> m_indexCount = {0};
+	VisibilityTestResults* m_visRez ANKI_DBG_NULLIFY_PTR;
+
+	/// An atomic that will help allocating the index buffer
+	Atomic<U32> m_probeIndicesAllocate = {0};
+	/// Same as m_probeIndicesAllocate
+	Atomic<U32> m_clustersAllocate = {0};
+
+	StackAllocator<U8> m_alloc;
+
+	~IrRunContext()
+	{
+		// Deallocate. Watch the order
+		m_clusterData.destroy(m_alloc);
+	}
+};
+
+/// Thread specific context.
+class IrTaskContext
+{
+public:
 	ClustererTestResult m_clustererTestResult;
 	ClustererTestResult m_clustererTestResult;
+	SceneNode* m_node ANKI_DBG_NULLIFY_PTR;
+};
+
+/// Write the lights to the GPU buffers.
+class IrTask: public ThreadPool::Task
+{
+public:
+	IrRunContext* m_ctx ANKI_DBG_NULLIFY_PTR;
+
+	Error operator()(U32 threadId, PtrSize threadsCount) override
+	{
+		m_ctx->m_ir->binProbes(threadId, threadsCount, *m_ctx);
+		return ErrorCode::NONE;
+	}
 };
 };
 
 
 //==============================================================================
 //==============================================================================
 // Ir                                                                          =
 // Ir                                                                          =
 //==============================================================================
 //==============================================================================
 
 
+//==============================================================================
+Ir::Ir(Renderer* r)
+	: RenderingPass(r)
+	, m_barrier(r->getThreadPool().getThreadsCount())
+{}
+
 //==============================================================================
 //==============================================================================
 Ir::~Ir()
 Ir::~Ir()
 {
 {
@@ -184,18 +233,25 @@ Error Ir::run(CommandBufferPtr cmdb)
 		ANKI_LOGW("Increase the ir.cubemapTextureArraySize");
 		ANKI_LOGW("Increase the ir.cubemapTextureArraySize");
 	}
 	}
 
 
-	IrBuildContext ctx;
-
 	//
 	//
-	// Perform some allocations
+	// Perform some initialization
 	//
 	//
+	IrRunContext ctx;
+
+	ctx.m_visRez = &visRez;
+	ctx.m_ir = this;
+	ctx.m_alloc = getFrameAllocator();
 
 
-	// Allocate temp mem for clusters
+	// Allocate temp CPU mem
 	ctx.m_clusterData.create(getFrameAllocator(), m_r->getClusterCount());
 	ctx.m_clusterData.create(getFrameAllocator(), m_r->getClusterCount());
 
 
-	// Probes
+	//
+	// Render and populate probes GPU mem
+	//
+
+	// Probes GPU mem
 	void* data = getGrManager().allocateFrameHostVisibleMemory(
 	void* data = getGrManager().allocateFrameHostVisibleMemory(
-		sizeof(ShaderReflectionProbe) * visRez.getReflectionProbeCount()
+		sizeof(IrShaderReflectionProbe) * visRez.getReflectionProbeCount()
 		+ sizeof(Mat3x4),
 		+ sizeof(Mat3x4),
 		BufferUsage::STORAGE, m_probesToken);
 		BufferUsage::STORAGE, m_probesToken);
 
 
@@ -203,27 +259,20 @@ Error Ir::run(CommandBufferPtr cmdb)
 	*invViewRotation =
 	*invViewRotation =
 		Mat3x4(frc.getViewMatrix().getInverse().getRotationPart());
 		Mat3x4(frc.getViewMatrix().getInverse().getRotationPart());
 
 
-	SArray<ShaderReflectionProbe> probes(
-		reinterpret_cast<ShaderReflectionProbe*>(invViewRotation + 1),
+	SArray<IrShaderReflectionProbe> probes(
+		reinterpret_cast<IrShaderReflectionProbe*>(invViewRotation + 1),
 		visRez.getReflectionProbeCount());
 		visRez.getReflectionProbeCount());
 
 
-	//
-	// Render and bin the probes
-	//
+	// Render some of the probes
 	const VisibleNode* it = visRez.getReflectionProbesBegin();
 	const VisibleNode* it = visRez.getReflectionProbesBegin();
 	const VisibleNode* end = visRez.getReflectionProbesEnd();
 	const VisibleNode* end = visRez.getReflectionProbesEnd();
 
 
-	m_r->getClusterer().initTestResults(getFrameAllocator(),
-		ctx.m_clustererTestResult);
-
 	U probeIdx = 0;
 	U probeIdx = 0;
 	while(it != end)
 	while(it != end)
 	{
 	{
-		// Render the probe
-		ANKI_CHECK(renderReflection(*it->m_node, probes[probeIdx]));
-
-		// Bin the probe
-		binProbe(*it->m_node, probeIdx, ctx);
+		// Write and render probe
+		ANKI_CHECK(
+			writeProbeAndRender(*it->m_node, probes[probeIdx]));
 
 
 		++it;
 		++it;
 		++probeIdx;
 		++probeIdx;
@@ -231,92 +280,157 @@ Error Ir::run(CommandBufferPtr cmdb)
 	ANKI_ASSERT(probeIdx == visRez.getReflectionProbeCount());
 	ANKI_ASSERT(probeIdx == visRez.getReflectionProbeCount());
 
 
 	//
 	//
-	// Populate the index buffer and the clusters
+	// Start the jobs that can run in parallel
 	//
 	//
-	populateIndexAndClusterBuffers(ctx);
+	ThreadPool& threadPool = m_r->getThreadPool();
+	Array<IrTask, ThreadPool::MAX_THREADS> tasks;
+	for(U i = 0; i < threadPool.getThreadsCount(); i++)
+	{
+		tasks[i].m_ctx = &ctx;
+		threadPool.assignNewTask(i, &tasks[i]);
+	}
+
+	// Sync
+	ANKI_CHECK(threadPool.waitForAllThreadsToFinish());
 
 
 	// Bye
 	// Bye
-	ctx.m_clusterData.destroy(getFrameAllocator());
 	ANKI_TRACE_STOP_EVENT(RENDER_IR);
 	ANKI_TRACE_STOP_EVENT(RENDER_IR);
 	return ErrorCode::NONE;
 	return ErrorCode::NONE;
 }
 }
 
 
 //==============================================================================
 //==============================================================================
-void Ir::populateIndexAndClusterBuffers(IrBuildContext& ctx)
+void Ir::binProbes(U32 threadId, PtrSize threadsCount, IrRunContext& ctx)
 {
 {
-	// Allocate GPU mem for indices
-	SArray<U32> indices;
-	if(ctx.m_indexCount > 0)
-	{
-		void* mem = getGrManager().allocateFrameHostVisibleMemory(
-			ctx.m_indexCount * sizeof(U32), BufferUsage::STORAGE,
-			m_indicesToken);
+	ANKI_TRACE_START_EVENT(RENDER_IR);
+	IrTaskContext task;
 
 
-		indices = SArray<U32>(static_cast<U32*>(mem), ctx.m_indexCount);
-	}
-	else
+	//
+	// Bin the probes
+	//
+
+	PtrSize start, end;
+	ThreadPool::Task::choseStartEnd(threadId, threadsCount,
+		ctx.m_visRez->getReflectionProbeCount(), start, end);
+
+	// Init clusterer test result for this thread
+	if(start < end)
 	{
 	{
-		m_indicesToken.markUnused();
+		m_r->getClusterer().initTestResults(getFrameAllocator(),
+			task.m_clustererTestResult);
 	}
 	}
 
 
-	U indexCount = 0;
+	for(auto i = start; i < end; i++)
+	{
+		VisibleNode* vnode = ctx.m_visRez->getReflectionProbesBegin() + i;
+		SceneNode& node = *vnode->m_node;
 
 
-	// Allocate GPU mem for clusters
-	void* mem = getGrManager().allocateFrameHostVisibleMemory(
-		m_r->getClusterCount() * sizeof(ShaderCluster), BufferUsage::STORAGE,
-		m_clustersToken);
-	SArray<ShaderCluster> clusters(static_cast<ShaderCluster*>(mem),
-		m_r->getClusterCount());
+		task.m_node = &node;
 
 
-	for(U i = 0; i < m_r->getClusterCount(); ++i)
+		// Bin it to temp clusters
+		binProbe(i, ctx, task);
+	}
+
+	//
+	// Write the clusters
+	//
+
+	// Allocate the cluster buffer. First come first served
+	U who = ctx.m_clustersAllocate.fetchAdd(1);
+	if(who == 0)
 	{
 	{
-		IrClusterData& cdata = ctx.m_clusterData[i];
-		ShaderCluster& cluster = clusters[i];
+		void* mem = getGrManager().allocateFrameHostVisibleMemory(
+			m_r->getClusterCount() * sizeof(IrShaderCluster),
+			BufferUsage::STORAGE, m_clustersToken);
+
+		ctx.m_clusters = SArray<IrShaderCluster>(
+			static_cast<IrShaderCluster*>(mem), m_r->getClusterCount());
+	}
 
 
-		if(cdata.m_probeCount > 0)
+	// Use the same trick to allocate the indices
+	ANKI_TRACE_STOP_EVENT(RENDER_IR);
+	m_barrier.wait();
+	ANKI_TRACE_START_EVENT(RENDER_IR);
+
+	who = ctx.m_probeIndicesAllocate.fetchAdd(1);
+	if(who == 0)
+	{
+		// Set it to zero in order to reuse it
+		U indexCount = ctx.m_indexCount.exchange(0);
+		if(indexCount > 0)
 		{
 		{
-			// Sort to satisfy the probe hierarchy
-			cdata.sort();
+			void* mem = getGrManager().allocateFrameHostVisibleMemory(
+				indexCount * sizeof(U32), BufferUsage::STORAGE,
+				m_indicesToken);
 
 
-			// Check if the cdata is the same for the previous
-			if(i > 0 && cdata == ctx.m_clusterData[i - 1])
-			{
-				// Same data
-				cluster = clusters[i - 1];
-			}
-			else
-			{
-				// Have to store the indices
-				cluster.m_indexOffset = indexCount;
-				cluster.m_probeCount = cdata.m_probeCount;
-				for(U j = 0; j < cdata.m_probeCount; ++j)
-				{
-					indices[indexCount] = cdata.m_probeIds[j].m_index;
-					++indexCount;
-				}
-			}
+			ctx.m_indices = SArray<U32>(static_cast<U32*>(mem), indexCount);
 		}
 		}
 		else
 		else
 		{
 		{
-			cluster.m_indexOffset = 0;
-			cluster.m_probeCount = 0;
+			m_indicesToken.markUnused();
 		}
 		}
 	}
 	}
+
+	// Sync
+	ANKI_TRACE_STOP_EVENT(RENDER_IR);
+	m_barrier.wait();
+	ANKI_TRACE_START_EVENT(RENDER_IR);
+
+	ThreadPool::Task::choseStartEnd(threadId, threadsCount,
+		m_r->getClusterCount(), start, end);
+
+	for(auto i = start; i < end; i++)
+	{
+		Bool hasPrevCluster = (i != start);
+		writeIndicesAndCluster(i, hasPrevCluster, ctx);
+	}
+	ANKI_TRACE_STOP_EVENT(RENDER_IR);
 }
 }
 
 
 //==============================================================================
 //==============================================================================
-void Ir::binProbe(const SceneNode& node, U index, IrBuildContext& ctx)
+Error Ir::writeProbeAndRender(SceneNode& node, IrShaderReflectionProbe& probe)
 {
 {
-	const SpatialComponent& sp = node.getComponent<SpatialComponent>();
-	const ReflectionProbeComponent& reflc =
+	const FrustumComponent& frc = m_r->getActiveFrustumComponent();
+	ReflectionProbeComponent& reflc =
 		node.getComponent<ReflectionProbeComponent>();
 		node.getComponent<ReflectionProbeComponent>();
 
 
+	Bool render = false;
+	U entry;
+	findCacheEntry(node, entry, render);
+
+	// Write shader var
+	probe.m_pos = (frc.getViewMatrix() * reflc.getPosition().xyz1()).xyz();
+	probe.m_radiusSq = reflc.getRadius() * reflc.getRadius();
+	probe.m_cubemapIndex = entry;
+
+	if(reflc.getMarkedForRendering())
+	{
+		reflc.setMarkedForRendering(false);
+		ANKI_CHECK(renderReflection(node, reflc, entry));
+	}
+
+	// If you need to render it mark it for the next frame
+	if(render)
+	{
+		reflc.setMarkedForRendering(true);
+	}
+
+	return ErrorCode::NONE;
+}
+
+//==============================================================================
+void Ir::binProbe(U probeIdx, IrRunContext& ctx, IrTaskContext& task) const
+{
+	const SpatialComponent& sp = task.m_node->getComponent<SpatialComponent>();
+	const ReflectionProbeComponent& reflc =
+		task.m_node->getComponent<ReflectionProbeComponent>();
+
+	// Perform the expensive tests
 	m_r->getClusterer().bin(sp.getSpatialCollisionShape(), sp.getAabb(),
 	m_r->getClusterer().bin(sp.getSpatialCollisionShape(), sp.getAabb(),
-		ctx.m_clustererTestResult);
+		task.m_clustererTestResult);
 
 
 	// Bin to the correct tiles
 	// Bin to the correct tiles
-	auto it = ctx.m_clustererTestResult.getClustersBegin();
-	auto end = ctx.m_clustererTestResult.getClustersEnd();
+	auto it = task.m_clustererTestResult.getClustersBegin();
+	auto end = task.m_clustererTestResult.getClustersEnd();
 	for(; it != end; ++it)
 	for(; it != end; ++it)
 	{
 	{
 		U x = (*it)[0];
 		U x = (*it)[0];
@@ -328,69 +442,86 @@ void Ir::binProbe(const SceneNode& node, U index, IrBuildContext& ctx)
 
 
 		auto& cluster = ctx.m_clusterData[i];
 		auto& cluster = ctx.m_clusterData[i];
 
 
-		i = cluster.m_probeCount % MAX_PROBES_PER_CLUSTER;
-		++cluster.m_probeCount;
-		cluster.m_probeIds[i].m_index = index;
+		i = cluster.m_probeCount.fetchAdd(1) % MAX_PROBES_PER_CLUSTER;
+		cluster.m_probeIds[i].m_index = probeIdx;
 		cluster.m_probeIds[i].m_probeRadius = reflc.getRadius();
 		cluster.m_probeIds[i].m_probeRadius = reflc.getRadius();
-
-		++ctx.m_indexCount;
 	}
 	}
+
+	ctx.m_indexCount.fetchAdd(task.m_clustererTestResult.getClusterCount());
 }
 }
 
 
 //==============================================================================
 //==============================================================================
-Error Ir::renderReflection(SceneNode& node, ShaderReflectionProbe& shaderProb)
+void Ir::writeIndicesAndCluster(U clusterIdx, Bool hasPrevCluster,
+	IrRunContext& ctx)
 {
 {
-	const FrustumComponent& frc = m_r->getActiveFrustumComponent();
-	ReflectionProbeComponent& reflc =
-		node.getComponent<ReflectionProbeComponent>();
+	IrClusterData& cdata = ctx.m_clusterData[clusterIdx];
+	IrShaderCluster& cluster = ctx.m_clusters[clusterIdx];
 
 
-	// Get cache entry
-	Bool render = false;
-	U entry;
-	findCacheEntry(node, entry, render);
-
-	// Write shader var
-	shaderProb.m_pos = (frc.getViewMatrix() * reflc.getPosition().xyz1()).xyz();
-	shaderProb.m_radiusSq = reflc.getRadius() * reflc.getRadius();
-	shaderProb.m_cubemapIndex = entry;
-
-	// Render cubemap
-	if(reflc.getMarkedForRendering())
+	const U probeCount = cdata.m_probeCount.load() % MAX_PROBES_PER_CLUSTER;
+	if(probeCount > 0)
 	{
 	{
-		for(U i = 0; i < 6; ++i)
+		// Sort to satisfy the probe hierarchy
+		cdata.sort();
+
+		// Check if the cdata is the same for the previous
+		if(hasPrevCluster && cdata == ctx.m_clusterData[clusterIdx - 1])
+		{
+			// Same data
+			cluster = ctx.m_clusters[clusterIdx - 1];
+		}
+		else
 		{
 		{
-			Array<CommandBufferPtr, RENDERER_COMMAND_BUFFERS_COUNT> cmdb;
-			for(U j = 0; j < cmdb.getSize(); ++j)
+			// Have to store the indices
+			U idx = ctx.m_indexCount.fetchAdd(probeCount);
+
+			cluster.m_indexOffset = idx;
+			cluster.m_probeCount = probeCount;
+			for(U j = 0; j < probeCount; ++j)
 			{
 			{
-				cmdb[j] = getGrManager().newInstance<CommandBuffer>();
+				ctx.m_indices[idx] = cdata.m_probeIds[j].m_index;
+				++idx;
 			}
 			}
+		}
+	}
+	else
+	{
+		cluster.m_indexOffset = 0;
+		cluster.m_probeCount = 0;
+	}
+}
 
 
-			// Render
-			ANKI_CHECK(m_nestedR.render(node, i, cmdb));
+//==============================================================================
+Error Ir::renderReflection(SceneNode& node, ReflectionProbeComponent& reflc,
+	U cubemapIdx)
+{
+	ANKI_TRACE_INC_COUNTER(RENDERER_REFLECTIONS, 1);
 
 
-			// Copy textures
-			cmdb[cmdb.getSize() - 1]->copyTextureToTexture(
-				m_nestedR.getPps().getRt(), 0, 0, m_cubemapArr, 6 * entry + i,
-				0);
+	// Render cubemap
+	for(U i = 0; i < 6; ++i)
+	{
+		Array<CommandBufferPtr, RENDERER_COMMAND_BUFFERS_COUNT> cmdb;
+		for(U j = 0; j < cmdb.getSize(); ++j)
+		{
+			cmdb[j] = getGrManager().newInstance<CommandBuffer>();
+		}
 
 
-			// Gen mips
-			cmdb[cmdb.getSize() - 1]->generateMipmaps(m_cubemapArr,
-				6 * entry + i);
+		// Render
+		ANKI_CHECK(m_nestedR.render(node, i, cmdb));
 
 
-			// Flush
-			for(U j = 0; j < cmdb.getSize(); ++j)
-			{
-				cmdb[j]->flush();
-			}
-		}
+		// Copy textures
+		cmdb[cmdb.getSize() - 1]->copyTextureToTexture(
+			m_nestedR.getPps().getRt(), 0, 0, m_cubemapArr,
+			6 * cubemapIdx + i, 0);
 
 
-		reflc.setMarkedForRendering(false);
-	}
+		// Gen mips
+		cmdb[cmdb.getSize() - 1]->generateMipmaps(m_cubemapArr,
+			6 * cubemapIdx + i);
 
 
-	// If you need to render it mark it for the next frame
-	if(render)
-	{
-		reflc.setMarkedForRendering(true);
+		// Flush
+		for(U j = 0; j < cmdb.getSize(); ++j)
+		{
+			cmdb[j]->flush();
+		}
 	}
 	}
 
 
 	return ErrorCode::NONE;
 	return ErrorCode::NONE;

+ 13 - 9
src/renderer/Is.cpp

@@ -74,8 +74,8 @@ public:
 
 
 	void sortLightIds()
 	void sortLightIds()
 	{
 	{
-		const U pointCount = m_pointCount.load();
-		const U spotCount = m_spotCount.load();
+		const U pointCount = m_pointCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
+		const U spotCount = m_spotCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
 
 
 		if(pointCount > 1)
 		if(pointCount > 1)
 		{
 		{
@@ -90,10 +90,12 @@ public:
 
 
 	Bool operator==(const ClusterData& b) const
 	Bool operator==(const ClusterData& b) const
 	{
 	{
-		const U pointCount = m_pointCount.load();
-		const U spotCount = m_spotCount.load();
-		const U pointCount2 = b.m_pointCount.load();
-		const U spotCount2 = b.m_spotCount.load();
+		const U pointCount = m_pointCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
+		const U spotCount = m_spotCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
+		const U pointCount2 =
+			b.m_pointCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
+		const U spotCount2 =
+			b.m_spotCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
 
 
 		if(pointCount != pointCount2 || spotCount != spotCount2)
 		if(pointCount != pointCount2 || spotCount != spotCount2)
 		{
 		{
@@ -505,7 +507,7 @@ void Is::binLights(U32 threadId, PtrSize threadsCount, TaskCommonData& task)
 	ClustererTestResult testResult;
 	ClustererTestResult testResult;
 	m_r->getClusterer().initTestResults(getFrameAllocator(), testResult);
 	m_r->getClusterer().initTestResults(getFrameAllocator(), testResult);
 
 
-	for(U64 i = start; i < end; i++)
+	for(auto i = start; i < end; i++)
 	{
 	{
 		SceneNode* snode = (*(task.m_lightsBegin + i)).m_node;
 		SceneNode* snode = (*(task.m_lightsBegin + i)).m_node;
 		MoveComponent& move = snode->getComponent<MoveComponent>();
 		MoveComponent& move = snode->getComponent<MoveComponent>();
@@ -555,8 +557,10 @@ void Is::binLights(U32 threadId, PtrSize threadsCount, TaskCommonData& task)
 	{
 	{
 		auto& cluster = task.m_tempClusters[i];
 		auto& cluster = task.m_tempClusters[i];
 
 
-		const U countP = cluster.m_pointCount.load();
-		const U countS = cluster.m_spotCount.load();
+		const U countP =
+			cluster.m_pointCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
+		const U countS =
+			cluster.m_spotCount.load() % MAX_TYPED_LIGHTS_PER_CLUSTER;
 		const U count = countP + countS;
 		const U count = countP + countS;
 
 
 		auto& c = task.m_clusters[i];
 		auto& c = task.m_clusters[i];