瀏覽代碼

Rely less on kMaxFramesInFlight

Panagiotis Christopoulos Charitos 1 周之前
父節點
當前提交
b2d782582b

+ 6 - 5
AnKi/Core/App.cpp

@@ -449,7 +449,8 @@ Error App::mainLoop()
 			SceneGraph::getSingleton().update(prevUpdateTime, crntTime);
 			GpuSceneMicroPatcher::getSingleton().endPatching();
 
-			ANKI_CHECK(Renderer::getSingleton().render());
+			FencePtr renderFence;
+			ANKI_CHECK(Renderer::getSingleton().render(renderFence));
 
 			// If we get stats exclude the time of GR because it forces some GPU-CPU serialization. We don't want to count that
 			Second grTime = 0.0;
@@ -465,11 +466,11 @@ Error App::mainLoop()
 				grTime = HighRezTimer::getCurrentTime() - grTime;
 			}
 
-			RebarTransientMemoryPool::getSingleton().endFrame();
-			UnifiedGeometryBuffer::getSingleton().endFrame();
-			GpuSceneBuffer::getSingleton().endFrame();
+			RebarTransientMemoryPool::getSingleton().endFrame(renderFence.get());
+			UnifiedGeometryBuffer::getSingleton().endFrame(renderFence.get());
+			GpuSceneBuffer::getSingleton().endFrame(renderFence.get());
 			GpuVisibleTransientMemoryPool::getSingleton().endFrame();
-			GpuReadbackMemoryPool::getSingleton().endFrame();
+			GpuReadbackMemoryPool::getSingleton().endFrame(renderFence.get());
 
 			// Sleep
 			const Second endTime = HighRezTimer::getCurrentTime();

+ 2 - 2
AnKi/GpuMemory/GpuReadbackMemoryPool.cpp

@@ -42,9 +42,9 @@ void GpuReadbackMemoryPool::deferredFree(GpuReadbackMemoryAllocation& allocation
 	::new(&allocation) GpuReadbackMemoryAllocation();
 }
 
-void GpuReadbackMemoryPool::endFrame()
+void GpuReadbackMemoryPool::endFrame(Fence* fence)
 {
-	m_pool.endFrame();
+	m_pool.endFrame(fence);
 }
 
 } // end namespace anki

+ 1 - 1
AnKi/GpuMemory/GpuReadbackMemoryPool.h

@@ -97,7 +97,7 @@ public:
 	// Thread-safe
 	void deferredFree(GpuReadbackMemoryAllocation& allocation);
 
-	void endFrame();
+	void endFrame(Fence* fence);
 
 private:
 	SegregatedListsGpuMemoryPool m_pool;

+ 2 - 2
AnKi/GpuMemory/GpuSceneBuffer.h

@@ -105,9 +105,9 @@ public:
 		m_pool.deferredFree(alloc.m_token);
 	}
 
-	void endFrame()
+	void endFrame(Fence* fence)
 	{
-		m_pool.endFrame();
+		m_pool.endFrame(fence);
 #if ANKI_STATS_ENABLED
 		updateStats();
 #endif

+ 2 - 6
AnKi/GpuMemory/GpuVisibleTransientMemoryPool.cpp

@@ -14,12 +14,8 @@ void GpuVisibleTransientMemoryPool::endFrame()
 {
 	g_svarGpuVisibleTransientMemory.set(m_pool.getAllocatedMemory());
 
-	if(m_frame == 0)
-	{
-		m_pool.reset();
-	}
-
-	m_frame = (m_frame + 1) % kMaxFramesInFlight;
+	// This is GPU only memory so next frame can start re-using immediately
+	m_pool.reset();
 }
 
 } // end namespace anki

+ 0 - 1
AnKi/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -45,7 +45,6 @@ public:
 
 private:
 	StackGpuMemoryPool m_pool;
-	U32 m_frame = 0;
 	U32 m_structuredBufferAlignment = kMaxU32;
 
 	GpuVisibleTransientMemoryPool()

+ 151 - 6
AnKi/GpuMemory/RebarTransientMemoryPool.cpp

@@ -7,6 +7,7 @@
 #include <AnKi/Util/Tracer.h>
 #include <AnKi/Gr/GrManager.h>
 #include <AnKi/Gr/Buffer.h>
+#include <AnKi/Gr/Fence.h>
 
 namespace anki {
 
@@ -38,6 +39,11 @@ void RebarTransientMemoryPool::init()
 	}
 
 	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
+
+	// Create the slice of the 1st frame
+	m_activeSliceMask.set(0);
+	m_slices[0].m_offset = 0;
+	m_crntActiveSlice = 0;
 }
 
 BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alignment, void*& mappedMem)
@@ -52,32 +58,171 @@ BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alig
 	do
 	{
 		offset = m_offset.fetchAdd(size) % m_bufferSize;
-		const PtrSize end = (offset + size) % (m_bufferSize + 1);
+		const PtrSize end = (offset + size) % m_bufferSize;
 
 		done = offset < end;
 	} while(!done);
 
+	// Wait for the range that contains the offset
+	m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdx) {
+		if(sliceIdx == m_crntActiveSlice)
+		{
+			return FunctorContinue::kContinue;
+		}
+
+		const FrameSlice& slice = m_slices[sliceIdx];
+
+		Bool overlaps;
+		if(offset <= slice.m_offset)
+		{
+			overlaps = offset + size > slice.m_offset;
+		}
+		else
+		{
+			overlaps = slice.m_offset + slice.m_range > offset;
+		}
+
+		if(overlaps)
+		{
+			ANKI_CORE_LOGW("ReBAR has to wait for a fence. This means that the ReBAR buffer is not big enough. Increase the %s CVAR",
+						   g_cvarCoreRebarGpuMemorySize.getName().cstr());
+
+			if(!m_sliceFences[sliceIdx]->clientWait(kMaxSecond))
+			{
+				ANKI_CORE_LOGF("Timeout detected");
+			}
+		}
+
+		return FunctorContinue::kContinue;
+	});
+
 	const PtrSize alignedOffset = getAlignedRoundUp(alignment, offset);
 	ANKI_ASSERT(alignedOffset + origSize <= offset + size);
+	ANKI_ASSERT(offset + size <= m_bufferSize);
 
 	mappedMem = m_mappedMem + alignedOffset;
 	return BufferView(m_buffer.get(), alignedOffset, origSize);
 }
 
-void RebarTransientMemoryPool::endFrame()
+void RebarTransientMemoryPool::endFrame(Fence* fence)
 {
+	// Free up previous slices
+	m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdx) {
+		if(sliceIdx != m_crntActiveSlice)
+		{
+			if(m_sliceFences[sliceIdx]->signaled())
+			{
+				m_sliceFences[sliceIdx].reset(nullptr);
+				m_slices[sliceIdx] = {};
+				m_activeSliceMask.unset(sliceIdx);
+			}
+		}
+
+		return FunctorContinue::kContinue;
+	});
+
+	// Finalize the active slice
 	const PtrSize crntOffset = m_offset.getNonAtomically();
+	const PtrSize crntNormalizedOffset = crntOffset % m_bufferSize;
+
+	FrameSlice& slice = m_slices[m_crntActiveSlice];
+	ANKI_ASSERT(slice.m_offset < kMaxPtrSize && slice.m_range == 0 && !m_sliceFences[m_crntActiveSlice]);
+
+	ANKI_ASSERT(crntOffset >= slice.m_offset);
+	const PtrSize range = crntOffset - slice.m_offset;
 
-	const PtrSize usedMemory = crntOffset - m_previousFrameEndOffset;
-	m_previousFrameEndOffset = crntOffset;
+	if(range == 0)
+	{
+		// No allocations this frame, remove the slice
 
-	if(usedMemory >= PtrSize(0.8 * F64(m_bufferSize / kMaxFramesInFlight)))
+		slice = {};
+		m_activeSliceMask.unset(m_crntActiveSlice);
+	}
+	else if((slice.m_offset % m_bufferSize) + range > m_bufferSize)
+	{
+		// The frame we are ending wrapped arround the ReBAR buffer, create two slices
+
+		slice.m_offset = slice.m_offset % m_bufferSize;
+		slice.m_range = m_bufferSize - slice.m_offset;
+		m_sliceFences[m_crntActiveSlice].reset(fence);
+
+		const U32 secondSliceIdx = (~m_activeSliceMask).getLeastSignificantBit();
+		m_slices[secondSliceIdx].m_offset = 0;
+		m_slices[secondSliceIdx].m_range = range - slice.m_range;
+		ANKI_ASSERT(crntNormalizedOffset == m_slices[secondSliceIdx].m_range);
+		m_sliceFences[secondSliceIdx].reset(fence);
+		m_activeSliceMask.set(secondSliceIdx);
+	}
+	else
 	{
-		ANKI_CORE_LOGW("Frame used more that 80%% of its safe limit of ReBAR memory");
+		// No wrapping, just finalize the active slice
+
+		slice.m_offset = slice.m_offset % m_bufferSize;
+		slice.m_range = range;
+		m_sliceFences[m_crntActiveSlice].reset(fence);
 	}
 
+	// Create a new active slice
+	const U32 newSliceIdx = (~m_activeSliceMask).getLeastSignificantBit();
+	m_activeSliceMask.set(newSliceIdx);
+	m_slices[newSliceIdx].m_offset = crntOffset;
+	m_crntActiveSlice = newSliceIdx;
+
+	validateSlices();
+
+	// Stats
+	const PtrSize usedMemory = range;
 	ANKI_TRACE_INC_COUNTER(ReBarUsedMemory, usedMemory);
 	g_svarRebarUserMemory.set(usedMemory);
 }
 
+void RebarTransientMemoryPool::validateSlices() const
+{
+	for(U32 sliceIdxA = 0; sliceIdxA < kSliceCount; ++sliceIdxA)
+	{
+		if(sliceIdxA == m_crntActiveSlice)
+		{
+			ANKI_ASSERT(m_activeSliceMask.get(sliceIdxA));
+			ANKI_ASSERT(m_slices[sliceIdxA].m_offset < kMaxPtrSize && m_slices[sliceIdxA].m_range == 0 && !m_sliceFences[sliceIdxA]);
+		}
+		else if(m_activeSliceMask.get(sliceIdxA))
+		{
+			const FrameSlice& a = m_slices[sliceIdxA];
+			ANKI_ASSERT(a.m_offset < kMaxPtrSize && a.m_range > 0 && a.m_offset + a.m_range <= m_bufferSize);
+			ANKI_ASSERT(!!m_sliceFences[sliceIdxA]);
+
+			m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdxB) {
+				if(sliceIdxA == sliceIdxB || sliceIdxB == m_crntActiveSlice)
+				{
+					return FunctorContinue::kContinue;
+				}
+
+				const FrameSlice& b = m_slices[sliceIdxB];
+
+				if(a.m_offset < b.m_offset)
+				{
+					ANKI_ASSERT(a.m_offset + a.m_range <= b.m_offset);
+				}
+				else if(b.m_offset < a.m_offset)
+				{
+					ANKI_ASSERT(b.m_offset + b.m_range <= a.m_offset);
+				}
+				else
+				{
+					ANKI_ASSERT(0 && "Offsets can't be equal");
+				}
+
+				ANKI_ASSERT(m_sliceFences[sliceIdxA] && m_sliceFences[sliceIdxB]);
+
+				return FunctorContinue::kContinue;
+			});
+		}
+		else
+		{
+			const FrameSlice& a = m_slices[sliceIdxA];
+			ANKI_ASSERT(a.m_offset == kMaxPtrSize && a.m_range == 0 && !m_sliceFences[sliceIdxA]);
+		}
+	}
+}
+
 } // end namespace anki

+ 26 - 14
AnKi/GpuMemory/RebarTransientMemoryPool.h

@@ -12,12 +12,9 @@
 
 namespace anki {
 
-/// @addtogroup gpu_memory
-/// @{
-
 ANKI_CVAR(NumericCVar<PtrSize>, Core, RebarGpuMemorySize, 24_MB, 1_MB, 1_GB, "ReBAR: always mapped GPU memory")
 
-/// Manages staging GPU memory.
+// Manages staging GPU memory.
 class RebarTransientMemoryPool : public MakeSingleton<RebarTransientMemoryPool>
 {
 	template<typename>
@@ -30,9 +27,9 @@ public:
 
 	void init();
 
-	void endFrame();
+	void endFrame(Fence* fence);
 
-	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
+	// Allocate staging memory for various operations. The memory will be reused when it's safe
 	template<typename T>
 	BufferView allocate(PtrSize size, U32 alignment, T*& mappedMem)
 	{
@@ -42,14 +39,14 @@ public:
 		return out;
 	}
 
-	/// @copydoc allocate
+	// See allocate()
 	template<typename T>
 	BufferView allocateConstantBuffer(T*& mappedMem)
 	{
 		return allocate(sizeof(T), GrManager::getSingleton().getDeviceCapabilities().m_constantBufferBindOffsetAlignment, mappedMem);
 	}
 
-	/// @copydoc allocate
+	// See allocate()
 	template<typename T>
 	BufferView allocateStructuredBuffer(U32 count, WeakArray<T>& arr)
 	{
@@ -60,7 +57,7 @@ public:
 		return out;
 	}
 
-	/// @copydoc allocate
+	// See allocate()
 	template<typename T>
 	BufferView allocateCopyBuffer(U32 count, WeakArray<T>& arr)
 	{
@@ -83,18 +80,33 @@ public:
 
 private:
 	BufferPtr m_buffer;
-	U8* m_mappedMem = nullptr; ///< Cache it.
-	PtrSize m_bufferSize = 0; ///< Cache it.
+	U8* m_mappedMem = nullptr; // Cache it
+	PtrSize m_bufferSize = 0; // Cache it
+	U32 m_structuredBufferAlignment = kMaxU32; // Cache it
+
 	Atomic<PtrSize> m_offset = {0};
-	PtrSize m_previousFrameEndOffset = 0;
-	U32 m_structuredBufferAlignment = kMaxU32;
+
+	// This is the slice of the ReBAR buffer that is protected by a fence
+	class FrameSlice
+	{
+	public:
+		PtrSize m_offset = kMaxPtrSize;
+		PtrSize m_range = 0;
+	};
+
+	static constexpr U32 kSliceCount = 8; // It's actually "max slices in-flight"
+	BitSet<kSliceCount, U32> m_activeSliceMask = {false};
+	U32 m_crntActiveSlice = kMaxU32;
+	Array<FrameSlice, kSliceCount> m_slices;
+	Array<FencePtr, kSliceCount> m_sliceFences;
 
 	RebarTransientMemoryPool() = default;
 
 	~RebarTransientMemoryPool();
 
 	BufferView allocateInternal(PtrSize size, U32 alignment, void*& mappedMem);
+
+	void validateSlices() const;
 };
-/// @}
 
 } // end namespace anki

+ 2 - 2
AnKi/GpuMemory/UnifiedGeometryBuffer.h

@@ -125,9 +125,9 @@ public:
 		alloc.m_fakeOffset = kMaxU32;
 	}
 
-	void endFrame()
+	void endFrame(Fence* fence)
 	{
-		m_pool.endFrame();
+		m_pool.endFrame(fence);
 #if ANKI_STATS_ENABLED
 		updateStats();
 #endif

+ 7 - 9
AnKi/Gr/BackendCommon/MicroObjectRecycler.h

@@ -10,10 +10,9 @@
 
 namespace anki {
 
-/// @addtogroup graphics
-/// @{
-
-/// Helper class for MicroXXX objects. It expects a specific interface for the T.
+// Helper class for MicroXXX objects. It expects a specific interface for the T:
+// I32 getRefcount() const;
+// Bool canRecycle() const;
 template<typename T>
 class MicroObjectRecycler
 {
@@ -27,16 +26,16 @@ public:
 		destroy();
 	}
 
-	/// It's thread-safe.
+	// It's thread-safe.
 	void destroy();
 
-	/// Find a new one to reuse. It's thread-safe.
+	// Find a new one to reuse. It's thread-safe.
 	T* findToReuse();
 
-	/// Release an object back to the recycler. It's thread-safe.
+	// Release an object back to the recycler. It's thread-safe.
 	void recycle(T* s);
 
-	/// Destroy those objects that their fence is done. It's thread-safe.
+	// Destroy those objects that their fence is done. It's thread-safe.
 	void trimCache()
 	{
 		LockGuard<Mutex> lock(m_mtx);
@@ -67,7 +66,6 @@ private:
 
 	void adjustAliveObjectCount();
 };
-/// @}
 
 } // end namespace anki
 

+ 26 - 10
AnKi/Gr/BackendCommon/MicroObjectRecycler.inl.h

@@ -34,10 +34,14 @@ inline T* MicroObjectRecycler<T>::findToReuse()
 	// Trim the cache but leave at least one object to be recycled
 	trimCacheInternal(max(m_availableObjectsAfterTrim, 1u));
 
-	if(m_objectCache.getSize())
+	for(auto it = m_objectCache.getBegin(); it != m_objectCache.getEnd(); ++it)
 	{
-		out = m_objectCache[m_objectCache.getSize() - 1];
-		m_objectCache.popBack();
+		if((*it)->canRecycle())
+		{
+			out = *it;
+			m_objectCache.erase(it);
+			break;
+		}
 	}
 
 	ANKI_ASSERT(out == nullptr || out->getRefcount() == 0);
@@ -72,19 +76,28 @@ template<typename T>
 void MicroObjectRecycler<T>::trimCacheInternal(U32 aliveObjectCountAfterTrim)
 {
 	aliveObjectCountAfterTrim = min(aliveObjectCountAfterTrim, m_objectCache.getSize());
-	const U32 toBeKilledCount = m_objectCache.getSize() - aliveObjectCountAfterTrim;
+	U32 toBeKilledCount = m_objectCache.getSize() - aliveObjectCountAfterTrim;
 	if(toBeKilledCount == 0)
 	{
 		return;
 	}
 
-	for(U32 i = 0; i < toBeKilledCount; ++i)
+	GrDynamicArray<T*> newObjectCache;
+	for(U32 i = 0; i < m_objectCache.getSize(); ++i)
 	{
-		deleteInstance(GrMemoryPool::getSingleton(), m_objectCache[i]);
-		m_objectCache[i] = nullptr;
+		if(toBeKilledCount > 0 && m_objectCache[i]->canRecycle())
+		{
+			deleteInstance(GrMemoryPool::getSingleton(), m_objectCache[i]);
+			--toBeKilledCount;
+		}
+		else
+		{
+			newObjectCache.emplaceBack(m_objectCache[i]);
+		}
 	}
 
-	m_objectCache.erase(m_objectCache.getBegin(), m_objectCache.getBegin() + toBeKilledCount);
+	m_objectCache.destroy();
+	m_objectCache = std::move(newObjectCache);
 }
 
 template<typename T>
@@ -97,12 +110,15 @@ void MicroObjectRecycler<T>::adjustAliveObjectCount()
 	}
 	else
 	{
+		constexpr U32 kGrowCount = 4;
+		constexpr U32 kMinAvailableObjects = 1;
+
 		if(m_cacheMisses)
 		{
 			// Need more alive objects
-			m_availableObjectsAfterTrim += 4;
+			m_availableObjectsAfterTrim += kGrowCount;
 		}
-		else if(m_availableObjectsAfterTrim > 0)
+		else if(m_availableObjectsAfterTrim > kMinAvailableObjects)
 		{
 			// Have more than enough alive objects per request, decrease alive objects
 			--m_availableObjectsAfterTrim;

+ 2 - 1
AnKi/Gr/D3D/D3DGrManager.cpp

@@ -138,8 +138,9 @@ void GrManager::finish()
 	self.finishInternal();
 }
 
-void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
+void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, [[maybe_unused]] Bool flushAndSerialize)
 {
+	// No need to do something about flushAndSerialize, D3D does that anyways
 	ANKI_D3D_SELF(GrManagerImpl);
 	self.submitInternal(cmdbs, waitFences, signalFence);
 }

+ 9 - 11
AnKi/Gr/Fence.h

@@ -9,10 +9,7 @@
 
 namespace anki {
 
-/// @addtogroup graphics
-/// @{
-
-/// GPU fence.
+// GPU fence
 class Fence : public GrObject
 {
 	ANKI_GR_OBJECT
@@ -20,27 +17,28 @@ class Fence : public GrObject
 public:
 	static constexpr GrObjectType kClassType = GrObjectType::kFence;
 
-	/// Wait for the fence.
-	/// @param seconds The time to wait in seconds. If it's zero then just return the status.
-	/// @return True if is signaled (signaled == GPU work is done).
+	// Wait for the fence.
+	// seconds: The time to wait in seconds. If it's zero then just return the status.
+	// Return true if is signaled (signaled == GPU work is done).
 	Bool clientWait(Second seconds);
 
+	Bool signaled()
+	{
+		return clientWait(0.0);
+	}
+
 protected:
-	/// Construct.
 	Fence(CString name)
 		: GrObject(kClassType, name)
 	{
 	}
 
-	/// Destroy.
 	~Fence()
 	{
 	}
 
 private:
-	/// Allocate and initialize a new instance.
 	[[nodiscard]] static Fence* newInstance();
 };
-/// @}
 
 } // end namespace anki

+ 17 - 21
AnKi/Gr/GrManager.h

@@ -16,10 +16,7 @@ namespace anki {
 // Forward
 class NativeWindow;
 
-/// @addtogroup graphics
-/// @{
-
-/// Manager initializer.
+// Manager initializer.
 class GrManagerInitInfo
 {
 public:
@@ -29,7 +26,7 @@ public:
 	CString m_cacheDirectory;
 };
 
-/// The graphics manager, owner of all graphics objects.
+// The graphics manager, owner of all graphics objects.
 class GrManager : public MakeSingletonPtr<GrManager>
 {
 	template<typename>
@@ -43,31 +40,31 @@ public:
 		return m_capabilities;
 	}
 
-	/// First call in the frame. Do that before everything else.
+	// First call in the frame. Do that before everything else.
 	void beginFrame();
 
-	/// Get next presentable image. The returned Texture is valid until the following swapBuffers. After that it might dissapear even if you hold the
-	/// reference.
+	// Get next presentable image. The returned Texture is valid until the following swapBuffers. After that it might dissapear even if you hold the
+	// reference.
 	TexturePtr acquireNextPresentableTexture();
 
-	/// End this frame.
+	// End this frame.
 	void endFrame();
 
-	/// Submit command buffers. Can be called outside beginFrame() endFrame().
-	/// @param[in]  waitFences Optionally wait for some fences.
-	/// @param[out] signalFence Optionaly create fence that will be signaled when the submission is done.
-	void submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr);
+	// Submit command buffers. Can be called outside beginFrame() endFrame()
+	// waitFences: Optionally wait for some fences
+	// signalFence: Optionaly create fence that will be signaled when the submission is done
+	// flushAndSerialize: Insert a barrier at the end of the submit that flushes all caches and inserts an ALL to ALL barrier
+	void submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr, Bool flushAndSerialize = false);
 
-	void submit(CommandBuffer* cmdb, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr)
+	void submit(CommandBuffer* cmdb, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr, Bool flushAndSerialize = false)
 	{
-		submit(WeakArray<CommandBuffer*>(&cmdb, 1), waitFences, signalFence);
+		submit(WeakArray<CommandBuffer*>(&cmdb, 1), waitFences, signalFence, flushAndSerialize);
 	}
 
-	/// Wait for all GPU work to finish.
+	// Wait for all GPU work to finish.
 	void finish();
 
-	/// @name Object creation methods. They are thread-safe.
-	/// @{
+	// Object creation methods. They are thread-safe //
 	[[nodiscard]] BufferPtr newBuffer(const BufferInitInfo& init);
 	[[nodiscard]] TexturePtr newTexture(const TextureInitInfo& init);
 	[[nodiscard]] SamplerPtr newSampler(const SamplerInitInfo& init);
@@ -80,9 +77,9 @@ public:
 	[[nodiscard]] RenderGraphPtr newRenderGraph();
 	[[nodiscard]] GrUpscalerPtr newGrUpscaler(const GrUpscalerInitInfo& init);
 	[[nodiscard]] AccelerationStructurePtr newAccelerationStructure(const AccelerationStructureInitInfo& init);
-	/// @}
+	// End object creation methods //
 
-	/// Get the size of the acceleration structure if you are planning to supply a custom buffer.
+	// Get the size of the acceleration structure if you are planning to supply a custom buffer.
 	PtrSize getAccelerationStructureMemoryRequirement(const AccelerationStructureInitInfo& init) const;
 
 	ANKI_INTERNAL CString getCacheDirectory() const
@@ -111,6 +108,5 @@ GrManager& MakeSingletonPtr<GrManager>::allocateSingleton<>();
 
 template<>
 void MakeSingletonPtr<GrManager>::freeSingleton();
-/// @}
 
 } // end namespace anki

+ 2 - 2
AnKi/Gr/RenderGraph.cpp

@@ -1270,7 +1270,7 @@ void RenderGraph::recordAndSubmitCommandBuffers(FencePtr* optionalFence)
 	const U32 firstGroupThatWroteToSwapchain2 = firstGroupThatWroteToSwapchain.getNonAtomically();
 	if(firstGroupThatWroteToSwapchain2 == 0 || firstGroupThatWroteToSwapchain2 == kMaxU32)
 	{
-		GrManager::getSingleton().submit(WeakArray(pCmdbs), {}, optionalFence);
+		GrManager::getSingleton().submit(WeakArray(pCmdbs), {}, optionalFence, true);
 	}
 	else
 	{
@@ -1279,7 +1279,7 @@ void RenderGraph::recordAndSubmitCommandBuffers(FencePtr* optionalFence)
 		GrManager::getSingleton().submit(WeakArray(pCmdbs).subrange(0, firstGroupThatWroteToSwapchain2), {}, nullptr);
 
 		GrManager::getSingleton().submit(
-			WeakArray(pCmdbs).subrange(firstGroupThatWroteToSwapchain2, batchGroupCount - firstGroupThatWroteToSwapchain2), {}, optionalFence);
+			WeakArray(pCmdbs).subrange(firstGroupThatWroteToSwapchain2, batchGroupCount - firstGroupThatWroteToSwapchain2), {}, optionalFence, true);
 	}
 }
 

+ 52 - 15
AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.cpp

@@ -20,8 +20,7 @@ class SegregatedListsGpuMemoryPool::BuilderInterface
 public:
 	SegregatedListsGpuMemoryPool* m_parent = nullptr;
 
-	/// @name Interface methods
-	/// @{
+	// Interface methods
 	U32 getClassCount() const
 	{
 		return m_parent->m_classes.getSize();
@@ -46,7 +45,6 @@ public:
 	{
 		return 4;
 	}
-	/// @}
 };
 
 void SegregatedListsGpuMemoryPool::init(BufferUsageBit gpuBufferUsage, ConstWeakArray<PtrSize> classUpperSizes, PtrSize initialGpuBufferSize,
@@ -72,7 +70,6 @@ void SegregatedListsGpuMemoryPool::init(BufferUsageBit gpuBufferUsage, ConstWeak
 	m_builder = newInstance<Builder>(GrMemoryPool::getSingleton());
 	m_builder->getInterface().m_parent = this;
 
-	m_frame = 0;
 	m_allocatedSize = 0;
 	m_allowCoWs = allowCoWs;
 	m_mapAccess = map;
@@ -92,9 +89,19 @@ void SegregatedListsGpuMemoryPool::destroy()
 		m_gpuBuffer->unmap();
 	}
 
-	for(GrDynamicArray<SegregatedListsGpuMemoryPoolToken>& arr : m_garbage)
+	for(auto it = m_garbage.getBegin(); it != m_garbage.getEnd(); ++it)
 	{
-		for(const SegregatedListsGpuMemoryPoolToken& token : arr)
+		Garbage& garbage = *it;
+		if(it.getArrayIndex() != m_activeGarbage)
+		{
+			ANKI_CHECKF(garbage.m_fence->clientWait(kMaxSecond));
+		}
+		else
+		{
+			ANKI_ASSERT(!garbage.m_fence);
+		}
+
+		for(const SegregatedListsGpuMemoryPoolToken& token : garbage.m_tokens)
 		{
 			m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
 		}
@@ -237,30 +244,60 @@ void SegregatedListsGpuMemoryPool::deferredFree(SegregatedListsGpuMemoryPoolToke
 
 	{
 		LockGuard lock(m_lock);
-		m_garbage[m_frame].emplaceBack(token);
+
+		if(m_activeGarbage == kMaxU32)
+		{
+			m_activeGarbage = m_garbage.emplace().getArrayIndex();
+		}
+
+		m_garbage[m_activeGarbage].m_tokens.emplace(token);
 	}
 
 	token = {};
 }
 
-void SegregatedListsGpuMemoryPool::endFrame()
+void SegregatedListsGpuMemoryPool::endFrame(Fence* fence)
 {
+	ANKI_ASSERT(fence);
 	ANKI_ASSERT(isInitialized());
 
 	LockGuard lock(m_lock);
 
-	m_frame = (m_frame + 1) % kMaxFramesInFlight;
-
 	// Throw out the garbage
-	for(SegregatedListsGpuMemoryPoolToken& token : m_garbage[m_frame])
+	Array<U32, 8> garbageToDelete;
+	U32 garbageToDeleteCount = 0;
+	for(auto it = m_garbage.getBegin(); it != m_garbage.getEnd(); ++it)
 	{
-		m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
+		Garbage& garbage = *it;
 
-		ANKI_ASSERT(m_allocatedSize >= token.m_size);
-		m_allocatedSize -= token.m_size;
+		if(garbage.m_fence && garbage.m_fence->clientWait(0.0))
+		{
+			for(SegregatedListsGpuMemoryPoolToken token : garbage.m_tokens)
+			{
+				m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
+
+				ANKI_ASSERT(m_allocatedSize >= token.m_size);
+				m_allocatedSize -= token.m_size;
+			}
+
+			garbageToDelete[garbageToDeleteCount++] = it.getArrayIndex();
+		}
+	}
+
+	for(U32 i = 0; i < garbageToDeleteCount; ++i)
+	{
+		m_garbage.erase(garbageToDelete[i]);
 	}
 
-	m_garbage[m_frame].destroy();
+	// Set the new fence
+	if(m_activeGarbage != kMaxU32)
+	{
+		ANKI_ASSERT(m_garbage[m_activeGarbage].m_tokens.getSize());
+		ANKI_ASSERT(!m_garbage[m_activeGarbage].m_fence);
+		m_garbage[m_activeGarbage].m_fence.reset(fence);
+
+		m_activeGarbage = kMaxU32;
+	}
 }
 
 void SegregatedListsGpuMemoryPool::getStats(F32& externalFragmentation, PtrSize& userAllocatedSize, PtrSize& totalSize) const

+ 24 - 20
AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.h

@@ -6,21 +6,19 @@
 #pragma once
 
 #include <AnKi/Util/SegregatedListsAllocatorBuilder.h>
+#include <AnKi/Util/BlockArray.h>
 #include <AnKi/Gr/Buffer.h>
+#include <AnKi/Gr/Fence.h>
 
 namespace anki {
 
-/// @addtogroup graphics
-/// @{
-
-/// The result of an allocation of SegregatedListsGpuMemoryPool.
-/// @memberof SegregatedListsGpuMemoryPool
+// The result of an allocation of SegregatedListsGpuMemoryPool.
 class SegregatedListsGpuMemoryPoolToken
 {
 	friend class SegregatedListsGpuMemoryPool;
 
 public:
-	/// The offset in the SegregatedListsGpuMemoryPoolToken::getBuffer() buffer.
+	// The offset in the SegregatedListsGpuMemoryPoolToken::getBuffer() buffer.
 	PtrSize m_offset = kMaxPtrSize;
 
 	PtrSize m_size = kMaxPtrSize;
@@ -40,8 +38,8 @@ private:
 	PtrSize m_chunkOffset = kMaxPtrSize;
 };
 
-/// GPU memory allocator based on segregated lists. It allocates a GPU buffer with some initial size. If there is a need to grow it allocates a bigger
-/// buffer and copies contents of the old one to the new (CoW).
+// GPU memory allocator based on segregated lists. It allocates a GPU buffer with some initial size. If there is a need to grow it allocates a bigger
+// buffer and copies contents of the old one to the new (CoW).
 class SegregatedListsGpuMemoryPool
 {
 public:
@@ -61,19 +59,19 @@ public:
 
 	void destroy();
 
-	/// Allocate memory.
-	/// @note It's thread-safe.
+	// Allocate memory.
+	// It's thread-safe.
 	void allocate(PtrSize size, U32 alignment, SegregatedListsGpuMemoryPoolToken& token);
 
-	/// Free memory a few frames down the line.
-	/// @note It's thread-safe.
+	// Free memory a few frames down the line.
+	// It's thread-safe.
 	void deferredFree(SegregatedListsGpuMemoryPoolToken& token);
 
-	/// @note It's thread-safe.
-	void endFrame();
+	// It's thread-safe.
+	void endFrame(Fence* fence);
 
-	/// Need to be checking this constantly to get the updated buffer in case of CoWs.
-	/// @note It's not thread-safe.
+	// Need to be checking this constantly to get the updated buffer in case of CoWs.
+	// It's not thread-safe.
 	Buffer& getGpuBuffer() const
 	{
 		ANKI_ASSERT(m_gpuBuffer.isCreated() && "The buffer hasn't been created yet");
@@ -86,7 +84,7 @@ public:
 		return m_mappedGpuBufferMemory;
 	}
 
-	/// @note It's thread-safe.
+	// It's thread-safe.
 	void getStats(F32& externalFragmentation, PtrSize& userAllocatedSize, PtrSize& totalSize) const;
 
 private:
@@ -108,8 +106,15 @@ private:
 
 	GrDynamicArray<Chunk*> m_deletedChunks;
 
-	Array<GrDynamicArray<SegregatedListsGpuMemoryPoolToken>, kMaxFramesInFlight> m_garbage;
-	U8 m_frame = 0;
+	class Garbage
+	{
+	public:
+		GrBlockArray<SegregatedListsGpuMemoryPoolToken, BlockArrayConfig<16>> m_tokens;
+		FencePtr m_fence;
+	};
+
+	GrBlockArray<Garbage, BlockArrayConfig<8>> m_garbage;
+	U32 m_activeGarbage = kMaxU32;
 	Bool m_allowCoWs = true;
 
 	BufferMapAccessBit m_mapAccess = BufferMapAccessBit::kNone;
@@ -122,6 +127,5 @@ private:
 		return m_bufferUsage != BufferUsageBit::kNone;
 	}
 };
-/// @}
 
 } // end namespace anki

+ 17 - 0
AnKi/Gr/Vulkan/VkCommandBuffer.cpp

@@ -1344,4 +1344,21 @@ void CommandBufferImpl::traceRaysInternal(const BufferView& sbtBuffer, U32 sbtRe
 	}
 }
 
+void CommandBufferImpl::setFullPipelineBarrier()
+{
+	ANKI_TRACE_FUNCTION();
+	ANKI_VK_SELF(CommandBufferImpl);
+	self.commandCommon();
+
+	VkMemoryBarrier barr = {};
+	barr.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+	barr.srcAccessMask = barr.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
+	barr.dstAccessMask |= VK_ACCESS_HOST_READ_BIT;
+
+	vkCmdPipelineBarrier(self.m_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT | VK_PIPELINE_STAGE_HOST_BIT, 0, 1,
+						 &barr, 0, nullptr, 0, nullptr);
+
+	ANKI_TRACE_INC_COUNTER(VkBarrier, 1);
+}
+
 } // end namespace anki

+ 2 - 0
AnKi/Gr/Vulkan/VkCommandBuffer.h

@@ -88,6 +88,8 @@ public:
 	}
 #endif
 
+	void setFullPipelineBarrier();
+
 private:
 	StackMemoryPool m_pool;
 

+ 3 - 1
AnKi/Gr/Vulkan/VkCommandBufferFactory.cpp

@@ -10,7 +10,8 @@
 
 namespace anki {
 
-ANKI_SVAR(CommandBufferCount, StatCategory::kGr, "CommandBufferCount", StatFlag::kNone)
+ANKI_SVAR(CommandBufferCount, StatCategory::kGr, "Cmdb count", StatFlag::kNone)
+ANKI_SVAR(CommandBuffersCreated, StatCategory::kGr, "Cmdbs created", StatFlag::kNone)
 
 MicroCommandBuffer::~MicroCommandBuffer()
 {
@@ -111,6 +112,7 @@ Error CommandBufferThreadAllocator::newCommandBuffer(CommandBufferFlag cmdbFlags
 
 		ANKI_TRACE_INC_COUNTER(VkCommandBufferCreate, 1);
 		g_svarCommandBufferCount.increment(1_U64);
+		g_svarCommandBuffersCreated.increment(1_U64);
 		VkCommandBuffer cmdb;
 		ANKI_VK_CHECK(vkAllocateCommandBuffers(getVkDevice(), &ci, &cmdb));
 

+ 5 - 0
AnKi/Gr/Vulkan/VkCommandBufferFactory.h

@@ -50,6 +50,11 @@ public:
 		return m_refcount.load();
 	}
 
+	Bool canRecycle() const
+	{
+		return true;
+	}
+
 	VkCommandBuffer getHandle() const
 	{
 		ANKI_ASSERT(m_handle);

+ 17 - 3
AnKi/Gr/Vulkan/VkFenceFactory.cpp

@@ -8,10 +8,24 @@
 
 namespace anki {
 
-void MicroFenceImpl::setName(CString name) const
+MicroFencePtr FenceFactory::newInstance(CString name)
 {
-	ANKI_ASSERT(m_handle);
-	getGrManagerImpl().trySetVulkanHandleName(name, VK_OBJECT_TYPE_FENCE, m_handle);
+	MicroFence* fence = m_recycler.findToReuse();
+
+	if(fence == nullptr)
+	{
+		fence = newInstance<MicroFence>(GrMemoryPool::getSingleton());
+	}
+	else
+	{
+		fence->reset();
+	}
+
+	ANKI_ASSERT(fence->getRefcount() == 0);
+
+	getGrManagerImpl().trySetVulkanHandleName(name, VK_OBJECT_TYPE_FENCE, fence->getHandle());
+
+	return MicroFencePtr(fence);
 }
 
 } // end namespace anki

+ 58 - 51
AnKi/Gr/Vulkan/VkFenceFactory.h

@@ -6,33 +6,58 @@
 #pragma once
 
 #include <AnKi/Gr/Vulkan/VkCommon.h>
-#include <AnKi/Gr/BackendCommon/MicroFenceFactory.h>
+#include <AnKi/Gr/BackendCommon/MicroObjectRecycler.h>
 #include <AnKi/Util/Tracer.h>
+#include <AnKi/Core/StatsSet.h>
 
 namespace anki {
 
-/// @addtogroup vulkan
-/// @{
+ANKI_SVAR(FenceCount2, StatCategory::kGr, "Fence count", StatFlag::kNone)
+ANKI_SVAR(FencesCreated, StatCategory::kGr, "Fences created", StatFlag::kNone)
 
-/// Fence wrapper over VkFence.
-class MicroFenceImpl
+// Fence wrapper over VkFence.
+class MicroFence
 {
 public:
-	VkFence m_handle = VK_NULL_HANDLE;
+	MicroFence()
+	{
+		VkFenceCreateInfo ci = {};
+		ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+		ANKI_VK_CHECKF(vkCreateFence(getVkDevice(), &ci, nullptr, &m_handle));
+		g_svarFenceCount2.increment(1u);
+		g_svarFencesCreated.increment(1u);
+	}
+
+	~MicroFence()
+	{
+		if(m_handle)
+		{
+			vkDestroyFence(getVkDevice(), m_handle, nullptr);
+			g_svarFenceCount2.decrement(1u);
+		}
+	}
+
+	void retain() const
+	{
+		m_refcount.fetchAdd(1);
+	}
 
-	~MicroFenceImpl()
+	void release();
+
+	I32 getRefcount() const
 	{
-		ANKI_ASSERT(!m_handle);
+		return m_refcount.load();
 	}
 
-	operator Bool() const
+	Bool canRecycle() const
 	{
-		return m_handle != 0;
+		return signaled();
 	}
 
-	Bool clientWait(Second seconds)
+	Bool clientWait(Second seconds) const
 	{
 		ANKI_ASSERT(m_handle);
+		seconds = min<Second>(seconds, g_cvarGrGpuTimeout);
 		const F64 nsf = 1e+9 * seconds;
 		const U64 ns = U64(nsf);
 		VkResult res;
@@ -41,7 +66,7 @@ public:
 		return res != VK_TIMEOUT;
 	}
 
-	Bool signaled()
+	Bool signaled() const
 	{
 		ANKI_ASSERT(m_handle);
 		VkResult status;
@@ -49,68 +74,50 @@ public:
 		return status == VK_SUCCESS;
 	}
 
-	void create()
-	{
-		ANKI_ASSERT(!m_handle);
-		VkFenceCreateInfo ci = {};
-		ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-		ANKI_VK_CHECKF(vkCreateFence(getVkDevice(), &ci, nullptr, &m_handle));
-	}
-
-	void destroy()
+	void reset() const
 	{
 		ANKI_ASSERT(m_handle);
-		vkDestroyFence(getVkDevice(), m_handle, nullptr);
-		m_handle = 0;
+		ANKI_VK_CHECKF(vkResetFences(getVkDevice(), 1, &m_handle));
 	}
 
-	void reset()
+	VkFence getHandle() const
 	{
 		ANKI_ASSERT(m_handle);
-		ANKI_VK_CHECKF(vkResetFences(getVkDevice(), 1, &m_handle));
+		return m_handle;
 	}
 
-	void setName(CString name) const;
-};
-
-using VulkanMicroFence = MicroFence<MicroFenceImpl>;
-
-/// Deleter for FencePtr.
-class MicroFencePtrDeleter
-{
-public:
-	void operator()(VulkanMicroFence* fence);
+private:
+	VkFence m_handle = VK_NULL_HANDLE;
+	mutable Atomic<I32> m_refcount = {0};
 };
 
-/// Fence smart pointer.
-using MicroFencePtr = IntrusivePtr<VulkanMicroFence, MicroFencePtrDeleter>;
+// Fence smart pointer.
+using MicroFencePtr = IntrusiveNoDelPtr<MicroFence>;
 
-/// A factory of fences.
+// A factory of fences.
 class FenceFactory : public MakeSingleton<FenceFactory>
 {
-	friend class MicroFencePtrDeleter;
+	friend class MicroFence;
 
 public:
-	/// Create a new fence pointer.
-	MicroFencePtr newInstance(CString name = "unnamed")
-	{
-		return MicroFencePtr(m_factory.newFence(name));
-	}
+	// Create a new fence pointer.
+	MicroFencePtr newInstance(CString name = "unnamed");
 
 private:
-	MicroFenceFactory<MicroFenceImpl> m_factory;
+	MicroObjectRecycler<MicroFence> m_recycler;
 
-	void deleteFence(VulkanMicroFence* fence)
+	void releaseFence(MicroFence* fence)
 	{
-		m_factory.releaseFence(fence);
+		m_recycler.recycle(fence);
 	}
 };
 
-inline void MicroFencePtrDeleter::operator()(VulkanMicroFence* fence)
+inline void MicroFence::release()
 {
-	ANKI_ASSERT(fence);
-	FenceFactory::getSingleton().deleteFence(fence);
+	if(m_refcount.fetchSub(1) == 1)
+	{
+		FenceFactory::getSingleton().releaseFence(this);
+	}
 }
-/// @}
 
 } // end namespace anki

+ 36 - 9
AnKi/Gr/Vulkan/VkGrManager.cpp

@@ -143,10 +143,10 @@ ANKI_NEW_GR_OBJECT(GrUpscaler)
 #undef ANKI_NEW_GR_OBJECT
 #undef ANKI_NEW_GR_OBJECT_NO_INIT_INFO
 
-void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
+void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize)
 {
 	ANKI_VK_SELF(GrManagerImpl);
-	self.submitInternal(cmdbs, waitFences, signalFence);
+	self.submitInternal(cmdbs, waitFences, signalFence, flushAndSerialize);
 }
 
 PtrSize GrManager::getAccelerationStructureMemoryRequirement(const AccelerationStructureInitInfo& init) const
@@ -1146,7 +1146,7 @@ Error GrManagerImpl::initDevice()
 	if(pureAsyncCompute)
 	{
 		vkGetDeviceQueue(m_device, m_queueFamilyIndices[GpuQueueType::kCompute], 0, &m_queues[GpuQueueType::kCompute]);
-		trySetVulkanHandleName("AsyncCompute", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kGeneral]);
+		trySetVulkanHandleName("AsyncCompute", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kCompute]);
 	}
 	else if(lowPriorityQueueAsyncCompute)
 	{
@@ -1274,8 +1274,8 @@ TexturePtr GrManagerImpl::acquireNextPresentableTexture()
 
 	// Get new image
 	uint32_t imageIdx;
-	const VkResult res = vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(),
-											   fence->getImplementation().m_handle, &imageIdx);
+	const VkResult res =
+		vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(), fence->getHandle(), &imageIdx);
 
 	if(res == VK_ERROR_OUT_OF_DATE_KHR)
 	{
@@ -1291,8 +1291,8 @@ TexturePtr GrManagerImpl::acquireNextPresentableTexture()
 		m_crntSwapchain = SwapchainFactory::getSingleton().newInstance();
 
 		// Can't fail a second time
-		ANKI_VK_CHECKF(vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(),
-											 fence->getImplementation().m_handle, &imageIdx));
+		ANKI_VK_CHECKF(
+			vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(), fence->getHandle(), &imageIdx));
 	}
 	else
 	{
@@ -1357,7 +1357,7 @@ void GrManagerImpl::endFrameInternal()
 	GpuMemoryManager::getSingleton().updateStats();
 }
 
-void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
+void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize)
 {
 	// First thing, create a fence
 	MicroFencePtr fence = FenceFactory::getSingleton().newInstance("Submit");
@@ -1390,6 +1390,33 @@ void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fe
 		vkCmdbs.emplaceBack(cmdb.getHandle());
 	}
 
+	// Create the flush command buffer
+	CommandBufferPtr flushCmdb;
+	if(flushAndSerialize)
+	{
+		CommandBufferInitInfo cmdbInit("Flush");
+		cmdbInit.m_flags = CommandBufferFlag::kSmallBatch;
+		if(queueType == GpuQueueType::kCompute)
+		{
+			cmdbInit.m_flags |= CommandBufferFlag::kComputeWork;
+		}
+		else
+		{
+			cmdbInit.m_flags |= CommandBufferFlag::kGeneralWork;
+		}
+
+		flushCmdb = newCommandBuffer(cmdbInit);
+		CommandBufferImpl& impl = static_cast<CommandBufferImpl&>(*flushCmdb);
+		impl.setFullPipelineBarrier();
+		flushCmdb->endRecording();
+
+#if ANKI_ASSERTIONS_ENABLED
+		impl.setSubmitted();
+#endif
+
+		vkCmdbs.emplaceBack(impl.getHandle());
+	}
+
 	// Gather wait semaphores
 	GrDynamicArray<VkSemaphore> waitSemaphores;
 	GrDynamicArray<VkPipelineStageFlags> waitStages;
@@ -1488,7 +1515,7 @@ void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fe
 		appendPNextList(submit, &timelineInfo);
 
 		ANKI_TRACE_SCOPED_EVENT(VkQueueSubmit);
-		ANKI_VK_CHECKF(vkQueueSubmit(m_queues[queueType], 1, &submit, fence->getImplementation().m_handle));
+		ANKI_VK_CHECKF(vkQueueSubmit(m_queues[queueType], 1, &submit, fence->getHandle()));
 	}
 }
 

+ 2 - 1
AnKi/Gr/Vulkan/VkGrManager.h

@@ -11,6 +11,7 @@
 #include <AnKi/Gr/Vulkan/VkFenceFactory.h>
 #include <AnKi/Gr/Vulkan/VkSwapchainFactory.h>
 #include <AnKi/Util/File.h>
+#include <AnKi/Util/BlockArray.h>
 
 namespace anki {
 
@@ -86,7 +87,7 @@ public:
 
 	void endFrameInternal();
 
-	void submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence);
+	void submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize);
 
 	void finishInternal();
 

+ 2 - 0
AnKi/Gr/Vulkan/VkSemaphoreFactory.cpp

@@ -11,6 +11,7 @@
 namespace anki {
 
 ANKI_SVAR(SemaphoreCount, StatCategory::kGr, "Semaphore count", StatFlag::kNone)
+ANKI_SVAR(SemaphoresCreated, StatCategory::kGr, "Semaphores created", StatFlag::kNone)
 
 MicroSemaphore::MicroSemaphore(Bool isTimeline)
 	: m_isTimeline(isTimeline)
@@ -27,6 +28,7 @@ MicroSemaphore::MicroSemaphore(Bool isTimeline)
 	ANKI_VK_CHECKF(vkCreateSemaphore(getVkDevice(), &ci, nullptr, &m_handle));
 	ANKI_TRACE_INC_COUNTER(VkSemaphoreCreate, 1);
 	g_svarSemaphoreCount.increment(1u);
+	g_svarSemaphoresCreated.increment(1u);
 }
 
 MicroSemaphore::~MicroSemaphore()

+ 5 - 0
AnKi/Gr/Vulkan/VkSemaphoreFactory.h

@@ -46,6 +46,11 @@ public:
 		return m_refcount.load();
 	}
 
+	Bool canRecycle() const
+	{
+		return true;
+	}
+
 	Bool clientWait(Second seconds);
 
 	Bool isTimeline() const

+ 5 - 0
AnKi/Gr/Vulkan/VkSwapchainFactory.h

@@ -46,6 +46,11 @@ public:
 		return m_refcount.load();
 	}
 
+	Bool canRecycle() const
+	{
+		return true;
+	}
+
 private:
 	mutable Atomic<I32> m_refcount = {0};
 

+ 1 - 2
AnKi/Renderer/Renderer.cpp

@@ -806,7 +806,7 @@ void Renderer::updatePipelineStats()
 }
 #endif
 
-Error Renderer::render()
+Error Renderer::render(FencePtr& fence)
 {
 	ANKI_TRACE_SCOPED_EVENT(Render);
 
@@ -925,7 +925,6 @@ Error Renderer::render()
 	m_rgraph->compileNewGraph(ctx.m_renderGraphDescr, m_framePool);
 
 	// Flush
-	FencePtr fence;
 	m_rgraph->recordAndSubmitCommandBuffers(&fence);
 
 	// Misc

+ 1 - 1
AnKi/Renderer/Renderer.h

@@ -93,7 +93,7 @@ public:
 
 	Error init(const RendererInitInfo& inf);
 
-	Error render();
+	Error render(FencePtr& fence);
 
 #define ANKI_RENDERER_OBJECT_DEF(type, name, initCondition) \
 	type& get##type() \

+ 1 - 1
AnKi/Scene/SceneNode.cpp

@@ -44,7 +44,7 @@ namespace anki {
 		compInit.m_node = this; \
 		compInit.m_componentUuid = SceneGraph::getSingleton().m_scenes[m_sceneIndex].getNewNodeUuid(); \
 		compInit.m_sceneUuid = m_sceneUuid; \
-		auto it = SceneGraph::getSingleton().getComponentArrays().get##name##s().emplace(compInit); \
+		auto it = SceneGraph::getSingleton().m_componentArrays.get##name##s().emplace(compInit); \
 		it->setArrayIndex(it.getArrayIndex()); \
 		addComponent(&(*it)); \
 		return &(*it); \

+ 24 - 0
AnKi/Util/BitSet.h

@@ -304,6 +304,30 @@ public:
 		return *this;
 	}
 
+	template<typename TFunc>
+	FunctorContinue iterateSetBitsFromLeastSignificant(TFunc func) const
+	{
+		for(U32 i = 0; i < kChunkCount; ++i)
+		{
+			ChunkType bits = m_chunks[i];
+			while(bits)
+			{
+				const U32 lsb = U32(std::countr_zero(bits));
+				const U32 bitIdx = lsb + (i * kChunkBitCount);
+
+				const FunctorContinue cont = func(bitIdx);
+				if(cont == FunctorContinue::kStop)
+				{
+					return cont;
+				}
+
+				bits &= ~(ChunkType(1) << ChunkType(lsb));
+			}
+		}
+
+		return FunctorContinue::kContinue;
+	}
+
 private:
 	Array<ChunkType, kChunkCount> m_chunks;