1 周之前 · b2d782582b
--- a/AnKi/Core/App.cpp
+++ b/AnKi/Core/App.cpp
@@ -449,7 +449,8 @@ Error App::mainLoop()
 
				 			SceneGraph::getSingleton().update(prevUpdateTime, crntTime);
			
 
				 			GpuSceneMicroPatcher::getSingleton().endPatching();
			
 
				 
			
 
				-			ANKI_CHECK(Renderer::getSingleton().render());
			
 
				+			FencePtr renderFence;
			
 
				+			ANKI_CHECK(Renderer::getSingleton().render(renderFence));
			
 
				 
			
 
				 			// If we get stats exclude the time of GR because it forces some GPU-CPU serialization. We don't want to count that
			
 
				 			Second grTime = 0.0;
			
@@ -465,11 +466,11 @@ Error App::mainLoop()
 
				 				grTime = HighRezTimer::getCurrentTime() - grTime;
			
 
				 			}
			
 
				 
			
 
				-			RebarTransientMemoryPool::getSingleton().endFrame();
			
 
				-			UnifiedGeometryBuffer::getSingleton().endFrame();
			
 
				-			GpuSceneBuffer::getSingleton().endFrame();
			
 
				+			RebarTransientMemoryPool::getSingleton().endFrame(renderFence.get());
			
 
				+			UnifiedGeometryBuffer::getSingleton().endFrame(renderFence.get());
			
 
				+			GpuSceneBuffer::getSingleton().endFrame(renderFence.get());
			
 
				 			GpuVisibleTransientMemoryPool::getSingleton().endFrame();
			
 
				-			GpuReadbackMemoryPool::getSingleton().endFrame();
			
 
				+			GpuReadbackMemoryPool::getSingleton().endFrame(renderFence.get());
			
 
				 
			
 
				 			// Sleep
			
 
				 			const Second endTime = HighRezTimer::getCurrentTime();
			
--- a/AnKi/GpuMemory/GpuReadbackMemoryPool.cpp
+++ b/AnKi/GpuMemory/GpuReadbackMemoryPool.cpp
@@ -42,9 +42,9 @@ void GpuReadbackMemoryPool::deferredFree(GpuReadbackMemoryAllocation& allocation
 
				 	::new(&allocation) GpuReadbackMemoryAllocation();
			
 
				 }
			
 
				 
			
 
				-void GpuReadbackMemoryPool::endFrame()
			
 
				+void GpuReadbackMemoryPool::endFrame(Fence* fence)
			
 
				 {
			
 
				-	m_pool.endFrame();
			
 
				+	m_pool.endFrame(fence);
			
 
				 }
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/GpuMemory/GpuReadbackMemoryPool.h
+++ b/AnKi/GpuMemory/GpuReadbackMemoryPool.h
@@ -97,7 +97,7 @@ public:
 
				 	// Thread-safe
			
 
				 	void deferredFree(GpuReadbackMemoryAllocation& allocation);
			
 
				 
			
 
				-	void endFrame();
			
 
				+	void endFrame(Fence* fence);
			
 
				 
			
 
				 private:
			
 
				 	SegregatedListsGpuMemoryPool m_pool;
			
--- a/AnKi/GpuMemory/GpuSceneBuffer.h
+++ b/AnKi/GpuMemory/GpuSceneBuffer.h
@@ -105,9 +105,9 @@ public:
 
				 		m_pool.deferredFree(alloc.m_token);
			
 
				 	}
			
 
				 
			
 
				-	void endFrame()
			
 
				+	void endFrame(Fence* fence)
			
 
				 	{
			
 
				-		m_pool.endFrame();
			
 
				+		m_pool.endFrame(fence);
			
 
				 #if ANKI_STATS_ENABLED
			
 
				 		updateStats();
			
 
				 #endif
			
--- a/AnKi/GpuMemory/GpuVisibleTransientMemoryPool.cpp
+++ b/AnKi/GpuMemory/GpuVisibleTransientMemoryPool.cpp
@@ -14,12 +14,8 @@ void GpuVisibleTransientMemoryPool::endFrame()
 
				 {
			
 
				 	g_svarGpuVisibleTransientMemory.set(m_pool.getAllocatedMemory());
			
 
				 
			
 
				-	if(m_frame == 0)
			
 
				-	{
			
 
				-		m_pool.reset();
			
 
				-	}
			
 
				-
			
 
				-	m_frame = (m_frame + 1) % kMaxFramesInFlight;
			
 
				+	// This is GPU only memory so next frame can start re-using immediately
			
 
				+	m_pool.reset();
			
 
				 }
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/GpuMemory/GpuVisibleTransientMemoryPool.h
+++ b/AnKi/GpuMemory/GpuVisibleTransientMemoryPool.h
@@ -45,7 +45,6 @@ public:
 
				 
			
 
				 private:
			
 
				 	StackGpuMemoryPool m_pool;
			
 
				-	U32 m_frame = 0;
			
 
				 	U32 m_structuredBufferAlignment = kMaxU32;
			
 
				 
			
 
				 	GpuVisibleTransientMemoryPool()
			
--- a/AnKi/GpuMemory/RebarTransientMemoryPool.cpp
+++ b/AnKi/GpuMemory/RebarTransientMemoryPool.cpp
@@ -7,6 +7,7 @@
 
				 #include <AnKi/Util/Tracer.h>
			
 
				 #include <AnKi/Gr/GrManager.h>
			
 
				 #include <AnKi/Gr/Buffer.h>
			
 
				+#include <AnKi/Gr/Fence.h>
			
 
				 
			
 
				 namespace anki {
			
 
				 
			
@@ -38,6 +39,11 @@ void RebarTransientMemoryPool::init()
 
				 	}
			
 
				 
			
 
				 	m_mappedMem = static_cast<U8*>(m_buffer->map(0, kMaxPtrSize, BufferMapAccessBit::kWrite));
			
 
				+
			
 
				+	// Create the slice of the 1st frame
			
 
				+	m_activeSliceMask.set(0);
			
 
				+	m_slices[0].m_offset = 0;
			
 
				+	m_crntActiveSlice = 0;
			
 
				 }
			
 
				 
			
 
				 BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alignment, void*& mappedMem)
			
@@ -52,32 +58,171 @@ BufferView RebarTransientMemoryPool::allocateInternal(PtrSize origSize, U32 alig
 
				 	do
			
 
				 	{
			
 
				 		offset = m_offset.fetchAdd(size) % m_bufferSize;
			
 
				-		const PtrSize end = (offset + size) % (m_bufferSize + 1);
			
 
				+		const PtrSize end = (offset + size) % m_bufferSize;
			
 
				 
			
 
				 		done = offset < end;
			
 
				 	} while(!done);
			
 
				 
			
 
				+	// Wait for the range that contains the offset
			
 
				+	m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdx) {
			
 
				+		if(sliceIdx == m_crntActiveSlice)
			
 
				+		{
			
 
				+			return FunctorContinue::kContinue;
			
 
				+		}
			
 
				+
			
 
				+		const FrameSlice& slice = m_slices[sliceIdx];
			
 
				+
			
 
				+		Bool overlaps;
			
 
				+		if(offset <= slice.m_offset)
			
 
				+		{
			
 
				+			overlaps = offset + size > slice.m_offset;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			overlaps = slice.m_offset + slice.m_range > offset;
			
 
				+		}
			
 
				+
			
 
				+		if(overlaps)
			
 
				+		{
			
 
				+			ANKI_CORE_LOGW("ReBAR has to wait for a fence. This means that the ReBAR buffer is not big enough. Increase the %s CVAR",
			
 
				+						   g_cvarCoreRebarGpuMemorySize.getName().cstr());
			
 
				+
			
 
				+			if(!m_sliceFences[sliceIdx]->clientWait(kMaxSecond))
			
 
				+			{
			
 
				+				ANKI_CORE_LOGF("Timeout detected");
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return FunctorContinue::kContinue;
			
 
				+	});
			
 
				+
			
 
				 	const PtrSize alignedOffset = getAlignedRoundUp(alignment, offset);
			
 
				 	ANKI_ASSERT(alignedOffset + origSize <= offset + size);
			
 
				+	ANKI_ASSERT(offset + size <= m_bufferSize);
			
 
				 
			
 
				 	mappedMem = m_mappedMem + alignedOffset;
			
 
				 	return BufferView(m_buffer.get(), alignedOffset, origSize);
			
 
				 }
			
 
				 
			
 
				-void RebarTransientMemoryPool::endFrame()
			
 
				+void RebarTransientMemoryPool::endFrame(Fence* fence)
			
 
				 {
			
 
				+	// Free up previous slices
			
 
				+	m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdx) {
			
 
				+		if(sliceIdx != m_crntActiveSlice)
			
 
				+		{
			
 
				+			if(m_sliceFences[sliceIdx]->signaled())
			
 
				+			{
			
 
				+				m_sliceFences[sliceIdx].reset(nullptr);
			
 
				+				m_slices[sliceIdx] = {};
			
 
				+				m_activeSliceMask.unset(sliceIdx);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return FunctorContinue::kContinue;
			
 
				+	});
			
 
				+
			
 
				+	// Finalize the active slice
			
 
				 	const PtrSize crntOffset = m_offset.getNonAtomically();
			
 
				+	const PtrSize crntNormalizedOffset = crntOffset % m_bufferSize;
			
 
				+
			
 
				+	FrameSlice& slice = m_slices[m_crntActiveSlice];
			
 
				+	ANKI_ASSERT(slice.m_offset < kMaxPtrSize && slice.m_range == 0 && !m_sliceFences[m_crntActiveSlice]);
			
 
				+
			
 
				+	ANKI_ASSERT(crntOffset >= slice.m_offset);
			
 
				+	const PtrSize range = crntOffset - slice.m_offset;
			
 
				 
			
 
				-	const PtrSize usedMemory = crntOffset - m_previousFrameEndOffset;
			
 
				-	m_previousFrameEndOffset = crntOffset;
			
 
				+	if(range == 0)
			
 
				+	{
			
 
				+		// No allocations this frame, remove the slice
			
 
				 
			
 
				-	if(usedMemory >= PtrSize(0.8 * F64(m_bufferSize / kMaxFramesInFlight)))
			
 
				+		slice = {};
			
 
				+		m_activeSliceMask.unset(m_crntActiveSlice);
			
 
				+	}
			
 
				+	else if((slice.m_offset % m_bufferSize) + range > m_bufferSize)
			
 
				+	{
			
 
				+		// The frame we are ending wrapped arround the ReBAR buffer, create two slices
			
 
				+
			
 
				+		slice.m_offset = slice.m_offset % m_bufferSize;
			
 
				+		slice.m_range = m_bufferSize - slice.m_offset;
			
 
				+		m_sliceFences[m_crntActiveSlice].reset(fence);
			
 
				+
			
 
				+		const U32 secondSliceIdx = (~m_activeSliceMask).getLeastSignificantBit();
			
 
				+		m_slices[secondSliceIdx].m_offset = 0;
			
 
				+		m_slices[secondSliceIdx].m_range = range - slice.m_range;
			
 
				+		ANKI_ASSERT(crntNormalizedOffset == m_slices[secondSliceIdx].m_range);
			
 
				+		m_sliceFences[secondSliceIdx].reset(fence);
			
 
				+		m_activeSliceMask.set(secondSliceIdx);
			
 
				+	}
			
 
				+	else
			
 
				 	{
			
 
				-		ANKI_CORE_LOGW("Frame used more that 80%% of its safe limit of ReBAR memory");
			
 
				+		// No wrapping, just finalize the active slice
			
 
				+
			
 
				+		slice.m_offset = slice.m_offset % m_bufferSize;
			
 
				+		slice.m_range = range;
			
 
				+		m_sliceFences[m_crntActiveSlice].reset(fence);
			
 
				 	}
			
 
				 
			
 
				+	// Create a new active slice
			
 
				+	const U32 newSliceIdx = (~m_activeSliceMask).getLeastSignificantBit();
			
 
				+	m_activeSliceMask.set(newSliceIdx);
			
 
				+	m_slices[newSliceIdx].m_offset = crntOffset;
			
 
				+	m_crntActiveSlice = newSliceIdx;
			
 
				+
			
 
				+	validateSlices();
			
 
				+
			
 
				+	// Stats
			
 
				+	const PtrSize usedMemory = range;
			
 
				 	ANKI_TRACE_INC_COUNTER(ReBarUsedMemory, usedMemory);
			
 
				 	g_svarRebarUserMemory.set(usedMemory);
			
 
				 }
			
 
				 
			
 
				+void RebarTransientMemoryPool::validateSlices() const
			
 
				+{
			
 
				+	for(U32 sliceIdxA = 0; sliceIdxA < kSliceCount; ++sliceIdxA)
			
 
				+	{
			
 
				+		if(sliceIdxA == m_crntActiveSlice)
			
 
				+		{
			
 
				+			ANKI_ASSERT(m_activeSliceMask.get(sliceIdxA));
			
 
				+			ANKI_ASSERT(m_slices[sliceIdxA].m_offset < kMaxPtrSize && m_slices[sliceIdxA].m_range == 0 && !m_sliceFences[sliceIdxA]);
			
 
				+		}
			
 
				+		else if(m_activeSliceMask.get(sliceIdxA))
			
 
				+		{
			
 
				+			const FrameSlice& a = m_slices[sliceIdxA];
			
 
				+			ANKI_ASSERT(a.m_offset < kMaxPtrSize && a.m_range > 0 && a.m_offset + a.m_range <= m_bufferSize);
			
 
				+			ANKI_ASSERT(!!m_sliceFences[sliceIdxA]);
			
 
				+
			
 
				+			m_activeSliceMask.iterateSetBitsFromLeastSignificant([&](U32 sliceIdxB) {
			
 
				+				if(sliceIdxA == sliceIdxB || sliceIdxB == m_crntActiveSlice)
			
 
				+				{
			
 
				+					return FunctorContinue::kContinue;
			
 
				+				}
			
 
				+
			
 
				+				const FrameSlice& b = m_slices[sliceIdxB];
			
 
				+
			
 
				+				if(a.m_offset < b.m_offset)
			
 
				+				{
			
 
				+					ANKI_ASSERT(a.m_offset + a.m_range <= b.m_offset);
			
 
				+				}
			
 
				+				else if(b.m_offset < a.m_offset)
			
 
				+				{
			
 
				+					ANKI_ASSERT(b.m_offset + b.m_range <= a.m_offset);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					ANKI_ASSERT(0 && "Offsets can't be equal");
			
 
				+				}
			
 
				+
			
 
				+				ANKI_ASSERT(m_sliceFences[sliceIdxA] && m_sliceFences[sliceIdxB]);
			
 
				+
			
 
				+				return FunctorContinue::kContinue;
			
 
				+			});
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			const FrameSlice& a = m_slices[sliceIdxA];
			
 
				+			ANKI_ASSERT(a.m_offset == kMaxPtrSize && a.m_range == 0 && !m_sliceFences[sliceIdxA]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 } // end namespace anki
			
--- a/AnKi/GpuMemory/RebarTransientMemoryPool.h
+++ b/AnKi/GpuMemory/RebarTransientMemoryPool.h
@@ -12,12 +12,9 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-/// @addtogroup gpu_memory
			
 
				-/// @{
			
 
				-
			
 
				 ANKI_CVAR(NumericCVar<PtrSize>, Core, RebarGpuMemorySize, 24_MB, 1_MB, 1_GB, "ReBAR: always mapped GPU memory")
			
 
				 
			
 
				-/// Manages staging GPU memory.
			
 
				+// Manages staging GPU memory.
			
 
				 class RebarTransientMemoryPool : public MakeSingleton<RebarTransientMemoryPool>
			
 
				 {
			
 
				 	template<typename>
			
@@ -30,9 +27,9 @@ public:
 
				 
			
 
				 	void init();
			
 
				 
			
 
				-	void endFrame();
			
 
				+	void endFrame(Fence* fence);
			
 
				 
			
 
				-	/// Allocate staging memory for various operations. The memory will be reclaimed at the begining of the N-(kMaxFramesInFlight-1) frame.
			
 
				+	// Allocate staging memory for various operations. The memory will be reused when it's safe
			
 
				 	template<typename T>
			
 
				 	BufferView allocate(PtrSize size, U32 alignment, T*& mappedMem)
			
 
				 	{
			
@@ -42,14 +39,14 @@ public:
 
				 		return out;
			
 
				 	}
			
 
				 
			
 
				-	/// @copydoc allocate
			
 
				+	// See allocate()
			
 
				 	template<typename T>
			
 
				 	BufferView allocateConstantBuffer(T*& mappedMem)
			
 
				 	{
			
 
				 		return allocate(sizeof(T), GrManager::getSingleton().getDeviceCapabilities().m_constantBufferBindOffsetAlignment, mappedMem);
			
 
				 	}
			
 
				 
			
 
				-	/// @copydoc allocate
			
 
				+	// See allocate()
			
 
				 	template<typename T>
			
 
				 	BufferView allocateStructuredBuffer(U32 count, WeakArray<T>& arr)
			
 
				 	{
			
@@ -60,7 +57,7 @@ public:
 
				 		return out;
			
 
				 	}
			
 
				 
			
 
				-	/// @copydoc allocate
			
 
				+	// See allocate()
			
 
				 	template<typename T>
			
 
				 	BufferView allocateCopyBuffer(U32 count, WeakArray<T>& arr)
			
 
				 	{
			
@@ -83,18 +80,33 @@ public:
 
				 
			
 
				 private:
			
 
				 	BufferPtr m_buffer;
			
 
				-	U8* m_mappedMem = nullptr; ///< Cache it.
			
 
				-	PtrSize m_bufferSize = 0; ///< Cache it.
			
 
				+	U8* m_mappedMem = nullptr; // Cache it
			
 
				+	PtrSize m_bufferSize = 0; // Cache it
			
 
				+	U32 m_structuredBufferAlignment = kMaxU32; // Cache it
			
 
				+
			
 
				 	Atomic<PtrSize> m_offset = {0};
			
 
				-	PtrSize m_previousFrameEndOffset = 0;
			
 
				-	U32 m_structuredBufferAlignment = kMaxU32;
			
 
				+
			
 
				+	// This is the slice of the ReBAR buffer that is protected by a fence
			
 
				+	class FrameSlice
			
 
				+	{
			
 
				+	public:
			
 
				+		PtrSize m_offset = kMaxPtrSize;
			
 
				+		PtrSize m_range = 0;
			
 
				+	};
			
 
				+
			
 
				+	static constexpr U32 kSliceCount = 8; // It's actually "max slices in-flight"
			
 
				+	BitSet<kSliceCount, U32> m_activeSliceMask = {false};
			
 
				+	U32 m_crntActiveSlice = kMaxU32;
			
 
				+	Array<FrameSlice, kSliceCount> m_slices;
			
 
				+	Array<FencePtr, kSliceCount> m_sliceFences;
			
 
				 
			
 
				 	RebarTransientMemoryPool() = default;
			
 
				 
			
 
				 	~RebarTransientMemoryPool();
			
 
				 
			
 
				 	BufferView allocateInternal(PtrSize size, U32 alignment, void*& mappedMem);
			
 
				+
			
 
				+	void validateSlices() const;
			
 
				 };
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/GpuMemory/UnifiedGeometryBuffer.h
+++ b/AnKi/GpuMemory/UnifiedGeometryBuffer.h
@@ -125,9 +125,9 @@ public:
 
				 		alloc.m_fakeOffset = kMaxU32;
			
 
				 	}
			
 
				 
			
 
				-	void endFrame()
			
 
				+	void endFrame(Fence* fence)
			
 
				 	{
			
 
				-		m_pool.endFrame();
			
 
				+		m_pool.endFrame(fence);
			
 
				 #if ANKI_STATS_ENABLED
			
 
				 		updateStats();
			
 
				 #endif
			
--- a/AnKi/Gr/BackendCommon/MicroObjectRecycler.h
+++ b/AnKi/Gr/BackendCommon/MicroObjectRecycler.h
@@ -10,10 +10,9 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-/// @addtogroup graphics
			
 
				-/// @{
			
 
				-
			
 
				-/// Helper class for MicroXXX objects. It expects a specific interface for the T.
			
 
				+// Helper class for MicroXXX objects. It expects a specific interface for the T:
			
 
				+// I32 getRefcount() const;
			
 
				+// Bool canRecycle() const;
			
 
				 template<typename T>
			
 
				 class MicroObjectRecycler
			
 
				 {
			
@@ -27,16 +26,16 @@ public:
 
				 		destroy();
			
 
				 	}
			
 
				 
			
 
				-	/// It's thread-safe.
			
 
				+	// It's thread-safe.
			
 
				 	void destroy();
			
 
				 
			
 
				-	/// Find a new one to reuse. It's thread-safe.
			
 
				+	// Find a new one to reuse. It's thread-safe.
			
 
				 	T* findToReuse();
			
 
				 
			
 
				-	/// Release an object back to the recycler. It's thread-safe.
			
 
				+	// Release an object back to the recycler. It's thread-safe.
			
 
				 	void recycle(T* s);
			
 
				 
			
 
				-	/// Destroy those objects that their fence is done. It's thread-safe.
			
 
				+	// Destroy those objects that their fence is done. It's thread-safe.
			
 
				 	void trimCache()
			
 
				 	{
			
 
				 		LockGuard<Mutex> lock(m_mtx);
			
@@ -67,7 +66,6 @@ private:
 
				 
			
 
				 	void adjustAliveObjectCount();
			
 
				 };
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
 
				 
			
--- a/AnKi/Gr/BackendCommon/MicroObjectRecycler.inl.h
+++ b/AnKi/Gr/BackendCommon/MicroObjectRecycler.inl.h
@@ -34,10 +34,14 @@ inline T* MicroObjectRecycler<T>::findToReuse()
 
				 	// Trim the cache but leave at least one object to be recycled
			
 
				 	trimCacheInternal(max(m_availableObjectsAfterTrim, 1u));
			
 
				 
			
 
				-	if(m_objectCache.getSize())
			
 
				+	for(auto it = m_objectCache.getBegin(); it != m_objectCache.getEnd(); ++it)
			
 
				 	{
			
 
				-		out = m_objectCache[m_objectCache.getSize() - 1];
			
 
				-		m_objectCache.popBack();
			
 
				+		if((*it)->canRecycle())
			
 
				+		{
			
 
				+			out = *it;
			
 
				+			m_objectCache.erase(it);
			
 
				+			break;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	ANKI_ASSERT(out == nullptr || out->getRefcount() == 0);
			
@@ -72,19 +76,28 @@ template<typename T>
 
				 void MicroObjectRecycler<T>::trimCacheInternal(U32 aliveObjectCountAfterTrim)
			
 
				 {
			
 
				 	aliveObjectCountAfterTrim = min(aliveObjectCountAfterTrim, m_objectCache.getSize());
			
 
				-	const U32 toBeKilledCount = m_objectCache.getSize() - aliveObjectCountAfterTrim;
			
 
				+	U32 toBeKilledCount = m_objectCache.getSize() - aliveObjectCountAfterTrim;
			
 
				 	if(toBeKilledCount == 0)
			
 
				 	{
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	for(U32 i = 0; i < toBeKilledCount; ++i)
			
 
				+	GrDynamicArray<T*> newObjectCache;
			
 
				+	for(U32 i = 0; i < m_objectCache.getSize(); ++i)
			
 
				 	{
			
 
				-		deleteInstance(GrMemoryPool::getSingleton(), m_objectCache[i]);
			
 
				-		m_objectCache[i] = nullptr;
			
 
				+		if(toBeKilledCount > 0 && m_objectCache[i]->canRecycle())
			
 
				+		{
			
 
				+			deleteInstance(GrMemoryPool::getSingleton(), m_objectCache[i]);
			
 
				+			--toBeKilledCount;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			newObjectCache.emplaceBack(m_objectCache[i]);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	m_objectCache.erase(m_objectCache.getBegin(), m_objectCache.getBegin() + toBeKilledCount);
			
 
				+	m_objectCache.destroy();
			
 
				+	m_objectCache = std::move(newObjectCache);
			
 
				 }
			
 
				 
			
 
				 template<typename T>
			
@@ -97,12 +110,15 @@ void MicroObjectRecycler<T>::adjustAliveObjectCount()
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				+		constexpr U32 kGrowCount = 4;
			
 
				+		constexpr U32 kMinAvailableObjects = 1;
			
 
				+
			
 
				 		if(m_cacheMisses)
			
 
				 		{
			
 
				 			// Need more alive objects
			
 
				-			m_availableObjectsAfterTrim += 4;
			
 
				+			m_availableObjectsAfterTrim += kGrowCount;
			
 
				 		}
			
 
				-		else if(m_availableObjectsAfterTrim > 0)
			
 
				+		else if(m_availableObjectsAfterTrim > kMinAvailableObjects)
			
 
				 		{
			
 
				 			// Have more than enough alive objects per request, decrease alive objects
			
 
				 			--m_availableObjectsAfterTrim;
			
--- a/AnKi/Gr/D3D/D3DGrManager.cpp
+++ b/AnKi/Gr/D3D/D3DGrManager.cpp
@@ -138,8 +138,9 @@ void GrManager::finish()
 
				 	self.finishInternal();
			
 
				 }
			
 
				 
			
 
				-void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
			
 
				+void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, [[maybe_unused]] Bool flushAndSerialize)
			
 
				 {
			
 
				+	// No need to do something about flushAndSerialize, D3D does that anyways
			
 
				 	ANKI_D3D_SELF(GrManagerImpl);
			
 
				 	self.submitInternal(cmdbs, waitFences, signalFence);
			
 
				 }
			
--- a/AnKi/Gr/Fence.h
+++ b/AnKi/Gr/Fence.h
@@ -9,10 +9,7 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-/// @addtogroup graphics
			
 
				-/// @{
			
 
				-
			
 
				-/// GPU fence.
			
 
				+// GPU fence
			
 
				 class Fence : public GrObject
			
 
				 {
			
 
				 	ANKI_GR_OBJECT
			
@@ -20,27 +17,28 @@ class Fence : public GrObject
 
				 public:
			
 
				 	static constexpr GrObjectType kClassType = GrObjectType::kFence;
			
 
				 
			
 
				-	/// Wait for the fence.
			
 
				-	/// @param seconds The time to wait in seconds. If it's zero then just return the status.
			
 
				-	/// @return True if is signaled (signaled == GPU work is done).
			
 
				+	// Wait for the fence.
			
 
				+	// seconds: The time to wait in seconds. If it's zero then just return the status.
			
 
				+	// Return true if is signaled (signaled == GPU work is done).
			
 
				 	Bool clientWait(Second seconds);
			
 
				 
			
 
				+	Bool signaled()
			
 
				+	{
			
 
				+		return clientWait(0.0);
			
 
				+	}
			
 
				+
			
 
				 protected:
			
 
				-	/// Construct.
			
 
				 	Fence(CString name)
			
 
				 		: GrObject(kClassType, name)
			
 
				 	{
			
 
				 	}
			
 
				 
			
 
				-	/// Destroy.
			
 
				 	~Fence()
			
 
				 	{
			
 
				 	}
			
 
				 
			
 
				 private:
			
 
				-	/// Allocate and initialize a new instance.
			
 
				 	[[nodiscard]] static Fence* newInstance();
			
 
				 };
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/GrManager.h
+++ b/AnKi/Gr/GrManager.h
@@ -16,10 +16,7 @@ namespace anki {
 
				 // Forward
			
 
				 class NativeWindow;
			
 
				 
			
 
				-/// @addtogroup graphics
			
 
				-/// @{
			
 
				-
			
 
				-/// Manager initializer.
			
 
				+// Manager initializer.
			
 
				 class GrManagerInitInfo
			
 
				 {
			
 
				 public:
			
@@ -29,7 +26,7 @@ public:
 
				 	CString m_cacheDirectory;
			
 
				 };
			
 
				 
			
 
				-/// The graphics manager, owner of all graphics objects.
			
 
				+// The graphics manager, owner of all graphics objects.
			
 
				 class GrManager : public MakeSingletonPtr<GrManager>
			
 
				 {
			
 
				 	template<typename>
			
@@ -43,31 +40,31 @@ public:
 
				 		return m_capabilities;
			
 
				 	}
			
 
				 
			
 
				-	/// First call in the frame. Do that before everything else.
			
 
				+	// First call in the frame. Do that before everything else.
			
 
				 	void beginFrame();
			
 
				 
			
 
				-	/// Get next presentable image. The returned Texture is valid until the following swapBuffers. After that it might dissapear even if you hold the
			
 
				-	/// reference.
			
 
				+	// Get next presentable image. The returned Texture is valid until the following swapBuffers. After that it might dissapear even if you hold the
			
 
				+	// reference.
			
 
				 	TexturePtr acquireNextPresentableTexture();
			
 
				 
			
 
				-	/// End this frame.
			
 
				+	// End this frame.
			
 
				 	void endFrame();
			
 
				 
			
 
				-	/// Submit command buffers. Can be called outside beginFrame() endFrame().
			
 
				-	/// @param[in]  waitFences Optionally wait for some fences.
			
 
				-	/// @param[out] signalFence Optionaly create fence that will be signaled when the submission is done.
			
 
				-	void submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr);
			
 
				+	// Submit command buffers. Can be called outside beginFrame() endFrame()
			
 
				+	// waitFences: Optionally wait for some fences
			
 
				+	// signalFence: Optionaly create fence that will be signaled when the submission is done
			
 
				+	// flushAndSerialize: Insert a barrier at the end of the submit that flushes all caches and inserts an ALL to ALL barrier
			
 
				+	void submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr, Bool flushAndSerialize = false);
			
 
				 
			
 
				-	void submit(CommandBuffer* cmdb, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr)
			
 
				+	void submit(CommandBuffer* cmdb, WeakArray<Fence*> waitFences = {}, FencePtr* signalFence = nullptr, Bool flushAndSerialize = false)
			
 
				 	{
			
 
				-		submit(WeakArray<CommandBuffer*>(&cmdb, 1), waitFences, signalFence);
			
 
				+		submit(WeakArray<CommandBuffer*>(&cmdb, 1), waitFences, signalFence, flushAndSerialize);
			
 
				 	}
			
 
				 
			
 
				-	/// Wait for all GPU work to finish.
			
 
				+	// Wait for all GPU work to finish.
			
 
				 	void finish();
			
 
				 
			
 
				-	/// @name Object creation methods. They are thread-safe.
			
 
				-	/// @{
			
 
				+	// Object creation methods. They are thread-safe //
			
 
				 	[[nodiscard]] BufferPtr newBuffer(const BufferInitInfo& init);
			
 
				 	[[nodiscard]] TexturePtr newTexture(const TextureInitInfo& init);
			
 
				 	[[nodiscard]] SamplerPtr newSampler(const SamplerInitInfo& init);
			
@@ -80,9 +77,9 @@ public:
 
				 	[[nodiscard]] RenderGraphPtr newRenderGraph();
			
 
				 	[[nodiscard]] GrUpscalerPtr newGrUpscaler(const GrUpscalerInitInfo& init);
			
 
				 	[[nodiscard]] AccelerationStructurePtr newAccelerationStructure(const AccelerationStructureInitInfo& init);
			
 
				-	/// @}
			
 
				+	// End object creation methods //
			
 
				 
			
 
				-	/// Get the size of the acceleration structure if you are planning to supply a custom buffer.
			
 
				+	// Get the size of the acceleration structure if you are planning to supply a custom buffer.
			
 
				 	PtrSize getAccelerationStructureMemoryRequirement(const AccelerationStructureInitInfo& init) const;
			
 
				 
			
 
				 	ANKI_INTERNAL CString getCacheDirectory() const
			
@@ -111,6 +108,5 @@ GrManager& MakeSingletonPtr<GrManager>::allocateSingleton<>();
 
				 
			
 
				 template<>
			
 
				 void MakeSingletonPtr<GrManager>::freeSingleton();
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/RenderGraph.cpp
+++ b/AnKi/Gr/RenderGraph.cpp
@@ -1270,7 +1270,7 @@ void RenderGraph::recordAndSubmitCommandBuffers(FencePtr* optionalFence)
 
				 	const U32 firstGroupThatWroteToSwapchain2 = firstGroupThatWroteToSwapchain.getNonAtomically();
			
 
				 	if(firstGroupThatWroteToSwapchain2 == 0 || firstGroupThatWroteToSwapchain2 == kMaxU32)
			
 
				 	{
			
 
				-		GrManager::getSingleton().submit(WeakArray(pCmdbs), {}, optionalFence);
			
 
				+		GrManager::getSingleton().submit(WeakArray(pCmdbs), {}, optionalFence, true);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -1279,7 +1279,7 @@ void RenderGraph::recordAndSubmitCommandBuffers(FencePtr* optionalFence)
 
				 		GrManager::getSingleton().submit(WeakArray(pCmdbs).subrange(0, firstGroupThatWroteToSwapchain2), {}, nullptr);
			
 
				 
			
 
				 		GrManager::getSingleton().submit(
			
 
				-			WeakArray(pCmdbs).subrange(firstGroupThatWroteToSwapchain2, batchGroupCount - firstGroupThatWroteToSwapchain2), {}, optionalFence);
			
 
				+			WeakArray(pCmdbs).subrange(firstGroupThatWroteToSwapchain2, batchGroupCount - firstGroupThatWroteToSwapchain2), {}, optionalFence, true);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.cpp
+++ b/AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.cpp
@@ -20,8 +20,7 @@ class SegregatedListsGpuMemoryPool::BuilderInterface
 
				 public:
			
 
				 	SegregatedListsGpuMemoryPool* m_parent = nullptr;
			
 
				 
			
 
				-	/// @name Interface methods
			
 
				-	/// @{
			
 
				+	// Interface methods
			
 
				 	U32 getClassCount() const
			
 
				 	{
			
 
				 		return m_parent->m_classes.getSize();
			
@@ -46,7 +45,6 @@ public:
 
				 	{
			
 
				 		return 4;
			
 
				 	}
			
 
				-	/// @}
			
 
				 };
			
 
				 
			
 
				 void SegregatedListsGpuMemoryPool::init(BufferUsageBit gpuBufferUsage, ConstWeakArray<PtrSize> classUpperSizes, PtrSize initialGpuBufferSize,
			
@@ -72,7 +70,6 @@ void SegregatedListsGpuMemoryPool::init(BufferUsageBit gpuBufferUsage, ConstWeak
 
				 	m_builder = newInstance<Builder>(GrMemoryPool::getSingleton());
			
 
				 	m_builder->getInterface().m_parent = this;
			
 
				 
			
 
				-	m_frame = 0;
			
 
				 	m_allocatedSize = 0;
			
 
				 	m_allowCoWs = allowCoWs;
			
 
				 	m_mapAccess = map;
			
@@ -92,9 +89,19 @@ void SegregatedListsGpuMemoryPool::destroy()
 
				 		m_gpuBuffer->unmap();
			
 
				 	}
			
 
				 
			
 
				-	for(GrDynamicArray<SegregatedListsGpuMemoryPoolToken>& arr : m_garbage)
			
 
				+	for(auto it = m_garbage.getBegin(); it != m_garbage.getEnd(); ++it)
			
 
				 	{
			
 
				-		for(const SegregatedListsGpuMemoryPoolToken& token : arr)
			
 
				+		Garbage& garbage = *it;
			
 
				+		if(it.getArrayIndex() != m_activeGarbage)
			
 
				+		{
			
 
				+			ANKI_CHECKF(garbage.m_fence->clientWait(kMaxSecond));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			ANKI_ASSERT(!garbage.m_fence);
			
 
				+		}
			
 
				+
			
 
				+		for(const SegregatedListsGpuMemoryPoolToken& token : garbage.m_tokens)
			
 
				 		{
			
 
				 			m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
			
 
				 		}
			
@@ -237,30 +244,60 @@ void SegregatedListsGpuMemoryPool::deferredFree(SegregatedListsGpuMemoryPoolToke
 
				 
			
 
				 	{
			
 
				 		LockGuard lock(m_lock);
			
 
				-		m_garbage[m_frame].emplaceBack(token);
			
 
				+
			
 
				+		if(m_activeGarbage == kMaxU32)
			
 
				+		{
			
 
				+			m_activeGarbage = m_garbage.emplace().getArrayIndex();
			
 
				+		}
			
 
				+
			
 
				+		m_garbage[m_activeGarbage].m_tokens.emplace(token);
			
 
				 	}
			
 
				 
			
 
				 	token = {};
			
 
				 }
			
 
				 
			
 
				-void SegregatedListsGpuMemoryPool::endFrame()
			
 
				+void SegregatedListsGpuMemoryPool::endFrame(Fence* fence)
			
 
				 {
			
 
				+	ANKI_ASSERT(fence);
			
 
				 	ANKI_ASSERT(isInitialized());
			
 
				 
			
 
				 	LockGuard lock(m_lock);
			
 
				 
			
 
				-	m_frame = (m_frame + 1) % kMaxFramesInFlight;
			
 
				-
			
 
				 	// Throw out the garbage
			
 
				-	for(SegregatedListsGpuMemoryPoolToken& token : m_garbage[m_frame])
			
 
				+	Array<U32, 8> garbageToDelete;
			
 
				+	U32 garbageToDeleteCount = 0;
			
 
				+	for(auto it = m_garbage.getBegin(); it != m_garbage.getEnd(); ++it)
			
 
				 	{
			
 
				-		m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
			
 
				+		Garbage& garbage = *it;
			
 
				 
			
 
				-		ANKI_ASSERT(m_allocatedSize >= token.m_size);
			
 
				-		m_allocatedSize -= token.m_size;
			
 
				+		if(garbage.m_fence && garbage.m_fence->clientWait(0.0))
			
 
				+		{
			
 
				+			for(SegregatedListsGpuMemoryPoolToken token : garbage.m_tokens)
			
 
				+			{
			
 
				+				m_builder->free(static_cast<Chunk*>(token.m_chunk), token.m_chunkOffset, token.m_size);
			
 
				+
			
 
				+				ANKI_ASSERT(m_allocatedSize >= token.m_size);
			
 
				+				m_allocatedSize -= token.m_size;
			
 
				+			}
			
 
				+
			
 
				+			garbageToDelete[garbageToDeleteCount++] = it.getArrayIndex();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(U32 i = 0; i < garbageToDeleteCount; ++i)
			
 
				+	{
			
 
				+		m_garbage.erase(garbageToDelete[i]);
			
 
				 	}
			
 
				 
			
 
				-	m_garbage[m_frame].destroy();
			
 
				+	// Set the new fence
			
 
				+	if(m_activeGarbage != kMaxU32)
			
 
				+	{
			
 
				+		ANKI_ASSERT(m_garbage[m_activeGarbage].m_tokens.getSize());
			
 
				+		ANKI_ASSERT(!m_garbage[m_activeGarbage].m_fence);
			
 
				+		m_garbage[m_activeGarbage].m_fence.reset(fence);
			
 
				+
			
 
				+		m_activeGarbage = kMaxU32;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void SegregatedListsGpuMemoryPool::getStats(F32& externalFragmentation, PtrSize& userAllocatedSize, PtrSize& totalSize) const
			
--- a/AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.h
+++ b/AnKi/Gr/Utils/SegregatedListsGpuMemoryPool.h
@@ -6,21 +6,19 @@
 
				 #pragma once
			
 
				 
			
 
				 #include <AnKi/Util/SegregatedListsAllocatorBuilder.h>
			
 
				+#include <AnKi/Util/BlockArray.h>
			
 
				 #include <AnKi/Gr/Buffer.h>
			
 
				+#include <AnKi/Gr/Fence.h>
			
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-/// @addtogroup graphics
			
 
				-/// @{
			
 
				-
			
 
				-/// The result of an allocation of SegregatedListsGpuMemoryPool.
			
 
				-/// @memberof SegregatedListsGpuMemoryPool
			
 
				+// The result of an allocation of SegregatedListsGpuMemoryPool.
			
 
				 class SegregatedListsGpuMemoryPoolToken
			
 
				 {
			
 
				 	friend class SegregatedListsGpuMemoryPool;
			
 
				 
			
 
				 public:
			
 
				-	/// The offset in the SegregatedListsGpuMemoryPoolToken::getBuffer() buffer.
			
 
				+	// The offset in the SegregatedListsGpuMemoryPoolToken::getBuffer() buffer.
			
 
				 	PtrSize m_offset = kMaxPtrSize;
			
 
				 
			
 
				 	PtrSize m_size = kMaxPtrSize;
			
@@ -40,8 +38,8 @@ private:
 
				 	PtrSize m_chunkOffset = kMaxPtrSize;
			
 
				 };
			
 
				 
			
 
				-/// GPU memory allocator based on segregated lists. It allocates a GPU buffer with some initial size. If there is a need to grow it allocates a bigger
			
 
				-/// buffer and copies contents of the old one to the new (CoW).
			
 
				+// GPU memory allocator based on segregated lists. It allocates a GPU buffer with some initial size. If there is a need to grow it allocates a bigger
			
 
				+// buffer and copies contents of the old one to the new (CoW).
			
 
				 class SegregatedListsGpuMemoryPool
			
 
				 {
			
 
				 public:
			
@@ -61,19 +59,19 @@ public:
 
				 
			
 
				 	void destroy();
			
 
				 
			
 
				-	/// Allocate memory.
			
 
				-	/// @note It's thread-safe.
			
 
				+	// Allocate memory.
			
 
				+	// It's thread-safe.
			
 
				 	void allocate(PtrSize size, U32 alignment, SegregatedListsGpuMemoryPoolToken& token);
			
 
				 
			
 
				-	/// Free memory a few frames down the line.
			
 
				-	/// @note It's thread-safe.
			
 
				+	// Free memory a few frames down the line.
			
 
				+	// It's thread-safe.
			
 
				 	void deferredFree(SegregatedListsGpuMemoryPoolToken& token);
			
 
				 
			
 
				-	/// @note It's thread-safe.
			
 
				-	void endFrame();
			
 
				+	// It's thread-safe.
			
 
				+	void endFrame(Fence* fence);
			
 
				 
			
 
				-	/// Need to be checking this constantly to get the updated buffer in case of CoWs.
			
 
				-	/// @note It's not thread-safe.
			
 
				+	// Need to be checking this constantly to get the updated buffer in case of CoWs.
			
 
				+	// It's not thread-safe.
			
 
				 	Buffer& getGpuBuffer() const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_gpuBuffer.isCreated() && "The buffer hasn't been created yet");
			
@@ -86,7 +84,7 @@ public:
 
				 		return m_mappedGpuBufferMemory;
			
 
				 	}
			
 
				 
			
 
				-	/// @note It's thread-safe.
			
 
				+	// It's thread-safe.
			
 
				 	void getStats(F32& externalFragmentation, PtrSize& userAllocatedSize, PtrSize& totalSize) const;
			
 
				 
			
 
				 private:
			
@@ -108,8 +106,15 @@ private:
 
				 
			
 
				 	GrDynamicArray<Chunk*> m_deletedChunks;
			
 
				 
			
 
				-	Array<GrDynamicArray<SegregatedListsGpuMemoryPoolToken>, kMaxFramesInFlight> m_garbage;
			
 
				-	U8 m_frame = 0;
			
 
				+	class Garbage
			
 
				+	{
			
 
				+	public:
			
 
				+		GrBlockArray<SegregatedListsGpuMemoryPoolToken, BlockArrayConfig<16>> m_tokens;
			
 
				+		FencePtr m_fence;
			
 
				+	};
			
 
				+
			
 
				+	GrBlockArray<Garbage, BlockArrayConfig<8>> m_garbage;
			
 
				+	U32 m_activeGarbage = kMaxU32;
			
 
				 	Bool m_allowCoWs = true;
			
 
				 
			
 
				 	BufferMapAccessBit m_mapAccess = BufferMapAccessBit::kNone;
			
@@ -122,6 +127,5 @@ private:
 
				 		return m_bufferUsage != BufferUsageBit::kNone;
			
 
				 	}
			
 
				 };
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Vulkan/VkCommandBuffer.cpp
+++ b/AnKi/Gr/Vulkan/VkCommandBuffer.cpp
@@ -1344,4 +1344,21 @@ void CommandBufferImpl::traceRaysInternal(const BufferView& sbtBuffer, U32 sbtRe
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void CommandBufferImpl::setFullPipelineBarrier()
			
 
				+{
			
 
				+	ANKI_TRACE_FUNCTION();
			
 
				+	ANKI_VK_SELF(CommandBufferImpl);
			
 
				+	self.commandCommon();
			
 
				+
			
 
				+	VkMemoryBarrier barr = {};
			
 
				+	barr.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
			
 
				+	barr.srcAccessMask = barr.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
			
 
				+	barr.dstAccessMask |= VK_ACCESS_HOST_READ_BIT;
			
 
				+
			
 
				+	vkCmdPipelineBarrier(self.m_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT | VK_PIPELINE_STAGE_HOST_BIT, 0, 1,
			
 
				+						 &barr, 0, nullptr, 0, nullptr);
			
 
				+
			
 
				+	ANKI_TRACE_INC_COUNTER(VkBarrier, 1);
			
 
				+}
			
 
				+
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Vulkan/VkCommandBuffer.h
+++ b/AnKi/Gr/Vulkan/VkCommandBuffer.h
@@ -88,6 +88,8 @@ public:
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				+	void setFullPipelineBarrier();
			
 
				+
			
 
				 private:
			
 
				 	StackMemoryPool m_pool;
			
 
				 
			
--- a/AnKi/Gr/Vulkan/VkCommandBufferFactory.cpp
+++ b/AnKi/Gr/Vulkan/VkCommandBufferFactory.cpp
@@ -10,7 +10,8 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-ANKI_SVAR(CommandBufferCount, StatCategory::kGr, "CommandBufferCount", StatFlag::kNone)
			
 
				+ANKI_SVAR(CommandBufferCount, StatCategory::kGr, "Cmdb count", StatFlag::kNone)
			
 
				+ANKI_SVAR(CommandBuffersCreated, StatCategory::kGr, "Cmdbs created", StatFlag::kNone)
			
 
				 
			
 
				 MicroCommandBuffer::~MicroCommandBuffer()
			
 
				 {
			
@@ -111,6 +112,7 @@ Error CommandBufferThreadAllocator::newCommandBuffer(CommandBufferFlag cmdbFlags
 
				 
			
 
				 		ANKI_TRACE_INC_COUNTER(VkCommandBufferCreate, 1);
			
 
				 		g_svarCommandBufferCount.increment(1_U64);
			
 
				+		g_svarCommandBuffersCreated.increment(1_U64);
			
 
				 		VkCommandBuffer cmdb;
			
 
				 		ANKI_VK_CHECK(vkAllocateCommandBuffers(getVkDevice(), &ci, &cmdb));
			
 
				 
			
--- a/AnKi/Gr/Vulkan/VkCommandBufferFactory.h
+++ b/AnKi/Gr/Vulkan/VkCommandBufferFactory.h
@@ -50,6 +50,11 @@ public:
 
				 		return m_refcount.load();
			
 
				 	}
			
 
				 
			
 
				+	Bool canRecycle() const
			
 
				+	{
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				 	VkCommandBuffer getHandle() const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_handle);
			
--- a/AnKi/Gr/Vulkan/VkFenceFactory.cpp
+++ b/AnKi/Gr/Vulkan/VkFenceFactory.cpp
@@ -8,10 +8,24 @@
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-void MicroFenceImpl::setName(CString name) const
			
 
				+MicroFencePtr FenceFactory::newInstance(CString name)
			
 
				 {
			
 
				-	ANKI_ASSERT(m_handle);
			
 
				-	getGrManagerImpl().trySetVulkanHandleName(name, VK_OBJECT_TYPE_FENCE, m_handle);
			
 
				+	MicroFence* fence = m_recycler.findToReuse();
			
 
				+
			
 
				+	if(fence == nullptr)
			
 
				+	{
			
 
				+		fence = newInstance<MicroFence>(GrMemoryPool::getSingleton());
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		fence->reset();
			
 
				+	}
			
 
				+
			
 
				+	ANKI_ASSERT(fence->getRefcount() == 0);
			
 
				+
			
 
				+	getGrManagerImpl().trySetVulkanHandleName(name, VK_OBJECT_TYPE_FENCE, fence->getHandle());
			
 
				+
			
 
				+	return MicroFencePtr(fence);
			
 
				 }
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Vulkan/VkFenceFactory.h
+++ b/AnKi/Gr/Vulkan/VkFenceFactory.h
@@ -6,33 +6,58 @@
 
				 #pragma once
			
 
				 
			
 
				 #include <AnKi/Gr/Vulkan/VkCommon.h>
			
 
				-#include <AnKi/Gr/BackendCommon/MicroFenceFactory.h>
			
 
				+#include <AnKi/Gr/BackendCommon/MicroObjectRecycler.h>
			
 
				 #include <AnKi/Util/Tracer.h>
			
 
				+#include <AnKi/Core/StatsSet.h>
			
 
				 
			
 
				 namespace anki {
			
 
				 
			
 
				-/// @addtogroup vulkan
			
 
				-/// @{
			
 
				+ANKI_SVAR(FenceCount2, StatCategory::kGr, "Fence count", StatFlag::kNone)
			
 
				+ANKI_SVAR(FencesCreated, StatCategory::kGr, "Fences created", StatFlag::kNone)
			
 
				 
			
 
				-/// Fence wrapper over VkFence.
			
 
				-class MicroFenceImpl
			
 
				+// Fence wrapper over VkFence.
			
 
				+class MicroFence
			
 
				 {
			
 
				 public:
			
 
				-	VkFence m_handle = VK_NULL_HANDLE;
			
 
				+	MicroFence()
			
 
				+	{
			
 
				+		VkFenceCreateInfo ci = {};
			
 
				+		ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
			
 
				+		ANKI_VK_CHECKF(vkCreateFence(getVkDevice(), &ci, nullptr, &m_handle));
			
 
				+		g_svarFenceCount2.increment(1u);
			
 
				+		g_svarFencesCreated.increment(1u);
			
 
				+	}
			
 
				+
			
 
				+	~MicroFence()
			
 
				+	{
			
 
				+		if(m_handle)
			
 
				+		{
			
 
				+			vkDestroyFence(getVkDevice(), m_handle, nullptr);
			
 
				+			g_svarFenceCount2.decrement(1u);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void retain() const
			
 
				+	{
			
 
				+		m_refcount.fetchAdd(1);
			
 
				+	}
			
 
				 
			
 
				-	~MicroFenceImpl()
			
 
				+	void release();
			
 
				+
			
 
				+	I32 getRefcount() const
			
 
				 	{
			
 
				-		ANKI_ASSERT(!m_handle);
			
 
				+		return m_refcount.load();
			
 
				 	}
			
 
				 
			
 
				-	operator Bool() const
			
 
				+	Bool canRecycle() const
			
 
				 	{
			
 
				-		return m_handle != 0;
			
 
				+		return signaled();
			
 
				 	}
			
 
				 
			
 
				-	Bool clientWait(Second seconds)
			
 
				+	Bool clientWait(Second seconds) const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_handle);
			
 
				+		seconds = min<Second>(seconds, g_cvarGrGpuTimeout);
			
 
				 		const F64 nsf = 1e+9 * seconds;
			
 
				 		const U64 ns = U64(nsf);
			
 
				 		VkResult res;
			
@@ -41,7 +66,7 @@ public:
 
				 		return res != VK_TIMEOUT;
			
 
				 	}
			
 
				 
			
 
				-	Bool signaled()
			
 
				+	Bool signaled() const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_handle);
			
 
				 		VkResult status;
			
@@ -49,68 +74,50 @@ public:
 
				 		return status == VK_SUCCESS;
			
 
				 	}
			
 
				 
			
 
				-	void create()
			
 
				-	{
			
 
				-		ANKI_ASSERT(!m_handle);
			
 
				-		VkFenceCreateInfo ci = {};
			
 
				-		ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
			
 
				-		ANKI_VK_CHECKF(vkCreateFence(getVkDevice(), &ci, nullptr, &m_handle));
			
 
				-	}
			
 
				-
			
 
				-	void destroy()
			
 
				+	void reset() const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_handle);
			
 
				-		vkDestroyFence(getVkDevice(), m_handle, nullptr);
			
 
				-		m_handle = 0;
			
 
				+		ANKI_VK_CHECKF(vkResetFences(getVkDevice(), 1, &m_handle));
			
 
				 	}
			
 
				 
			
 
				-	void reset()
			
 
				+	VkFence getHandle() const
			
 
				 	{
			
 
				 		ANKI_ASSERT(m_handle);
			
 
				-		ANKI_VK_CHECKF(vkResetFences(getVkDevice(), 1, &m_handle));
			
 
				+		return m_handle;
			
 
				 	}
			
 
				 
			
 
				-	void setName(CString name) const;
			
 
				-};
			
 
				-
			
 
				-using VulkanMicroFence = MicroFence<MicroFenceImpl>;
			
 
				-
			
 
				-/// Deleter for FencePtr.
			
 
				-class MicroFencePtrDeleter
			
 
				-{
			
 
				-public:
			
 
				-	void operator()(VulkanMicroFence* fence);
			
 
				+private:
			
 
				+	VkFence m_handle = VK_NULL_HANDLE;
			
 
				+	mutable Atomic<I32> m_refcount = {0};
			
 
				 };
			
 
				 
			
 
				-/// Fence smart pointer.
			
 
				-using MicroFencePtr = IntrusivePtr<VulkanMicroFence, MicroFencePtrDeleter>;
			
 
				+// Fence smart pointer.
			
 
				+using MicroFencePtr = IntrusiveNoDelPtr<MicroFence>;
			
 
				 
			
 
				-/// A factory of fences.
			
 
				+// A factory of fences.
			
 
				 class FenceFactory : public MakeSingleton<FenceFactory>
			
 
				 {
			
 
				-	friend class MicroFencePtrDeleter;
			
 
				+	friend class MicroFence;
			
 
				 
			
 
				 public:
			
 
				-	/// Create a new fence pointer.
			
 
				-	MicroFencePtr newInstance(CString name = "unnamed")
			
 
				-	{
			
 
				-		return MicroFencePtr(m_factory.newFence(name));
			
 
				-	}
			
 
				+	// Create a new fence pointer.
			
 
				+	MicroFencePtr newInstance(CString name = "unnamed");
			
 
				 
			
 
				 private:
			
 
				-	MicroFenceFactory<MicroFenceImpl> m_factory;
			
 
				+	MicroObjectRecycler<MicroFence> m_recycler;
			
 
				 
			
 
				-	void deleteFence(VulkanMicroFence* fence)
			
 
				+	void releaseFence(MicroFence* fence)
			
 
				 	{
			
 
				-		m_factory.releaseFence(fence);
			
 
				+		m_recycler.recycle(fence);
			
 
				 	}
			
 
				 };
			
 
				 
			
 
				-inline void MicroFencePtrDeleter::operator()(VulkanMicroFence* fence)
			
 
				+inline void MicroFence::release()
			
 
				 {
			
 
				-	ANKI_ASSERT(fence);
			
 
				-	FenceFactory::getSingleton().deleteFence(fence);
			
 
				+	if(m_refcount.fetchSub(1) == 1)
			
 
				+	{
			
 
				+		FenceFactory::getSingleton().releaseFence(this);
			
 
				+	}
			
 
				 }
			
 
				-/// @}
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Gr/Vulkan/VkGrManager.cpp
+++ b/AnKi/Gr/Vulkan/VkGrManager.cpp
@@ -143,10 +143,10 @@ ANKI_NEW_GR_OBJECT(GrUpscaler)
 
				 #undef ANKI_NEW_GR_OBJECT
			
 
				 #undef ANKI_NEW_GR_OBJECT_NO_INIT_INFO
			
 
				 
			
 
				-void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
			
 
				+void GrManager::submit(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize)
			
 
				 {
			
 
				 	ANKI_VK_SELF(GrManagerImpl);
			
 
				-	self.submitInternal(cmdbs, waitFences, signalFence);
			
 
				+	self.submitInternal(cmdbs, waitFences, signalFence, flushAndSerialize);
			
 
				 }
			
 
				 
			
 
				 PtrSize GrManager::getAccelerationStructureMemoryRequirement(const AccelerationStructureInitInfo& init) const
			
@@ -1146,7 +1146,7 @@ Error GrManagerImpl::initDevice()
 
				 	if(pureAsyncCompute)
			
 
				 	{
			
 
				 		vkGetDeviceQueue(m_device, m_queueFamilyIndices[GpuQueueType::kCompute], 0, &m_queues[GpuQueueType::kCompute]);
			
 
				-		trySetVulkanHandleName("AsyncCompute", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kGeneral]);
			
 
				+		trySetVulkanHandleName("AsyncCompute", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kCompute]);
			
 
				 	}
			
 
				 	else if(lowPriorityQueueAsyncCompute)
			
 
				 	{
			
@@ -1274,8 +1274,8 @@ TexturePtr GrManagerImpl::acquireNextPresentableTexture()
 
				 
			
 
				 	// Get new image
			
 
				 	uint32_t imageIdx;
			
 
				-	const VkResult res = vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(),
			
 
				-											   fence->getImplementation().m_handle, &imageIdx);
			
 
				+	const VkResult res =
			
 
				+		vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(), fence->getHandle(), &imageIdx);
			
 
				 
			
 
				 	if(res == VK_ERROR_OUT_OF_DATE_KHR)
			
 
				 	{
			
@@ -1291,8 +1291,8 @@ TexturePtr GrManagerImpl::acquireNextPresentableTexture()
 
				 		m_crntSwapchain = SwapchainFactory::getSingleton().newInstance();
			
 
				 
			
 
				 		// Can't fail a second time
			
 
				-		ANKI_VK_CHECKF(vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(),
			
 
				-											 fence->getImplementation().m_handle, &imageIdx));
			
 
				+		ANKI_VK_CHECKF(
			
 
				+			vkAcquireNextImageKHR(m_device, m_crntSwapchain->m_swapchain, UINT64_MAX, acquireSemaphore->getHandle(), fence->getHandle(), &imageIdx));
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -1357,7 +1357,7 @@ void GrManagerImpl::endFrameInternal()
 
				 	GpuMemoryManager::getSingleton().updateStats();
			
 
				 }
			
 
				 
			
 
				-void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence)
			
 
				+void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize)
			
 
				 {
			
 
				 	// First thing, create a fence
			
 
				 	MicroFencePtr fence = FenceFactory::getSingleton().newInstance("Submit");
			
@@ -1390,6 +1390,33 @@ void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fe
 
				 		vkCmdbs.emplaceBack(cmdb.getHandle());
			
 
				 	}
			
 
				 
			
 
				+	// Create the flush command buffer
			
 
				+	CommandBufferPtr flushCmdb;
			
 
				+	if(flushAndSerialize)
			
 
				+	{
			
 
				+		CommandBufferInitInfo cmdbInit("Flush");
			
 
				+		cmdbInit.m_flags = CommandBufferFlag::kSmallBatch;
			
 
				+		if(queueType == GpuQueueType::kCompute)
			
 
				+		{
			
 
				+			cmdbInit.m_flags |= CommandBufferFlag::kComputeWork;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			cmdbInit.m_flags |= CommandBufferFlag::kGeneralWork;
			
 
				+		}
			
 
				+
			
 
				+		flushCmdb = newCommandBuffer(cmdbInit);
			
 
				+		CommandBufferImpl& impl = static_cast<CommandBufferImpl&>(*flushCmdb);
			
 
				+		impl.setFullPipelineBarrier();
			
 
				+		flushCmdb->endRecording();
			
 
				+
			
 
				+#if ANKI_ASSERTIONS_ENABLED
			
 
				+		impl.setSubmitted();
			
 
				+#endif
			
 
				+
			
 
				+		vkCmdbs.emplaceBack(impl.getHandle());
			
 
				+	}
			
 
				+
			
 
				 	// Gather wait semaphores
			
 
				 	GrDynamicArray<VkSemaphore> waitSemaphores;
			
 
				 	GrDynamicArray<VkPipelineStageFlags> waitStages;
			
@@ -1488,7 +1515,7 @@ void GrManagerImpl::submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fe
 
				 		appendPNextList(submit, &timelineInfo);
			
 
				 
			
 
				 		ANKI_TRACE_SCOPED_EVENT(VkQueueSubmit);
			
 
				-		ANKI_VK_CHECKF(vkQueueSubmit(m_queues[queueType], 1, &submit, fence->getImplementation().m_handle));
			
 
				+		ANKI_VK_CHECKF(vkQueueSubmit(m_queues[queueType], 1, &submit, fence->getHandle()));
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/AnKi/Gr/Vulkan/VkGrManager.h
+++ b/AnKi/Gr/Vulkan/VkGrManager.h
@@ -11,6 +11,7 @@
 
				 #include <AnKi/Gr/Vulkan/VkFenceFactory.h>
			
 
				 #include <AnKi/Gr/Vulkan/VkSwapchainFactory.h>
			
 
				 #include <AnKi/Util/File.h>
			
 
				+#include <AnKi/Util/BlockArray.h>
			
 
				 
			
 
				 namespace anki {
			
 
				 
			
@@ -86,7 +87,7 @@ public:
 
				 
			
 
				 	void endFrameInternal();
			
 
				 
			
 
				-	void submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence);
			
 
				+	void submitInternal(WeakArray<CommandBuffer*> cmdbs, WeakArray<Fence*> waitFences, FencePtr* signalFence, Bool flushAndSerialize);
			
 
				 
			
 
				 	void finishInternal();
			
 
				 
			
--- a/AnKi/Gr/Vulkan/VkSemaphoreFactory.cpp
+++ b/AnKi/Gr/Vulkan/VkSemaphoreFactory.cpp
@@ -11,6 +11,7 @@
 
				 namespace anki {
			
 
				 
			
 
				 ANKI_SVAR(SemaphoreCount, StatCategory::kGr, "Semaphore count", StatFlag::kNone)
			
 
				+ANKI_SVAR(SemaphoresCreated, StatCategory::kGr, "Semaphores created", StatFlag::kNone)
			
 
				 
			
 
				 MicroSemaphore::MicroSemaphore(Bool isTimeline)
			
 
				 	: m_isTimeline(isTimeline)
			
@@ -27,6 +28,7 @@ MicroSemaphore::MicroSemaphore(Bool isTimeline)
 
				 	ANKI_VK_CHECKF(vkCreateSemaphore(getVkDevice(), &ci, nullptr, &m_handle));
			
 
				 	ANKI_TRACE_INC_COUNTER(VkSemaphoreCreate, 1);
			
 
				 	g_svarSemaphoreCount.increment(1u);
			
 
				+	g_svarSemaphoresCreated.increment(1u);
			
 
				 }
			
 
				 
			
 
				 MicroSemaphore::~MicroSemaphore()
			
--- a/AnKi/Gr/Vulkan/VkSemaphoreFactory.h
+++ b/AnKi/Gr/Vulkan/VkSemaphoreFactory.h
@@ -46,6 +46,11 @@ public:
 
				 		return m_refcount.load();
			
 
				 	}
			
 
				 
			
 
				+	Bool canRecycle() const
			
 
				+	{
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				 	Bool clientWait(Second seconds);
			
 
				 
			
 
				 	Bool isTimeline() const
			
--- a/AnKi/Gr/Vulkan/VkSwapchainFactory.h
+++ b/AnKi/Gr/Vulkan/VkSwapchainFactory.h
@@ -46,6 +46,11 @@ public:
 
				 		return m_refcount.load();
			
 
				 	}
			
 
				 
			
 
				+	Bool canRecycle() const
			
 
				+	{
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				 private:
			
 
				 	mutable Atomic<I32> m_refcount = {0};
			
 
				 
			
--- a/AnKi/Renderer/Renderer.cpp
+++ b/AnKi/Renderer/Renderer.cpp
@@ -806,7 +806,7 @@ void Renderer::updatePipelineStats()
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-Error Renderer::render()
			
 
				+Error Renderer::render(FencePtr& fence)
			
 
				 {
			
 
				 	ANKI_TRACE_SCOPED_EVENT(Render);
			
 
				 
			
@@ -925,7 +925,6 @@ Error Renderer::render()
 
				 	m_rgraph->compileNewGraph(ctx.m_renderGraphDescr, m_framePool);
			
 
				 
			
 
				 	// Flush
			
 
				-	FencePtr fence;
			
 
				 	m_rgraph->recordAndSubmitCommandBuffers(&fence);
			
 
				 
			
 
				 	// Misc
			
--- a/AnKi/Renderer/Renderer.h
+++ b/AnKi/Renderer/Renderer.h
@@ -93,7 +93,7 @@ public:
 
				 
			
 
				 	Error init(const RendererInitInfo& inf);
			
 
				 
			
 
				-	Error render();
			
 
				+	Error render(FencePtr& fence);
			
 
				 
			
 
				 #define ANKI_RENDERER_OBJECT_DEF(type, name, initCondition) \
			
 
				 	type& get##type() \
			
--- a/AnKi/Scene/SceneNode.cpp
+++ b/AnKi/Scene/SceneNode.cpp
@@ -44,7 +44,7 @@ namespace anki {
 
				 		compInit.m_node = this; \
			
 
				 		compInit.m_componentUuid = SceneGraph::getSingleton().m_scenes[m_sceneIndex].getNewNodeUuid(); \
			
 
				 		compInit.m_sceneUuid = m_sceneUuid; \
			
 
				-		auto it = SceneGraph::getSingleton().getComponentArrays().get##name##s().emplace(compInit); \
			
 
				+		auto it = SceneGraph::getSingleton().m_componentArrays.get##name##s().emplace(compInit); \
			
 
				 		it->setArrayIndex(it.getArrayIndex()); \
			
 
				 		addComponent(&(*it)); \
			
 
				 		return &(*it); \
			
--- a/AnKi/Util/BitSet.h
+++ b/AnKi/Util/BitSet.h
@@ -304,6 +304,30 @@ public:
 
				 		return *this;
			
 
				 	}
			
 
				 
			
 
				+	template<typename TFunc>
			
 
				+	FunctorContinue iterateSetBitsFromLeastSignificant(TFunc func) const
			
 
				+	{
			
 
				+		for(U32 i = 0; i < kChunkCount; ++i)
			
 
				+		{
			
 
				+			ChunkType bits = m_chunks[i];
			
 
				+			while(bits)
			
 
				+			{
			
 
				+				const U32 lsb = U32(std::countr_zero(bits));
			
 
				+				const U32 bitIdx = lsb + (i * kChunkBitCount);
			
 
				+
			
 
				+				const FunctorContinue cont = func(bitIdx);
			
 
				+				if(cont == FunctorContinue::kStop)
			
 
				+				{
			
 
				+					return cont;
			
 
				+				}
			
 
				+
			
 
				+				bits &= ~(ChunkType(1) << ChunkType(lsb));
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return FunctorContinue::kContinue;
			
 
				+	}
			
 
				+
			
 
				 private:
			
 
				 	Array<ChunkType, kChunkCount> m_chunks;