Browse Source

CPU optimizations in threaded problems

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
65196489fa
2 changed files with 20 additions and 22 deletions
  1. 13 19
      AnKi/Gr/Vulkan/VkCommandBuffer.cpp
  2. 7 3
      AnKi/Util/Functions.h

+ 13 - 19
AnKi/Gr/Vulkan/VkCommandBuffer.cpp

@@ -971,8 +971,7 @@ void CommandBuffer::setPipelineBarrier(ConstWeakArray<TextureBarrierInfo> textur
 	self.commandCommon();
 	self.commandCommon();
 
 
 	DynamicArray<VkImageMemoryBarrier, MemoryPoolPtrWrapper<StackMemoryPool>> imageBarriers(self.m_pool);
 	DynamicArray<VkImageMemoryBarrier, MemoryPoolPtrWrapper<StackMemoryPool>> imageBarriers(self.m_pool);
-	DynamicArray<VkBufferMemoryBarrier, MemoryPoolPtrWrapper<StackMemoryPool>> bufferBarriers(self.m_pool);
-	DynamicArray<VkMemoryBarrier, MemoryPoolPtrWrapper<StackMemoryPool>> genericBarriers(self.m_pool);
+	VkMemoryBarrier genericBarrier = {.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER};
 	VkPipelineStageFlags srcStageMask = 0;
 	VkPipelineStageFlags srcStageMask = 0;
 	VkPipelineStageFlags dstStageMask = 0;
 	VkPipelineStageFlags dstStageMask = 0;
 
 
@@ -990,32 +989,27 @@ void CommandBuffer::setPipelineBarrier(ConstWeakArray<TextureBarrierInfo> textur
 		const BufferImpl& impl = static_cast<const BufferImpl&>(barrier.m_bufferView.getBuffer());
 		const BufferImpl& impl = static_cast<const BufferImpl&>(barrier.m_bufferView.getBuffer());
 		const VkBufferMemoryBarrier akBarrier = impl.computeBarrierInfo(barrier.m_previousUsage, barrier.m_nextUsage, srcStageMask, dstStageMask);
 		const VkBufferMemoryBarrier akBarrier = impl.computeBarrierInfo(barrier.m_previousUsage, barrier.m_nextUsage, srcStageMask, dstStageMask);
 
 
-		if(bufferBarriers.getSize() && bufferBarriers.getBack().buffer == akBarrier.buffer)
-		{
-			// Merge barriers
-			bufferBarriers.getBack().srcAccessMask |= akBarrier.srcAccessMask;
-			bufferBarriers.getBack().dstAccessMask |= akBarrier.dstAccessMask;
-		}
-		else
-		{
-			// Create a new buffer barrier
-			bufferBarriers.emplaceBack(akBarrier);
-		}
+		genericBarrier.srcAccessMask |= akBarrier.srcAccessMask;
+		genericBarrier.dstAccessMask |= akBarrier.dstAccessMask;
 	}
 	}
 
 
 	for(const AccelerationStructureBarrierInfo& barrier : accelerationStructures)
 	for(const AccelerationStructureBarrierInfo& barrier : accelerationStructures)
 	{
 	{
 		ANKI_ASSERT(barrier.m_as);
 		ANKI_ASSERT(barrier.m_as);
 
 
-		genericBarriers.emplaceBack(
-			AccelerationStructureImpl::computeBarrierInfo(barrier.m_previousUsage, barrier.m_nextUsage, srcStageMask, dstStageMask));
+		const VkMemoryBarrier memBarrier =
+			AccelerationStructureImpl::computeBarrierInfo(barrier.m_previousUsage, barrier.m_nextUsage, srcStageMask, dstStageMask);
+
+		genericBarrier.srcAccessMask |= memBarrier.srcAccessMask;
+		genericBarrier.dstAccessMask |= memBarrier.dstAccessMask;
+
 		self.m_microCmdb->pushObjectRef(barrier.m_as);
 		self.m_microCmdb->pushObjectRef(barrier.m_as);
 	}
 	}
 
 
-	vkCmdPipelineBarrier(self.m_handle, srcStageMask, dstStageMask, 0, genericBarriers.getSize(),
-						 (genericBarriers.getSize()) ? &genericBarriers[0] : nullptr, bufferBarriers.getSize(),
-						 (bufferBarriers.getSize()) ? &bufferBarriers[0] : nullptr, imageBarriers.getSize(),
-						 (imageBarriers.getSize()) ? &imageBarriers[0] : nullptr);
+	const Bool genericBarrierSet = genericBarrier.srcAccessMask != 0 && genericBarrier.dstAccessMask != 0;
+
+	vkCmdPipelineBarrier(self.m_handle, srcStageMask, dstStageMask, 0, (genericBarrierSet) ? 1 : 0, (genericBarrierSet) ? &genericBarrier : nullptr,
+						 0, nullptr, imageBarriers.getSize(), (imageBarriers.getSize()) ? &imageBarriers[0] : nullptr);
 
 
 	ANKI_TRACE_INC_COUNTER(VkBarrier, 1);
 	ANKI_TRACE_INC_COUNTER(VkBarrier, 1);
 }
 }

+ 7 - 3
AnKi/Util/Functions.h

@@ -298,9 +298,13 @@ inline void unflatten3dArrayIndex(const T sizeA, const T sizeB, const T sizeC, c
 inline void splitThreadedProblem(U32 threadId, U32 threadCount, U32 problemSize, U32& start, U32& end)
 inline void splitThreadedProblem(U32 threadId, U32 threadCount, U32 problemSize, U32& start, U32& end)
 {
 {
 	ANKI_ASSERT(threadCount > 0 && threadId < threadCount);
 	ANKI_ASSERT(threadCount > 0 && threadId < threadCount);
-	const U32 div = problemSize / threadCount;
-	start = threadId * div;
-	end = (threadId == threadCount - 1) ? problemSize : (threadId + 1u) * div;
+	const U32 range = problemSize / threadCount;
+	const U32 remain = problemSize % threadCount;
+
+	start = threadId * range + min(remain, threadId);
+	end = start + range + (threadId < remain);
+
+	ANKI_ASSERT(start <= problemSize && end <= end);
 	ANKI_ASSERT(!(threadId == threadCount - 1 && end != problemSize));
 	ANKI_ASSERT(!(threadId == threadCount - 1 && end != problemSize));
 }
 }