瀏覽代碼

Optimize the job manager

Panagiotis Christopoulos Charitos 1 年之前
父節點
當前提交
9e66e43aae
共有 2 個文件被更改,包括 78 次插入52 次删除
  1. 9 3
      Tests/Gr/GrWorkGraphs.cpp
  2. 69 49
      Tests/Gr/JobManager.hlsl

+ 9 - 3
Tests/Gr/GrWorkGraphs.cpp

@@ -775,11 +775,12 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 
 		DynamicArray<U32> initialWorkItems;
 		U32 finalValue = 0;
+		U32 workItemCount = 0;
 		{
 			initialWorkItems.resize(128 * 1024);
 			for(U32 i = 0; i < initialWorkItems.getSize(); ++i)
 			{
-				const U32 level = (bBenchmark) ? i : (rand() % 4);
+				const U32 level = ((bBenchmark) ? i : rand()) % 4;
 				const U32 payload = (bBenchmark) ? 1 : (rand() % 4);
 
 				initialWorkItems[i] = (level << 16) | payload;
@@ -793,6 +794,8 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 				const U32 level = workItem >> 16u;
 				const U32 payload = workItem & 0xFFFFu;
 
+				++workItemCount;
+
 				if(level == 0)
 				{
 					finalValue += payload;
@@ -845,6 +848,8 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 			queueBuff = createBuffer(BufferUsageBit::kAllUav, q, 1);
 		}
 
+		ANKI_TEST_LOGI("Init complete");
+
 		const U32 iterationsPerCmdb = 1;
 		const U32 iterationCount = 1;
 		runBenchmark(iterationCount, iterationsPerCmdb, bBenchmark, [&](CommandBuffer& cmdb) {
@@ -855,7 +860,7 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 				cmdb.bindUav(0, 0, BufferView(queueBuff.get()));
 				cmdb.bindUav(1, 0, BufferView(queueRingBuff.get()));
 				cmdb.bindUav(2, 0, BufferView(resultBuff.get()));
-				UVec4 consts(queueRingBufferSize);
+				UVec4 consts(queueRingBufferSize - 1);
 				cmdb.setFastConstants(&consts, sizeof(consts));
 
 				cmdb.dispatchCompute(256, 1, 1);
@@ -864,7 +869,8 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 
 		DynamicArray<U32> result;
 		readBuffer(resultBuff, result);
-		printf("vals_equal:%u failed:%u total_workitems:%u\n", result[0] == finalValue, result[1], finalValue);
+		ANKI_TEST_EXPECT_EQ(result[0], finalValue);
+		ANKI_TEST_EXPECT_EQ(result[1], 0);
 	}
 
 	commonDestroy();

+ 69 - 49
Tests/Gr/JobManager.hlsl

@@ -4,7 +4,7 @@
 // http://www.anki3d.org/LICENSE
 
 #define LOCK(spinlock) \
-	for(uint keepWaiting__ = true; keepWaiting__;) \
+	for(bool keepWaiting__ = true; keepWaiting__;) \
 	{ \
 		uint locked__; \
 		InterlockedCompareExchange(spinlock, 0, 1, locked__); \
@@ -18,7 +18,7 @@
 	} \
 	}
 
-#define NUMTHREADS 64
+#define NUMTHREADS 128
 #define MAX_CHILDREN 4
 
 struct Queue
@@ -36,7 +36,7 @@ RWStructuredBuffer<uint> g_finalResult : register(u2);
 
 struct Consts
 {
-	uint m_ringBufferSize;
+	uint m_ringBufferSizeMinusOne;
 	uint m_padding[3];
 };
 
@@ -53,33 +53,86 @@ groupshared bool g_bNoMoreWork;
 groupshared uint g_outWorkItems[NUMTHREADS * MAX_CHILDREN];
 groupshared uint g_outWorkItemCount;
 
+static const int kMashPushTries = 1000;
+
 [numthreads(NUMTHREADS, 1, 1)] void main(uint svGroupIndex : SV_GroupIndex)
 {
+	if(svGroupIndex == 0)
+	{
+		g_inWorkItemCount = 0;
+		g_outWorkItemCount = 0;
+	}
+
 	while(true)
 	{
 		GroupMemoryBarrierWithGroupSync();
 
-		// Dequeue work
 		if(svGroupIndex == 0)
 		{
-			LOCK(g_queue[0].m_spinlock);
+			bool pushSuccessful = true;
+			int iterationCount = kMashPushTries;
+			const uint oldInWorkItemCount = g_inWorkItemCount;
+			const uint outWorkItemCount = g_outWorkItemCount;
+			do
+			{
+				LOCK(g_queue[0].m_spinlock);
 
-			const uint workItemCount = min(NUMTHREADS, g_queue[0].m_head - g_queue[0].m_tail);
+				// Touch groupshared as little as possible
+				uint head = g_queue[0].m_head;
+				uint tail = g_queue[0].m_tail;
+				uint pendingWork = g_queue[0].m_pendingWork;
 
-			for(uint it = 0; it < workItemCount; ++it)
-			{
-				g_inWorkItems[it] = g_ringBuffer[(g_queue[0].m_tail + it) & (g_consts.m_ringBufferSize - 1u)];
-			}
+				// Dequeue work
+				if(iterationCount == kMashPushTries)
+				{
+					const uint workItemCount = min(NUMTHREADS, head - tail);
+
+					for(uint it = 0; it < workItemCount; ++it)
+					{
+						g_inWorkItems[it] = g_ringBuffer[(tail + it) & g_consts.m_ringBufferSizeMinusOne];
+					}
+
+					pendingWork += workItemCount;
+					g_inWorkItemCount = workItemCount;
+					tail += workItemCount;
+				}
+
+				// Push work
+				if(outWorkItemCount > 0)
+				{
+					const bool full = (head - tail) + outWorkItemCount >= (g_consts.m_ringBufferSizeMinusOne + 1);
+					pushSuccessful = !full;
+					if(pushSuccessful)
+					{
+						for(uint i = 0; i < outWorkItemCount; ++i)
+						{
+							g_ringBuffer[(head + i) & g_consts.m_ringBufferSizeMinusOne] = g_outWorkItems[i];
+						}
 
-			g_inWorkItemCount = workItemCount;
-			g_queue[0].m_tail += workItemCount;
-			g_queue[0].m_pendingWork += workItemCount;
+						head += outWorkItemCount;
+						g_outWorkItemCount = 0;
+					}
+				}
+
+				if(pushSuccessful)
+				{
+					pendingWork -= oldInWorkItemCount;
+					g_bNoMoreWork = pendingWork == 0;
+				}
 
-			g_bNoMoreWork = g_queue[0].m_pendingWork == 0;
+				// Restore mem
+				g_queue[0].m_head = head;
+				g_queue[0].m_tail = tail;
+				g_queue[0].m_pendingWork = pendingWork;
 
-			UNLOCK(g_queue[0].m_spinlock);
+				UNLOCK(g_queue[0].m_spinlock);
+			} while(!pushSuccessful && (iterationCount-- > 0));
 
-			g_outWorkItemCount = 0;
+			if(!pushSuccessful)
+			{
+				InterlockedAdd(g_finalResult[1], 1);
+				g_bNoMoreWork = true;
+			}
 		}
 
 		GroupMemoryBarrierWithGroupSync();
@@ -115,38 +168,5 @@ groupshared uint g_outWorkItemCount;
 				}
 			}
 		}
-
-		GroupMemoryBarrierWithGroupSync();
-
-		// Push new work
-		if(svGroupIndex == 0)
-		{
-			bool success = true;
-			int iterationCount = 1000;
-			do
-			{
-				LOCK(g_queue[0].m_spinlock);
-
-				const bool full = (g_queue[0].m_head - g_queue[0].m_tail) + g_outWorkItemCount >= g_consts.m_ringBufferSize;
-				success = !full;
-				if(success)
-				{
-					for(uint i = 0; i < g_outWorkItemCount; ++i)
-					{
-						g_ringBuffer[(g_queue[0].m_head + i) & (g_consts.m_ringBufferSize - 1u)] = g_outWorkItems[i];
-					}
-
-					g_queue[0].m_head += g_outWorkItemCount;
-					g_queue[0].m_pendingWork -= g_inWorkItemCount;
-				}
-
-				UNLOCK(g_queue[0].m_spinlock);
-			} while(!success && (iterationCount-- > 0));
-
-			if(iterationCount <= 0)
-			{
-				InterlockedAdd(g_finalResult[1], 1);
-			}
-		}
 	}
 }