Panagiotis Christopoulos Charitos 1 anno fa
parent
commit
248fede4fb
3 ha cambiato i file con 245 aggiunte e 5 eliminazioni
  1. 67 5
      Tests/Gr/GrWorkGraphs.cpp
  2. 0 0
      Tests/Gr/JobManagerCompute.hlsl
  3. 178 0
      Tests/Gr/JobManagerWg.hlsl

+ 67 - 5
Tests/Gr/GrWorkGraphs.cpp

@@ -758,30 +758,54 @@ ANKI_TEST(Gr, WorkGraphsOverhead)
 ANKI_TEST(Gr, WorkGraphsJobManager)
 ANKI_TEST(Gr, WorkGraphsJobManager)
 {
 {
 	Bool bBenchmark, bWorkgraphs;
 	Bool bBenchmark, bWorkgraphs;
+	// CVarSet::getSingleton().setMultiple(Array<const Char*, 2>{"Device", "1"});
 	commonInitWg(bBenchmark, bWorkgraphs);
 	commonInitWg(bBenchmark, bWorkgraphs);
 
 
 	const U32 queueRingBufferSize = nextPowerOfTwo(2 * 1024 * 1024);
 	const U32 queueRingBufferSize = nextPowerOfTwo(2 * 1024 * 1024);
+	const U32 initialWorkItemCount = 128 * 1024;
 
 
 	{
 	{
 		// Create compute progs
 		// Create compute progs
 		ShaderProgramPtr compProg;
 		ShaderProgramPtr compProg;
 		{
 		{
-			ShaderPtr shader =
-				loadShader(ANKI_SOURCE_DIRECTORY "/Tests/Gr/JobManager.hlsl", ShaderType::kCompute, Array<CString, 1>{"-DWORKGRAPHS=0"});
+			ShaderPtr shader = loadShader(ANKI_SOURCE_DIRECTORY "/Tests/Gr/JobManagerCompute.hlsl", ShaderType::kCompute);
 			ShaderProgramInitInfo progInit;
 			ShaderProgramInitInfo progInit;
 			progInit.m_computeShader = shader.get();
 			progInit.m_computeShader = shader.get();
 			compProg = GrManager::getSingleton().newShaderProgram(progInit);
 			compProg = GrManager::getSingleton().newShaderProgram(progInit);
 		}
 		}
 
 
+		ShaderProgramPtr wgProg;
+		if(bWorkgraphs)
+		{
+			ShaderPtr shader = loadShader(ANKI_SOURCE_DIRECTORY "/Tests/Gr/JobManagerWg.hlsl", ShaderType::kWorkGraph);
+
+			ShaderProgramInitInfo progInit;
+			Array<WorkGraphNodeSpecialization, 1> specializations = {{{"main", UVec3((initialWorkItemCount + 64 - 1) / 64, 1, 1)}}};
+			progInit.m_workGraph.m_nodeSpecializations = specializations;
+			progInit.m_workGraph.m_shader = shader.get();
+			wgProg = GrManager::getSingleton().newShaderProgram(progInit);
+		}
+
+		// Scratch buff
+		BufferPtr scratchBuff;
+		if(bWorkgraphs)
+		{
+			BufferInitInfo scratchInit("scratch");
+			scratchInit.m_size = wgProg->getWorkGraphMemoryRequirements();
+			scratchInit.m_usage = BufferUsageBit::kAllUav | BufferUsageBit::kAllSrv;
+			scratchBuff = GrManager::getSingleton().newBuffer(scratchInit);
+		}
+
 		DynamicArray<U32> initialWorkItems;
 		DynamicArray<U32> initialWorkItems;
 		U32 finalValue = 0;
 		U32 finalValue = 0;
 		U32 workItemCount = 0;
 		U32 workItemCount = 0;
 		{
 		{
-			initialWorkItems.resize(128 * 1024);
+			initialWorkItems.resize(initialWorkItemCount);
 			for(U32 i = 0; i < initialWorkItems.getSize(); ++i)
 			for(U32 i = 0; i < initialWorkItems.getSize(); ++i)
 			{
 			{
-				const U32 level = ((bBenchmark) ? i : rand()) % 4;
-				const U32 payload = (bBenchmark) ? 1 : (rand() % 4);
+				const Bool bDeterministic = bBenchmark;
+				const U32 level = ((bDeterministic) ? i : rand()) % 4;
+				const U32 payload = ((bDeterministic) ? 1 : rand()) % 4;
 
 
 				initialWorkItems[i] = (level << 16) | payload;
 				initialWorkItems[i] = (level << 16) | payload;
 			}
 			}
@@ -816,6 +840,7 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 		BufferPtr resultBuff = createBuffer<U32>(BufferUsageBit::kAllUav, 0u, 2);
 		BufferPtr resultBuff = createBuffer<U32>(BufferUsageBit::kAllUav, 0u, 2);
 
 
 		BufferPtr queueRingBuff;
 		BufferPtr queueRingBuff;
+		if(!bWorkgraphs)
 		{
 		{
 			queueRingBuff = createBuffer<U32>(BufferUsageBit::kAllUav, 0u, queueRingBufferSize);
 			queueRingBuff = createBuffer<U32>(BufferUsageBit::kAllUav, 0u, queueRingBufferSize);
 
 
@@ -832,7 +857,26 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 			ANKI_TEST_EXPECT_EQ(fence->clientWait(kMaxSecond), true);
 			ANKI_TEST_EXPECT_EQ(fence->clientWait(kMaxSecond), true);
 		}
 		}
 
 
+		BufferPtr initialWorkItemsBuff;
+		if(bWorkgraphs)
+		{
+			initialWorkItemsBuff = createBuffer<U32>(BufferUsageBit::kAllUav, 0u, initialWorkItemCount);
+
+			BufferPtr tempBuff = createBuffer<U32>(BufferUsageBit::kCopySource, initialWorkItems);
+
+			CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(CommandBufferInitInfo());
+			cmdb->copyBufferToBuffer(BufferView(tempBuff.get(), 0, initialWorkItems.getSizeInBytes()),
+									 BufferView(initialWorkItemsBuff.get(), 0, initialWorkItems.getSizeInBytes()));
+			cmdb->endRecording();
+
+			FencePtr fence;
+			GrManager::getSingleton().submit(cmdb.get(), {}, &fence);
+
+			ANKI_TEST_EXPECT_EQ(fence->clientWait(kMaxSecond), true);
+		}
+
 		BufferPtr queueBuff;
 		BufferPtr queueBuff;
+		if(!bWorkgraphs)
 		{
 		{
 			struct Queue
 			struct Queue
 			{
 			{
@@ -865,10 +909,28 @@ ANKI_TEST(Gr, WorkGraphsJobManager)
 
 
 				cmdb.dispatchCompute(256, 1, 1);
 				cmdb.dispatchCompute(256, 1, 1);
 			}
 			}
+			else
+			{
+				cmdb.bindShaderProgram(wgProg.get());
+
+				cmdb.bindSrv(0, 0, BufferView(initialWorkItemsBuff.get()));
+				cmdb.bindUav(0, 0, BufferView(resultBuff.get()));
+
+				struct FirstNodeRecord
+				{
+					UVec3 m_gridSize;
+				};
+
+				Array<FirstNodeRecord, 1> records;
+				records[0].m_gridSize = UVec3((initialWorkItemCount + 64 - 1) / 64, 1, 1);
+
+				cmdb.dispatchGraph(BufferView(scratchBuff.get()), records.getBegin(), records.getSize(), sizeof(records[0]));
+			}
 		});
 		});
 
 
 		DynamicArray<U32> result;
 		DynamicArray<U32> result;
 		readBuffer(resultBuff, result);
 		readBuffer(resultBuff, result);
+		printf("expecting %u, got %u. Error %u\n", finalValue, result[0], result[1]);
 		ANKI_TEST_EXPECT_EQ(result[0], finalValue);
 		ANKI_TEST_EXPECT_EQ(result[0], finalValue);
 		ANKI_TEST_EXPECT_EQ(result[1], 0);
 		ANKI_TEST_EXPECT_EQ(result[1], 0);
 	}
 	}

+ 0 - 0
Tests/Gr/JobManager.hlsl → Tests/Gr/JobManagerCompute.hlsl


+ 178 - 0
Tests/Gr/JobManagerWg.hlsl

@@ -0,0 +1,178 @@
+// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#define MAX_CHILDREN 4
+#define NUMTHREADS 64
+
+StructuredBuffer<uint> g_initialWork : register(t0);
+
+RWStructuredBuffer<uint> g_finalResult : register(u0);
+
+#define ASSERT(x) \
+	do \
+	{ \
+		if(!(x)) \
+		{ \
+			InterlockedAdd(g_finalResult[1], 1); \
+		} \
+	} while(0)
+
+struct FirstNodeInput
+{
+	uint3 m_svDispatchGrid : SV_DispatchGrid;
+};
+
+struct SecondNodeInput
+{
+	uint3 m_svDispatchGrid : SV_DispatchGrid;
+
+	uint m_workItems[NUMTHREADS];
+	uint m_workItemCount;
+};
+
+groupshared uint g_newWorkItemCount;
+
+[Shader("node")][NodeLaunch("broadcasting")][NodeIsProgramEntry][NodeMaxDispatchGrid(1, 1, 1)][NumThreads(NUMTHREADS, 1, 1)] void
+main(DispatchNodeInputRecord<FirstNodeInput> input, uint svDispatchThreadId
+	 : SV_DispatchThreadId, uint svGroupIndex : SV_GroupIndex, [MaxRecords(MAX_CHILDREN)] NodeOutput<SecondNodeInput> secondNode)
+{
+	if(svGroupIndex == 0)
+	{
+		g_newWorkItemCount = 0;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	uint count, stride;
+	g_initialWork.GetDimensions(count, stride);
+	uint newWorkItemCount = 0;
+	uint newWorkItems[MAX_CHILDREN];
+	uint firstOutputRecord = 0;
+
+	if(svDispatchThreadId < count)
+	{
+		const uint workItem = g_initialWork[svDispatchThreadId];
+
+		const uint level = workItem >> 16u;
+		const uint payload = workItem & 0xFFFFu;
+
+		if(level == 0)
+		{
+			InterlockedAdd(g_finalResult[0], payload);
+		}
+		else
+		{
+			uint newWorkItem = (level - 1) << 16u;
+			newWorkItem |= payload;
+
+			for(uint i = 0; i < MAX_CHILDREN; ++i)
+			{
+				newWorkItems[i] = newWorkItem;
+			}
+
+			InterlockedAdd(g_newWorkItemCount, MAX_CHILDREN, firstOutputRecord);
+			newWorkItemCount = MAX_CHILDREN;
+		}
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	const uint recordCount = (g_newWorkItemCount + NUMTHREADS - 1) / NUMTHREADS;
+	GroupNodeOutputRecords<SecondNodeInput> output = secondNode.GetGroupNodeOutputRecords(recordCount);
+
+	if(recordCount)
+	{
+		for(uint i = 0; i < recordCount; ++i)
+		{
+			output[i].m_svDispatchGrid = 1;
+			const uint begin = i * NUMTHREADS;
+			const uint end = min((i + 1) * NUMTHREADS, g_newWorkItemCount);
+			output[i].m_workItemCount = end - begin;
+		}
+
+		for(uint i = 0; i < newWorkItemCount; ++i)
+		{
+			const uint k = (firstOutputRecord + i) / NUMTHREADS;
+			const uint l = (firstOutputRecord + i) % NUMTHREADS;
+			output[k].m_workItems[l] = newWorkItems[i];
+		}
+	}
+
+	output.OutputComplete();
+}
+
+static const int x = 0; // For formatting
+
+[Shader("node")][NodeLaunch("broadcasting")][NumThreads(NUMTHREADS, 1, 1)][NodeDispatchGrid(1, 1, 1)][NodeMaxRecursionDepth(16)] void
+secondNode(DispatchNodeInputRecord<SecondNodeInput> input, [MaxRecords(MAX_CHILDREN)] NodeOutput<SecondNodeInput> secondNode,
+		   uint svGroupIndex : SV_GroupIndex)
+{
+	if(svGroupIndex == 0)
+	{
+		g_newWorkItemCount = 0;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	uint newWorkItemCount = 0;
+	uint newWorkItems[MAX_CHILDREN];
+	uint firstOutputRecord = 0;
+
+	if(svGroupIndex < input.Get().m_workItemCount)
+	{
+		const uint workItem = input.Get().m_workItems[svGroupIndex];
+
+		const uint level = workItem >> 16u;
+		const uint payload = workItem & 0xFFFFu;
+
+		if(level == 0)
+		{
+			InterlockedAdd(g_finalResult[0], payload);
+		}
+		else
+		{
+			uint newWorkItem = (level - 1) << 16u;
+			newWorkItem |= payload;
+
+			for(uint i = 0; i < MAX_CHILDREN; ++i)
+			{
+				newWorkItems[i] = newWorkItem;
+			}
+
+			InterlockedAdd(g_newWorkItemCount, MAX_CHILDREN, firstOutputRecord);
+			newWorkItemCount = MAX_CHILDREN;
+		}
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	const uint recordCount = (secondNode.IsValid()) ? (g_newWorkItemCount + NUMTHREADS - 1) / NUMTHREADS : 0;
+	GroupNodeOutputRecords<SecondNodeInput> output = secondNode.GetGroupNodeOutputRecords(recordCount);
+
+	if(recordCount)
+	{
+		for(uint i = 0; i < recordCount; ++i)
+		{
+			output[i].m_svDispatchGrid = 1;
+			const uint begin = i * NUMTHREADS;
+			const uint end = min((i + 1) * NUMTHREADS, g_newWorkItemCount);
+			output[i].m_workItemCount = end - begin;
+		}
+
+		for(uint i = 0; i < newWorkItemCount; ++i)
+		{
+			const uint k = (firstOutputRecord + i) / NUMTHREADS;
+			const uint l = (firstOutputRecord + i) % NUMTHREADS;
+			output[k].m_workItems[l] = newWorkItems[i];
+		}
+	}
+
+	output.OutputComplete();
+
+	if(!secondNode.IsValid())
+	{
+		ASSERT(1);
+	}
+}