Prechádzať zdrojové kódy

Add an async compute unit test

Panagiotis Christopoulos Charitos 4 rokov pred
rodič
commit
30769fceae

+ 7 - 2
AnKi/Core/App.cpp

@@ -344,9 +344,14 @@ Error App::initInternal(const ConfigSet& config_, AllocAlignedCallback allocCb,
 		"NO dbg symbols, "
 #endif
 #if ANKI_EXTRA_CHECKS
-		"extra checks";
+		"extra checks, "
 #else
-		"NO extra checks";
+		"NO extra checks, "
+#endif
+#if ANKI_ENABLE_TRACE
+		"built with tracing";
+#else
+		"NOT built with tracing";
 #endif
 
 	ANKI_CORE_LOGI("Initializing application ("

+ 1 - 1
AnKi/Gr/Utils/ClassGpuAllocator.cpp

@@ -125,7 +125,7 @@ ClassGpuAllocator::Class* ClassGpuAllocator::findClass(PtrSize size, U alignment
 		++it;
 	}
 
-	ANKI_ASSERT(!"No class found");
+	ANKI_GR_LOGF("Memory class not found");
 	return nullptr;
 }
 

+ 2 - 0
AnKi/Gr/Vulkan/BufferImpl.cpp

@@ -88,6 +88,7 @@ Error BufferImpl::init(const BufferInitInfo& inf)
 		// Fallback: host & coherent and not cached
 		if(memIdx == MAX_U32)
 		{
+			ANKI_VK_LOGW("Using a fallback mode for write-only buffer");
 			memIdx = getGrManagerImpl().getGpuMemoryManager().findMemoryType(
 				req.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
 				VK_MEMORY_PROPERTY_HOST_CACHED_BIT | avoidDeviceLocal);
@@ -115,6 +116,7 @@ Error BufferImpl::init(const BufferInitInfo& inf)
 		// Fallback: Just cached
 		if(memIdx == MAX_U32)
 		{
+			ANKI_VK_LOGW("Using a fallback mode for read/write buffer");
 			memIdx = getGrManagerImpl().getGpuMemoryManager().findMemoryType(
 				req.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, 0);
 		}

+ 15 - 11
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -1034,17 +1034,15 @@ void GrManagerImpl::flushCommandBuffer(MicroCommandBufferPtr cmdb, Bool cmdbRend
 	submit.pCommandBuffers = &handle;
 
 	// Handle user semaphores
-	Array<U64, maxSemaphores> timelineSignalValues;
-	for(U64& i : timelineSignalValues)
-	{
-		i = 1;
-	}
+	Array<U64, maxSemaphores> waitTimelineValues;
+	Array<U64, maxSemaphores> signalTimelineValues;
+
 	VkTimelineSemaphoreSubmitInfo timelineInfo = {};
 	timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
 	timelineInfo.waitSemaphoreValueCount = userWaitSemaphores.getSize();
-	timelineInfo.pWaitSemaphoreValues = &timelineSignalValues[0];
+	timelineInfo.pWaitSemaphoreValues = &waitTimelineValues[0];
 	timelineInfo.signalSemaphoreValueCount = (userSignalSemaphore != nullptr);
-	timelineInfo.pSignalSemaphoreValues = &timelineSignalValues[0];
+	timelineInfo.pSignalSemaphoreValues = &signalTimelineValues[0];
 	submit.pNext = &timelineInfo;
 
 	for(MicroSemaphorePtr& userWaitSemaphore : userWaitSemaphores)
@@ -1053,6 +1051,8 @@ void GrManagerImpl::flushCommandBuffer(MicroCommandBufferPtr cmdb, Bool cmdbRend
 		ANKI_ASSERT(userWaitSemaphore->isTimeline());
 		waitSemaphores[submit.waitSemaphoreCount] = userWaitSemaphore->getHandle();
 
+		waitTimelineValues[submit.waitSemaphoreCount] = userWaitSemaphore->getSemaphoreValue();
+
 		// Be a bit conservative
 		waitStages[submit.waitSemaphoreCount] = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
 
@@ -1066,15 +1066,18 @@ void GrManagerImpl::flushCommandBuffer(MicroCommandBufferPtr cmdb, Bool cmdbRend
 	{
 		*userSignalSemaphore = m_semaphoreFactory.newInstance(fence, true);
 
-		signalSemaphores[submit.signalSemaphoreCount++] = (*userSignalSemaphore)->getHandle();
+		signalSemaphores[submit.signalSemaphoreCount] = (*userSignalSemaphore)->getHandle();
+
+		signalTimelineValues[submit.signalSemaphoreCount] = (*userSignalSemaphore)->getNextSemaphoreValue();
+
+		++submit.signalSemaphoreCount;
 	}
 
-	// Protect the class and the queue
+	// Protect the class, the queue and other stuff
 	LockGuard<Mutex> lock(m_globalMtx);
 
-	PerFrame& frame = m_perFrame[m_frame % MAX_FRAMES_IN_FLIGHT];
-
 	// Do some special stuff for the last command buffer
+	PerFrame& frame = m_perFrame[m_frame % MAX_FRAMES_IN_FLIGHT];
 	if(cmdbRenderedToSwapchain)
 	{
 		// Wait semaphore
@@ -1103,6 +1106,7 @@ void GrManagerImpl::flushCommandBuffer(MicroCommandBufferPtr cmdb, Bool cmdbRend
 		frame.m_queueWroteToSwapchainImage = getQueueTypeFromCommandBufferFlags(cmdb->getFlags());
 	}
 
+	// Submit
 	{
 		ANKI_TRACE_SCOPED_EVENT(VK_QUEUE_SUBMIT);
 		ANKI_VK_CHECKF(vkQueueSubmit(m_queues[getQueueTypeFromCommandBufferFlags(cmdb->getFlags())], 1, &submit,

+ 1 - 1
AnKi/Gr/Vulkan/GrManagerImpl.h

@@ -47,7 +47,7 @@ public:
 
 	ANKI_USE_RESULT Error init(const GrManagerInitInfo& cfg);
 
-	const Array<U32, U(QueueType::COUNT)> getQueueFamilies() const
+	const Array<U32, U(QueueType::COUNT)>& getQueueFamilies() const
 	{
 		return m_queueFamilyIndices;
 	}

+ 17 - 0
AnKi/Gr/Vulkan/SemaphoreFactory.h

@@ -56,6 +56,22 @@ public:
 		return m_isTimeline;
 	}
 
+	/// Get the value of the semaphore after a signal.
+	/// @note It's thread safe.
+	U64 getNextSemaphoreValue()
+	{
+		ANKI_ASSERT(m_isTimeline);
+		return m_timelineValue.fetchAdd(1) + 1;
+	}
+
+	/// Get the value of the semaphore to wait on.
+	/// @note It's thread safe.
+	U64 getSemaphoreValue() const
+	{
+		ANKI_ASSERT(m_isTimeline);
+		return m_timelineValue.load();
+	}
+
 private:
 	VkSemaphore m_handle = VK_NULL_HANDLE;
 	Atomic<U32> m_refcount = {0};
@@ -64,6 +80,7 @@ private:
 	/// Fence to find out when it's safe to reuse this semaphore.
 	MicroFencePtr m_fence;
 
+	Atomic<U64> m_timelineValue = {0};
 	Bool m_isTimeline = false;
 
 	MicroSemaphore(SemaphoreFactory* f, MicroFencePtr fence, Bool isTimeline);

+ 6 - 2
AnKi/Gr/Vulkan/SemaphoreFactory.inl.h

@@ -51,8 +51,8 @@ inline Bool MicroSemaphore::clientWait(Second seconds)
 	waitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO;
 	waitInfo.semaphoreCount = 1;
 	waitInfo.pSemaphores = &m_handle;
-	const U64 waitValue = 1;
-	waitInfo.pValues = &waitValue;
+	const U64 crntTimelineValue = m_timelineValue.load();
+	waitInfo.pValues = &crntTimelineValue;
 
 	const F64 nsf = 1e+9 * seconds;
 	const U64 ns = U64(nsf);
@@ -91,6 +91,10 @@ inline MicroSemaphorePtr SemaphoreFactory::newInstance(MicroFencePtr fence, Bool
 	{
 		out->m_fence = fence;
 		ANKI_ASSERT(out->m_isTimeline == isTimeline);
+		if(out->m_isTimeline)
+		{
+			ANKI_ASSERT(out->m_timelineValue.getNonAtomically() > 0 && "Recycled without being signaled?");
+		}
 	}
 
 	ANKI_ASSERT(out->m_refcount.getNonAtomically() == 0);

+ 5 - 0
AnKi/Util/Singleton.h

@@ -91,6 +91,11 @@ public:
 		}
 	}
 
+	static Bool isInitialized()
+	{
+		return m_instance != nullptr;
+	}
+
 private:
 	static Value* m_instance;
 };

+ 138 - 0
Tests/Gr/Gr.cpp

@@ -3511,4 +3511,142 @@ void main()
 	COMMON_END();
 }
 
+ANKI_TEST(Gr, AsyncCompute)
+{
+	COMMON_BEGIN()
+
+	constexpr U32 ARRAY_SIZE = 1000 * 1024 * 8;
+
+	// Create the counting program
+	static const char* PROG_SRC = R"(
+layout(local_size_x = 8) in;
+
+layout(binding = 0, std430) buffer b_buff
+{
+	U32 u_counters[];
+};
+
+void main()
+{
+	for(U32 i = 0u; i < gl_LocalInvocationID.x * 20u; ++i)
+	{
+		atomicAdd(u_counters[gl_GlobalInvocationID.x], i + 1u);
+	}
+})";
+
+	ShaderPtr shader = createShader(PROG_SRC, ShaderType::COMPUTE, *gr);
+	ShaderProgramInitInfo sprogInit;
+	sprogInit.m_computeShader = shader;
+	ShaderProgramPtr incrementProg = gr->newShaderProgram(sprogInit);
+
+	// Create the check program
+	static const char* CHECK_SRC = R"(
+layout(local_size_x = 8) in;
+
+layout(binding = 0, std430) buffer b_buff
+{
+	U32 u_counters[];
+};
+
+void main()
+{
+	// Walk the atomics in reverse to make sure that this dispatch won't overlap with the previous one
+	const U32 newGlobalInvocationID = gl_NumWorkGroups.x * gl_WorkGroupSize.x - gl_GlobalInvocationID.x - 1u;
+
+	U32 expectedVal = 0u;
+	for(U32 i = 0u; i < (newGlobalInvocationID % gl_WorkGroupSize.x) * 20u; ++i)
+	{
+		expectedVal += i + 1u;
+	}
+
+	atomicCompSwap(u_counters[newGlobalInvocationID], expectedVal, 4u);
+})";
+
+	shader = createShader(CHECK_SRC, ShaderType::COMPUTE, *gr);
+	sprogInit.m_computeShader = shader;
+	ShaderProgramPtr checkProg = gr->newShaderProgram(sprogInit);
+
+	// Create buffers
+	BufferInitInfo info;
+	info.m_size = sizeof(U32) * ARRAY_SIZE;
+	info.m_usage = BufferUsageBit::ALL_COMPUTE;
+	info.m_mapAccess = BufferMapAccessBit::WRITE | BufferMapAccessBit::READ;
+	BufferPtr atomicsBuffer = gr->newBuffer(info);
+	U32* values =
+		static_cast<U32*>(atomicsBuffer->map(0, MAX_PTR_SIZE, BufferMapAccessBit::READ | BufferMapAccessBit::WRITE));
+	memset(values, 0, info.m_size);
+
+	// Pre-create some CPU result buffers
+	DynamicArrayAuto<U32> atomicsBufferCpu(HeapAllocator<U8>(allocAligned, nullptr));
+	atomicsBufferCpu.create(ARRAY_SIZE);
+	DynamicArrayAuto<U32> expectedResultsBufferCpu(HeapAllocator<U8>(allocAligned, nullptr));
+	expectedResultsBufferCpu.create(ARRAY_SIZE);
+	for(U32 i = 0; i < ARRAY_SIZE; ++i)
+	{
+		const U32 localInvocation = i % 8;
+		U32 expectedVal = 4;
+		for(U32 j = 0; j < localInvocation * 20; ++j)
+		{
+			expectedVal += j + 1;
+		}
+
+		expectedResultsBufferCpu[i] = expectedVal;
+	}
+
+	// Create the 1st command buffer
+	CommandBufferInitInfo cinit;
+	cinit.m_flags = CommandBufferFlag::COMPUTE_WORK | CommandBufferFlag::SMALL_BATCH;
+	CommandBufferPtr incrementCmdb = gr->newCommandBuffer(cinit);
+	incrementCmdb->bindShaderProgram(incrementProg);
+	incrementCmdb->bindStorageBuffer(0, 0, atomicsBuffer, 0, MAX_PTR_SIZE);
+	incrementCmdb->dispatchCompute(ARRAY_SIZE / 8, 1, 1);
+
+	// Create the 2nd command buffer
+	cinit.m_flags = CommandBufferFlag::GENERAL_WORK | CommandBufferFlag::SMALL_BATCH;
+	CommandBufferPtr checkCmdb = gr->newCommandBuffer(cinit);
+	checkCmdb->bindShaderProgram(checkProg);
+	checkCmdb->bindStorageBuffer(0, 0, atomicsBuffer, 0, MAX_PTR_SIZE);
+	checkCmdb->dispatchCompute(ARRAY_SIZE / 8, 1, 1);
+
+	// Create the 3rd command buffer
+	cinit.m_flags = CommandBufferFlag::COMPUTE_WORK | CommandBufferFlag::SMALL_BATCH;
+	CommandBufferPtr incrementCmdb2 = gr->newCommandBuffer(cinit);
+	incrementCmdb2->bindShaderProgram(incrementProg);
+	incrementCmdb2->bindStorageBuffer(0, 0, atomicsBuffer, 0, MAX_PTR_SIZE);
+	incrementCmdb2->dispatchCompute(ARRAY_SIZE / 8, 1, 1);
+
+	// Submit
+#if 1
+	FencePtr fence;
+	incrementCmdb->flush({}, &fence);
+	checkCmdb->flush(Array<FencePtr, 1>{fence}, &fence);
+	incrementCmdb2->flush(Array<FencePtr, 1>{fence}, &fence);
+	fence->clientWait(MAX_SECOND);
+#else
+	incrementCmdb->flush();
+	gr->finish();
+	checkCmdb->flush();
+	gr->finish();
+	incrementCmdb2->flush();
+	gr->finish();
+#endif
+
+	// Verify
+	memcpy(atomicsBufferCpu.getBegin(), values, atomicsBufferCpu.getSizeInBytes());
+	Bool correct = true;
+	for(U32 i = 0; i < ARRAY_SIZE; ++i)
+	{
+		correct = correct && atomicsBufferCpu[i] == expectedResultsBufferCpu[i];
+		if(!correct)
+		{
+			printf("%u!=%u %u\n", atomicsBufferCpu[i], expectedResultsBufferCpu[i], i);
+			break;
+		}
+	}
+	atomicsBuffer->unmap();
+	ANKI_TEST_EXPECT_EQ(correct, true);
+
+	COMMON_END()
+}
+
 } // end namespace anki