瀏覽代碼

Add an async compute benchmark

Panagiotis Christopoulos Charitos 10 月之前
父節點
當前提交
feb9722089

+ 3 - 1
AnKi/Gr/Common.h

@@ -63,7 +63,9 @@ inline BoolCVar g_dredCVar("Gr", "Dred", false, "Enable DRED");
 inline NumericCVar<PtrSize> g_diskShaderCacheMaxSizeCVar("Gr", "DiskShaderCacheMaxSize", 128_MB, 1_MB, 1_GB, "Max size of the pipeline cache file");
 inline BoolCVar g_debugPrintfCVar("Gr", "DebugPrintf", false, "Enable or not debug printf");
 inline BoolCVar g_samplerFilterMinMaxCVar("Gr", "SamplerFilterMinMax", true, "Enable or not min/max sample filtering");
-inline BoolCVar g_asyncComputeCVar("Gr", "AsyncCompute", true, "Enable or not async compute");
+inline NumericCVar<U8> g_asyncComputeCVar("Gr", "AsyncCompute", 0, 0, 2,
+										  "Control the async compute behaviour: 0: Try use separate queue family, 1: Use lower priority queue in the "
+										  "general's queue family, 2: Use the general queue");
 inline StringCVar g_vkLayersCVar("Gr", "VkLayers", "", "VK layers to enable. Seperated by :");
 #endif
 

+ 35 - 28
AnKi/Gr/Vulkan/VkCommandBufferFactory.cpp

@@ -4,6 +4,7 @@
 // http://www.anki3d.org/LICENSE
 
 #include <AnKi/Gr/Vulkan/VkCommandBufferFactory.h>
+#include <AnKi/Gr/Vulkan/VkGrManager.h>
 #include <AnKi/Util/Tracer.h>
 #include <AnKi/Core/StatsSet.h>
 
@@ -11,20 +12,6 @@ namespace anki {
 
 static StatCounter g_commandBufferCountStatVar(StatCategory::kMisc, "CommandBufferCount", StatFlag::kNone);
 
-static GpuQueueType getQueueTypeFromCommandBufferFlags(CommandBufferFlag flags, const VulkanQueueFamilies& queueFamilies)
-{
-	ANKI_ASSERT(!!(flags & CommandBufferFlag::kGeneralWork) ^ !!(flags & CommandBufferFlag::kComputeWork));
-	if(!(flags & CommandBufferFlag::kGeneralWork) && queueFamilies[GpuQueueType::kCompute] != kMaxU32)
-	{
-		return GpuQueueType::kCompute;
-	}
-	else
-	{
-		ANKI_ASSERT(queueFamilies[GpuQueueType::kGeneral] != kMaxU32);
-		return GpuQueueType::kGeneral;
-	}
-}
-
 void MicroCommandBufferPtrDeleter::operator()(MicroCommandBuffer* ptr)
 {
 	ANKI_ASSERT(ptr);
@@ -39,7 +26,10 @@ MicroCommandBuffer::~MicroCommandBuffer()
 
 	if(m_handle)
 	{
-		vkFreeCommandBuffers(getVkDevice(), m_threadAlloc->m_pools[m_queue], 1, &m_handle);
+		const U32 queueFamilyIdx =
+			(m_queue == GpuQueueType::kCompute && getGrManagerImpl().getAsyncComputeType() == AsyncComputeType::kLowPriorityQueue) ? 0 : U32(m_queue);
+
+		vkFreeCommandBuffers(getVkDevice(), m_threadAlloc->m_pools[queueFamilyIdx], 1, &m_handle);
 		m_handle = {};
 
 		g_commandBufferCountStatVar.decrement(1_U64);
@@ -65,19 +55,15 @@ void MicroCommandBuffer::reset()
 
 Error CommandBufferThreadAllocator::init()
 {
-	for(GpuQueueType qtype : EnumIterable<GpuQueueType>())
+	ConstWeakArray<U32> families = getGrManagerImpl().getQueueFamilies();
+	for(U32 i = 0; i < families.getSize(); ++i)
 	{
-		if(CommandBufferFactory::getSingleton().m_queueFamilies[qtype] == kMaxU32)
-		{
-			continue;
-		}
-
 		VkCommandPoolCreateInfo ci = {};
 		ci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
 		ci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
-		ci.queueFamilyIndex = CommandBufferFactory::getSingleton().m_queueFamilies[qtype];
+		ci.queueFamilyIndex = families[i];
 
-		ANKI_VK_CHECK(vkCreateCommandPool(getVkDevice(), &ci, nullptr, &m_pools[qtype]));
+		ANKI_VK_CHECK(vkCreateCommandPool(getVkDevice(), &ci, nullptr, &m_pools[i]));
 	}
 
 	return Error::kNone;
@@ -108,9 +94,26 @@ Error CommandBufferThreadAllocator::newCommandBuffer(CommandBufferFlag cmdbFlags
 	ANKI_ASSERT(!!(cmdbFlags & CommandBufferFlag::kComputeWork) ^ !!(cmdbFlags & CommandBufferFlag::kGeneralWork));
 
 	const Bool smallBatch = !!(cmdbFlags & CommandBufferFlag::kSmallBatch);
-	const GpuQueueType queue = getQueueTypeFromCommandBufferFlags(cmdbFlags, CommandBufferFactory::getSingleton().m_queueFamilies);
 
-	MicroObjectRecycler<MicroCommandBuffer>& recycler = m_recyclers[smallBatch][queue];
+	GpuQueueType queue;
+	U32 queueFamilyIdx;
+	if(!!(cmdbFlags & CommandBufferFlag::kGeneralWork) || getGrManagerImpl().getAsyncComputeType() == AsyncComputeType::kDisabled)
+	{
+		queue = GpuQueueType::kGeneral;
+		queueFamilyIdx = 0;
+	}
+	else if(getGrManagerImpl().getAsyncComputeType() == AsyncComputeType::kLowPriorityQueue)
+	{
+		queue = GpuQueueType::kCompute;
+		queueFamilyIdx = 0;
+	}
+	else
+	{
+		queue = GpuQueueType::kCompute;
+		queueFamilyIdx = 1;
+	}
+
+	MicroObjectRecycler<MicroCommandBuffer>& recycler = m_recyclers[smallBatch][queueFamilyIdx];
 
 	MicroCommandBuffer* out = recycler.findToReuse();
 
@@ -120,7 +123,7 @@ Error CommandBufferThreadAllocator::newCommandBuffer(CommandBufferFlag cmdbFlags
 
 		VkCommandBufferAllocateInfo ci = {};
 		ci.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-		ci.commandPool = m_pools[queue];
+		ci.commandPool = m_pools[queueFamilyIdx];
 		ci.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
 		ci.commandBufferCount = 1;
 
@@ -154,7 +157,6 @@ Error CommandBufferThreadAllocator::newCommandBuffer(CommandBufferFlag cmdbFlags
 	}
 
 	ANKI_ASSERT(out && out->m_refcount.load() == 0);
-	ANKI_ASSERT(out->m_flags == cmdbFlags);
 	outPtr.reset(out);
 	return Error::kNone;
 }
@@ -165,7 +167,12 @@ void CommandBufferThreadAllocator::deleteCommandBuffer(MicroCommandBuffer* ptr)
 
 	const Bool smallBatch = !!(ptr->m_flags & CommandBufferFlag::kSmallBatch);
 
-	m_recyclers[smallBatch][ptr->m_queue].recycle(ptr);
+	const U32 queueFamilyIdx =
+		(ptr->m_queue == GpuQueueType::kCompute && getGrManagerImpl().getAsyncComputeType() == AsyncComputeType::kLowPriorityQueue)
+			? 0
+			: U32(ptr->m_queue);
+
+	m_recyclers[smallBatch][queueFamilyIdx].recycle(ptr);
 }
 
 void CommandBufferFactory::destroy()

+ 1 - 6
AnKi/Gr/Vulkan/VkCommandBufferFactory.h

@@ -196,10 +196,7 @@ class CommandBufferFactory : public MakeSingleton<CommandBufferFactory>
 	friend class MicroCommandBuffer;
 
 public:
-	CommandBufferFactory(const VulkanQueueFamilies& queueFamilies)
-		: m_queueFamilies(queueFamilies)
-	{
-	}
+	CommandBufferFactory() = default;
 
 	CommandBufferFactory(const CommandBufferFactory&) = delete; // Non-copyable
 
@@ -214,8 +211,6 @@ public:
 	Error newCommandBuffer(ThreadId tid, CommandBufferFlag cmdbFlags, MicroCommandBufferPtr& ptr);
 
 private:
-	VulkanQueueFamilies m_queueFamilies;
-
 	GrDynamicArray<CommandBufferThreadAllocator*> m_threadAllocs;
 	RWMutex m_threadAllocMtx;
 

+ 80 - 45
AnKi/Gr/Vulkan/VkGrManager.cpp

@@ -385,18 +385,6 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 	ANKI_CHECK(initSurface());
 	ANKI_CHECK(initDevice());
 
-	for(GpuQueueType qtype : EnumIterable<GpuQueueType>())
-	{
-		if(m_queueFamilyIndices[qtype] != kMaxU32)
-		{
-			vkGetDeviceQueue(m_device, m_queueFamilyIndices[qtype], 0, &m_queues[qtype]);
-		}
-		else
-		{
-			m_queues[qtype] = VK_NULL_HANDLE;
-		}
-	}
-
 	SwapchainFactory::allocateSingleton(U32(g_vsyncCVar));
 	m_crntSwapchain = SwapchainFactory::getSingleton().newInstance();
 
@@ -405,7 +393,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 
 	ANKI_CHECK(initMemory());
 
-	CommandBufferFactory::allocateSingleton(m_queueFamilyIndices);
+	CommandBufferFactory::allocateSingleton();
 	FenceFactory::allocateSingleton();
 	SemaphoreFactory::allocateSingleton();
 	OcclusionQueryFactory::allocateSingleton();
@@ -808,24 +796,33 @@ Error GrManagerImpl::initDevice()
 	queueInfos.resize(count);
 	vkGetPhysicalDeviceQueueFamilyProperties(m_physicalDevice, &count, &queueInfos[0]);
 
-	const VkQueueFlags GENERAL_QUEUE_FLAGS = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+	Bool generalQueueFamilySupportsMultipleQueues = false;
+
+	const VkQueueFlags generalQueueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
 	for(U32 i = 0; i < count; ++i)
 	{
 		VkBool32 supportsPresent = false;
 		ANKI_VK_CHECK(vkGetPhysicalDeviceSurfaceSupportKHR(m_physicalDevice, i, m_surface, &supportsPresent));
 
-		if(supportsPresent)
+		if(!supportsPresent)
 		{
-			if((queueInfos[i].queueFlags & GENERAL_QUEUE_FLAGS) == GENERAL_QUEUE_FLAGS)
-			{
-				m_queueFamilyIndices[GpuQueueType::kGeneral] = i;
-			}
-			else if((queueInfos[i].queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueInfos[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
+			continue;
+		}
+
+		if((queueInfos[i].queueFlags & generalQueueFlags) == generalQueueFlags)
+		{
+			m_queueFamilyIndices[GpuQueueType::kGeneral] = i;
+
+			if(queueInfos[i].queueCount > 1)
 			{
-				// This must be the async compute
-				m_queueFamilyIndices[GpuQueueType::kCompute] = i;
+				generalQueueFamilySupportsMultipleQueues = true;
 			}
 		}
+		else if((queueInfos[i].queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueInfos[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
+		{
+			// This must be the async compute
+			m_queueFamilyIndices[GpuQueueType::kCompute] = i;
+		}
 	}
 
 	if(m_queueFamilyIndices[GpuQueueType::kGeneral] == kMaxU32)
@@ -834,39 +831,58 @@ Error GrManagerImpl::initDevice()
 		return Error::kFunctionFailed;
 	}
 
-	if(!g_asyncComputeCVar)
-	{
-		m_queueFamilyIndices[GpuQueueType::kCompute] = kMaxU32;
-	}
-
-	if(m_queueFamilyIndices[GpuQueueType::kCompute] == kMaxU32)
-	{
-		ANKI_VK_LOGW("Couldn't find an async compute queue. Will try to use the general queue instead");
-	}
-	else
-	{
-		ANKI_VK_LOGI("Async compute is enabled");
-	}
+	const Bool pureAsyncCompute = m_queueFamilyIndices[GpuQueueType::kCompute] != kMaxU32 && g_asyncComputeCVar == 0;
+	const Bool lowPriorityQueueAsyncCompute = !pureAsyncCompute && generalQueueFamilySupportsMultipleQueues && g_asyncComputeCVar <= 1;
 
-	const F32 priority = 1.0f;
+	Array<F32, U32(GpuQueueType::kCount)> priorities = {1.0f, 0.5f};
 	Array<VkDeviceQueueCreateInfo, U32(GpuQueueType::kCount)> q = {};
+	q.fill({VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO});
 
 	VkDeviceCreateInfo ci = {};
 	ci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
 	ci.pQueueCreateInfos = &q[0];
 
-	for(GpuQueueType qtype : EnumIterable<GpuQueueType>())
+	CString asyncComputeMsg;
+	if(pureAsyncCompute)
 	{
-		if(m_queueFamilyIndices[qtype] != kMaxU32)
-		{
-			q[qtype].sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-			q[qtype].queueFamilyIndex = m_queueFamilyIndices[qtype];
-			q[qtype].queueCount = 1;
-			q[qtype].pQueuePriorities = &priority;
+		asyncComputeMsg = "Using pure async compute queue";
 
-			++ci.queueCreateInfoCount;
-		}
+		q[GpuQueueType::kGeneral].queueFamilyIndex = m_queueFamilyIndices[GpuQueueType::kGeneral];
+		q[GpuQueueType::kGeneral].queueCount = 1;
+		q[GpuQueueType::kGeneral].pQueuePriorities = &priorities[0];
+
+		q[GpuQueueType::kCompute].queueFamilyIndex = m_queueFamilyIndices[GpuQueueType::kCompute];
+		q[GpuQueueType::kCompute].queueCount = 1;
+		q[GpuQueueType::kCompute].pQueuePriorities = &priorities[0];
+
+		ci.queueCreateInfoCount = 2;
 	}
+	else if(lowPriorityQueueAsyncCompute)
+	{
+		asyncComputeMsg = "Using low priority queue in same family as general queue (fallback #1)";
+
+		m_queueFamilyIndices[GpuQueueType::kCompute] = m_queueFamilyIndices[GpuQueueType::kGeneral];
+
+		q[0].queueFamilyIndex = m_queueFamilyIndices[GpuQueueType::kGeneral];
+		q[0].queueCount = 2;
+		q[0].pQueuePriorities = &priorities[0];
+
+		ci.queueCreateInfoCount = 1;
+	}
+	else
+	{
+		asyncComputeMsg = "Can't do much, using general queue (fallback #2)";
+
+		m_queueFamilyIndices[GpuQueueType::kCompute] = m_queueFamilyIndices[GpuQueueType::kGeneral];
+
+		q[0].queueFamilyIndex = m_queueFamilyIndices[GpuQueueType::kGeneral];
+		q[0].queueCount = 1;
+		q[0].pQueuePriorities = &priorities[0];
+
+		ci.queueCreateInfoCount = 1;
+	}
+
+	ANKI_VK_LOGI("Async compute: %s", asyncComputeMsg.cstr());
 
 	// Extensions
 	U32 extCount = 0;
@@ -1209,6 +1225,25 @@ Error GrManagerImpl::initDevice()
 
 	ANKI_VK_CHECK(vkCreateDevice(m_physicalDevice, &ci, nullptr, &m_device));
 
+	// Get the queues
+	vkGetDeviceQueue(m_device, m_queueFamilyIndices[GpuQueueType::kGeneral], 0, &m_queues[GpuQueueType::kGeneral]);
+	trySetVulkanHandleName("General", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kGeneral]);
+
+	if(pureAsyncCompute)
+	{
+		vkGetDeviceQueue(m_device, m_queueFamilyIndices[GpuQueueType::kCompute], 0, &m_queues[GpuQueueType::kCompute]);
+		trySetVulkanHandleName("AsyncCompute", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kGeneral]);
+	}
+	else if(lowPriorityQueueAsyncCompute)
+	{
+		vkGetDeviceQueue(m_device, m_queueFamilyIndices[GpuQueueType::kGeneral], 1, &m_queues[GpuQueueType::kCompute]);
+		trySetVulkanHandleName("GeneralLowPriority", VK_OBJECT_TYPE_QUEUE, m_queues[GpuQueueType::kCompute]);
+	}
+	else
+	{
+		m_queues[GpuQueueType::kCompute] = nullptr;
+	}
+
 	return Error::kNone;
 }
 

+ 25 - 2
AnKi/Gr/Vulkan/VkGrManager.h

@@ -25,6 +25,13 @@ class MicroCommandBuffer;
 /// @addtogroup vulkan
 /// @{
 
+enum class AsyncComputeType
+{
+	kProper,
+	kLowPriorityQueue,
+	kDisabled
+};
+
 /// Vulkan implementation of GrManager.
 class GrManagerImpl : public GrManager
 {
@@ -41,10 +48,26 @@ public:
 
 	ConstWeakArray<U32> getQueueFamilies() const
 	{
-		const Bool hasAsyncCompute = m_queueFamilyIndices[GpuQueueType::kCompute] != kMaxU32;
+		const Bool hasAsyncCompute = m_queueFamilyIndices[GpuQueueType::kGeneral] != m_queueFamilyIndices[GpuQueueType::kCompute];
 		return (hasAsyncCompute) ? m_queueFamilyIndices : ConstWeakArray<U32>(&m_queueFamilyIndices[0], 1);
 	}
 
+	AsyncComputeType getAsyncComputeType() const
+	{
+		if(m_queues[GpuQueueType::kCompute] == nullptr)
+		{
+			return AsyncComputeType::kDisabled;
+		}
+		else if(m_queueFamilyIndices[GpuQueueType::kCompute] == m_queueFamilyIndices[GpuQueueType::kGeneral])
+		{
+			return AsyncComputeType::kLowPriorityQueue;
+		}
+		else
+		{
+			return AsyncComputeType::kProper;
+		}
+	}
+
 	const VkPhysicalDeviceProperties& getPhysicalDeviceProperties() const
 	{
 		return m_devProps.properties;
@@ -145,7 +168,7 @@ private:
 	VulkanExtensions m_extensions = VulkanExtensions::kNone;
 	VkDevice m_device = VK_NULL_HANDLE;
 	VulkanQueueFamilies m_queueFamilyIndices = {kMaxU32, kMaxU32};
-	Array<VkQueue, U32(GpuQueueType::kCount)> m_queues = {};
+	Array<VkQueue, U32(GpuQueueType::kCount)> m_queues = {nullptr, nullptr};
 	Mutex m_globalMtx;
 
 	VkPhysicalDeviceProperties2 m_devProps = {};

+ 3 - 3
AnKi/Physics2/Common.h

@@ -41,14 +41,14 @@ namespace v2 {
 #define ANKI_PHYSICS_COMMON_FRIENDS \
 	friend class PhysicsWorld; \
 	template<typename, typename> \
-	friend class IntrusivePtr; \
+	friend class anki::IntrusivePtr; \
 	template<typename, typename, typename> \
-	friend class BlockArray;
+	friend class anki::BlockArray;
 
 class PhysicsMemoryPool : public HeapMemoryPool, public MakeSingleton<PhysicsMemoryPool>
 {
 	template<typename>
-	friend class MakeSingleton;
+	friend class anki::MakeSingleton;
 
 private:
 	PhysicsMemoryPool(AllocAlignedCallback allocCb, void* allocCbUserData)

+ 1 - 1
AnKi/Physics2/PhysicsWorld.h

@@ -39,7 +39,7 @@ public:
 class PhysicsWorld : public MakeSingleton<PhysicsWorld>
 {
 	template<typename>
-	friend class MakeSingleton;
+	friend class anki::MakeSingleton;
 	friend class PhysicsCollisionShapePtrDeleter;
 	friend class PhysicsBodyPtrDeleter;
 	friend class PhysicsBody;

+ 6 - 1
Tests/CMakeLists.txt

@@ -4,4 +4,9 @@ include_directories("..")
 
 anki_new_executable(Tests ${sources})
 target_compile_definitions(Tests PRIVATE -DANKI_SOURCE_FILE)
-target_link_libraries(Tests AnKi AnKiShaderCompiler AnKiImporter)
+
+if(!ANDROID)
+	set(extra_libs "AnKiImporter")
+endif()
+
+target_link_libraries(Tests AnKi AnKiShaderCompiler ${extra_libs})

+ 4 - 0
Tests/Framework/Framework.cpp

@@ -150,6 +150,10 @@ Options:
 			}
 			testName = argv[i];
 		}
+		else
+		{
+			break;
+		}
 	}
 
 	// Sanity check

+ 404 - 0
Tests/Gr/GrAsyncCompute.cpp

@@ -0,0 +1,404 @@
+// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#include <Tests/Framework/Framework.h>
+#include <Tests/Gr/GrCommon.h>
+#include <AnKi/Gr.h>
+#include <AnKi/Util/MemoryPool.h>
+#include <AnKi/Util/HighRezTimer.h>
+
+using namespace anki;
+
+static void generateSphere(DynamicArray<Vec3>& positions, DynamicArray<UVec3>& indices, U32 sliceCount, U32 stackCount)
+{
+	positions.emplaceBack(0.0f, 1.0f, 0.0f);
+	const U32 v0 = 0;
+
+	// generate vertices per stack / slice
+	for(U32 i = 0u; i < stackCount - 1; i++)
+	{
+		const F32 phi = kPi * (i + 1) / stackCount;
+		for(F32 j = 0u; j < sliceCount; j++)
+		{
+			const F32 theta = 2.0f * kPi * F32(j) / sliceCount;
+			const F32 x = sin(phi) * cos(theta);
+			const F32 y = cos(phi);
+			const F32 z = sin(phi) * sin(theta);
+			positions.emplaceBack(x, y, z);
+		}
+	}
+
+	// add bottom vertex
+	positions.emplaceBack(0.0f, -1.0f, 0.0f);
+	const U32 v1 = U32(positions.getSize() - 1);
+
+	// add top / bottom triangles
+	for(auto i = 0u; i < sliceCount; ++i)
+	{
+		auto i0 = i + 1;
+		auto i1 = (i + 1) % sliceCount + 1;
+		indices.emplaceBack(v0, i1, i0);
+		i0 = i + sliceCount * (stackCount - 2) + 1;
+		i1 = (i + 1) % sliceCount + sliceCount * (stackCount - 2) + 1;
+		indices.emplaceBack(v1, i0, i1);
+	}
+
+	// add quads per stack / slice
+	for(U32 j = 0u; j < stackCount - 2; j++)
+	{
+		const U32 j0 = j * sliceCount + 1;
+		const U32 j1 = (j + 1) * sliceCount + 1;
+		for(U32 i = 0u; i < sliceCount; i++)
+		{
+			const U32 i0 = j0 + i;
+			const U32 i1 = j0 + (i + 1) % sliceCount;
+			const U32 i2 = j1 + (i + 1) % sliceCount;
+			const U32 i3 = j1 + i;
+
+			indices.emplaceBack(i0, i1, i2);
+			indices.emplaceBack(i0, i2, i3);
+		}
+	}
+}
+
+ANKI_TEST(Gr, AsyncComputeBench)
+{
+	const Bool useAsyncQueue = true;
+	const Bool runConcurently = true;
+	const U32 spheresToDrawPerDimension = 100;
+	const U32 windowSize = 512;
+
+	g_validationCVar.set(false); // TODO
+	g_debugMarkersCVar.set(false);
+	g_windowWidthCVar.set(windowSize);
+	g_windowHeightCVar.set(windowSize);
+	g_asyncComputeCVar.set(0);
+
+	DefaultMemoryPool::allocateSingleton(allocAligned, nullptr);
+	ShaderCompilerMemoryPool::allocateSingleton(allocAligned, nullptr);
+	initWindow();
+	initGrManager();
+	Input::allocateSingleton();
+
+	{
+		const CString computeShaderSrc = R"(
+RWTexture2D<float4> g_inTex : register(u0);
+RWTexture2D<float4> g_outTex : register(u1);
+
+[NumThreads(8, 8, 1)] void main(uint2 svDispatchThreadId : SV_DispatchThreadID)
+{
+	uint2 texSize;
+	g_inTex.GetDimensions(texSize.x, texSize.y);
+
+	float4 val = 0.0;
+	for(int x = -9; x <= 9; ++x)
+	{
+		for(int y = -9; y <= 9; ++y)
+		{
+			int2 coord = int2(svDispatchThreadId) + int2(x, y);
+			if(coord.x < 0 || coord.y < 0 || coord.x >= texSize.x || coord.y >= texSize.y)
+			{
+				continue;
+			}
+
+			val += g_inTex[coord];
+		}
+	}
+
+	g_outTex[svDispatchThreadId] = val;
+})";
+		const CString vertShaderSrc = R"(
+struct Consts
+{
+	float3 m_worldPosition;
+	float m_scale;
+
+	float4x4 m_viewProjMat;
+};
+
+#if defined(__spirv__)
+[[vk::push_constant]] ConstantBuffer<Consts> g_consts;
+#else
+ConstantBuffer<Consts> g_consts : register(b0, space3000);
+#endif
+
+float4 main(float3 svPosition : POSITION) : SV_Position
+{
+	return mul(g_consts.m_viewProjMat, float4(svPosition * g_consts.m_scale + g_consts.m_worldPosition, 1.0));
+})";
+
+		const CString pixelShaderSrc = R"(
+float4 main() : SV_Target0
+{
+	return float4(1.0, 0.0, 0.5, 0.0);
+})";
+
+		const CString blitVertShader = R"(
+struct VertOut
+{
+	float4 m_svPosition : SV_POSITION;
+	float2 m_uv : TEXCOORD;
+};
+
+VertOut main(uint vertId : SV_VERTEXID)
+{
+	const float2 coord = float2(vertId >> 1, vertId & 1);
+
+	VertOut output;
+	output.m_svPosition = float4(coord * float2(4.0, -4.0) + float2(-1.0, 1.0), 0.0, 1.0);
+	output.m_uv = coord * 2.0f;
+
+	return output;
+})";
+
+		const CString blitPixelShader = R"(
+struct VertOut
+{
+	float4 m_svPosition : SV_POSITION;
+	float2 m_uv : TEXCOORD;
+};
+
+Texture2D g_inTex : register(t0);
+SamplerState g_sampler : register(s0);
+
+float4 main(VertOut input) : SV_Target0
+{
+	return g_inTex.Sample(g_sampler, input.m_uv);
+})";
+
+		ShaderProgramPtr compProg = createComputeProg(computeShaderSrc);
+		ShaderProgramPtr graphicsProg = createVertFragProg(vertShaderSrc, pixelShaderSrc);
+		ShaderProgramPtr blitProg = createVertFragProg(blitVertShader, blitPixelShader);
+
+		DynamicArray<Vec3> positions;
+		DynamicArray<UVec3> indices;
+		generateSphere(positions, indices, 50, 50);
+
+		BufferPtr posBuff = createBuffer(BufferUsageBit::kVertexOrIndex, ConstWeakArray(positions), "PosBuffer");
+		BufferPtr indexBuff = createBuffer(BufferUsageBit::kVertexOrIndex, ConstWeakArray(indices), "IdxBuffer");
+
+		TextureInitInfo texInit("Tex");
+		texInit.m_width = texInit.m_height = 2048;
+		texInit.m_format = Format::kR32G32B32A32_Sfloat;
+		texInit.m_usage = TextureUsageBit::kUavCompute;
+		TexturePtr inTex = createTexture2d(texInit, Vec4(0.5f));
+		TexturePtr outTex = createTexture2d(texInit, Vec4(0.1f));
+
+		{
+			CommandBufferInitInfo cinit;
+			cinit.m_flags = CommandBufferFlag::kGeneralWork | CommandBufferFlag::kSmallBatch;
+			CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(cinit);
+
+			const TextureBarrierInfo barrier2 = {TextureView(inTex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kCopyDestination,
+												 TextureUsageBit::kUavCompute};
+			cmdb->setPipelineBarrier({&barrier2, 1}, {}, {});
+			cmdb->endRecording();
+
+			FencePtr fence;
+			GrManager::getSingleton().submit(cmdb.get(), {}, &fence);
+			fence->clientWait(kMaxSecond);
+		}
+
+		TextureInitInfo texInit2("RT");
+		texInit2.m_width = texInit2.m_height = windowSize;
+		texInit2.m_format = Format::kR32G32B32A32_Sfloat;
+		texInit2.m_usage = TextureUsageBit::kRtvDsvWrite | TextureUsageBit::kSrvPixel;
+		TexturePtr rtTex = createTexture2d(texInit2, Vec4(0.5f));
+
+		SamplerInitInfo samplerInit("sampler");
+		SamplerPtr sampler = GrManager::getSingleton().newSampler(samplerInit);
+
+		Array<TimestampQueryPtr, 2> startTimestamps = {GrManager::getSingleton().newTimestampQuery(), GrManager::getSingleton().newTimestampQuery()};
+		TimestampQueryPtr endTimestamp = GrManager::getSingleton().newTimestampQuery();
+
+		FencePtr finalFence;
+
+		const U32 iterationCount = 1000;
+		for(U32 i = 0; i < iterationCount; ++i)
+		{
+			ANKI_TEST_EXPECT_NO_ERR(Input::getSingleton().handleEvents());
+			TexturePtr presentTex = GrManager::getSingleton().acquireNextPresentableTexture();
+
+			// Init command buffers
+			CommandBufferInitInfo cinit;
+			cinit.m_flags = CommandBufferFlag::kGeneralWork | CommandBufferFlag::kSmallBatch;
+			CommandBufferPtr gfxCmdb = GrManager::getSingleton().newCommandBuffer(cinit);
+
+			CommandBufferPtr compCmdb;
+			if(useAsyncQueue)
+			{
+				CommandBufferInitInfo cinit;
+				cinit.m_flags = CommandBufferFlag::kComputeWork | CommandBufferFlag::kSmallBatch;
+				compCmdb = GrManager::getSingleton().newCommandBuffer(cinit);
+			}
+			else
+			{
+				compCmdb = gfxCmdb;
+			}
+
+			CommandBufferPtr blitCmdb = GrManager::getSingleton().newCommandBuffer(cinit);
+
+			// Barriers
+			{
+				const TextureBarrierInfo rtBarrier = {TextureView(rtTex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kNone,
+													  TextureUsageBit::kRtvDsvWrite};
+				gfxCmdb->setPipelineBarrier({&rtBarrier, 1}, {}, {});
+
+				const TextureBarrierInfo uavBarrier = {TextureView(outTex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kNone,
+													   TextureUsageBit::kUavCompute};
+				compCmdb->setPipelineBarrier({&uavBarrier, 1}, {}, {});
+
+				const TextureBarrierInfo blitBarrier = {TextureView(presentTex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kNone,
+														TextureUsageBit::kRtvDsvWrite};
+				blitCmdb->setPipelineBarrier({&blitBarrier, 1}, {}, {});
+			}
+
+			// Compute dispatch
+			{
+				if(i == 0)
+				{
+					compCmdb->writeTimestamp(startTimestamps[0].get());
+				}
+
+				compCmdb->bindShaderProgram(compProg.get());
+				compCmdb->bindUav(0, 0, TextureView(inTex.get(), TextureSubresourceDesc::all()));
+				compCmdb->bindUav(1, 0, TextureView(outTex.get(), TextureSubresourceDesc::all()));
+				compCmdb->dispatchCompute(inTex->getWidth() / 8, inTex->getHeight() / 8, 1);
+			}
+
+			// Draw spheres
+			{
+				if(i == 0)
+				{
+					compCmdb->writeTimestamp(startTimestamps[1].get());
+				}
+
+				RenderTarget rt;
+				rt.m_textureView = TextureView(rtTex.get(), TextureSubresourceDesc::all());
+				rt.m_loadOperation = RenderTargetLoadOperation::kClear;
+				rt.m_clearValue.m_colorf = {getRandomRange(0.0f, 1.0f), getRandomRange(0.0f, 1.0f), getRandomRange(0.0f, 1.0f), 1.0f};
+				gfxCmdb->beginRenderPass({rt});
+
+				gfxCmdb->bindVertexBuffer(0, BufferView(posBuff.get()), sizeof(Vec3));
+				gfxCmdb->setVertexAttribute(VertexAttributeSemantic::kPosition, 0, Format::kR32G32B32_Sfloat, 0);
+				gfxCmdb->bindIndexBuffer(BufferView(indexBuff.get()), IndexType::kU32);
+				gfxCmdb->bindShaderProgram(graphicsProg.get());
+				gfxCmdb->setViewport(0, 0, windowSize, windowSize);
+
+				struct Consts
+				{
+					Vec3 m_worldPosition;
+					F32 m_scale;
+
+					Mat4 m_viewProjMat;
+				} consts;
+
+				constexpr F32 orthoHalfSize = 10.0f;
+				constexpr F32 orthoSize = orthoHalfSize * 2.0f;
+				const Mat4 viewMat = Mat4::getIdentity().getInverse();
+				const Mat4 projMat =
+					Mat4::calculateOrthographicProjectionMatrix(orthoHalfSize, -orthoHalfSize, orthoHalfSize, -orthoHalfSize, 0.1f, 200.0f);
+				consts.m_viewProjMat = projMat * viewMat;
+
+				consts.m_scale = 0.07f;
+
+				for(U32 x = 0; x < spheresToDrawPerDimension; ++x)
+				{
+					for(U32 y = 0; y < spheresToDrawPerDimension; ++y)
+					{
+						consts.m_worldPosition = Vec3(F32(x) / (spheresToDrawPerDimension - 1) * orthoSize - orthoHalfSize,
+													  F32(y) / (spheresToDrawPerDimension - 1) * orthoSize - orthoHalfSize, -1.0f);
+
+						gfxCmdb->setFastConstants(&consts, sizeof(consts));
+
+						gfxCmdb->drawIndexed(PrimitiveTopology::kTriangles, U32(indexBuff->getSize() / sizeof(U32)));
+					}
+				}
+
+				gfxCmdb->endRenderPass();
+			}
+
+			// Blit
+			{
+				const TextureBarrierInfo blitBarrier = {TextureView(rtTex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kRtvDsvWrite,
+														TextureUsageBit::kSrvPixel};
+				blitCmdb->setPipelineBarrier({&blitBarrier, 1}, {}, {});
+
+				RenderTarget rt;
+				rt.m_textureView = TextureView(presentTex.get(), TextureSubresourceDesc::all());
+				rt.m_loadOperation = RenderTargetLoadOperation::kDontCare;
+				rt.m_clearValue.m_colorf = {getRandomRange(0.0f, 1.0f), getRandomRange(0.0f, 1.0f), getRandomRange(0.0f, 1.0f), 1.0f};
+				blitCmdb->beginRenderPass({rt});
+
+				blitCmdb->bindShaderProgram(blitProg.get());
+				blitCmdb->bindSrv(0, 0, TextureView(rtTex.get(), TextureSubresourceDesc::all()));
+				blitCmdb->bindSampler(0, 0, sampler.get());
+				blitCmdb->setViewport(0, 0, windowSize, windowSize);
+				blitCmdb->draw(PrimitiveTopology::kTriangles, 3);
+
+				blitCmdb->endRenderPass();
+
+				const TextureBarrierInfo presentBarrier = {TextureView(presentTex.get(), TextureSubresourceDesc::all()),
+														   TextureUsageBit::kRtvDsvWrite, TextureUsageBit::kPresent};
+				blitCmdb->setPipelineBarrier({&presentBarrier, 1}, {}, {});
+
+				if(i == iterationCount - 1)
+				{
+					compCmdb->writeTimestamp(endTimestamp.get());
+				}
+			}
+
+			gfxCmdb->endRecording();
+			blitCmdb->endRecording();
+			if(useAsyncQueue)
+			{
+				compCmdb->endRecording();
+			}
+
+			if(useAsyncQueue)
+			{
+				WeakArray<Fence*> firstWaveWaitFences;
+				Array<Fence*, 1> arr;
+				if(finalFence.isCreated())
+				{
+					arr = {finalFence.get()};
+					firstWaveWaitFences = {arr};
+				}
+
+				FencePtr fence2;
+				GrManager::getSingleton().submit(compCmdb.get(), firstWaveWaitFences, &fence2);
+
+				FencePtr fence1;
+				GrManager::getSingleton().submit(gfxCmdb.get(), firstWaveWaitFences, &fence1);
+
+				Array<Fence*, 2> waitFences = {{fence1.get(), fence2.get()}};
+				GrManager::getSingleton().submit(blitCmdb.get(), {waitFences}, &finalFence);
+			}
+			else
+			{
+				GrManager::getSingleton().submit(gfxCmdb.get());
+				GrManager::getSingleton().submit(blitCmdb.get(), {}, &finalFence);
+			}
+
+			GrManager::getSingleton().swapBuffers();
+		}
+
+		finalFence->clientWait(kMaxSecond);
+
+		Array<Second, 2> startTime;
+		ANKI_TEST_EXPECT_EQ(startTimestamps[0]->getResult(startTime[0]), TimestampQueryResult::kAvailable);
+		ANKI_TEST_EXPECT_EQ(startTimestamps[1]->getResult(startTime[1]), TimestampQueryResult::kAvailable);
+		Second endTime;
+		ANKI_TEST_EXPECT_EQ(endTimestamp->getResult(endTime), TimestampQueryResult::kAvailable);
+
+		ANKI_TEST_LOGI("GPU time %f\n", endTime - min(startTime[0], startTime[1]));
+	}
+
+	Input::freeSingleton();
+	GrManager::freeSingleton();
+	NativeWindow::freeSingleton();
+	ShaderCompilerMemoryPool::freeSingleton();
+	DefaultMemoryPool::freeSingleton();
+}

+ 15 - 0
Tests/Gr/GrCommon.h

@@ -62,6 +62,18 @@ inline ShaderProgramPtr createVertFragProg(CString vert, CString frag, ConstWeak
 	return prog;
 }
 
+inline ShaderProgramPtr createComputeProg(CString src, ConstWeakArray<CString> extraCompilerArgs = {})
+{
+	ShaderPtr shader = createShader(src, ShaderType::kCompute, extraCompilerArgs);
+
+	ShaderProgramInitInfo init;
+	init.m_computeShader = shader.get();
+
+	ShaderProgramPtr prog = GrManager::getSingleton().newShaderProgram(init);
+
+	return prog;
+}
+
 inline ShaderPtr loadShader(CString filename, ShaderType type, ConstWeakArray<CString> extraCompilerArgs = {})
 {
 	File file;
@@ -166,6 +178,9 @@ inline TexturePtr createTexture2d(const TextureInitInfo texInit_, ConstWeakArray
 
 	CommandBufferPtr cmdb = GrManager::getSingleton().newCommandBuffer(cmdbInit);
 
+	const TextureBarrierInfo barr = {TextureView(tex.get(), TextureSubresourceDesc::all()), TextureUsageBit::kNone,
+									 TextureUsageBit::kCopyDestination};
+	cmdb->setPipelineBarrier({&barr, 1}, {}, {});
 	cmdb->copyBufferToTexture(BufferView(staging.get()), TextureView(tex.get(), TextureSubresourceDesc::all()));
 	cmdb->endRecording();
 

+ 0 - 2
Tests/Main.cpp

@@ -11,8 +11,6 @@ using namespace anki;
 ANKI_MAIN_FUNCTION(myMain)
 int myMain(int argc, char** argv)
 {
-	HeapAllocator<U8> alloc(allocAligned, nullptr);
-
 	int exitcode = getTesterSingleton().run(argc, argv);
 
 	deleteTesterSingleton();