2
0
Эх сурвалжийг харах

Vulkan buffer memory mapping with handling for memory hazards and accesses from other device queues

BearishSun 9 жил өмнө
parent
commit
cc9a402774

+ 8 - 4
Source/BansheeCore/Include/BsCommonTypes.h

@@ -178,11 +178,15 @@ namespace BansheeEngine
 	{
 		/** 
 		 * Signifies that you don't plan on modifying the buffer often (or at all) after creation. Modifying such buffer 
-		 * will involve a larger performance hit.
+		 * will involve a larger performance hit. Mutually exclusive with GBU_DYNAMIC.
 		 */
-        GBU_STATIC = 1,
-		/** Signifies that you will modify this buffer fairly often (e.g. every frame). */
-		GBU_DYNAMIC = 2
+        GBU_STATIC = 0x01,
+		/** 
+		 * Signifies that you will modify this buffer fairly often (e.g. every frame). Mutually exclusive with GBU_STATIC. 
+		 */
+		GBU_DYNAMIC = 0x02,
+		/** Signifies that the buffer's data on the GPU can be read by the CPU. */
+		GBU_READABLE = 0x04
 	};
 
 	/** Types of generic GPU buffers that may be attached to GPU programs. */

+ 1 - 1
Source/BansheeCore/Source/BsVertexBuffer.cpp

@@ -10,7 +10,7 @@ namespace BansheeEngine
 	{ }
 
 	VertexBufferCore::VertexBufferCore(const VERTEX_BUFFER_DESC& desc, GpuDeviceFlags deviceMask)
-		:HardwareBuffer(mProperties.mVertexSize * mProperties.mNumVertices), mProperties(desc.numVerts, desc.vertexSize)
+		:HardwareBuffer(desc.vertexSize * desc.numVerts), mProperties(desc.numVerts, desc.vertexSize)
 	{ }
 
 	SPtr<VertexBufferCore> VertexBufferCore::create(const VERTEX_BUFFER_DESC& desc, GpuDeviceFlags deviceMask)

+ 1 - 7
Source/BansheeD3D11RenderAPI/Source/BsD3D11Mappings.cpp

@@ -812,13 +812,7 @@ namespace BansheeEngine
 
 	bool D3D11Mappings::isDynamic(GpuBufferUsage usage)
 	{
-		switch (usage)
-		{
-		case GBU_DYNAMIC:
-			return true;
-		}
-
-		return false;
+		return (usage & GBU_DYNAMIC) != 0;
 	}
 
 	bool D3D11Mappings::isMappingWrite(D3D11_MAP map)

+ 7 - 9
Source/BansheeGLRenderAPI/Source/BsGLHardwareBufferManager.cpp

@@ -54,15 +54,13 @@ namespace BansheeEngine
 
     GLenum GLHardwareBufferCoreManager::getGLUsage(GpuBufferUsage usage)
     {
-        switch(usage)
-        {
-        case GBU_STATIC:
-            return GL_STATIC_DRAW;
-        case GBU_DYNAMIC:
-            return GL_DYNAMIC_DRAW;
-        default:
-            return GL_DYNAMIC_DRAW;
-        };
+		if(usage & GBU_STATIC)
+			return GL_STATIC_DRAW;
+
+		if(usage & GBU_DYNAMIC)
+			return GL_DYNAMIC_DRAW;
+
+        return GL_DYNAMIC_DRAW;
     }
 
     GLenum GLHardwareBufferCoreManager::getGLType(VertexElementType type)

+ 35 - 6
Source/BansheeVulkanRenderAPI/Include/BsVulkanCommandBufferManager.h

@@ -13,28 +13,54 @@ namespace BansheeEngine
 	 */
 
 	/** Wrapper around a command buffer used specifically for transfer operations. */
-	class VulkanTransferBufferInfo
+	class VulkanTransferBuffer
 	{
 	public:
-		VulkanTransferBufferInfo(UINT32 queueIdx);
+		VulkanTransferBuffer();
+		VulkanTransferBuffer(VulkanDevice* device, GpuQueueType type, UINT32 queueIdx);
+		~VulkanTransferBuffer();
 
 		/** 
 		 * OR's the provided sync mask with the internal sync mask. The sync mask determines on which queues should
-		 * the buffer wait on before executing. See CommandSyncMask.
+		 * the buffer wait on before executing. Sync mask is reset after a flush. See CommandSyncMask on how to generate
+		 * a sync mask.
 		 */
 		void appendMask(UINT32 syncMask) { mSyncMask |= syncMask; }
 
 		/** Resets the sync mask. */
 		void clearMask() { mSyncMask = 0; }
 
+		/** 
+		 * Issues a pipeline barrier on the provided buffer. See vkCmdPipelineBarrier in Vulkan spec. for usage
+		 * information.
+		 */
+		void memoryBarrier(VkBuffer buffer, VkAccessFlags srcAccessFlags, VkAccessFlags dstAccessFlags,
+						   VkPipelineStageFlags srcStage, VkPipelineStageFlags dstStage);
+
+		/** 
+		 * Submits the command buffer on the queue. 
+		 * 
+		 *	@param[in]	wait	If true, the caller thread will wait until all device operations on the command buffer's
+		 *						queue complete.	
+		 */
+		void flush(bool wait);
+
 		/** Returns the internal command buffer. */
 		VulkanCmdBuffer* getCB() const { return mCB; }
 	private:
 		friend class VulkanCommandBufferManager;
 
+		/** Allocates a new internal command buffer. */
+		void allocate();
+
+		VulkanDevice* mDevice;
+		GpuQueueType mType;
+		UINT32 mQueueIdx;
+		VulkanQueue* mQueue;
+		UINT32 mQueueMask;
+
 		VulkanCmdBuffer* mCB;
 		UINT32 mSyncMask;
-		UINT32 mQueueIdx;
 	};
 
 	/** 
@@ -78,7 +104,7 @@ namespace BansheeEngine
 		 * Transfer buffers are automatically flushed (submitted) whenever a new (normal) command buffer is about to
 		 * execute.
 		 */
-		VulkanTransferBufferInfo* getTransferBuffer(UINT32 deviceIdx, GpuQueueType type, UINT32 queueIdx);
+		VulkanTransferBuffer* getTransferBuffer(UINT32 deviceIdx, GpuQueueType type, UINT32 queueIdx);
 
 		/** Submits all transfer command buffers, ensuring all queued transfer operations get executed. */
 		void flushTransferBuffers(UINT32 deviceIdx);
@@ -88,7 +114,7 @@ namespace BansheeEngine
 		struct PerDeviceData
 		{
 			VulkanCmdBuffer* activeBuffers[BS_MAX_UNIQUE_QUEUES];
-			VulkanTransferBufferInfo transferBuffers[BS_MAX_UNIQUE_QUEUES];
+			VulkanTransferBuffer transferBuffers[GQT_COUNT][BS_MAX_QUEUES_PER_TYPE];
 		};
 
 		const VulkanRenderAPI& mRapi;
@@ -97,5 +123,8 @@ namespace BansheeEngine
 		UINT32 mNumDevices;
 	};
 
+	/**	Provides easy access to the VulkanCommandBufferManager. */
+	VulkanCommandBufferManager& gVulkanCBManager();
+
 	/** @} */
 }

+ 35 - 3
Source/BansheeVulkanRenderAPI/Include/BsVulkanHardwareBuffer.h

@@ -25,6 +25,22 @@ namespace BansheeEngine
 		/** Returns a buffer view that covers the entire buffer. */
 		VkBufferView getView() const { return mView; }
 
+		/** 
+		 * Returns a pointer to internal buffer memory. Must be followed by unmap(). Caller must ensure the buffer was
+		 * created in CPU readable memory, and that buffer isn't currently being written to by the GPU.
+		 */
+		UINT8* map(VkDeviceSize offset, VkDeviceSize length) const;
+
+		/** Unmaps a buffer previously mapped with map(). */
+		void unmap();
+
+		/** 
+		 * Queues a command on the provided command buffer. The command copies the contents of the current buffer to
+		 * the destination buffer. Caller must ensure the provided offsets and lengths are within valid bounds of
+		 * both buffers.
+		 */
+		void copy(VulkanTransferBuffer* cb, VulkanBuffer* destination, VkDeviceSize offset, VkDeviceSize length);
+
 	private:
 		VkBuffer mBuffer;
 		VkBufferView mView;
@@ -48,8 +64,6 @@ namespace BansheeEngine
 			BT_GENERIC = 0x8,
 			/** Generic read/write GPU buffer containing formatted data. */
 			BT_STORAGE = 0x10,
-			/** Helper buffer that can be written to on the CPU. Used for copy operations. */
-			BT_STAGING = 0x20,
 		};
 
 		VulkanHardwareBuffer(BufferType type, GpuBufferFormat format, GpuBufferUsage usage, UINT32 size,
@@ -80,8 +94,26 @@ namespace BansheeEngine
 		/** @copydoc HardwareBuffer::unmap */
 		void unmap() override;
 
+		/** Creates a new buffer for the specified device, matching the current buffer properties. */
+		VulkanBuffer* createBuffer(VulkanDevice& device, bool staging, bool readable);
+
 		VulkanBuffer* mBuffers[BS_MAX_DEVICES];
-		bool mStaging;
+
+		VulkanBuffer* mStagingBuffer;
+		UINT32 mMappedDeviceIdx;
+		UINT32 mMappedGlobalQueueIdx;
+		UINT32 mMappedOffset;
+		UINT32 mMappedSize;
+		GpuLockOptions mMappedLockOptions;
+
+		VkBufferCreateInfo mBufferCI;
+		VkBufferViewCreateInfo mViewCI;
+		VkBufferUsageFlags mUsageFlags;
+		bool mDirectlyMappable : 1;
+		bool mSupportsGPUWrites : 1;
+		bool mRequiresView : 1;
+		bool mReadable : 1;
+		bool mIsMapped : 1;
 	};
 
 	/** @} */

+ 1 - 0
Source/BansheeVulkanRenderAPI/Include/BsVulkanPrerequisites.h

@@ -56,6 +56,7 @@ namespace BansheeEngine
 	class VulkanImage;
 	class VulkanDescriptorPool;
 	class VulkanGpuParams;
+	class VulkanTransferBuffer;
 
 	VkAllocationCallbacks* gVulkanAllocator = nullptr;
 

+ 3 - 0
Source/BansheeVulkanRenderAPI/Include/BsVulkanQueue.h

@@ -38,6 +38,9 @@ namespace BansheeEngine
 		/** Submits the provided command buffer on the queue. */
 		void submit(VulkanCmdBuffer* cmdBuffer, VkSemaphore* waitSemaphores, UINT32 semaphoresCount);
 
+		/** Blocks the calling thread until all operations on the queue finish. */
+		void waitIdle() const;
+
 	protected:
 		VulkanDevice& mDevice;
 		VkQueue mQueue;

+ 3 - 3
Source/BansheeVulkanRenderAPI/Include/BsVulkanResource.h

@@ -105,13 +105,13 @@ namespace BansheeEngine
 		UINT32 getQueueFamily() const { Lock(mMutex); return mQueueFamily; }
 
 		/** 
-		 * Returns a mask that has bits set for every queue that the resource is currently used by.
+		 * Returns a mask that has bits set for every queue that the resource is currently used (read or written) by.
 		 *
-		 * @param[out]	useFlags	Output parameter that notifies the caller in what way is the resource being used.
+		 * @param[in]	useFlags	Flags for which to check use information (e.g. read only, write only, or both).
 		 * @return					Bitmask of which queues is the resource used on. This has the same format as sync mask
 		 *							created by CommandSyncMask.
 		 */
-		UINT32 getUseInfo(VulkanUseFlags& useFlags) const;
+		UINT32 getUseInfo(VulkanUseFlags useFlags) const;
 
 		/** Returns true if the resource is only allowed to be used by a single queue family at once. */
 		bool isExclusive() const { Lock(mMutex); return mState != State::Shared; }

+ 6 - 6
Source/BansheeVulkanRenderAPI/Source/BsVulkanCommandBuffer.cpp

@@ -330,8 +330,8 @@ namespace BansheeEngine
 			UINT32 numBufferBarriers = (UINT32)barriers.bufferBarriers.size();
 
 			vkCmdPipelineBarrier(vkCmdBuffer,
-								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // Note: VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT might be more correct here, according to the spec
+								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, //       The main idea is that the barrier executes before the semaphore triggers, no actual stage dependencies are needed.
 								 0, 0, nullptr,
 								 numBufferBarriers, barriers.bufferBarriers.data(),
 								 numImgBarriers, barriers.imageBarriers.data());
@@ -406,8 +406,8 @@ namespace BansheeEngine
 			UINT32 numBufferBarriers = (UINT32)barriers.bufferBarriers.size();
 
 			vkCmdPipelineBarrier(vkCmdBuffer,
-								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // Note: VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT might be more correct here, according to the spec
+								 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 
 								 0, 0, nullptr,
 								 numBufferBarriers, barriers.bufferBarriers.data(),
 								 numImgBarriers, barriers.imageBarriers.data());
@@ -448,8 +448,6 @@ namespace BansheeEngine
 			entry.first->notifyUsed(mGlobalQueueIdx, mQueueFamily, useHandle.flags);
 		}
 
-		cbm.refreshStates(deviceIdx);
-
 		// Note: Uncommented for debugging only, prevents any device concurrency issues.
 		// vkQueueWaitIdle(queue->getHandle());
 
@@ -655,6 +653,8 @@ namespace BansheeEngine
 		syncMask &= ~mIdMask;
 
 		mBuffer->submit(mQueue, mQueueIdx, syncMask);
+
+		gVulkanCBManager().refreshStates(mDeviceIdx);
 		acquireNewBuffer();
 	}
 }

+ 93 - 41
Source/BansheeVulkanRenderAPI/Source/BsVulkanCommandBufferManager.cpp

@@ -4,21 +4,99 @@
 #include "BsVulkanCommandBuffer.h"
 #include "BsVulkanRenderAPI.h"
 #include "BsVulkanDevice.h"
+#include "BsVulkanQueue.h"
 
 namespace BansheeEngine
 {
-	VulkanTransferBufferInfo::VulkanTransferBufferInfo(UINT32 queueIdx)
-		:mCB(nullptr), mSyncMask(0), mQueueIdx(queueIdx)
+	VulkanTransferBuffer::VulkanTransferBuffer()
+		:mDevice(nullptr), mType(GQT_GRAPHICS), mQueueIdx(0), mQueue(nullptr), mCB(nullptr), mSyncMask(0), mQueueMask(0)
 	{ }
 
+	VulkanTransferBuffer::VulkanTransferBuffer(VulkanDevice* device, GpuQueueType type, UINT32 queueIdx)
+		:mDevice(device), mType(type), mQueueIdx(queueIdx), mQueue(nullptr), mCB(nullptr), mSyncMask(0), mQueueMask(0)
+	{
+		UINT32 numQueues = device->getNumQueues(mType);
+		if (numQueues == 0)
+		{
+			mType = GQT_GRAPHICS;
+			numQueues = device->getNumQueues(GQT_GRAPHICS);
+		}
+
+		UINT32 physicalQueueIdx = queueIdx % numQueues;
+		mQueue = device->getQueue(mType, physicalQueueIdx);
+		mQueueMask = device->getQueueMask(mType, queueIdx);
+	}
+
+	VulkanTransferBuffer::~VulkanTransferBuffer()
+	{
+		if (mCB != nullptr)
+			mCB->end();
+	}
+
+	void VulkanTransferBuffer::allocate()
+	{
+		if (mCB != nullptr)
+			return;
+
+		UINT32 queueFamily = mDevice->getQueueFamily(mType);
+		mCB = mDevice->getCmdBufferPool().getBuffer(queueFamily, false);
+	}
+
+	void VulkanTransferBuffer::memoryBarrier(VkBuffer buffer, VkAccessFlags srcAccessFlags, VkAccessFlags dstAccessFlags,
+					   VkPipelineStageFlags srcStage, VkPipelineStageFlags dstStage)
+	{
+		VkBufferMemoryBarrier barrier;
+		barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+		barrier.pNext = nullptr;
+		barrier.srcAccessMask = srcAccessFlags;
+		barrier.dstAccessMask = dstAccessFlags;
+		barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+		barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+		barrier.buffer = buffer;
+		barrier.offset = 0;
+		barrier.size = VK_WHOLE_SIZE;
+
+		vkCmdPipelineBarrier(mCB->getHandle(),
+							 srcStage,
+							 dstStage,
+							 0, 0, nullptr,
+							 1, &barrier,
+							 0, nullptr);
+	}
+
+	void VulkanTransferBuffer::flush(bool wait)
+	{
+		UINT32 syncMask = mSyncMask & ~mQueueMask; // Don't sync with itself
+
+		mCB->end();
+		mCB->submit(mQueue, mQueueIdx, syncMask);
+
+		if (wait)
+		{
+			mQueue->waitIdle();
+			gVulkanCBManager().refreshStates(mDevice->getIndex());
+		}
+
+		mCB = nullptr;
+	}
+
 	VulkanCommandBufferManager::VulkanCommandBufferManager(const VulkanRenderAPI& rapi)
 		:mRapi(rapi), mDeviceData(nullptr), mNumDevices(rapi.getNumDevices())
 	{
 		mDeviceData = bs_newN<PerDeviceData>(mNumDevices);
 		for (UINT32 i = 0; i < mNumDevices; i++)
 		{
+			SPtr<VulkanDevice> device = rapi._getDevice(i);
+
 			bs_zero_out(mDeviceData[i].activeBuffers);
-			bs_zero_out(mDeviceData[i].transferBuffers);
+
+			for (UINT32 j = 0; j < GQT_COUNT; j++)
+			{
+				GpuQueueType queueType = (GpuQueueType)j;
+
+				for (UINT32 k = 0; k < BS_MAX_QUEUES_PER_TYPE; k++)
+					mDeviceData[i].transferBuffers[j][k] = VulkanTransferBuffer(device.get(), queueType, k);
+			}
 		}
 	}
 
@@ -97,58 +175,32 @@ namespace BansheeEngine
 		}
 	}
 
-	VulkanTransferBufferInfo* VulkanCommandBufferManager::getTransferBuffer(UINT32 deviceIdx, GpuQueueType type, 
+	VulkanTransferBuffer* VulkanCommandBufferManager::getTransferBuffer(UINT32 deviceIdx, GpuQueueType type,
 		UINT32 queueIdx)
 	{
 		assert(deviceIdx < mNumDevices);
 
-		UINT32 globalQueueIdx = CommandSyncMask::getGlobalQueueIdx(type, queueIdx);
-		assert(globalQueueIdx < BS_MAX_UNIQUE_QUEUES);
-
 		PerDeviceData& deviceData = mDeviceData[deviceIdx];
-		if (deviceData.transferBuffers[globalQueueIdx].mCB == nullptr)
-		{
-			SPtr<VulkanDevice> device = mRapi._getDevice(deviceIdx);
-
-			UINT32 queueFamily = device->getQueueFamily(type);
-			deviceData.transferBuffers[globalQueueIdx].mCB = device->getCmdBufferPool().getBuffer(queueFamily, false);
-		}
 
-		return &deviceData.transferBuffers[globalQueueIdx];
+		VulkanTransferBuffer* transferBuffer = &deviceData.transferBuffers[type][queueIdx];
+		transferBuffer->allocate();
+		return transferBuffer;
 	}
 
 	void VulkanCommandBufferManager::flushTransferBuffers(UINT32 deviceIdx)
 	{
 		assert(deviceIdx < mNumDevices);
 
-		SPtr<VulkanDevice> device = mRapi._getDevice(deviceIdx);
 		PerDeviceData& deviceData = mDeviceData[deviceIdx];
-
-		UINT32 transferBufferIdx = 0;
-		for(UINT32 i = 0; i < GQT_COUNT; i++)
+		for (UINT32 i = 0; i < GQT_COUNT; i++)
 		{
-			GpuQueueType queueType = (GpuQueueType)i;
-			UINT32 numQueues = device->getNumQueues(queueType);
-			if (numQueues == 0)
-			{
-				queueType = GQT_GRAPHICS;
-				numQueues = device->getNumQueues(GQT_GRAPHICS);
-			}
-
-			for(UINT32 j = 0; j < BS_MAX_QUEUES_PER_TYPE; j++)
-			{
-				VulkanTransferBufferInfo& bufferInfo = deviceData.transferBuffers[transferBufferIdx];
-				if (bufferInfo.mCB == nullptr)
-					continue;
-
-				UINT32 physicalQueueIdx = j % numQueues;
-				VulkanQueue* queue = device->getQueue(queueType, physicalQueueIdx);
-
-				bufferInfo.mCB->submit(queue, bufferInfo.mQueueIdx, bufferInfo.mSyncMask);
-				bufferInfo.mCB = nullptr;
-
-				transferBufferIdx++;
-			}
+			for (UINT32 j = 0; j < BS_MAX_QUEUES_PER_TYPE; j++)
+				deviceData.transferBuffers[i][j].flush(false);
 		}
 	}
+
+	VulkanCommandBufferManager& gVulkanCBManager()
+	{
+		return static_cast<VulkanCommandBufferManager&>(CommandBufferManager::instance());
+	}
 }

+ 259 - 104
Source/BansheeVulkanRenderAPI/Source/BsVulkanHardwareBuffer.cpp

@@ -4,7 +4,7 @@
 #include "BsVulkanRenderAPI.h"
 #include "BsVulkanDevice.h"
 #include "BsVulkanUtility.h"
-#include "BsException.h"
+#include "BsVulkanCommandBufferManager.h"
 
 namespace BansheeEngine
 {
@@ -25,16 +25,42 @@ namespace BansheeEngine
 		device.freeMemory(mMemory);
 	}
 
-	VulkanHardwareBuffer::VulkanHardwareBuffer(BufferType type, GpuBufferFormat format, GpuBufferUsage usage, 
-		UINT32 size, GpuDeviceFlags deviceMask)
-		: HardwareBuffer(size), mBuffers(), mStaging(type == BT_STAGING)
+	UINT8* VulkanBuffer::map(VkDeviceSize offset, VkDeviceSize length) const
 	{
-		bool needsView = false;
+		VulkanDevice& device = mOwner->getDevice();
 
-		VkMemoryPropertyFlags flags = mStaging ?
-			(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) : // Note: Try using cached uncoherent memory
-			VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+		UINT8* data;
+		VkResult result = vkMapMemory(device.getLogical(), mMemory, offset, length, 0, (void**)&data);
+		assert(result == VK_SUCCESS);
+
+		return data;
+	}
+
+	void VulkanBuffer::unmap()
+	{
+		VulkanDevice& device = mOwner->getDevice();
+
+		vkUnmapMemory(device.getLogical(), mMemory);
+	}
+
+	void VulkanBuffer::copy(VulkanTransferBuffer* cb, VulkanBuffer* destination, VkDeviceSize offset, VkDeviceSize length)
+	{
+		VkBufferCopy region;
+		region.size = length;
+		region.srcOffset = offset;
+		region.dstOffset = offset;
+
+		vkCmdCopyBuffer(cb->getCB()->getHandle(), mBuffer, destination->getHandle(), 1, &region);
+	}
 
+	VulkanHardwareBuffer::VulkanHardwareBuffer(BufferType type, GpuBufferFormat format, GpuBufferUsage usage, 
+		UINT32 size, GpuDeviceFlags deviceMask)
+		: HardwareBuffer(size), mBuffers(), mStagingBuffer(nullptr), mMappedDeviceIdx(-1), mMappedGlobalQueueIdx(-1)
+		, mMappedOffset(0), mMappedSize(0), mMappedLockOptions(GBL_WRITE_ONLY)
+		, mDirectlyMappable((usage & GBU_DYNAMIC) != 0)
+		, mSupportsGPUWrites(type == BT_STORAGE), mRequiresView(false), mReadable((usage & GBU_READABLE) != 0)
+		, mIsMapped(false)
+	{
 		VkBufferUsageFlags usageFlags = 0;
 		switch(type)
 		{
@@ -49,17 +75,30 @@ namespace BansheeEngine
 			break;
 		case BT_GENERIC:
 			usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
-			needsView = true;
+			mRequiresView = true;
 			break;
 		case BT_STORAGE:
 			usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
-			needsView = true;
-			break;
-		case BT_STAGING:
-			usageFlags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+			mRequiresView = true;
 			break;
 		}
 
+		mBufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+		mBufferCI.pNext = nullptr;
+		mBufferCI.flags = 0;
+		mBufferCI.size = size;
+		mBufferCI.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+		mBufferCI.usage = usageFlags;
+		mBufferCI.queueFamilyIndexCount = 0;
+		mBufferCI.pQueueFamilyIndices = nullptr;
+
+		mViewCI.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO;
+		mViewCI.pNext = nullptr;
+		mViewCI.flags = 0;
+		mViewCI.format = VulkanUtility::getBufferFormat(format);
+		mViewCI.offset = 0;
+		mViewCI.range = VK_WHOLE_SIZE;
+
 		VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
 		VulkanDevice* devices[BS_MAX_DEVICES];
 		VulkanUtility::getDevices(rapi, deviceMask, devices);
@@ -70,48 +109,7 @@ namespace BansheeEngine
 			if (devices[i] == nullptr)
 				continue;
 
-			VkBufferCreateInfo bufferCI;
-			bufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-			bufferCI.pNext = nullptr;
-			bufferCI.flags = 0; 
-			bufferCI.size = size;
-			bufferCI.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-			bufferCI.usage = usageFlags;
-			bufferCI.queueFamilyIndexCount = 0;
-			bufferCI.pQueueFamilyIndices = nullptr;
-
-			VkDevice device = devices[i]->getLogical();
-
-			VkBuffer buffer;
-			VkResult result = vkCreateBuffer(device, &bufferCI, gVulkanAllocator, &buffer);
-			assert(result == VK_SUCCESS);
-
-			VkMemoryRequirements memReqs;
-			vkGetBufferMemoryRequirements(device, buffer, &memReqs);
-
-			VkDeviceMemory memory = devices[i]->allocateMemory(memReqs, flags);
-			result = vkBindBufferMemory(device, buffer, memory, 0);
-			assert(result == VK_SUCCESS);
-
-			VkBufferView view;
-			if (needsView)
-			{
-				VkBufferViewCreateInfo bufferViewCI;
-				bufferViewCI.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO;
-				bufferViewCI.pNext = nullptr;
-				bufferViewCI.flags = 0;
-				bufferViewCI.buffer = buffer;
-				bufferViewCI.format = VulkanUtility::getBufferFormat(format);
-				bufferViewCI.offset = 0;
-				bufferViewCI.range = VK_WHOLE_SIZE;
-
-				result = vkCreateBufferView(device, &bufferViewCI, gVulkanAllocator, &view);
-				assert(result == VK_SUCCESS);
-			}
-			else
-				view = VK_NULL_HANDLE;
-
-			mBuffers[i] = devices[i]->getResourceManager().create<VulkanBuffer>(buffer, view, memory);
+			mBuffers[i] = createBuffer(*devices[i], false, mReadable);
 		}
 	}
 
@@ -124,83 +122,240 @@ namespace BansheeEngine
 
 			mBuffers[i]->destroy();
 		}
+
+		assert(mStagingBuffer == nullptr);
 	}
 
-	void* VulkanHardwareBuffer::map(UINT32 offset, UINT32 length, GpuLockOptions options, UINT32 deviceIdx, UINT32 queueIdx)
+	VulkanBuffer* VulkanHardwareBuffer::createBuffer(VulkanDevice& device, bool staging, bool readable)
 	{
-		if ((offset + length) > mSize)
+		VkBufferUsageFlags usage = mBufferCI.usage;
+		if (staging)
 		{
-			LOGERR("Provided offset(" + toString(offset) + ") + length(" + toString(length) + ") "
-				   "is larger than the buffer " + toString(mSize) + ".");
+			mBufferCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
 
-			return nullptr;
+			// Staging buffers are used as a destination for reads
+			if (readable)
+				mBufferCI.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
 		}
+		else if(readable) // Non-staging readable
+			mBufferCI.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
 
-		VulkanBuffer* buffer = mBuffers[deviceIdx];
+		VkMemoryPropertyFlags flags = (mDirectlyMappable || staging) ?
+			(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) : // Note: Try using cached memory
+			VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
 
-		if (buffer == nullptr)
-			return;
+		VkDevice vkDevice = device.getLogical();
 
-		bool directMap = mStaging && !buffer->isUsed();
+		VkBuffer buffer;
+		VkResult result = vkCreateBuffer(vkDevice, &mBufferCI, gVulkanAllocator, &buffer);
+		assert(result == VK_SUCCESS);
 
-		// If memory is host visible and buffer isn't used on the GPU, map directly (no need for pipeline barriers
-		// with access modifiers since we're sure the buffer isn't used on the GPU)
-		if (directMap)
-			return buffer->map();
+		VkMemoryRequirements memReqs;
+		vkGetBufferMemoryRequirements(vkDevice, buffer, &memReqs);
 
-		// TODO - Allocate staging buffer
+		VkDeviceMemory memory = device.allocateMemory(memReqs, flags);
+		result = vkBindBufferMemory(vkDevice, buffer, memory, 0);
+		assert(result == VK_SUCCESS);
 
-		bool needRead = options == GBL_READ_WRITE || options == GBL_READ_ONLY;
-		if(needRead)
+		VkBufferView view;
+		if (mRequiresView && !staging)
 		{
-			// TODO - Get command buffer on wanted queue (getTransferBuffer(deviceIdx, queueIdx))
-			//      - Generate sync mask depending on where the resource is used on (VulkanResource::getUseInfo())
-			//      - Register this buffer and staging buffer with the transfer buffer, updating the transfer buffer's sync mask
-			//      - Flush the transfer buffer, wait for it to complete, and refresh CB states
-			//      - Proceed below
+			mViewCI.buffer = buffer;
+
+			result = vkCreateBufferView(vkDevice, &mViewCI, gVulkanAllocator, &view);
+			assert(result == VK_SUCCESS);
 		}
+		else
+			view = VK_NULL_HANDLE;
 
-		// TODO - Return staging buffer->map()
-		//      - Set mRequiresUpload field to true
-		//      - Remember lock mode
-		//      - Remember staging buffer
-		//      - Remember lock queue and device
+		mBufferCI.usage = usage; // Restore original usage
+		return device.getResourceManager().create<VulkanBuffer>(buffer, view, memory);
+	}
 
-		switch (options)
+	void* VulkanHardwareBuffer::map(UINT32 offset, UINT32 length, GpuLockOptions options, UINT32 deviceIdx, UINT32 queueIdx)
+	{
+		if ((offset + length) > mSize)
 		{
-		case GBL_WRITE_ONLY_DISCARD:
+			LOGERR("Provided offset(" + toString(offset) + ") + length(" + toString(length) + ") "
+				   "is larger than the buffer " + toString(mSize) + ".");
 
-			break;
-		case GBL_WRITE_ONLY_NO_OVERWRITE:
+			return nullptr;
+		}
 
-			break;
-		case GBL_WRITE_ONLY:
+		VulkanBuffer* buffer = mBuffers[deviceIdx];
 
-			break;
-		case GBL_READ_WRITE:
+		if (buffer == nullptr)
+			return nullptr;
 
-			break;
-		case GBL_READ_ONLY:
+		mIsMapped = true;
+		mMappedDeviceIdx = deviceIdx;
+		mMappedGlobalQueueIdx = queueIdx;
+		mMappedOffset = offset;
+		mMappedSize = length;
+		mMappedLockOptions = options;
 
-			break;
+		VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
+		VulkanDevice& device = *rapi._getDevice(deviceIdx);
+
+		VulkanCommandBufferManager& cbManager = gVulkanCBManager();
+		GpuQueueType queueType;
+		UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(queueIdx, queueType);
+
+		VkAccessFlags accessFlags;
+		if (options == GBL_READ_ONLY)
+			accessFlags = VK_ACCESS_HOST_READ_BIT;
+		else if (options == GBL_READ_WRITE)
+			accessFlags = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
+		else
+			accessFlags = VK_ACCESS_HOST_WRITE_BIT;
+
+		// If memory is host visible try mapping it directly
+		if(mDirectlyMappable)
+		{
+			// If GPU has the ability to write to the buffer we must issue a pipeline barrier to prevent any memory hazards
+			//  - Additionally it might be possible the GPU is /currently/ writing to the buffer, in which case we need to
+			//    wait for those writes to finish before continuing
+			if(mSupportsGPUWrites) // Note: It might be tempting to only do this step only if buffer is currently being 
+								   // written to, but that doesn't guarantee memory visibility if it was written to recently
+			{
+				// First try to avoid the expensive wait operation and barrier
+				if(options == GBL_WRITE_ONLY_NO_OVERWRITE) // Caller guarantees he won't touch the same data as the GPU, so just map
+					return buffer->map(offset, length);
+
+				if(options == GBL_WRITE_ONLY_DISCARD) // Caller doesn't care about buffer contents, so just discard the 
+				{									  // existing buffer and create a new one
+					buffer->destroy();
+
+					buffer = createBuffer(device, false, mReadable);
+					mBuffers[deviceIdx] = buffer;
+
+					return buffer->map(offset, length);
+				}
+
+				// Otherwise we need to wait until (potential) GPU write completes, and issue a barrier so:
+				//  - If reading: the device makes the written memory available for read (read-after-write hazard)
+				//  - If writing: ensures our writes properly overlap with GPU writes (write-after-write hazard)
+				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
+
+				// Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish
+				UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write);
+				transferCB->appendMask(writeUseMask); 
+
+				// Issue barrier to avoid memory hazards
+				transferCB->memoryBarrier(buffer->getHandle(),
+										  VK_ACCESS_SHADER_WRITE_BIT,
+										  accessFlags,
+										  // Last stages that could have written to the buffer:
+										  VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 
+										  VK_PIPELINE_STAGE_HOST_BIT
+				);
+
+				// Submit the command buffer and wait until it finishes
+				transferCB->flush(true);
+				assert(!buffer->isUsed());
+			}
+
+			return buffer->map(offset, length);
 		}
+		else // Otherwise we use a staging buffer
+		{
+			bool needRead = options == GBL_READ_WRITE || options == GBL_READ_ONLY;
+
+			// Allocate a staging buffer
+			mStagingBuffer = createBuffer(device, true, needRead);
 
-		return nullptr;
+			if (needRead) // If reading, we need to copy the current contents of the buffer to the staging buffer
+			{
+				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
+				
+				// Similar to above, if buffer supports GPU writes, we need to wait on any potential writes to complete
+				if(mSupportsGPUWrites)
+				{
+					// Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish
+					UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write);
+					transferCB->appendMask(writeUseMask);
+				}
+
+				// Queue copy command
+				buffer->copy(transferCB, mStagingBuffer, offset, length);
+
+				// Ensure data written to the staging buffer is visible
+				transferCB->memoryBarrier(buffer->getHandle(),
+										  VK_ACCESS_TRANSFER_WRITE_BIT,
+										  accessFlags,
+										  VK_PIPELINE_STAGE_TRANSFER_BIT,
+										  VK_PIPELINE_STAGE_HOST_BIT
+				);
+
+				// Submit the command buffer and wait until it finishes
+				transferCB->flush(true);
+				assert(!buffer->isUsed());
+			}
+
+			return mStagingBuffer->map(offset, length);
+		}
 	}
 
 	void VulkanHardwareBuffer::unmap()
 	{
-		// TODO - If direct map (mRequiresUpload == false), simply unmap
-		// TODO - If mRequiresUpload is true
-		//      - Get command buffer on locked queue and device
-		//      - If lock was discard
-		//        - Create a brand new internal buffer
-		//        - Call destroy on the old one
-		//        - Issue copy on the CB without a sync mask (register both resources on CB)
-		//      - If lock was no overwrite
-		//        - Issue copy on the CB without a sync mask (register both resources on CB)
-		//      - Otherwise issue copy with a sync mask depending on current use flags
-		//      - Destroy staging buffer
+		// Possibly map() failed with some error
+		if (!mIsMapped)
+			return;
+
+		// Note: If we did any writes they need to be made visible to the GPU. However there is no need to execute 
+		// a pipeline barrier because (as per spec) host writes are implicitly visible to the device.
+
+		if(mDirectlyMappable)
+			mBuffers[mMappedDeviceIdx]->unmap();
+		else
+		{
+			bool isWrite = mMappedLockOptions != GBL_READ_ONLY;
+
+			// We the caller wrote anything to the staging buffer, we need to upload it back to the main buffer
+			if(isWrite)
+			{
+				VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
+				VulkanDevice& device = *rapi._getDevice(mMappedDeviceIdx);
+
+				VulkanCommandBufferManager& cbManager = gVulkanCBManager();
+				GpuQueueType queueType;
+				UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(mMappedGlobalQueueIdx, queueType);
+
+				VulkanBuffer* buffer = mBuffers[mMappedDeviceIdx];
+				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(mMappedDeviceIdx, queueType, localQueueIdx);
+
+				// If the buffer is used in any way on the GPU, we need to wait for that use to finish before
+				// we issue our copy
+				UINT32 useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write);
+				if(useMask != 0) // Buffer is currently used on the GPU
+				{
+					// Try to avoid the wait
+					if (mMappedLockOptions == GBL_WRITE_ONLY_NO_OVERWRITE) // Caller guarantees he won't touch the same data as the GPU, so just copy
+					{
+						// Fall through to copy()
+					}
+					else if (mMappedLockOptions == GBL_WRITE_ONLY_DISCARD) // Caller doesn't care about buffer contents, so just discard the 
+					{													   // existing buffer and create a new one
+						buffer->destroy();
+
+						buffer = createBuffer(device, false, mReadable);
+						mBuffers[mMappedDeviceIdx] = buffer;
+					} 
+					else // Otherwise we have no choice but to issue a dependency between the queues
+						transferCB->appendMask(useMask);
+				}
+				
+				// Queue copy command
+				mStagingBuffer->copy(transferCB, buffer, mMappedOffset, mMappedSize);
+			}
+
+			mStagingBuffer->unmap();
+
+			mStagingBuffer->destroy();
+			mStagingBuffer = nullptr;
+		}
+
+		mIsMapped = false;
 	}
 
 	void VulkanHardwareBuffer::copyData(HardwareBuffer& srcBuffer, UINT32 srcOffset,

+ 8 - 1
Source/BansheeVulkanRenderAPI/Source/BsVulkanQueue.cpp

@@ -36,8 +36,15 @@ namespace BansheeEngine
 		else
 			submitInfo.pWaitSemaphores = nullptr;
 
-		vkQueueSubmit(mQueue, 1, &submitInfo, cmdBuffer->getFence());
+		VkResult result = vkQueueSubmit(mQueue, 1, &submitInfo, cmdBuffer->getFence());
+		assert(result == VK_SUCCESS);
 
 		mLastCommandBuffer = cmdBuffer;
 	}
+
+	void VulkanQueue::waitIdle() const
+	{
+		VkResult result = vkQueueWaitIdle(mQueue);
+		assert(result == VK_SUCCESS);
+	}
 }

+ 12 - 10
Source/BansheeVulkanRenderAPI/Source/BsVulkanResource.cpp

@@ -97,23 +97,25 @@ namespace BansheeEngine
 			mOwner->destroy(this);
 	}
 
-	UINT32 VulkanResource::getUseInfo(VulkanUseFlags& useFlags) const
+	UINT32 VulkanResource::getUseInfo(VulkanUseFlags useFlags) const
 	{
-		useFlags = VulkanUseFlag::None;
-
 		UINT32 mask = 0;
-		for(UINT32 i = 0; i < MAX_UNIQUE_QUEUES; i++)
+
+		if(useFlags.isSet(VulkanUseFlag::Read))
 		{
-			if (mReadUses[i] > 0)
+			for (UINT32 i = 0; i < MAX_UNIQUE_QUEUES; i++)
 			{
-				mask |= 1 << i;
-				useFlags |= VulkanUseFlag::Read;
+				if (mReadUses[i] > 0)
+					mask |= 1 << i;
 			}
+		}
 
-			if (mWriteUses[i] > 0)
+		if (useFlags.isSet(VulkanUseFlag::Write))
+		{
+			for (UINT32 i = 0; i < MAX_UNIQUE_QUEUES; i++)
 			{
-				mask |= 1 << i;
-				useFlags |= VulkanUseFlag::Write;
+				if (mWriteUses[i] > 0)
+					mask |= 1 << i;
 			}
 		}