فهرست منبع

Vulkan texture locking/unlocking WIP

BearishSun 9 سال پیش
والد
کامیت
9e1b7aea56

+ 7 - 0
Source/BansheeVulkanRenderAPI/Include/BsVulkanCommandBufferManager.h

@@ -37,6 +37,13 @@ namespace bs
 		void memoryBarrier(VkBuffer buffer, VkAccessFlags srcAccessFlags, VkAccessFlags dstAccessFlags,
 						   VkPipelineStageFlags srcStage, VkPipelineStageFlags dstStage);
 
+		/** 
+		 * Issues a pipeline barrier on the provided image, changing its layout. See vkCmdPipelineBarrier in Vulkan spec. 
+		 * for usage information.
+		 */
+		void setLayout(VkImage image, VkAccessFlags srcAccessFlags, VkAccessFlags dstAccessFlags, 
+			VkImageLayout oldLayout, VkImageLayout newLayout, const VkImageSubresourceRange& range);
+
 		/** 
 		 * Submits the command buffer on the queue. 
 		 * 

+ 7 - 0
Source/BansheeVulkanRenderAPI/Include/BsVulkanHardwareBuffer.h

@@ -42,6 +42,13 @@ namespace bs
 		void copy(VulkanTransferBuffer* cb, VulkanBuffer* destination, VkDeviceSize srcOffset, VkDeviceSize dstOffset, 
 			VkDeviceSize length);
 
+		/** 
+		 * Queues a command on the provided command buffer. The command copies the contents of the current buffer to
+		 * the destination image subresource. 
+		 */
+		void copy(VulkanTransferBuffer* cb, VulkanImage* destination, const VkExtent3D& extent, 
+			const VkImageSubresourceLayers& range, VkImageLayout layout);
+
 	private:
 		VkBuffer mBuffer;
 		VkBufferView mView;

+ 25 - 1
Source/BansheeVulkanRenderAPI/Include/BsVulkanTexture.h

@@ -35,6 +35,13 @@ namespace bs
 		/** Returns an image view that covers the specified faces and mip maps of the texture. */
 		VkImageView getView(const TextureSurface& surface) const;
 
+		/** 
+		 * Queues a command on the provided command buffer. The command copies the contents of the current image
+		 * subresource to the destination buffer. 
+		 */
+		void copy(VulkanTransferBuffer* cb, VulkanBuffer* destination, const VkExtent3D& extent, 
+			const VkImageSubresourceLayers& range, VkImageLayout layout);
+
 	private:
 		/** Creates a new view of the provided part (or entirety) of surface. */
 		VkImageView createView(const TextureSurface& surface) const;
@@ -108,12 +115,29 @@ namespace bs
 
 	private:
 		/** Creates a new image for the specified device, matching the current properties. */
-		VulkanImage* createImage(VulkanDevice& device, bool staging, bool readable);
+		VulkanImage* createImage(VulkanDevice& device);
+
+		/** 
+		 * Creates a new buffer for the specified device, with enough space to hold the provided mip level of this
+		 * texture. 
+		 */
+		VulkanBuffer* createStaging(VulkanDevice& device, UINT32 mipLevel, bool needsRead);
 
 		VulkanImage* mImages[BS_MAX_DEVICES];
 		GpuDeviceFlags mDeviceMask;
+		VkAccessFlags mAccessFlags;
+
+		VulkanBuffer* mStagingBuffer;
+		UINT32 mMappedDeviceIdx;
+		UINT32 mMappedGlobalQueueIdx;
+		UINT32 mMappedMip;
+		UINT32 mMappedFace;
+		GpuLockOptions mMappedLockOptions;
 
 		VkImageCreateInfo mImageCI;
+		bool mDirectlyMappable : 1;
+		bool mSupportsGPUWrites : 1;
+		bool mIsMapped : 1;
 	};
 
 	/** @} */

+ 23 - 0
Source/BansheeVulkanRenderAPI/Source/BsVulkanCommandBufferManager.cpp

@@ -64,6 +64,29 @@ namespace bs
 							 0, nullptr);
 	}
 
+	void VulkanTransferBuffer::setLayout(VkImage image, VkAccessFlags srcAccessFlags, VkAccessFlags dstAccessFlags, 
+		VkImageLayout oldLayout, VkImageLayout newLayout, const VkImageSubresourceRange& range)
+	{
+		VkImageMemoryBarrier barrier;
+		barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+		barrier.pNext = nullptr;
+		barrier.srcAccessMask = srcAccessFlags;
+		barrier.dstAccessMask = dstAccessFlags;
+		barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+		barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+		barrier.oldLayout = oldLayout;
+		barrier.newLayout = newLayout;
+		barrier.image = image;
+		barrier.subresourceRange = range;
+
+		vkCmdPipelineBarrier(mCB->getHandle(),
+							 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+							 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+							 0, 0, nullptr,
+							 0, nullptr,
+							 1, &barrier);
+	}
+
 	void VulkanTransferBuffer::flush(bool wait)
 	{
 		UINT32 syncMask = mSyncMask & ~mQueueMask; // Don't sync with itself

+ 9 - 1
Source/BansheeVulkanRenderAPI/Source/BsVulkanGpuParams.cpp

@@ -406,7 +406,15 @@ namespace bs
 			range.baseMipLevel = 0;
 			range.levelCount = props.getNumMipmaps();
 
-			buffer.registerResource(resource, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+			VkImageLayout layout;
+
+			// Keep dynamic textures in general layout, so they can be easily mapped by CPU
+			if (props.getUsage() & TU_DYNAMIC)
+				layout = VK_IMAGE_LAYOUT_GENERAL;
+			else
+				layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+			buffer.registerResource(resource, VK_ACCESS_SHADER_READ_BIT, layout,
 				range, VulkanUseFlag::Read);
 		}
 

+ 103 - 62
Source/BansheeVulkanRenderAPI/Source/BsVulkanHardwareBuffer.cpp

@@ -6,6 +6,7 @@
 #include "BsVulkanUtility.h"
 #include "BsVulkanCommandBufferManager.h"
 #include "BsVulkanCommandBuffer.h"
+#include "BsVulkanTexture.h"
 
 namespace bs
 {
@@ -55,6 +56,22 @@ namespace bs
 		vkCmdCopyBuffer(cb->getCB()->getHandle(), mBuffer, destination->getHandle(), 1, &region);
 	}
 
+	void VulkanBuffer::copy(VulkanTransferBuffer* cb, VulkanImage* destination, const VkExtent3D& extent, 
+		const VkImageSubresourceLayers& range, VkImageLayout layout)
+	{
+		VkBufferImageCopy region;
+		region.bufferRowLength = 0;
+		region.bufferImageHeight = 0;
+		region.bufferOffset = 0;
+		region.imageOffset.x = 0;
+		region.imageOffset.y = 0;
+		region.imageOffset.z = 0;
+		region.imageExtent = extent;
+		region.imageSubresource = range;
+
+		vkCmdCopyBufferToImage(cb->getCB()->getHandle(), mBuffer, destination->getHandle(), layout, 1, &region);
+	}
+
 	VulkanHardwareBuffer::VulkanHardwareBuffer(BufferType type, GpuBufferFormat format, GpuBufferUsage usage, 
 		UINT32 size, GpuDeviceFlags deviceMask)
 		: HardwareBuffer(size), mBuffers(), mStagingBuffer(nullptr), mMappedDeviceIdx(-1), mMappedGlobalQueueIdx(-1)
@@ -214,88 +231,112 @@ namespace bs
 		// If memory is host visible try mapping it directly
 		if(mDirectlyMappable)
 		{
-			// If GPU has the ability to write to the buffer we must issue a pipeline barrier to prevent any memory hazards
-			//  - Additionally it might be possible the GPU is /currently/ writing to the buffer, in which case we need to
-			//    wait for those writes to finish before continuing
-			if(mSupportsGPUWrites) // Note: It might be tempting to only do this step only if buffer is currently being 
-								   // written to, but that doesn't guarantee memory visibility if it was written to recently
+			// Check is the GPU currently reading or writing from the buffer
+			UINT32 useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write);
+
+			// Note: Even if GPU isn't currently using the buffer, but the buffer supports GPU writes, we consider it as
+			// being used because the write could have completed yet still not visible, so we need to issue a pipeline
+			// barrier below.
+			bool isUsedOnGPU = useMask != 0 || mSupportsGPUWrites;
+
+			// We're safe to map directly since GPU isn't using the buffer
+			if (!isUsedOnGPU)
+				return buffer->map(offset, length);
+
+			// Caller guarantees he won't touch the same data as the GPU, so just map even though the GPU is using the buffer
+			if (options == GBL_WRITE_ONLY_NO_OVERWRITE)
+				return buffer->map(offset, length);
+
+			// Caller doesn't care about buffer contents, so just discard the existing buffer and create a new one
+			if (options == GBL_WRITE_ONLY_DISCARD)
 			{
-				// First try to avoid the expensive wait operation and barrier
-				if(options == GBL_WRITE_ONLY_NO_OVERWRITE) // Caller guarantees he won't touch the same data as the GPU, so just map
-					return buffer->map(offset, length);
+				buffer->destroy();
 
-				if(options == GBL_WRITE_ONLY_DISCARD) // Caller doesn't care about buffer contents, so just discard the 
-				{									  // existing buffer and create a new one
-					buffer->destroy();
+				buffer = createBuffer(device, false, mReadable);
+				mBuffers[deviceIdx] = buffer;
 
-					buffer = createBuffer(device, false, mReadable);
-					mBuffers[deviceIdx] = buffer;
+				return buffer->map(offset, length);
+			}
 
-					return buffer->map(offset, length);
-				}
+			// No GPU writes are are supported and we're only reading, so no need to wait on anything
+			if (options == GBL_READ_ONLY && !mSupportsGPUWrites)
+				return buffer->map(offset, length);
 
-				// Otherwise we need to wait until (potential) GPU write completes, and issue a barrier so:
-				//  - If reading: the device makes the written memory available for read (read-after-write hazard)
-				//  - If writing: ensures our writes properly overlap with GPU writes (write-after-write hazard)
+			// We need to read the buffer contents with GPU writes potentially enabled
+			if(options == GBL_READ_ONLY || options == GBL_READ_WRITE)
+			{
+				// We need to wait until (potential) GPU read/write completes
 				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
 
-				// Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish
-				UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write);
-				transferCB->appendMask(writeUseMask); 
+				// Ensure flush() will wait for all queues currently using to the buffer (if any) to finish
+				// If only reading, wait for all writes to complete, otherwise wait on both writes and reads
+				if (options == GBL_READ_ONLY)
+					useMask = buffer->getUseInfo(VulkanUseFlag::Write);
+				else
+					useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write);
+
+				transferCB->appendMask(useMask);
 
-				// Issue barrier to avoid memory hazards
-				transferCB->memoryBarrier(buffer->getHandle(),
-										  VK_ACCESS_SHADER_WRITE_BIT,
-										  accessFlags,
-										  // Last stages that could have written to the buffer:
-										  VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 
-										  VK_PIPELINE_STAGE_HOST_BIT
-				);
+				// Make any writes visible before mapping
+				if (mSupportsGPUWrites)
+				{
+					// Issue a barrier so :
+					//  - If reading: the device makes the written memory available for read (read-after-write hazard)
+					//  - If writing: ensures our writes properly overlap with GPU writes (write-after-write hazard)
+					transferCB->memoryBarrier(buffer->getHandle(),
+											  VK_ACCESS_SHADER_WRITE_BIT,
+											  accessFlags,
+											  // Last stages that could have written to the buffer:
+											  VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+											  VK_PIPELINE_STAGE_HOST_BIT
+					);
+				}
 
 				// Submit the command buffer and wait until it finishes
 				transferCB->flush(true);
-				assert(!buffer->isUsed());
+
+				return buffer->map(offset, length);
 			}
 
-			return buffer->map(offset, length);
+			// Otherwise, we're doing write only, in which case it's best to use the staging buffer to avoid waiting
+			// and blocking, so fall through
 		}
-		else // Otherwise we use a staging buffer
-		{
-			bool needRead = options == GBL_READ_WRITE || options == GBL_READ_ONLY;
+		
+		// Use a staging buffer
+		bool needRead = options == GBL_READ_WRITE || options == GBL_READ_ONLY;
 
-			// Allocate a staging buffer
-			mStagingBuffer = createBuffer(device, true, needRead);
+		// Allocate a staging buffer
+		mStagingBuffer = createBuffer(device, true, needRead);
 
-			if (needRead) // If reading, we need to copy the current contents of the buffer to the staging buffer
-			{
-				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
+		if (needRead) // If reading, we need to copy the current contents of the buffer to the staging buffer
+		{
+			VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
 				
-				// Similar to above, if buffer supports GPU writes, we need to wait on any potential writes to complete
-				if(mSupportsGPUWrites)
-				{
-					// Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish
-					UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write);
-					transferCB->appendMask(writeUseMask);
-				}
-
-				// Queue copy command
-				buffer->copy(transferCB, mStagingBuffer, offset, offset, length);
+			// Similar to above, if buffer supports GPU writes, we need to wait on any potential writes to complete
+			if(mSupportsGPUWrites)
+			{
+				// Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish
+				UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write);
+				transferCB->appendMask(writeUseMask);
+			}
 
-				// Ensure data written to the staging buffer is visible
-				transferCB->memoryBarrier(buffer->getHandle(),
-										  VK_ACCESS_TRANSFER_WRITE_BIT,
-										  accessFlags,
-										  VK_PIPELINE_STAGE_TRANSFER_BIT,
-										  VK_PIPELINE_STAGE_HOST_BIT
-				);
+			// Queue copy command
+			buffer->copy(transferCB, mStagingBuffer, offset, offset, length);
 
-				// Submit the command buffer and wait until it finishes
-				transferCB->flush(true);
-				assert(!buffer->isUsed());
-			}
+			// Ensure data written to the staging buffer is visible
+			transferCB->memoryBarrier(mStagingBuffer->getHandle(),
+										VK_ACCESS_TRANSFER_WRITE_BIT,
+										accessFlags,
+										VK_PIPELINE_STAGE_TRANSFER_BIT,
+										VK_PIPELINE_STAGE_HOST_BIT
+			);
 
-			return mStagingBuffer->map(offset, length);
+			// Submit the command buffer and wait until it finishes
+			transferCB->flush(true);
+			assert(!buffer->isUsed());
 		}
+
+		return mStagingBuffer->map(offset, length);
 	}
 
 	void VulkanHardwareBuffer::unmap()
@@ -307,7 +348,7 @@ namespace bs
 		// Note: If we did any writes they need to be made visible to the GPU. However there is no need to execute 
 		// a pipeline barrier because (as per spec) host writes are implicitly visible to the device.
 
-		if(mDirectlyMappable)
+		if(mStagingBuffer == nullptr) // Means we directly mapped the buffer
 			mBuffers[mMappedDeviceIdx]->unmap();
 		else
 		{

+ 393 - 32
Source/BansheeVulkanRenderAPI/Source/BsVulkanTexture.cpp

@@ -4,6 +4,8 @@
 #include "BsVulkanRenderAPI.h"
 #include "BsVulkanDevice.h"
 #include "BsVulkanUtility.h"
+#include "BsVulkanCommandBufferManager.h"
+#include "BsVulkanHardwareBuffer.h"
 #include "BsCoreThread.h"
 #include "BsRenderStats.h"
 #include "BsMath.h"
@@ -123,9 +125,27 @@ namespace bs
 		return view;
 	}
 
+	void VulkanImage::copy(VulkanTransferBuffer* cb, VulkanBuffer* destination, const VkExtent3D& extent,
+							const VkImageSubresourceLayers& range, VkImageLayout layout)
+	{
+		VkBufferImageCopy region;
+		region.bufferRowLength = 0;
+		region.bufferImageHeight = 0;
+		region.bufferOffset = 0;
+		region.imageOffset.x = 0;
+		region.imageOffset.y = 0;
+		region.imageOffset.z = 0;
+		region.imageExtent = extent;
+		region.imageSubresource = range;
+
+		vkCmdCopyImageToBuffer(cb->getCB()->getHandle(), mImage, layout, destination->getHandle(), 1, &region);
+	}
+
 	VulkanTextureCore::VulkanTextureCore(const TEXTURE_DESC& desc, const SPtr<PixelData>& initialData,
 		GpuDeviceFlags deviceMask)
-		: TextureCore(desc, initialData, deviceMask), mImages(), mDeviceMask(deviceMask)
+		: TextureCore(desc, initialData, deviceMask), mImages(), mDeviceMask(deviceMask), mAccessFlags(0)
+		, mStagingBuffer(nullptr), mMappedDeviceIdx(-1), mMappedGlobalQueueIdx(-1), mMappedMip(0), mMappedFace(0)
+		, mMappedLockOptions(GBL_WRITE_ONLY), mDirectlyMappable(false), mSupportsGPUWrites(false), mIsMapped(false)
 	{
 		
 	}
@@ -140,6 +160,8 @@ namespace bs
 			mImages[i]->destroy();
 		}
 
+		assert(mStagingBuffer == nullptr);
+
 		BS_INC_RENDER_STAT_CAT(ResDestroyed, RenderStatObject_Texture);
 	}
 
@@ -153,6 +175,8 @@ namespace bs
 		mImageCI.pNext = nullptr;
 		mImageCI.flags = 0;
 
+		// Note: If usage is dynamic I might consider creating a VK_IMAGE_TILING_LINEAR (if supported by the device)
+
 		TextureType texType = props.getTextureType();
 		switch(texType)
 		{
@@ -172,31 +196,60 @@ namespace bs
 		}
 
 		int usage = props.getUsage();
-		if((usage & TU_RENDERTARGET) != 0)
+		if ((usage & TU_RENDERTARGET) != 0)
+		{
 			mImageCI.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-		else if((usage & TU_DEPTHSTENCIL) != 0)
+			mSupportsGPUWrites = true;
+			mAccessFlags = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		}
+		else if ((usage & TU_DEPTHSTENCIL) != 0)
+		{
 			mImageCI.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-		else if((usage & TU_LOADSTORE) != 0)
+			mSupportsGPUWrites = true;
+			mAccessFlags = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+		}
+		else if ((usage & TU_LOADSTORE) != 0)
+		{
 			mImageCI.usage = VK_IMAGE_USAGE_STORAGE_BIT;
+			mSupportsGPUWrites = true;
+			mAccessFlags = VK_ACCESS_SHADER_WRITE_BIT;
+		}
 		else
+		{
 			mImageCI.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
+			mAccessFlags = VK_ACCESS_SHADER_READ_BIT;;
+		}
 
 		if ((usage & TU_CPUREADABLE) != 0)
 			mImageCI.usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
 
+		VkImageTiling tiling = VK_IMAGE_TILING_OPTIMAL;
+		VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED;
+		if ((usage & TU_DYNAMIC) != 0) // Attempt to use linear tiling for dynamic textures, so we can directly map and modify them
+		{
+			// Only support 2D textures, with one sample and one mip level, only used for shader reads
+			// (Optionally check vkGetPhysicalDeviceFormatProperties & vkGetPhysicalDeviceImageFormatProperties for
+			// additional supported configs, but right now there doesn't seem to be any additional support)
+			if(texType == TEX_TYPE_2D && props.getNumSamples() <= 1 && props.getNumMipmaps() == 0 && 
+				props.getNumFaces() == 1 && (mImageCI.usage & VK_IMAGE_USAGE_SAMPLED_BIT) != 0)
+			{
+				mDirectlyMappable = true;
+				tiling = VK_IMAGE_TILING_LINEAR;
+				layout = VK_IMAGE_LAYOUT_PREINITIALIZED;
+			}
+		}
+
 		mImageCI.format = VulkanUtility::getPixelFormat(props.getFormat());
 		mImageCI.extent = { props.getWidth(), props.getHeight(), props.getDepth() };
 		mImageCI.mipLevels = props.getNumMipmaps() + 1;
 		mImageCI.arrayLayers = props.getNumFaces();
 		mImageCI.samples = VulkanUtility::getSampleFlags(props.getNumSamples());
-		mImageCI.tiling = VK_IMAGE_TILING_OPTIMAL;
-		mImageCI.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+		mImageCI.tiling = tiling;
+		mImageCI.initialLayout = layout;
 		mImageCI.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
 		mImageCI.queueFamilyIndexCount = 0;
 		mImageCI.pQueueFamilyIndices = nullptr;
 
-		bool isReadable = (usage & GBU_READABLE) != 0 || BS_EDITOR_BUILD;
-
 		VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
 		VulkanDevice* devices[BS_MAX_DEVICES];
 		VulkanUtility::getDevices(rapi, mDeviceMask, devices);
@@ -207,29 +260,17 @@ namespace bs
 			if (devices[i] == nullptr)
 				continue;
 
-			mImages[i] = createImage(*devices[i], false, isReadable);
+			mImages[i] = createImage(*devices[i]);
 		}
 
 		BS_INC_RENDER_STAT_CAT(ResCreated, RenderStatObject_Texture);
 		TextureCore::initialize();
 	}
 
-	VulkanImage* VulkanTextureCore::createImage(VulkanDevice& device, bool staging, bool readable)
+	VulkanImage* VulkanTextureCore::createImage(VulkanDevice& device)
 	{
-		VkBufferUsageFlags usage = mImageCI.usage;
-		if (staging)
-		{
-			mImageCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
-
-			// Staging buffers are used as a destination for reads
-			if (readable)
-				mImageCI.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-		}
-		else if (readable) // Non-staging readable
-			mImageCI.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
-
-		bool directlyMappable = (getProperties().getUsage() & TU_DYNAMIC) != 0;
-		VkMemoryPropertyFlags flags = (directlyMappable || staging) ?
+		bool directlyMappable = mImageCI.tiling == VK_IMAGE_TILING_LINEAR;
+		VkMemoryPropertyFlags flags = directlyMappable ?
 			(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) : // Note: Try using cached memory
 			VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
 
@@ -246,8 +287,48 @@ namespace bs
 		result = vkBindImageMemory(vkDevice, image, memory, 0);
 		assert(result == VK_SUCCESS);
 
-		mImageCI.usage = usage; // Restore original usage
-		return device.getResourceManager().create<VulkanImage>(image, memory, VK_IMAGE_LAYOUT_UNDEFINED, getProperties());
+		return device.getResourceManager().create<VulkanImage>(image, memory, mImageCI.initialLayout, getProperties());
+	}
+
+	VulkanBuffer* VulkanTextureCore::createStaging(VulkanDevice& device, UINT32 mipLevel, bool readable)
+	{
+		const TextureProperties& props = getProperties();
+
+		UINT32 mipWidth, mipHeight, mipDepth;
+		PixelUtil::getSizeForMipLevel(props.getWidth(), props.getHeight(), props.getDepth(), mipLevel, mipWidth, mipHeight, 
+			mipDepth);
+
+		UINT32 mipLevelSize = PixelUtil::getMemorySize(mipWidth, mipHeight, mipDepth, props.getFormat());
+
+		VkBufferCreateInfo bufferCI;
+		bufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+		bufferCI.pNext = nullptr;
+		bufferCI.flags = 0;
+		bufferCI.size = mipLevelSize;
+		bufferCI.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+		bufferCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+		bufferCI.queueFamilyIndexCount = 0;
+		bufferCI.pQueueFamilyIndices = nullptr;
+
+		if (readable)
+			bufferCI.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+		VkDevice vkDevice = device.getLogical();
+
+		VkBuffer buffer;
+		VkResult result = vkCreateBuffer(vkDevice, &bufferCI, gVulkanAllocator, &buffer);
+		assert(result == VK_SUCCESS);
+
+		VkMemoryRequirements memReqs;
+		vkGetBufferMemoryRequirements(vkDevice, buffer, &memReqs);
+
+		VkMemoryPropertyFlags flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+
+		VkDeviceMemory memory = device.allocateMemory(memReqs, flags);
+		result = vkBindBufferMemory(vkDevice, buffer, memory, 0);
+		assert(result == VK_SUCCESS);
+
+		return device.getResourceManager().create<VulkanBuffer>(buffer, VK_NULL_HANDLE, memory);
 	}
 
 	VkImageView VulkanTextureCore::getView(UINT32 deviceIdx) const
@@ -275,16 +356,296 @@ namespace bs
 	PixelData VulkanTextureCore::lockImpl(GpuLockOptions options, UINT32 mipLevel, UINT32 face, UINT32 deviceIdx,
 										  UINT32 queueIdx)
 	{
-		PixelData lockedArea(1, 1, 1, mProperties.getFormat());
-	
-		// TODO - Increment read/write stat (do this for other buffers as well)
+		if (mProperties.getNumSamples() > 1)
+		{
+			LOGERR("Multisampled textures cannot be accessed from the CPU directly.");
+			return PixelData();
+		}
+
+#if BS_PROFILING_ENABLED
+		if (options == GBL_READ_ONLY || options == GBL_READ_WRITE)
+		{
+			BS_INC_RENDER_STAT_CAT(ResRead, RenderStatObject_Texture);
+		}
+
+		if (options == GBL_READ_WRITE || options == GBL_WRITE_ONLY || options == GBL_WRITE_ONLY_DISCARD || options == GBL_WRITE_ONLY_NO_OVERWRITE)
+		{
+			BS_INC_RENDER_STAT_CAT(ResWrite, RenderStatObject_Texture);
+		}
+#endif
 
-		return lockedArea;
+		UINT32 mipWidth = std::max(1u, mProperties.getWidth() >> mipLevel);
+		UINT32 mipHeight = std::max(1u, mProperties.getHeight() >> mipLevel);
+		UINT32 mipDepth = std::max(1u, mProperties.getDepth() >> mipLevel);
+
+		PixelData lockedArea(mipWidth, mipHeight, mipDepth, mProperties.getFormat());
+
+		VulkanImage* image = mImages[deviceIdx];
+
+		if (image == nullptr)
+			return PixelData();
+
+		mIsMapped = true;
+		mMappedDeviceIdx = deviceIdx;
+		mMappedGlobalQueueIdx = queueIdx;
+		mMappedFace = face;
+		mMappedMip = mipLevel;
+		mMappedLockOptions = options;
+
+		VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
+		VulkanDevice& device = *rapi._getDevice(deviceIdx);
+
+		VulkanCommandBufferManager& cbManager = gVulkanCBManager();
+		GpuQueueType queueType;
+		UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(queueIdx, queueType);
+
+		VulkanImage* subresource = image->getSubresource(face, mipLevel);
+
+		// If memory is host visible try mapping it directly
+		if (mDirectlyMappable)
+		{
+			// Initially the texture will be in preinitialized layout, and it will transition to general layout on first
+			// use in shader. No further transitions are allowed for directly mappable textures.
+			assert(image->getLayout() == VK_IMAGE_LAYOUT_PREINITIALIZED || image->getLayout() == VK_IMAGE_LAYOUT_GENERAL);
+
+			// GPU should never be allowed to write to a directly mappable texture, since only linear tiling is supported
+			// for direct mapping, and we don't support using it with either storage textures or render targets.
+			assert(!mSupportsGPUWrites);
+
+			// Check is the GPU currently reading from the image
+			UINT32 useMask = subresource->getUseInfo(VulkanUseFlag::Read);
+			bool isUsedOnGPU = useMask != 0;
+
+			// We're safe to map directly since GPU isn't using the subresource
+			if (!isUsedOnGPU)
+				return image->map(face, mipLevel);
+
+			// Caller guarantees he won't touch the same data as the GPU, so just map even though the GPU is using the
+			// subresource
+			if (options == GBL_WRITE_ONLY_NO_OVERWRITE)
+				return image->map(face, mipLevel);
+
+			// No GPU writes are are supported and we're only reading, so no need to wait on anything
+			if (options == GBL_READ_ONLY)
+				return image->map(face, mipLevel);
+
+			// Caller doesn't care about buffer contents, so just discard the existing buffer and create a new one
+			if (options == GBL_WRITE_ONLY_DISCARD)
+			{
+				// TODO - Since I'm only writing to a single subresource, how will discard work? Discard every subresource?
+
+				//buffer->destroy();
+
+				//buffer = createBuffer(device, false, mReadable);
+				//mBuffers[deviceIdx] = buffer;
+
+				//return buffer->map(offset, length);
+			}
+
+			// We need to both read and write, meaning we need to wait until existing reads complete before we return
+			if (options == GBL_READ_WRITE)
+			{
+				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
+
+				// Ensure flush() will wait for all queues currently reading from the texture to finish
+				transferCB->appendMask(useMask);
+
+				// Submit the command buffer and wait until it finishes
+				transferCB->flush(true);
+
+				return image->map(face, mipLevel);
+			}
+
+			// Otherwise, we're doing write only, in which case it's best to use the staging buffer to avoid waiting
+			// and blocking, so fall through
+		}
+
+		bool needRead = options == GBL_READ_WRITE || options == GBL_READ_ONLY;
+
+		// Allocate a staging buffer
+		mStagingBuffer = createStaging(device, mipLevel, needRead);
+
+		if (needRead) // If reading, we need to copy the current contents of the image to the staging buffer
+		{
+			VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx);
+
+			// Similar to above, if image supports GPU writes, we need to wait on any potential writes to complete
+			if (mSupportsGPUWrites)
+			{
+				// Ensure flush() will wait for all queues currently writing to the image (if any) to finish
+				UINT32 writeUseMask = subresource->getUseInfo(VulkanUseFlag::Write);
+				transferCB->appendMask(writeUseMask);
+			}
+
+			const TextureProperties& props = getProperties();
+
+			VkImageSubresourceRange range;
+			if ((props.getUsage() & TU_DEPTHSTENCIL) != 0)
+				range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+			else
+				range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+
+			range.baseArrayLayer = face;
+			range.layerCount = 1;
+			range.baseMipLevel = mipLevel;
+			range.levelCount = 1;
+
+			VkImageSubresourceLayers rangeLayers;
+			if ((props.getUsage() & TU_DEPTHSTENCIL) != 0)
+				rangeLayers.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+			else
+				rangeLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+
+			rangeLayers.baseArrayLayer = range.baseArrayLayer;
+			rangeLayers.layerCount = range.layerCount;
+			rangeLayers.mipLevel = range.baseMipLevel;
+
+			VkExtent3D extent;
+			PixelUtil::getSizeForMipLevel(props.getWidth(), props.getHeight(), props.getDepth(), mMappedMip,
+										  extent.width, extent.height, extent.depth);
+
+			// Transfer texture to a valid layout
+			transferCB->setLayout(image->getHandle(), mAccessFlags, VK_ACCESS_TRANSFER_READ_BIT, image->getLayout(),
+								  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, range);
+
+			// Queue copy command
+			image->copy(transferCB, mStagingBuffer, extent, rangeLayers, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+
+			// Transfer back to original layout
+			transferCB->setLayout(image->getHandle(), VK_ACCESS_TRANSFER_READ_BIT, mAccessFlags,
+								  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, image->getLayout(), range);
+
+			// Ensure data written to the staging buffer is visible
+			VkAccessFlags stagingAccessFlags;
+			if (options == GBL_READ_ONLY)
+				stagingAccessFlags = VK_ACCESS_HOST_READ_BIT;
+			else // Must be read/write
+				stagingAccessFlags = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
+
+			transferCB->memoryBarrier(mStagingBuffer->getHandle(),
+									  VK_ACCESS_TRANSFER_WRITE_BIT,
+									  stagingAccessFlags,
+									  VK_PIPELINE_STAGE_TRANSFER_BIT,
+									  VK_PIPELINE_STAGE_HOST_BIT);
+
+			// Submit the command buffer and wait until it finishes
+			transferCB->flush(true);
+		}
+
+		return mStagingBuffer->map(offset, length);
 	}
 
 	void VulkanTextureCore::unlockImpl()
 	{
-		
+		// Possibly map() failed with some error
+		if (!mIsMapped)
+			return;
+
+		// Note: If we did any writes they need to be made visible to the GPU. However there is no need to execute 
+		// a pipeline barrier because (as per spec) host writes are implicitly visible to the device.
+
+		if (mStagingBuffer == nullptr)
+			mImages[mMappedDeviceIdx]->unmap();
+		else
+		{
+			bool isWrite = mMappedLockOptions != GBL_READ_ONLY;
+
+			// We the caller wrote anything to the staging buffer, we need to upload it back to the main buffer
+			if (isWrite)
+			{
+				VulkanRenderAPI& rapi = static_cast<VulkanRenderAPI&>(RenderAPICore::instance());
+				VulkanDevice& device = *rapi._getDevice(mMappedDeviceIdx);
+
+				VulkanCommandBufferManager& cbManager = gVulkanCBManager();
+				GpuQueueType queueType;
+				UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(mMappedGlobalQueueIdx, queueType);
+
+				VulkanImage* image = mImages[mMappedDeviceIdx];
+				VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(mMappedDeviceIdx, queueType, localQueueIdx);
+
+				VulkanImage* subresource = image->getSubresource(mMappedFace, mMappedMip);
+
+				// If the subresource is used in any way on the GPU, we need to wait for that use to finish before
+				// we issue our copy
+				UINT32 useMask = subresource->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write);
+				if (useMask != 0) // Subresource is currently used on the GPU
+				{
+					// Try to avoid the wait
+
+					// Caller guarantees he won't touch the same data as the GPU, so just copy
+					if (mMappedLockOptions == GBL_WRITE_ONLY_NO_OVERWRITE) 
+					{
+						// Fall through to copy()
+					}
+					// Caller doesn't care about buffer contents, so just discard the  existing buffer and create a new one
+					else if (mMappedLockOptions == GBL_WRITE_ONLY_DISCARD) 
+					{
+						// TODO - Handle discard
+
+						//buffer->destroy();
+
+						//buffer = createBuffer(device, false, mReadable);
+						//mBuffers[mMappedDeviceIdx] = buffer;
+					}
+					else // Otherwise we have no choice but to issue a dependency between the queues
+						transferCB->appendMask(useMask);
+				}
+
+				const TextureProperties& props = getProperties();
+
+				VkImageSubresourceRange range;
+				if ((props.getUsage() & TU_DEPTHSTENCIL) != 0)
+					range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+				else
+					range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+
+				range.baseArrayLayer = mMappedFace;
+				range.layerCount = 1;
+				range.baseMipLevel = mMappedMip;
+				range.levelCount = 1;
+
+				VkImageSubresourceLayers rangeLayers;
+				rangeLayers.aspectMask = range.aspectMask;
+				rangeLayers.baseArrayLayer = range.baseArrayLayer;
+				rangeLayers.layerCount = range.layerCount;
+				rangeLayers.mipLevel = range.baseMipLevel;
+
+				VkExtent3D extent;
+				PixelUtil::getSizeForMipLevel(props.getWidth(), props.getHeight(), props.getDepth(), mMappedMip,
+											  extent.width, extent.height, extent.depth);
+
+				VkImageLayout transferLayout;
+				if (mDirectlyMappable)
+					transferLayout = VK_IMAGE_LAYOUT_GENERAL;
+				else
+					transferLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+
+				// Transfer texture to a valid layout
+				transferCB->setLayout(image->getHandle(), mAccessFlags, VK_ACCESS_TRANSFER_WRITE_BIT, image->getLayout(),
+									  transferLayout, range);
+
+				// Queue copy command
+				mStagingBuffer->copy(transferCB, image, extent, rangeLayers, transferLayout);
+
+				// Transfer back to original layout
+				transferCB->setLayout(image->getHandle(), VK_ACCESS_TRANSFER_WRITE_BIT, mAccessFlags,
+									  transferLayout, image->getLayout(), range);
+
+				// Notify the command buffer that these resources are being used on it
+				transferCB->getCB()->registerResource(mStagingBuffer, VK_ACCESS_TRANSFER_READ_BIT, VulkanUseFlag::Read);
+				transferCB->getCB()->registerResource(image, mAccessFlags, image->getLayout(), range, VulkanUseFlag::Write);
+
+				// We don't actually flush the transfer buffer here since it's an expensive operation, but it's instead
+				// done automatically before next "normal" command buffer submission.
+			}
+
+			mStagingBuffer->unmap();
+
+			mStagingBuffer->destroy();
+			mStagingBuffer = nullptr;
+		}
+
+		mIsMapped = false;
 	}
 
 	void VulkanTextureCore::readDataImpl(PixelData& dest, UINT32 mipLevel, UINT32 face, UINT32 deviceIdx, UINT32 queueIdx)
@@ -346,4 +707,4 @@ namespace bs
 
 		BS_INC_RENDER_STAT_CAT(ResWrite, RenderStatObject_Texture);
 	}
-}
+}