//********************************** Banshee Engine (www.banshee3d.com) **************************************************// //**************** Copyright (c) 2016 Marko Pintera (marko.pintera@gmail.com). All rights reserved. **********************// #include "BsVulkanHardwareBuffer.h" #include "BsVulkanRenderAPI.h" #include "BsVulkanDevice.h" #include "BsVulkanUtility.h" #include "Managers/BsVulkanCommandBufferManager.h" #include "BsVulkanCommandBuffer.h" #include "BsVulkanTexture.h" namespace bs { namespace ct { VulkanBuffer::VulkanBuffer(VulkanResourceManager* owner, VkBuffer buffer, VkBufferView view, VmaAllocation allocation, UINT32 rowPitch, UINT32 slicePitch) : VulkanResource(owner, false), mBuffer(buffer), mView(view), mAllocation(allocation), mRowPitch(rowPitch) { if (rowPitch != 0) mSliceHeight = slicePitch / rowPitch; else mSliceHeight = 0; } VulkanBuffer::~VulkanBuffer() { VulkanDevice& device = mOwner->getDevice(); if (mView != VK_NULL_HANDLE) vkDestroyBufferView(device.getLogical(), mView, gVulkanAllocator); vkDestroyBuffer(device.getLogical(), mBuffer, gVulkanAllocator); device.freeMemory(mAllocation); } UINT8* VulkanBuffer::map(VkDeviceSize offset, VkDeviceSize length) const { VulkanDevice& device = mOwner->getDevice(); VkDeviceMemory memory; VkDeviceSize memoryOffset; device.getAllocationInfo(mAllocation, memory, memoryOffset); UINT8* data; VkResult result = vkMapMemory(device.getLogical(), memory, memoryOffset + offset, length, 0, (void**)&data); assert(result == VK_SUCCESS); return data; } void VulkanBuffer::unmap() { VulkanDevice& device = mOwner->getDevice(); VkDeviceMemory memory; VkDeviceSize memoryOffset; device.getAllocationInfo(mAllocation, memory, memoryOffset); vkUnmapMemory(device.getLogical(), memory); } void VulkanBuffer::copy(VulkanCmdBuffer* cb, VulkanBuffer* destination, VkDeviceSize srcOffset, VkDeviceSize dstOffset, VkDeviceSize length) { VkBufferCopy region; region.size = length; region.srcOffset = srcOffset; region.dstOffset = dstOffset; vkCmdCopyBuffer(cb->getHandle(), mBuffer, destination->getHandle(), 1, ®ion); } void VulkanBuffer::copy(VulkanCmdBuffer* cb, VulkanImage* destination, const VkExtent3D& extent, const VkImageSubresourceLayers& range, VkImageLayout layout) { VkBufferImageCopy region; region.bufferRowLength = mRowPitch; region.bufferImageHeight = mSliceHeight; region.bufferOffset = 0; region.imageOffset.x = 0; region.imageOffset.y = 0; region.imageOffset.z = 0; region.imageExtent = extent; region.imageSubresource = range; vkCmdCopyBufferToImage(cb->getHandle(), mBuffer, destination->getHandle(), layout, 1, ®ion); } void VulkanBuffer::update(VulkanCmdBuffer* cb, UINT8* data, VkDeviceSize offset, VkDeviceSize length) { vkCmdUpdateBuffer(cb->getHandle(), mBuffer, offset, length, (uint32_t*)data); } VulkanHardwareBuffer::VulkanHardwareBuffer(BufferType type, GpuBufferFormat format, GpuBufferUsage usage, UINT32 size, GpuDeviceFlags deviceMask) : HardwareBuffer(size), mBuffers(), mStagingBuffer(nullptr), mStagingMemory(nullptr), mMappedDeviceIdx(-1) , mMappedGlobalQueueIdx(-1), mMappedOffset(0), mMappedSize(0), mMappedLockOptions(GBL_WRITE_ONLY) , mDirectlyMappable((usage & GBU_DYNAMIC) != 0), mSupportsGPUWrites(type == BT_STORAGE), mRequiresView(false) , mIsMapped(false) { VkBufferUsageFlags usageFlags = 0; switch(type) { case BT_VERTEX: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; break; case BT_INDEX: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT; break; case BT_UNIFORM: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; break; case BT_GENERIC: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; mRequiresView = true; break; case BT_STORAGE: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; mRequiresView = true; break; case BT_STRUCTURED: usageFlags = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; break; } mBufferCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; mBufferCI.pNext = nullptr; mBufferCI.flags = 0; mBufferCI.sharingMode = VK_SHARING_MODE_EXCLUSIVE; mBufferCI.usage = usageFlags; mBufferCI.queueFamilyIndexCount = 0; mBufferCI.pQueueFamilyIndices = nullptr; mViewCI.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO; mViewCI.pNext = nullptr; mViewCI.flags = 0; mViewCI.format = VulkanUtility::getBufferFormat(format); mViewCI.offset = 0; mViewCI.range = VK_WHOLE_SIZE; VulkanRenderAPI& rapi = static_cast(RenderAPI::instance()); VulkanDevice* devices[BS_MAX_DEVICES]; VulkanUtility::getDevices(rapi, deviceMask, devices); // Allocate buffers per-device for (UINT32 i = 0; i < BS_MAX_DEVICES; i++) { if (devices[i] == nullptr) continue; mBuffers[i] = createBuffer(*devices[i], size, false, true); } } VulkanHardwareBuffer::~VulkanHardwareBuffer() { for (UINT32 i = 0; i < BS_MAX_DEVICES; i++) { if (mBuffers[i] == nullptr) continue; mBuffers[i]->destroy(); } assert(mStagingBuffer == nullptr); } VulkanBuffer* VulkanHardwareBuffer::createBuffer(VulkanDevice& device, UINT32 size, bool staging, bool readable) { VkBufferUsageFlags usage = mBufferCI.usage; if (staging) { mBufferCI.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT; // Staging buffers are used as a destination for reads if (readable) mBufferCI.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; } else if(readable) // Non-staging readable mBufferCI.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; mBufferCI.size = size; VkMemoryPropertyFlags flags = (mDirectlyMappable || staging) ? (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) : // Note: Try using cached memory VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; VkDevice vkDevice = device.getLogical(); VkBuffer buffer; VkResult result = vkCreateBuffer(vkDevice, &mBufferCI, gVulkanAllocator, &buffer); assert(result == VK_SUCCESS); VmaAllocation allocation = device.allocateMemory(buffer, flags); VkBufferView view; if (mRequiresView && !staging) { mViewCI.buffer = buffer; result = vkCreateBufferView(vkDevice, &mViewCI, gVulkanAllocator, &view); assert(result == VK_SUCCESS); } else view = VK_NULL_HANDLE; mBufferCI.usage = usage; // Restore original usage return device.getResourceManager().create(buffer, view, allocation); } void* VulkanHardwareBuffer::map(UINT32 offset, UINT32 length, GpuLockOptions options, UINT32 deviceIdx, UINT32 queueIdx) { if ((offset + length) > mSize) { LOGERR("Provided offset(" + toString(offset) + ") + length(" + toString(length) + ") " "is larger than the buffer " + toString(mSize) + "."); return nullptr; } if (length == 0) return nullptr; VulkanBuffer* buffer = mBuffers[deviceIdx]; if (buffer == nullptr) return nullptr; mIsMapped = true; mMappedDeviceIdx = deviceIdx; mMappedGlobalQueueIdx = queueIdx; mMappedOffset = offset; mMappedSize = length; mMappedLockOptions = options; VulkanRenderAPI& rapi = static_cast(RenderAPI::instance()); VulkanDevice& device = *rapi._getDevice(deviceIdx); VulkanCommandBufferManager& cbManager = gVulkanCBManager(); GpuQueueType queueType; UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(queueIdx, queueType); VkAccessFlags accessFlags; if (options == GBL_READ_ONLY) accessFlags = VK_ACCESS_HOST_READ_BIT; else if (options == GBL_READ_WRITE) accessFlags = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT; else accessFlags = VK_ACCESS_HOST_WRITE_BIT; // If memory is host visible try mapping it directly if(mDirectlyMappable) { // Check is the GPU currently reading or writing from the buffer UINT32 useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write); // Note: Even if GPU isn't currently using the buffer, but the buffer supports GPU writes, we consider it as // being used because the write could have completed yet still not visible, so we need to issue a pipeline // barrier below. bool isUsedOnGPU = useMask != 0 || mSupportsGPUWrites; // We're safe to map directly since GPU isn't using the buffer if (!isUsedOnGPU) { // If some CB has an operation queued that will be using the current contents of the buffer, create a new // buffer so we don't modify the previous use of the buffer if(buffer->isBound()) { VulkanBuffer* newBuffer = createBuffer(device, mSize, false, true); // Copy contents of the current buffer to the new one, unless caller explicitly specifies he doesn't // care about the current contents if (options != GBL_WRITE_ONLY_DISCARD) { UINT8* src = buffer->map(offset, length); UINT8* dst = newBuffer->map(offset, length); memcpy(dst, src, length); buffer->unmap(); newBuffer->unmap(); } buffer->destroy(); buffer = newBuffer; mBuffers[deviceIdx] = buffer; } return buffer->map(offset, length); } // Caller guarantees he won't touch the same data as the GPU, so just map even though the GPU is using the buffer if (options == GBL_WRITE_ONLY_NO_OVERWRITE) return buffer->map(offset, length); // Caller doesn't care about buffer contents, so just discard the existing buffer and create a new one if (options == GBL_WRITE_ONLY_DISCARD) { buffer->destroy(); buffer = createBuffer(device, mSize, false, true); mBuffers[deviceIdx] = buffer; return buffer->map(offset, length); } // We need to read the buffer contents if(options == GBL_READ_ONLY || options == GBL_READ_WRITE) { // We need to wait until (potential) read/write operations complete VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx); // Ensure flush() will wait for all queues currently using to the buffer (if any) to finish // If only reading, wait for all writes to complete, otherwise wait on both writes and reads if (options == GBL_READ_ONLY) useMask = buffer->getUseInfo(VulkanUseFlag::Write); else useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write); transferCB->appendMask(useMask); // Make any writes visible before mapping if (mSupportsGPUWrites) { // Issue a barrier so : // - If reading: the device makes the written memory available for read (read-after-write hazard) // - If writing: ensures our writes properly overlap with GPU writes (write-after-write hazard) transferCB->memoryBarrier(buffer->getHandle(), VK_ACCESS_SHADER_WRITE_BIT, accessFlags, // Last stages that could have written to the buffer: VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT ); } // Submit the command buffer and wait until it finishes transferCB->flush(true); // If writing and some CB has an operation queued that will be using the current contents of the buffer, // create a new buffer so we don't modify the previous use of the buffer if (options == GBL_READ_WRITE && buffer->isBound()) { VulkanBuffer* newBuffer = createBuffer(device, mSize, false, true); // Copy contents of the current buffer to the new one UINT8* src = buffer->map(offset, length); UINT8* dst = newBuffer->map(offset, length); memcpy(dst, src, length); buffer->unmap(); newBuffer->unmap(); buffer->destroy(); buffer = newBuffer; mBuffers[deviceIdx] = buffer; } return buffer->map(offset, length); } // Otherwise, we're doing write only, in which case it's best to use the staging buffer to avoid waiting // and blocking, so fall through } // Can't use direct mapping, so use a staging buffer or memory // We might need to copy the current contents of the buffer to the staging buffer. Even if the user doesn't plan on // reading, it is still required as we will eventually copy all of the contents back to the original buffer, // and we can't write potentially uninitialized data. The only exception is when the caller specifies the buffer // contents should be discarded in which he guarantees he will overwrite the entire locked area with his own // contents. bool needRead = options != GBL_WRITE_ONLY_DISCARD_RANGE && options != GBL_WRITE_ONLY_DISCARD; // See if we can use the cheaper staging memory, rather than a staging buffer if(!needRead && offset % 4 == 0 && length % 4 == 0 && length <= 65536) { mStagingMemory = (UINT8*)bs_alloc(length); return mStagingMemory; } // Create a staging buffer mStagingBuffer = createBuffer(device, length, true, needRead); if (needRead) { VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(deviceIdx, queueType, localQueueIdx); // Similar to above, if buffer supports GPU writes or is currently being written to, we need to wait on any // potential writes to complete UINT32 writeUseMask = buffer->getUseInfo(VulkanUseFlag::Write); if(mSupportsGPUWrites || writeUseMask != 0) { // Ensure flush() will wait for all queues currently writing to the buffer (if any) to finish transferCB->appendMask(writeUseMask); } // Queue copy command buffer->copy(transferCB->getCB(), mStagingBuffer, offset, 0, length); // Ensure data written to the staging buffer is visible transferCB->memoryBarrier(mStagingBuffer->getHandle(), VK_ACCESS_TRANSFER_WRITE_BIT, accessFlags, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT ); // Submit the command buffer and wait until it finishes transferCB->flush(true); assert(!buffer->isUsed()); } return mStagingBuffer->map(0, length); } void VulkanHardwareBuffer::unmap() { // Possibly map() failed with some error if (!mIsMapped) return; // Note: If we did any writes they need to be made visible to the GPU. However there is no need to execute // a pipeline barrier because (as per spec) host writes are implicitly visible to the device. if(mStagingMemory == nullptr && mStagingBuffer == nullptr) // We directly mapped the buffer { mBuffers[mMappedDeviceIdx]->unmap(); } else { if(mStagingBuffer != nullptr) mStagingBuffer->unmap(); bool isWrite = mMappedLockOptions != GBL_READ_ONLY; // We the caller wrote anything to the staging buffer, we need to upload it back to the main buffer if(isWrite) { VulkanRenderAPI& rapi = static_cast(RenderAPI::instance()); VulkanDevice& device = *rapi._getDevice(mMappedDeviceIdx); VulkanCommandBufferManager& cbManager = gVulkanCBManager(); GpuQueueType queueType; UINT32 localQueueIdx = CommandSyncMask::getQueueIdxAndType(mMappedGlobalQueueIdx, queueType); VulkanBuffer* buffer = mBuffers[mMappedDeviceIdx]; VulkanTransferBuffer* transferCB = cbManager.getTransferBuffer(mMappedDeviceIdx, queueType, localQueueIdx); // If the buffer is used in any way on the GPU, we need to wait for that use to finish before // we issue our copy UINT32 useMask = buffer->getUseInfo(VulkanUseFlag::Read | VulkanUseFlag::Write); bool isNormalWrite = false; if(useMask != 0) // Buffer is currently used on the GPU { // Try to avoid the wait by checking for special write conditions // Caller guarantees he won't touch the same data as the GPU, so just copy if (mMappedLockOptions == GBL_WRITE_ONLY_NO_OVERWRITE) { // Fall through to copy() } // Caller doesn't care about buffer contents, so just discard the existing buffer and create a new one else if (mMappedLockOptions == GBL_WRITE_ONLY_DISCARD) { buffer->destroy(); buffer = createBuffer(device, mSize, false, true); mBuffers[mMappedDeviceIdx] = buffer; } else // Otherwise we have no choice but to issue a dependency between the queues { transferCB->appendMask(useMask); isNormalWrite = true; } } else isNormalWrite = true; // Check if the buffer will still be bound somewhere after the CBs using it finish if (isNormalWrite) { UINT32 useCount = buffer->getUseCount(); UINT32 boundCount = buffer->getBoundCount(); bool isBoundWithoutUse = boundCount > useCount; // If buffer is queued for some operation on a CB, then we need to make a copy of the buffer to // avoid modifying its use in the previous operation if (isBoundWithoutUse) { VulkanBuffer* newBuffer = createBuffer(device, mSize, false, true); // Avoid copying original contents if the staging buffer completely covers it if (mMappedOffset > 0 || mMappedSize != mSize) { buffer->copy(transferCB->getCB(), newBuffer, 0, 0, mSize); transferCB->getCB()->registerResource(buffer, VK_ACCESS_TRANSFER_READ_BIT, VulkanUseFlag::Read); } buffer->destroy(); buffer = newBuffer; mBuffers[mMappedDeviceIdx] = buffer; } } // Queue copy/update command if (mStagingBuffer != nullptr) { mStagingBuffer->copy(transferCB->getCB(), buffer, 0, mMappedOffset, mMappedSize); transferCB->getCB()->registerResource(mStagingBuffer, VK_ACCESS_TRANSFER_READ_BIT, VulkanUseFlag::Read); } else // Staging memory { buffer->update(transferCB->getCB(), mStagingMemory, mMappedOffset, mMappedSize); } transferCB->getCB()->registerResource(buffer, VK_ACCESS_TRANSFER_WRITE_BIT, VulkanUseFlag::Write); // We don't actually flush the transfer buffer here since it's an expensive operation, but it's instead // done automatically before next "normal" command buffer submission. } if (mStagingBuffer != nullptr) { mStagingBuffer->destroy(); mStagingBuffer = nullptr; } if(mStagingMemory != nullptr) { bs_free(mStagingMemory); mStagingMemory = nullptr; } } mIsMapped = false; } void VulkanHardwareBuffer::copyData(HardwareBuffer& srcBuffer, UINT32 srcOffset, UINT32 dstOffset, UINT32 length, bool discardWholeBuffer, const SPtr& commandBuffer) { if ((dstOffset + length) > mSize) { LOGERR("Provided offset(" + toString(dstOffset) + ") + length(" + toString(length) + ") " "is larger than the destination buffer " + toString(mSize) + ". Copy operation aborted."); return; } if ((srcOffset + length) > srcBuffer.getSize()) { LOGERR("Provided offset(" + toString(srcOffset) + ") + length(" + toString(length) + ") " "is larger than the source buffer " + toString(srcBuffer.getSize()) + ". Copy operation aborted."); return; } VulkanHardwareBuffer& vkSource = static_cast(srcBuffer); VulkanRenderAPI& rapi = static_cast(RenderAPI::instance()); VulkanCmdBuffer* vkCB; if (commandBuffer != nullptr) vkCB = static_cast(commandBuffer.get())->getInternal(); else vkCB = rapi._getMainCommandBuffer()->getInternal(); UINT32 deviceIdx = vkCB->getDeviceIdx(); VulkanBuffer* src = vkSource.mBuffers[deviceIdx]; VulkanBuffer* dst = mBuffers[deviceIdx]; if (src == nullptr || dst == nullptr) return; if (vkCB->isInRenderPass()) vkCB->endRenderPass(); src->copy(vkCB, dst, srcOffset, dstOffset, length); // Notify the command buffer that these resources are being used on it vkCB->registerResource(src, VK_ACCESS_TRANSFER_READ_BIT, VulkanUseFlag::Read); vkCB->registerResource(dst, VK_ACCESS_TRANSFER_WRITE_BIT, VulkanUseFlag::Write); } void VulkanHardwareBuffer::readData(UINT32 offset, UINT32 length, void* dest, UINT32 deviceIdx, UINT32 queueIdx) { void* lockedData = lock(offset, length, GBL_READ_ONLY, deviceIdx, queueIdx); memcpy(dest, lockedData, length); unlock(); } void VulkanHardwareBuffer::writeData(UINT32 offset, UINT32 length, const void* source, BufferWriteType writeFlags, UINT32 queueIdx) { GpuLockOptions lockOptions = GBL_WRITE_ONLY_DISCARD_RANGE; if (writeFlags == BTW_NO_OVERWRITE) lockOptions = GBL_WRITE_ONLY_NO_OVERWRITE; else if (writeFlags == BWT_DISCARD) lockOptions = GBL_WRITE_ONLY_DISCARD; // Write to every device for (UINT32 i = 0; i < BS_MAX_DEVICES; i++) { if (mBuffers[i] == nullptr) continue; void* lockedData = lock(offset, length, lockOptions, i, queueIdx); memcpy(lockedData, source, length); unlock(); } } }}