Преглед изворни кода

Flush GPU param block buffers when GPU params are bound
Improve documentation for queue indices, so there isn't that much repetition
Submit main command buffer on swapBuffers

BearishSun пре 9 година
родитељ
комит
1a47d80872

+ 7 - 0
Documentation/Manuals/Native/apiRefPages.md

@@ -0,0 +1,7 @@
+@page queuesDoc Device queues
+
+Using a non-default queue index allows the GPU to perform operations on different queues in parallel (for example, streaming texture data from the CPU while the GPU is rendering).
+
+When a write operation is being performed it is the responsibility of the caller not to use that resource until that write has completed. This can be ensured by providing a sync mask to commands that accept it (such as RenderAPICore::executeCommands) with the bit corresponding to the queue index set. This way queues will know to wait until write completes. This is not required if the resource is being used on the same queue the write is being performed on, as operations within a queue are sequential.
+
+This value is a global queue index which encodes both the queue type and queue index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().

+ 9 - 7
Source/BansheeCore/Include/BsCommandBuffer.h

@@ -53,17 +53,19 @@ namespace bs
 		 * @param[in]	type		Determines what type of commands can be added to the command buffer.
 		 * @param[in]	deviceIdx	Index of the GPU the command buffer will be used to queue commands on. 0 is always
 		 *							the primary available GPU.
-		 * @param[in]	queueIdx	Index of the hardware queue the command buffer will be used on. Command buffers with
+		 * @param[in]	queueIdx	Index of the GPU queue the command buffer will be used on. Command buffers with
 		 *							the same index will execute sequentially, but command buffers with different queue
-		 *							indices may execute in parallel, for a potential performance improvement. Queue indices
-		 *							are unique per buffer type (e.g. upload index 0 and graphics index 0 may map to 
-		 *							different queues internally). Must be in range [0, 7].
+		 *							indices may execute in parallel, for a potential performance improvement. 
+		 *							
+		 *							Caller must ensure to synchronize operations executing on different queues via
+		 *							sync masks. Command buffer dependant on another command buffer should provide a sync
+		 *							mask when being submitted (see RenderAPICore::executeCommands).
+		 *							
+		 *							Queue indices are unique per buffer type (e.g. upload index 0 and graphics index 0 may
+		 *							map to different queues internally). Must be in range [0, 7].
 		 * @param[in]	secondary	If true the command buffer will not be allowed to execute on its own, but it can
 		 *							be appended to a primary command buffer. 
 		 * @return					New CommandBuffer instance.
-		 * 
-		 * @note The parallelism provided by @p queueIdx is parallelism on the GPU itself, it has nothing to do with CPU
-		 *		 parallelism or threads.
 		 */
 		static SPtr<CommandBuffer> create(GpuQueueType type, UINT32 deviceIdx = 0, UINT32 queueIdx = 0,
 			bool secondary = false);

+ 2 - 1
Source/BansheeCore/Include/BsCommonTypes.h

@@ -167,7 +167,8 @@ namespace bs
 		GPT_GEOMETRY_PROGRAM, /**< Geometry program. */
 		GPT_DOMAIN_PROGRAM, /**< Domain (tesselation evaluation) program. */
 		GPT_HULL_PROGRAM, /**< Hull (tesselation control) program. */
-		GPT_COMPUTE_PROGRAM /**< Compute program. */
+		GPT_COMPUTE_PROGRAM, /**< Compute program. */
+		GPT_COUNT // Keep at end
 	};
 
 	/**

+ 17 - 6
Source/BansheeCore/Include/BsGpuParamBlockBuffer.h

@@ -22,16 +22,26 @@ namespace bs
 		GpuParamBlockBufferCore(UINT32 size, GpuParamBlockUsage usage, GpuDeviceFlags deviceMask);
 		virtual ~GpuParamBlockBufferCore();
 
-		/** Writes all of the specified data to the buffer. Data size must be the same size as the buffer. */
-		virtual void writeToGPU(const UINT8* data) = 0;
+		/** 
+		 * Writes all of the specified data to the buffer. Data size must be the same size as the buffer. 
+		 *
+		 * @param[in]	data		Data to write. Must match the size of the buffer.
+		 * @param[in]	queueIdx	Device queue to perform the write operation on. See @ref queuesDoc.
+		 */
+		virtual void writeToGPU(const UINT8* data, UINT32 queueIdx = 0) = 0;
 
-		/** Flushes any cached data into the actual GPU buffer. */
-		void flushToGPU();
+		/** 
+		 * Flushes any cached data into the actual GPU buffer. 
+		 *
+		 * @param[in]	queueIdx	Device queue to perform the write operation on. See @ref queuesDoc.
+		 */
+		void flushToGPU(UINT32 queueIdx = 0);
 
 		/**
 		 * Write some data to the specified offset in the buffer. 
 		 *
-		 * @note	All values are in bytes. Actual hardware buffer update is delayed until rendering.
+		 * @note	All values are in bytes. Actual hardware buffer update is delayed until rendering or until 
+		 *			flushToGPU() is called.
 		 */
 		void write(UINT32 offset, const void* data, UINT32 size);
 
@@ -45,7 +55,8 @@ namespace bs
 		/**
 		 * Clear specified section of the buffer to zero.
 		 *
-		 * @note	All values are in bytes. Actual hardware buffer update is delayed until rendering.
+		 * @note	All values are in bytes. Actual hardware buffer update is delayed until rendering or until 
+		 *			flushToGPU() is called.
 		 */
 		void zeroOut(UINT32 offset, UINT32 size);
 

+ 6 - 56
Source/BansheeCore/Include/BsHardwareBuffer.h

@@ -32,16 +32,7 @@ namespace bs
 		 *							requested it here).
 		 * @param[in]	deviceIdx	Index of the device whose memory to map. If the buffer doesn't exist on this device,
 		 *							the method returns null.							
-		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. Using a non-default queue index
-		 *							allows the GPU to perform write or read operations while executing rendering or compute
-		 *							operations on the same time.
-		 * 
-		 *							Note that when writing to a buffer that is being used on a command buffer with a
-		 *							different queue you must ensure to provide the command buffer with a valid sync mask
-		 *							so it knows to wait before the write operation completes.
-		 *							
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. See @ref queuesDoc.
 		 */
 		virtual void* lock(UINT32 offset, UINT32 length, GpuLockOptions options, UINT32 deviceIdx = 0, UINT32 queueIdx = 0)
         {
@@ -62,16 +53,7 @@ namespace bs
 		 *							requested it here).
 		 * @param[in]	deviceIdx	Index of the device whose memory to map. If the buffer doesn't exist on this device,
 		 *							the method returns null.
-		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. Using a non-default queue index
-		 *							allows the GPU to perform write or read operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							Note that when writing to a buffer that is being used on a command buffer with a
-		 *							different queue you must ensure to provide the command buffer with a valid sync mask
-		 *							so it knows to wait before the write operation completes.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. See @ref queuesDoc.
 		 */
         void* lock(GpuLockOptions options, UINT32 deviceIdx = 0, UINT32 queueIdx = 0)
         {
@@ -97,12 +79,7 @@ namespace bs
 		 *							of the buffer (@p offset is only applied to the source).
 		 * @param[in]	deviceIdx	Index of the device whose memory to read. If the buffer doesn't exist on this device,
 		 *							no data will be read.		
-		 * @param[in]	queueIdx	Device queue to perform the read operation on. Using a non-default queue index
-		 *							allows the GPU to perform read operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform the read operation on. See @ref queuesDoc.
 		 */
         virtual void readData(UINT32 offset, UINT32 length, void* dest, UINT32 deviceIdx = 0, UINT32 queueIdx = 0) = 0;
 
@@ -114,16 +91,7 @@ namespace bs
 		 * @param[in]	source		Source buffer containing the data to write. Data is read from the start of the buffer
 		 *							(@p offset is only applied to the destination).
 		 * @param[in]	writeFlags	Optional write flags that may affect performance.
-		 * @param[in]	queueIdx	Device queue to perform any write operations on. Using a non-default queue index
-		 *							allows the GPU to perform write operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							Note that when writing to a buffer that is being used on a command buffer with a
-		 *							different queue you must ensure to provide the command buffer with a valid sync mask
-		 *							so it knows to wait before the write operation completes.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform the write operation on. See @ref queuesDoc.
 		 */
         virtual void writeData(UINT32 offset, UINT32 length, const void* source,
 				BufferWriteType writeFlags = BWT_NORMAL, UINT32 queueIdx = 0) = 0;
@@ -137,16 +105,7 @@ namespace bs
 		 * @param[in]	length				Size of the data to copy, in bytes.
 		 * @param[in]	discardWholeBuffer	Specify true if the data in the current buffer can be entirely discarded. This
 		 *									may improve performance.
-		 * @param[in]	queueIdx			Device queue to perform any read/write operations on. Using a non-default queue
-		 *									index allows the GPU to perform write or read operations while executing 
-		 *									rendering or compute operations on the same time.
-		 *									
-		 *									Note that when writing to a buffer that is being used on a command buffer with a
-		 *									different queue you must ensure to provide the command buffer with a valid sync
-		 *									mask so it knows to wait before the write operation completes.
-		 *
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the copy operation on. See @ref queuesDoc.
 		 */
 		virtual void copyData(HardwareBuffer& srcBuffer, UINT32 srcOffset, 
 			UINT32 dstOffset, UINT32 length, bool discardWholeBuffer = false, UINT32 queueIdx = 0)
@@ -161,16 +120,7 @@ namespace bs
 		 * Copy data from the provided buffer into this buffer. If buffers are not the same size, smaller size will be used.
 		 * 
 		 * @param[in]	srcBuffer	Hardware buffer to copy from.
-		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. Using a non-default queue index
-		 *							allows the GPU to perform write or read operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							Note that when writing to a buffer that is being used on a command buffer with a
-		 *							different queue you must ensure to provide the command buffer with a valid sync mask
-		 *							so it knows to wait before the write operation completes.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform the copy operation on. See @ref queuesDoc.
 		 */
 		virtual void copyData(HardwareBuffer& srcBuffer, UINT32 queueIdx = 0)
 		{

+ 2 - 12
Source/BansheeCore/Include/BsMesh.h

@@ -315,12 +315,7 @@ namespace bs
 		 *									will fail.
 		 * @param[in]	updateBounds		If true the internal bounds of the mesh will be recalculated based on the 
 		 *									provided data.
-		 * @param[in]	queueIdx			Device queue to perform the write operation on. Using a non-default queue index
-		 *									allows the GPU to perform write operations while executing rendering or compute
-		 *									operations on the same time.
-		 *									
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the write operation on. See @ref queuesDoc.
 		 */
 		virtual void writeSubresource(UINT32 subresourceIdx, const MeshData& data, bool discardEntireBuffer, 
 			bool updateBounds = true, UINT32 queueIdx = 0);
@@ -333,12 +328,7 @@ namespace bs
 		 *									allocateSubresourceBuffer() to ensure it is of valid type and size.
 		 * @param[in]	deviceIdx			Index of the device whose memory to read. If the buffer doesn't exist on this
 		 *									device, no data will be read.
-		 * @param[in]	queueIdx			Device queue to perform the read operation on. Using a non-default queue index
-		 *									allows the GPU to perform read operations while executing rendering or compute
-		 *									operations on the same time.
-		 *
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the read operation on. See @ref queuesDoc.
 		 */
 		virtual void readSubresource(UINT32 subresourceIdx, MeshData& data, UINT32 deviceIdx = 0, UINT32 queueIdx = 0);
 

+ 1 - 1
Source/BansheeCore/Include/BsParamBlocks.h

@@ -47,7 +47,7 @@ namespace bs
 																															\
 		const SPtr<GpuParamBlockBufferCore>& getBuffer() const { return mBuffer; }											\
 		const GpuParamBlockDesc& getDesc() const { return mBlockDesc; }														\
-		void flushToGPU() {	mBuffer->flushToGPU(); }																		\
+		void flushToGPU(UINT32 queueIdx = 0) { mBuffer->flushToGPU(queueIdx); }												\
 																															\
 	private:																												\
 		struct META_FirstEntry {};																							\

+ 6 - 49
Source/BansheeCore/Include/BsTexture.h

@@ -343,12 +343,7 @@ namespace bs
 		 *									discarded. This can make the operation faster. Resources with certain buffer 
 		 *									types might require this flag to be in a specific state otherwise the operation 
 		 *									will fail.
-		 * @param[in]	queueIdx			Device queue to perform the write operation on. Using a non-default queue index
-		 *									allows the GPU to perform write operations while executing rendering or compute
-		 *									operations on the same time.
-		 *									
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().									
+		 * @param[in]	queueIdx			Device queue to perform the write operation on. See @ref queuesDoc.								
 		 */
 		virtual void writeSubresource(UINT32 subresourceIdx, const PixelData& data, bool discardEntireBuffer,
 									  UINT32 queueIdx = 0);
@@ -361,12 +356,7 @@ namespace bs
 		 *									allocateSubresourceBuffer() to ensure it is of valid type and size.
 		 * @param[in]	deviceIdx			Index of the device whose memory to read. If the buffer doesn't exist on this
 		 *									device, no data will be read.
-		 * @param[in]	queueIdx			Device queue to perform the read operation on. Using a non-default queue index
-		 *									allows the GPU to perform read operations while executing rendering or compute
-		 *									operations on the same time.
-		 *
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the read operation on. See @ref queuesDoc.
 		 */
 		virtual void readSubresource(UINT32 subresourceIdx, PixelData& data, UINT32 deviceIdx = 0, UINT32 queueIdx = 0);
 
@@ -378,17 +368,7 @@ namespace bs
 		 * @param[in]	face		(optional) Texture face to lock.
 		 * @param[in]	deviceIdx	Index of the device whose memory to map. If the buffer doesn't exist on this device,
 		 *							the method returns null.
-		 * @param[in]	queueIdx	Device queue to perform any read/write operations on. Using a non-default queue index
-		 *							allows the GPU to perform write or read operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							Note that when writing to a texture that is being used on a command buffer with a
-		 *							different queue you must ensure to provide the command buffer with a valid sync mask
-		 *							so it knows to wait before the write operation completes.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
-		 * @return					Pointer to the buffer data. Only valid until you call unlock().
+		 * @param[in]	queueIdx	Device queue to perform the read/write operations on. See @ref queuesDoc.
 		 * 			
 		 * @note	
 		 * If you are just reading or writing one block of data use readData()/writeData() methods as they can be much faster
@@ -414,16 +394,7 @@ namespace bs
 		 * @param[in]	srcSubresourceIdx	Index of the subresource to copy from.
 		 * @param[in]	destSubresourceIdx	Index of the subresource to copy to.
 		 * @param[in]	target				Texture that contains the destination subresource.
-		 * @param[in]	queueIdx			Device queue to perform any read/write operations on. Using a non-default queue
-		 *									index allows the GPU to perform write or read operations while executing
-		 *									rendering or compute operations on the same time.
-		 *
-		 *									Note that when writing to a texture that is being used on a command buffer with
-		 *									a different queue you must ensure to provide the command buffer with a valid
-		 *									sync mask so it knows to wait before the write operation completes.
-		 *
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the copy operation on. See @ref queuesDoc.
 		 */
 		void copy(UINT32 srcSubresourceIdx, UINT32 destSubresourceIdx, const SPtr<TextureCore>& target, UINT32 queueIdx = 0);
 
@@ -435,12 +406,7 @@ namespace bs
 		 * @param[in]	face		(optional) Texture face to read from.
 		 * @param[in]	deviceIdx	Index of the device whose memory to read. If the buffer doesn't exist on this device,
 		 *							no data will be read.
-		 * @param[in]	queueIdx	Device queue to perform the read operation on. Using a non-default queue index
-		 *							allows the GPU to perform read operations while executing rendering or compute
-		 *							operations on the same time.
-		 *
-		 *							This value is a global queue index which encodes both the queue type and queue index.
-		 *							Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx	Device queue to perform the read operation on. See @ref queuesDoc.
 		 */
 		virtual void readData(PixelData& dest, UINT32 mipLevel = 0, UINT32 face = 0, UINT32 deviceIdx = 0,
 							  UINT32 queueIdx = 0) = 0;
@@ -453,16 +419,7 @@ namespace bs
 		 * @param[in]	face				(optional) Texture face to write into.
 		 * @param[in]	discardWholeBuffer	(optional) If true any existing texture data will be discard. This can improve 
 		 *									performance of the write operation.
-		 * @param[in]	queueIdx			Device queue to perform any write operations on. Using a non-default queue index
-		 *									allows the GPU to perform write operations while executing rendering or compute
-		 *									operations on the same time.
-		 *
-		 *									Note that when writing to a texture that is being used on a command buffer with
-		 *									a different queue you must ensure to provide the command buffer with a valid
-		 *									sync mask so it knows to wait before the write operation completes.
-		 *
-		 *									This value is a global queue index which encodes both the queue type and queue
-		 *									index. Retrieve it from CommandSyncMask::getGlobalQueueIdx().
+		 * @param[in]	queueIdx			Device queue to perform the write operation on. See @ref queuesDoc.
 		 */
 		virtual void writeData(const PixelData& src, UINT32 mipLevel = 0, UINT32 face = 0, bool discardWholeBuffer = false,
 							   UINT32 queueIdx = 0) = 0;

+ 2 - 2
Source/BansheeCore/Source/BsGpuParamBlockBuffer.cpp

@@ -65,11 +65,11 @@ namespace bs
 		mGPUBufferDirty = true;
 	}
 
-	void GpuParamBlockBufferCore::flushToGPU()
+	void GpuParamBlockBufferCore::flushToGPU(UINT32 queueIdx)
 	{
 		if (mGPUBufferDirty)
 		{
-			writeToGPU(mCachedData);
+			writeToGPU(mCachedData, queueIdx);
 			mGPUBufferDirty = false;
 		}
 	}

+ 1 - 1
Source/BansheeD3D11RenderAPI/Include/BsD3D11GpuParamBlockBuffer.h

@@ -19,7 +19,7 @@ namespace bs
 		~D3D11GpuParamBlockBufferCore();
 
 		/** @copydoc GpuParamBlockBufferCore::writeToGPU */
-		void writeToGPU(const UINT8* data) override;
+		void writeToGPU(const UINT8* data, UINT32 queueIdx = 0) override;
 
 		/**	Returns internal DX11 buffer object. */
 		ID3D11Buffer* getD3D11Buffer() const;

+ 1 - 1
Source/BansheeD3D11RenderAPI/Source/BsD3D11GpuParamBlockBuffer.cpp

@@ -46,7 +46,7 @@ namespace bs
 		return mBuffer->getD3DBuffer();
 	}
 
-	void D3D11GpuParamBlockBufferCore::writeToGPU(const UINT8* data)
+	void D3D11GpuParamBlockBufferCore::writeToGPU(const UINT8* data, UINT32 queueIdx)
 	{
 		mBuffer->writeData(0, mSize, data, BWT_DISCARD);
 

+ 1 - 1
Source/BansheeGLRenderAPI/Include/BsGLGpuParamBlockBuffer.h

@@ -19,7 +19,7 @@ namespace bs
 		~GLGpuParamBlockBufferCore();
 
 		/** @copydoc GpuParamBlockBufferCore::writeToGPU */
-		void writeToGPU(const UINT8* data) override;
+		void writeToGPU(const UINT8* data, UINT32 queueIdx = 0) override;
 
 		/**	Returns internal OpenGL uniform buffer handle. */
 		GLuint getGLHandle() const { return mGLHandle; }

+ 1 - 1
Source/BansheeGLRenderAPI/Source/BsGLGpuParamBlockBuffer.cpp

@@ -36,7 +36,7 @@ namespace bs
 		GpuParamBlockBufferCore::initialize();
 	}
 
-	void GLGpuParamBlockBufferCore::writeToGPU(const UINT8* data)
+	void GLGpuParamBlockBufferCore::writeToGPU(const UINT8* data, UINT32 queueIdx)
 	{
 		glBindBuffer(GL_UNIFORM_BUFFER, mGLHandle);
 		glBufferSubData(GL_UNIFORM_BUFFER, 0 , mSize, data);

+ 1 - 1
Source/BansheeVulkanRenderAPI/Include/BsVulkanGpuParamBlockBuffer.h

@@ -19,7 +19,7 @@ namespace bs
 		~VulkanGpuParamBlockBufferCore();
 
 		/** @copydoc GpuParamBlockBufferCore::writeToGPU */
-		void writeToGPU(const UINT8* data) override;
+		void writeToGPU(const UINT8* data, UINT32 queueIdx = 0) override;
 
 		/** 
 		 * Gets the resource wrapping the buffer object, on the specified device. If GPU param block buffer's device mask

+ 3 - 0
Source/BansheeVulkanRenderAPI/Source/BsVulkanCommandBuffer.cpp

@@ -1092,6 +1092,9 @@ namespace bs
 		if (mBuffer->isRecording())
 			mBuffer->end();
 
+		if (!mBuffer->isReadyForSubmit()) // Possibly nothing was recorded in the buffer
+			return;
+
 		mBuffer->submit(mQueue, mQueueIdx, syncMask);
 
 		gVulkanCBManager().refreshStates(mDeviceIdx);

+ 2 - 2
Source/BansheeVulkanRenderAPI/Source/BsVulkanGpuParamBlockBuffer.cpp

@@ -30,9 +30,9 @@ namespace bs
 		GpuParamBlockBufferCore::initialize();
 	}
 
-	void VulkanGpuParamBlockBufferCore::writeToGPU(const UINT8* data)
+	void VulkanGpuParamBlockBufferCore::writeToGPU(const UINT8* data, UINT32 queueIdx)
 	{
-		mBuffer->writeData(0, mSize, data, BWT_DISCARD);
+		mBuffer->writeData(0, mSize, data, BWT_DISCARD, queueIdx);
 
 		BS_INC_RENDER_STAT_CAT(ResWrite, RenderStatObject_GpuParamBuffer);
 	}

+ 20 - 0
Source/BansheeVulkanRenderAPI/Source/BsVulkanRenderAPI.cpp

@@ -16,6 +16,7 @@
 #include "BsVulkanCommandBuffer.h"
 #include "BsVulkanGpuParams.h"
 #include "BsVulkanVertexInputManager.h"
+#include "BsVulkanGpuParamBlockBuffer.h"
 
 #if BS_PLATFORM == BS_PLATFORM_WIN32
 	#include "Win32/BsWin32VideoModeInfo.h"
@@ -331,6 +332,24 @@ namespace bs
 		VulkanCommandBuffer* cb = getCB(commandBuffer);
 		VulkanCmdBuffer* vkCB = cb->getInternal();
 
+		UINT32 globalQueueIdx = CommandSyncMask::getGlobalQueueIdx(cb->getType(), cb->getQueueIdx());
+
+		for (UINT32 i = 0; i < GPT_COUNT; i++)
+		{
+			SPtr<GpuParamDesc> paramDesc = gpuParams->getParamDesc((GpuProgramType)i);
+			if (paramDesc == nullptr)
+				return;
+
+			// Flush all param block buffers
+			for (auto iter = paramDesc->paramBlocks.begin(); iter != paramDesc->paramBlocks.end(); ++iter)
+			{
+				SPtr<GpuParamBlockBufferCore> buffer = gpuParams->getParamBlockBuffer(iter->second.set, iter->second.slot);
+
+				if (buffer != nullptr)
+					buffer->flushToGPU(globalQueueIdx);
+			}
+		}
+
 		vkCB->setGpuParams(gpuParams);
 
 		BS_INC_RENDER_STAT(NumGpuParamBinds);
@@ -488,6 +507,7 @@ namespace bs
 	{
 		THROW_IF_NOT_CORE_THREAD;
 
+		executeCommands(mMainCommandBuffer, syncMask);
 		target->swapBuffers(syncMask);
 
 		// See if any command buffers finished executing