浏览代码

VK: Implement chunked scratch buffer for uniforms. Fixed issue #3112. (#3500)

Branimir Karadžić 3 周之前
父节点
当前提交
efe84f3f14
共有 3 个文件被更改,包括 407 次插入127 次删除
  1. 3 0
      src/bgfx_p.h
  2. 346 119
      src/renderer_vk.cpp
  3. 58 8
      src/renderer_vk.h

+ 3 - 0
src/bgfx_p.h

@@ -2192,6 +2192,7 @@ namespace bgfx
 				, m_offset
 				, kMaxOffset
 				);
+			BX_UNUSED(kMaxSize, kMaxOffset);
 
 			const KeyT view   = (KeyT(m_view)      << kViewShift)   & kViewMask;
 			const KeyT handle = (KeyT(m_handle)    << kHandleShift) & kHandleMask;
@@ -2694,6 +2695,7 @@ namespace bgfx
 					, "Setting uniform for draw call, but uniform frequency is different (frequency: %d)!"
 					, uniform.m_freq
 					);
+				BX_UNUSED(uniform);
 			}
 
 			UniformBuffer::update(&m_frame->m_uniformBuffer[m_uniformIdx]);
@@ -3305,6 +3307,7 @@ namespace bgfx
 				, "Truncated uniform update. %d (max: %d)"
 				, _num, uniform.m_num
 				);
+			BX_UNUSED(freq);
 
 			UniformCacheKey key =
 			{

+ 346 - 119
src/renderer_vk.cpp

@@ -2088,14 +2088,7 @@ VK_IMPORT_DEVICE
 			}
 
 			{
-				const uint32_t size = 128;
-				const uint32_t count = BGFX_CONFIG_MAX_DRAW_CALLS;
-
-				for (uint32_t ii = 0; ii < m_maxFrameLatency; ++ii)
-				{
-					BX_TRACE("Create scratch buffer %d", ii);
-					m_scratchBuffer[ii].createUniform(size, count);
-				}
+				m_uniformScratchBuffer.createUniform(2<<20, m_maxFrameLatency*2);
 
 				for (uint32_t ii = 0; ii < m_maxFrameLatency; ++ii)
 				{
@@ -2165,9 +2158,10 @@ VK_IMPORT_DEVICE
 				[[fallthrough]];
 
 			case ErrorState::DescriptorCreated:
+				m_uniformScratchBuffer.destroy();
+
 				for (uint32_t ii = 0; ii < m_maxFrameLatency; ++ii)
 				{
-					m_scratchBuffer[ii].destroy();
 					m_scratchStagingBuffer[ii].destroy();
 					vkDestroy(m_descriptorPool[ii]);
 				}
@@ -2228,10 +2222,7 @@ VK_IMPORT_DEVICE
 			m_samplerBorderColorCache.invalidate();
 			m_imageViewCache.invalidate();
 
-			for (uint32_t ii = 0; ii < m_maxFrameLatency; ++ii)
-			{
-				m_scratchBuffer[ii].destroy();
-			}
+			m_uniformScratchBuffer.destroy();
 
 			for (uint32_t ii = 0; ii < m_maxFrameLatency; ++ii)
 			{
@@ -2748,8 +2739,10 @@ VK_IMPORT_DEVICE
 				commit(*vcb);
 			}
 
-			ScratchBufferVK& scratchBuffer = m_scratchBuffer[m_cmd.m_currentFrameInFlight];
-			const uint32_t bufferOffset = scratchBuffer.write(m_vsScratch, program.m_vsh->m_size);
+			ChunkedScratchBufferVK& uniformScratchBuffer = m_uniformScratchBuffer;
+
+			ChunkedScratchBufferOffset sbo;
+			uniformScratchBuffer.write(sbo, m_vsScratch, program.m_vsh->m_size);
 
 			const TextureVK& texture = m_textures[_blitter.m_texture.idx];
 
@@ -2759,7 +2752,7 @@ VK_IMPORT_DEVICE
 			bind.m_bind[0].m_idx = _blitter.m_texture.idx;
 			bind.m_bind[0].m_samplerFlags = (uint32_t)(texture.m_flags & BGFX_SAMPLER_BITS_MASK);
 
-			const VkDescriptorSet descriptorSet = getDescriptorSet(program, bind, scratchBuffer, NULL);
+			const VkDescriptorSet descriptorSet = getDescriptorSet(program, bind, sbo.buffer, NULL);
 
 			vkCmdBindDescriptorSets(
 				  m_commandBuffer
@@ -2769,7 +2762,7 @@ VK_IMPORT_DEVICE
 				, 1
 				, &descriptorSet
 				, 1
-				, &bufferOffset
+				, sbo.offsets
 				);
 
 			const VertexBufferVK& vb  = m_vertexBuffers[_blitter.m_vb->handle.idx];
@@ -3933,7 +3926,7 @@ VK_IMPORT_DEVICE
 			return pipeline;
 		}
 
-		VkDescriptorSet getDescriptorSet(const ProgramVK& program, const RenderBind& renderBind, const ScratchBufferVK& scratchBuffer, const float _palette[][4])
+		VkDescriptorSet getDescriptorSet(const ProgramVK& _program, const RenderBind& _renderBind, VkBuffer _uniformBuffer, const float _palette[][4])
 		{
 			VkDescriptorSet descriptorSet;
 
@@ -3942,7 +3935,7 @@ VK_IMPORT_DEVICE
 			dsai.pNext              = NULL;
 			dsai.descriptorPool     = m_descriptorPool[m_cmd.m_currentFrameInFlight];
 			dsai.descriptorSetCount = 1;
-			dsai.pSetLayouts        = &program.m_descriptorSetLayout;
+			dsai.pSetLayouts        = &_program.m_descriptorSetLayout;
 
 			VK_CHECK(vkAllocateDescriptorSets(m_device, &dsai, &descriptorSet) );
 
@@ -3958,8 +3951,8 @@ VK_IMPORT_DEVICE
 
 			for (uint32_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 			{
-				const Binding& bind = renderBind.m_bind[stage];
-				const BindInfo& bindInfo = program.m_bindInfo[stage];
+				const Binding& bind = _renderBind.m_bind[stage];
+				const BindInfo& bindInfo = _program.m_bindInfo[stage];
 
 				if (kInvalidHandle != bind.m_idx
 				&&  isValid(bindInfo.uniformHandle) )
@@ -3989,7 +3982,7 @@ VK_IMPORT_DEVICE
 							VkImageViewType type = texture.m_type;
 							if (UINT32_MAX != bindInfo.index)
 							{
-								type = program.m_textures[bindInfo.index].type;
+								type = _program.m_textures[bindInfo.index].type;
 							}
 							else if (type == VK_IMAGE_VIEW_TYPE_CUBE
 							     ||  type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
@@ -4058,7 +4051,7 @@ VK_IMPORT_DEVICE
 
 							const VkImageViewType type = UINT32_MAX == bindInfo.index
 								? texture.m_type
-								: program.m_textures[bindInfo.index].type
+								: _program.m_textures[bindInfo.index].type
 								;
 
 							BX_ASSERT(
@@ -4107,19 +4100,19 @@ VK_IMPORT_DEVICE
 				}
 			}
 
-			const uint32_t vsize = program.m_vsh->m_size;
-			const uint32_t fsize = NULL != program.m_fsh ? program.m_fsh->m_size : 0;
+			const uint32_t vsSize = _program.m_vsh->m_size;
+			const uint32_t fsSize = NULL != _program.m_fsh ? _program.m_fsh->m_size : 0;
 
-			if (vsize > 0)
+			if (0 < vsSize)
 			{
-				bufferInfo[bufferCount].buffer = scratchBuffer.m_buffer;
+				bufferInfo[bufferCount].buffer = _uniformBuffer;
 				bufferInfo[bufferCount].offset = 0;
-				bufferInfo[bufferCount].range  = vsize;
+				bufferInfo[bufferCount].range  = vsSize;
 
 				wds[wdsCount].sType            = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 				wds[wdsCount].pNext            = NULL;
 				wds[wdsCount].dstSet           = descriptorSet;
-				wds[wdsCount].dstBinding       = program.m_vsh->m_uniformBinding;
+				wds[wdsCount].dstBinding       = _program.m_vsh->m_uniformBinding;
 				wds[wdsCount].dstArrayElement  = 0;
 				wds[wdsCount].descriptorCount  = 1;
 				wds[wdsCount].descriptorType   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
@@ -4130,16 +4123,16 @@ VK_IMPORT_DEVICE
 				++bufferCount;
 			}
 
-			if (fsize > 0)
+			if (0 < fsSize)
 			{
-				bufferInfo[bufferCount].buffer = scratchBuffer.m_buffer;
+				bufferInfo[bufferCount].buffer = _uniformBuffer;
 				bufferInfo[bufferCount].offset = 0;
-				bufferInfo[bufferCount].range  = fsize;
+				bufferInfo[bufferCount].range  = fsSize;
 
 				wds[wdsCount].sType            = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 				wds[wdsCount].pNext            = NULL;
 				wds[wdsCount].dstSet           = descriptorSet;
-				wds[wdsCount].dstBinding       = program.m_fsh->m_uniformBinding;
+				wds[wdsCount].dstBinding       = _program.m_fsh->m_uniformBinding;
 				wds[wdsCount].dstArrayElement  = 0;
 				wds[wdsCount].descriptorCount  = 1;
 				wds[wdsCount].descriptorType   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
@@ -4599,7 +4592,7 @@ VK_IMPORT_DEVICE
 			BGFX_PROFILER_SCOPE("RendererContextVK::allocFromScratchStagingBuffer", kColorResource);
 
 			StagingBufferVK result;
-			ScratchBufferVK &scratch = m_scratchStagingBuffer[m_cmd.m_currentFrameInFlight];
+			StagingScratchBufferVK& scratch = m_scratchStagingBuffer[m_cmd.m_currentFrameInFlight];
 
 			if (_size <= BGFX_CONFIG_MAX_STAGING_SCRATCH_BUFFER_SIZE)
 			{
@@ -4671,8 +4664,8 @@ VK_IMPORT_DEVICE
 
 		MemoryLruVK m_memoryLru;
 
-		ScratchBufferVK m_scratchBuffer[BGFX_CONFIG_MAX_FRAME_LATENCY];
-		ScratchBufferVK m_scratchStagingBuffer[BGFX_CONFIG_MAX_FRAME_LATENCY];
+		ChunkedScratchBufferVK m_uniformScratchBuffer;
+		StagingScratchBufferVK m_scratchStagingBuffer[BGFX_CONFIG_MAX_FRAME_LATENCY];
 
 		uint32_t        m_maxFrameLatency;
 		CommandQueueVK  m_cmd;
@@ -4808,31 +4801,33 @@ VK_DESTROY
 		s_renderVK->release(_obj);
 	}
 
-	void MemoryLruVK::recycle(DeviceMemoryAllocationVK &_alloc)
+	void MemoryLruVK::recycle(DeviceMemoryAllocationVK& _alloc)
 	{
 		if (MAX_ENTRIES == lru.getNumHandles() )
 		{
 			// Evict LRU
 			uint16_t handle = lru.getBack();
-			DeviceMemoryAllocationVK &alloc = entries[handle];
+			DeviceMemoryAllocationVK& alloc = entries[handle];
 			totalSizeCached -= alloc.size;
 			release(alloc.mem);
 
 			// Touch slot and overwrite
 			lru.touch(handle);
 			alloc = _alloc;
-		} else
+		}
+		else
 		{
 			uint16_t handle = lru.alloc();
 			entries[handle] = _alloc;
 		}
+
 		totalSizeCached += _alloc.size;
 
 		while (totalSizeCached > BGFX_CONFIG_CACHED_DEVICE_MEMORY_ALLOCATIONS_SIZE)
 		{
 			BX_ASSERT(lru.getNumHandles() > 0, "Memory badly counted.");
 			uint16_t handle = lru.getBack();
-			DeviceMemoryAllocationVK &alloc = entries[handle];
+			DeviceMemoryAllocationVK& alloc = entries[handle];
 			totalSizeCached -= alloc.size;
 			release(alloc.mem);
 			lru.free(handle);
@@ -4844,25 +4839,33 @@ VK_DESTROY
 		BGFX_PROFILER_SCOPE("MemoryLruVK::find", kColorResource);
 		// Find best fit.
 		uint16_t slot;
+
 		{
-			int16_t bestIdx = MAX_ENTRIES;
+			int16_t  bestIdx   = MAX_ENTRIES;
 			uint32_t bestWaste = 0xffff'ffff;
+
 			slot = lru.getFront();
+
 			while (UINT16_MAX != slot)
 			{
-				DeviceMemoryAllocationVK &alloc = entries[slot];
+				DeviceMemoryAllocationVK& alloc = entries[slot];
+
 				if (alloc.memoryTypeIndex == _memoryTypeIndex)
 				{
 					// 50% waste allowed, otherwise we'll just allocate a new one.
 					// This is to prevent we trash this cache of useful allocations
 					// with a handful of tiny allocations.
-					if (alloc.size >= _size && _size * 2 >= alloc.size)
+
+					if (alloc.size >= _size
+					&&  alloc.size <= _size * 2)
 					{
-						uint32_t waste = bx::narrowCast<uint32_t>(alloc.size - _size);
+						const uint32_t waste = bx::narrowCast<uint32_t>(alloc.size - _size);
+
 						if (waste < bestWaste)
 						{
 							bestIdx = slot;
 							bestWaste = waste;
+
 							if (waste == 0)
 							{
 								break;
@@ -4870,8 +4873,10 @@ VK_DESTROY
 						}
 					}
 				}
+
 				slot = lru.getNext(slot);
 			}
+
 			slot = bestIdx;
 		}
 
@@ -4880,37 +4885,40 @@ VK_DESTROY
 			*_alloc = entries[slot];
 			lru.free(slot);
 			totalSizeCached -= _alloc->size;
+
 			return true;
-		} else {
-			return false;
 		}
+
+		return false;
 	}
 
 	void MemoryLruVK::evictAll()
 	{
 		uint16_t slot = lru.getFront();
+
 		while (slot != UINT16_MAX)
 		{
 			release(entries[slot].mem);
 			slot = lru.getNext(slot);
 		}
+
 		lru.reset();
 		totalSizeCached = 0;
 	}
 
-	void ScratchBufferVK::create(uint32_t _size, uint32_t _count, VkBufferUsageFlags usage, uint32_t _align)
+	void StagingScratchBufferVK::create(uint32_t _size, uint32_t _count, VkBufferUsageFlags usage, uint32_t _align)
 	{
 		const VkAllocationCallbacks* allocatorCb = s_renderVK->m_allocatorCb;
 		const VkDevice device = s_renderVK->m_device;
 
 		const uint32_t entrySize = bx::strideAlign(_size, _align);
-		const uint32_t totalSize = entrySize * _count;
+		const uint32_t chunkSize = entrySize * _count;
 
 		VkBufferCreateInfo bci;
 		bci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
 		bci.pNext = NULL;
 		bci.flags = 0;
-		bci.size  = totalSize;
+		bci.size  = chunkSize;
 		bci.usage = usage;
 		bci.sharingMode           = VK_SHARING_MODE_EXCLUSIVE;
 		bci.queueFamilyIndexCount = 0;
@@ -4940,7 +4948,7 @@ VK_DESTROY
 		}
 
 		m_size = (uint32_t)mr.size;
-		m_pos  = 0;
+		m_chunkPos  = 0;
 		m_align = _align;
 
 		VK_CHECK(vkBindBufferMemory(device, m_buffer, m_deviceMem.mem, m_deviceMem.offset) );
@@ -4948,7 +4956,7 @@ VK_DESTROY
 		VK_CHECK(vkMapMemory(device, m_deviceMem.mem, m_deviceMem.offset, m_size, 0, (void**)&m_data) );
 	}
 
-	void ScratchBufferVK::createUniform(uint32_t _size, uint32_t _count)
+	void StagingScratchBufferVK::createUniform(uint32_t _size, uint32_t _count)
 	{
 		const VkPhysicalDeviceLimits& deviceLimits = s_renderVK->m_deviceProperties.properties.limits;
 		const uint32_t align = uint32_t(deviceLimits.minUniformBufferOffsetAlignment);
@@ -4956,7 +4964,7 @@ VK_DESTROY
 		create(_size, _count, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, align);
 	}
 
-	void ScratchBufferVK::createStaging(uint32_t _size)
+	void StagingScratchBufferVK::createStaging(uint32_t _size)
 	{
 		const VkPhysicalDeviceLimits& deviceLimits = s_renderVK->m_deviceProperties.properties.limits;
 		const uint32_t align = uint32_t(deviceLimits.optimalBufferCopyOffsetAlignment);
@@ -4964,7 +4972,7 @@ VK_DESTROY
 		create(_size, 1, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, align);
 	}
 
-	void ScratchBufferVK::destroy()
+	void StagingScratchBufferVK::destroy()
 	{
 		vkUnmapMemory(s_renderVK->m_device, m_deviceMem.mem);
 
@@ -4972,42 +4980,40 @@ VK_DESTROY
 		s_renderVK->recycleMemory(m_deviceMem);
 	}
 
-
-	uint32_t ScratchBufferVK::alloc(uint32_t _size, uint32_t _minAlign)
+	uint32_t StagingScratchBufferVK::alloc(uint32_t _size, uint32_t _minAlign)
 	{
 		const uint32_t align = bx::uint32_lcm(m_align, _minAlign);
-		const uint32_t dstOffset = bx::strideAlign(m_pos, align);
+		const uint32_t offset = bx::strideAlign(m_chunkPos, align);
 
-		if (dstOffset + _size <= m_size)
+		if (offset + _size <= m_size)
 		{
-			m_pos = dstOffset + _size;
-			return dstOffset;
+			m_chunkPos = offset + _size;
+			return offset;
 		}
 
 		return UINT32_MAX;
 	}
 
-	uint32_t ScratchBufferVK::write(const void* _data, uint32_t _size, uint32_t _minAlign)
+	uint32_t StagingScratchBufferVK::write(const void* _data, uint32_t _size, uint32_t _minAlign)
 	{
-		uint32_t dstOffset = alloc(_size, _minAlign);
-		BX_ASSERT(dstOffset != UINT32_MAX, "Not enough space on ScratchBuffer left to allocate %u bytes with alignment %u.", _size, _minAlign);
+		uint32_t offset = alloc(_size, _minAlign);
+		BX_ASSERT(offset != UINT32_MAX, "Not enough space on ScratchBuffer left to allocate %u bytes with alignment %u.", _size, _minAlign);
 
 		if (_size > 0)
 		{
-			bx::memCopy(&m_data[dstOffset], _data, _size);
+			bx::memCopy(&m_data[offset], _data, _size);
 		}
 
-		return dstOffset;
+		return offset;
 	}
 
-
-	void ScratchBufferVK::flush(bool _reset)
+	void StagingScratchBufferVK::flush(bool _reset)
 	{
 		const VkPhysicalDeviceLimits& deviceLimits = s_renderVK->m_deviceProperties.properties.limits;
 		VkDevice device = s_renderVK->m_device;
 
 		const uint32_t align = uint32_t(deviceLimits.nonCoherentAtomSize);
-		const uint32_t size  = bx::min(bx::strideAlign(m_pos, align), m_size);
+		const uint32_t size  = bx::min(bx::strideAlign(m_chunkPos, align), m_size);
 
 		VkMappedMemoryRange range;
 		range.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@@ -5019,8 +5025,221 @@ VK_DESTROY
 
 		if (_reset)
 		{
-			m_pos = 0;
+			m_chunkPos = 0;
+		}
+	}
+
+	void ChunkedScratchBufferVK::create(uint32_t _chunkSize, uint32_t _numChunks, VkBufferUsageFlags usage, uint32_t _align)
+	{
+		const uint32_t chunkSize = bx::alignUp(_chunkSize, 1<<20);
+
+		m_chunkPos  = 0;
+		m_chunkSize = chunkSize;
+		m_align     = _align;
+		m_usage     = usage;
+
+		m_chunkControl.m_size = 0;
+		m_chunkControl.reset();
+
+		bx::memSet(m_consume, 0, sizeof(m_consume) );
+		m_totalUsed = 0;
+
+		for (uint32_t ii = 0; ii < _numChunks; ++ii)
+		{
+			addChunk();
+		}
+	}
+
+	void ChunkedScratchBufferVK::createUniform(uint32_t _chunkSize, uint32_t _numChunks)
+	{
+		const VkPhysicalDeviceLimits& deviceLimits = s_renderVK->m_deviceProperties.properties.limits;
+		const uint32_t align = uint32_t(deviceLimits.minUniformBufferOffsetAlignment);
+
+		create(_chunkSize, _numChunks, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, align);
+	}
+
+	void ChunkedScratchBufferVK::destroy()
+	{
+		for (Chunk& sbc : m_chunks)
+		{
+			vkUnmapMemory(s_renderVK->m_device, sbc.deviceMem.mem);
+
+			s_renderVK->release(sbc.buffer);
+			s_renderVK->recycleMemory(sbc.deviceMem);
+		}
+	}
+
+	void ChunkedScratchBufferVK::addChunk(uint32_t _at)
+	{
+		const VkAllocationCallbacks* allocatorCb = s_renderVK->m_allocatorCb;
+		const VkDevice device = s_renderVK->m_device;
+
+		Chunk sbc;
+
+		VkBufferCreateInfo bci =
+		{
+			.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+			.pNext = NULL,
+			.flags = 0,
+			.size  = m_chunkSize,
+			.usage = m_usage,
+			.sharingMode           = VK_SHARING_MODE_EXCLUSIVE,
+			.queueFamilyIndexCount = 0,
+			.pQueueFamilyIndices   = NULL,
+		};
+
+		VK_CHECK(vkCreateBuffer(
+			  device
+			, &bci
+			, allocatorCb
+			, &sbc.buffer
+			) );
+
+		VkMemoryRequirements mr;
+		vkGetBufferMemoryRequirements(
+			  device
+			, sbc.buffer
+			, &mr
+			);
+
+		VkMemoryPropertyFlags flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+		VkResult result = s_renderVK->allocateMemory(&mr, flags, &sbc.deviceMem, true);
+
+		if (VK_SUCCESS != result)
+		{
+			flags &= ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+			VK_CHECK(s_renderVK->allocateMemory(&mr, flags, &sbc.deviceMem, true) );
+		}
+
+		m_chunkSize = bx::narrowCast<uint32_t>(mr.size);
+
+		VK_CHECK(vkBindBufferMemory(device, sbc.buffer, sbc.deviceMem.mem, sbc.deviceMem.offset) );
+
+		VK_CHECK(vkMapMemory(device, sbc.deviceMem.mem, sbc.deviceMem.offset, m_chunkSize, 0, (void**)&sbc.data) );
+
+		const uint32_t lastChunk = bx::max(uint32_t(m_chunks.size()-1), 1);
+		const uint32_t at = UINT32_MAX == _at ? lastChunk : _at;
+		const uint32_t chunkIndex = at % bx::max(m_chunks.size(), 1);
+
+		m_chunkControl.resize(m_chunkSize);
+
+		m_chunks.insert(&m_chunks[chunkIndex], sbc);
+	}
+
+	ChunkedScratchBufferAlloc ChunkedScratchBufferVK::alloc(uint32_t _size)
+	{
+		BX_ASSERT(_size < m_chunkSize, "Size can't be larger than chunk size (size: %d, chunk size: %d)!", _size, m_chunkSize);
+
+		uint32_t offset     = m_chunkPos;
+		uint32_t nextOffset = offset + _size;
+		uint32_t chunkIdx   = m_chunkControl.m_write/m_chunkSize;
+
+		if (nextOffset >= m_chunkSize)
+		{
+			const uint32_t total = m_chunkSize - m_chunkPos + _size;
+			uint32_t reserved    = m_chunkControl.reserve(total, true);
+
+			if (total != reserved)
+			{
+				addChunk(chunkIdx + 1);
+				reserved = m_chunkControl.reserve(total, true);
+				BX_ASSERT(total == reserved, "Failed to reserve chunk memory after adding chunk.");
+			}
+
+			m_chunkPos = 0;
+			offset     = 0;
+			nextOffset = _size;
+			chunkIdx   = m_chunkControl.m_write/m_chunkSize;
+		}
+		else
+		{
+			const uint32_t size = m_chunkControl.reserve(_size, true);
+			BX_ASSERT(size == _size, "Failed to reserve chunk memory.");
+			BX_UNUSED(size);
+		}
+
+		m_chunkPos = nextOffset;
+
+		return { .offset = offset, .chunkIdx = chunkIdx };
+	}
+
+	void ChunkedScratchBufferVK::write(ChunkedScratchBufferOffset& _outSbo, const void* _vsData, uint32_t _vsSize, const void* _fsData, uint32_t _fsSize)
+	{
+		const uint32_t vsSize = bx::strideAlign(_vsSize, m_align);
+		const uint32_t fsSize = bx::strideAlign(_fsSize, m_align);
+		const uint32_t size   = vsSize + fsSize;
+
+		const ChunkedScratchBufferAlloc sba = alloc(size);
+
+		const uint32_t offset0 = sba.offset;
+		const uint32_t offset1 = offset0 + vsSize;
+
+		const Chunk& sbc = m_chunks[sba.chunkIdx];
+
+		_outSbo.buffer = sbc.buffer;
+		_outSbo.offsets[0] = offset0;
+		_outSbo.offsets[1] = offset1;
+
+		bx::memCopy(&sbc.data[offset0], _vsData, _vsSize);
+		bx::memCopy(&sbc.data[offset1], _fsData, _fsSize);
+	}
+
+	void ChunkedScratchBufferVK::begin()
+	{
+		BX_ASSERT(0 == m_chunkPos, "");
+		const uint32_t numConsumed = m_consume[s_renderVK->m_cmd.m_currentFrameInFlight];
+		m_chunkControl.consume(numConsumed);
+	}
+
+	void ChunkedScratchBufferVK::end()
+	{
+		uint32_t numFlush = m_chunkControl.getNumReserved();
+
+		if (0 != m_chunkPos)
+		{
+retry:
+			const uint32_t remainder = m_chunkSize - m_chunkPos;
+			const uint32_t rem = m_chunkControl.reserve(remainder, true);
+
+			if (rem != remainder)
+			{
+				const uint32_t chunkIdx = m_chunkControl.m_write/m_chunkSize;
+				addChunk(chunkIdx + 1);
+				goto retry;
+			}
+
+			m_chunkPos = 0;
+		}
+
+		const VkPhysicalDeviceLimits& deviceLimits = s_renderVK->m_deviceProperties.properties.limits;
+		const uint32_t align = uint32_t(deviceLimits.nonCoherentAtomSize);
+
+		VkDevice device = s_renderVK->m_device;
+
+		const uint32_t numReserved = m_chunkControl.getNumReserved();
+		BX_ASSERT(0 == numReserved % m_chunkSize, "Number of reserved must always be aligned to chunk size!");
+
+		const uint32_t first = m_chunkControl.m_current / m_chunkSize;
+
+		for (uint32_t ii = first, end = numReserved / m_chunkSize + first; ii < end; ++ii)
+		{
+			const Chunk& chunk = m_chunks[ii % m_chunks.size()];
+
+			VkMappedMemoryRange range;
+			range.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+			range.pNext  = NULL;
+			range.memory = chunk.deviceMem.mem;
+			range.offset = chunk.deviceMem.offset;
+			range.size   = bx::alignUp(bx::min(numFlush, m_chunkSize), align);
+			VK_CHECK(vkFlushMappedMemoryRanges(device, 1, &range) );
+
+			m_chunkControl.commit(m_chunkSize);
+			numFlush -= m_chunkSize;
 		}
+
+		m_consume[s_renderVK->m_cmd.m_currentFrameInFlight] = numReserved;
+
+		m_totalUsed = m_chunkControl.getNumUsed();
 	}
 
 	void BufferVK::create(VkCommandBuffer _commandBuffer, uint32_t _size, void* _data, uint16_t _flags, bool _vertex, uint32_t _stride)
@@ -5774,7 +5993,7 @@ VK_DESTROY
 
 	bool TimerQueryVK::update()
 	{
-		if (0 != m_control.available() )
+		if (0 != m_control.getNumUsed() )
 		{
 			uint32_t idx = m_control.m_read;
 			Query& query = m_query[idx];
@@ -5891,7 +6110,7 @@ VK_DESTROY
 	{
 		BGFX_PROFILER_SCOPE("OcclusionQueryVK::flush", kColorFrame);
 
-		if (0 < m_control.available() )
+		if (0 < m_control.getNumUsed() )
 		{
 			VkCommandBuffer commandBuffer = s_renderVK->m_commandBuffer;
 
@@ -5899,7 +6118,7 @@ VK_DESTROY
 
 			// need to copy each result individually because VK_QUERY_RESULT_WAIT_BIT causes
 			// vkWaitForFences to hang indefinitely if we copy all results (including unavailable ones)
-			for (uint32_t ii = 0, num = m_control.available(); ii < num; ++ii)
+			for (uint32_t ii = 0, num = m_control.getNumUsed(); ii < num; ++ii)
 			{
 				const OcclusionQueryHandle& handle = m_handle[(m_control.m_read + ii) % size];
 				if (isValid(handle) )
@@ -5932,7 +6151,7 @@ VK_DESTROY
 
 	void OcclusionQueryVK::resolve(Frame* _render)
 	{
-		while (0 != m_control.available() )
+		while (0 != m_control.getNumUsed() )
 		{
 			OcclusionQueryHandle handle = m_handle[m_control.m_read];
 			if (isValid(handle) )
@@ -5947,7 +6166,7 @@ VK_DESTROY
 	{
 		const uint32_t size = m_control.m_size;
 
-		for (uint32_t ii = 0, num = m_control.available(); ii < num; ++ii)
+		for (uint32_t ii = 0, num = m_control.getNumUsed(); ii < num; ++ii)
 		{
 			OcclusionQueryHandle& handle = m_handle[(m_control.m_read + ii) % size];
 			if (handle.idx == _handle.idx)
@@ -8428,7 +8647,7 @@ VK_DESTROY
 		}
 	}
 
-	VkResult CommandQueueVK::alloc(VkCommandBuffer* _commandBuffer)
+	VkResult CommandQueueVK::alloc(VkCommandBuffer* _outCommandBuffer)
 	{
 		BGFX_PROFILER_SCOPE("CommandQueueVK::alloc", kColorResource);
 
@@ -8477,9 +8696,9 @@ VK_DESTROY
 			m_currentFence = commandList.m_fence;
 		}
 
-		if (NULL != _commandBuffer)
+		if (NULL != _outCommandBuffer)
 		{
-			*_commandBuffer = m_activeCommandBuffer;
+			*_outCommandBuffer = m_activeCommandBuffer;
 		}
 
 		return result;
@@ -8594,7 +8813,7 @@ VK_DESTROY
 
 		m_consumeIndex = (m_consumeIndex + 1) % s_renderVK->m_maxFrameLatency;
 
-		for (DeviceMemoryAllocationVK &alloc : m_recycleAllocs[m_consumeIndex])
+		for (DeviceMemoryAllocationVK& alloc : m_recycleAllocs[m_consumeIndex])
 		{
 			s_renderVK->m_memoryLru.recycle(alloc);
 		}
@@ -8625,7 +8844,6 @@ VK_DESTROY
 			}
 		}
 
-
 		m_release[m_consumeIndex].clear();
 	}
 
@@ -8853,8 +9071,10 @@ VK_DESTROY
 		VkDescriptorPool& descriptorPool = m_descriptorPool[m_cmd.m_currentFrameInFlight];
 		vkResetDescriptorPool(m_device, descriptorPool, 0);
 
-		ScratchBufferVK& scratchBuffer = m_scratchBuffer[m_cmd.m_currentFrameInFlight];
-		ScratchBufferVK& scratchStagingBuffer = m_scratchStagingBuffer[m_cmd.m_currentFrameInFlight];
+		ChunkedScratchBufferVK& uniformScratchBuffer = m_uniformScratchBuffer;
+		uniformScratchBuffer.begin();
+
+		StagingScratchBufferVK& stagingScratchBuffer = m_scratchStagingBuffer[m_cmd.m_currentFrameInFlight];
 
 		setMemoryBarrier(
 			  m_commandBuffer
@@ -9205,17 +9425,18 @@ VK_DESTROY
 
 					if (VK_NULL_HANDLE != program.m_descriptorSetLayout)
 					{
-						const uint32_t vsize = program.m_vsh->m_size;
-						uint32_t numOffset = 0;
-						uint32_t offset = 0;
+						ChunkedScratchBufferOffset sbo;
+
+						const uint32_t vsSize = program.m_vsh->m_size;
+						uint32_t numOffsets = 0;
 
 						if (constantsChanged
 						||  hasPredefined)
 						{
-							if (vsize > 0)
+							if (vsSize > 0)
 							{
-								offset = scratchBuffer.write(m_vsScratch, vsize);
-								++numOffset;
+								uniformScratchBuffer.write(sbo, m_vsScratch, vsSize);
+								numOffsets = 1;
 							}
 						}
 
@@ -9223,7 +9444,8 @@ VK_DESTROY
 						hash.begin();
 						hash.add(program.m_descriptorSetLayout);
 						hash.add(renderBind.m_bind, sizeof(renderBind.m_bind) );
-						hash.add(vsize);
+						hash.add(sbo.buffer);
+						hash.add(vsSize);
 						hash.add(0);
 						const uint32_t bindHash = hash.end();
 
@@ -9234,7 +9456,7 @@ VK_DESTROY
 							currentDescriptorSet = getDescriptorSet(
 								  program
 								, renderBind
-								, scratchBuffer
+								, sbo.buffer
 								, _render->m_colorPalette
 							);
 
@@ -9248,8 +9470,8 @@ VK_DESTROY
 							, 0
 							, 1
 							, &currentDescriptorSet
-							, numOffset
-							, &offset
+							, numOffsets
+							, sbo.offsets
 							);
 					}
 
@@ -9492,31 +9714,28 @@ VK_DESTROY
 
 					if (VK_NULL_HANDLE != program.m_descriptorSetLayout)
 					{
-						const uint32_t vsize = program.m_vsh->m_size;
-						const uint32_t fsize = NULL != program.m_fsh ? program.m_fsh->m_size : 0;
-						uint32_t numOffset = 0;
-						uint32_t offsets[2] = { 0, 0 };
+						ChunkedScratchBufferOffset sbo;
 
-						if (constantsChanged
-						||  hasPredefined)
-						{
-							if (vsize > 0)
-							{
-								offsets[numOffset++] = scratchBuffer.write(m_vsScratch, vsize);
-							}
+						const uint32_t vsSize = program.m_vsh->m_size;
+						const uint32_t fsSize = NULL != program.m_fsh ? program.m_fsh->m_size : 0;
+						uint32_t numOffsets = 0;
 
-							if (fsize > 0)
-							{
-								offsets[numOffset++] = scratchBuffer.write(m_fsScratch, fsize);
-							}
+						if (true
+						&& (constantsChanged || hasPredefined)
+						&& (0 < vsSize || 0 < fsSize)
+						   )
+						{
+							uniformScratchBuffer.write(sbo, m_vsScratch, vsSize, m_fsScratch, fsSize);
+							numOffsets = (0 < vsSize) + (0 < fsSize);
 						}
 
 						bx::HashMurmur2A hash;
 						hash.begin();
 						hash.add(program.m_descriptorSetLayout);
 						hash.add(renderBind.m_bind, sizeof(renderBind.m_bind) );
-						hash.add(vsize);
-						hash.add(fsize);
+						hash.add(sbo.buffer);
+						hash.add(vsSize);
+						hash.add(fsSize);
 						const uint32_t bindHash = hash.end();
 
 						if (currentBindHash != bindHash)
@@ -9526,9 +9745,9 @@ VK_DESTROY
 							currentDescriptorSet = getDescriptorSet(
 								  program
 								, renderBind
-								, scratchBuffer
+								, sbo.buffer
 								, _render->m_colorPalette
-							);
+								);
 
 							descriptorSetCount++;
 						}
@@ -9540,8 +9759,8 @@ VK_DESTROY
 							, 0
 							, 1
 							, &currentDescriptorSet
-							, numOffset
-							, offsets
+							, numOffsets
+							, sbo.offsets
 							);
 					}
 
@@ -9754,7 +9973,7 @@ VK_DESTROY
 			maxGpuLatency = bx::uint32_imax(maxGpuLatency, result.m_pending-1);
 		}
 
-		maxGpuLatency = bx::uint32_imax(maxGpuLatency, m_gpuTimer.m_control.available()-1);
+		maxGpuLatency = bx::uint32_imax(maxGpuLatency, m_gpuTimer.m_control.getNumUsed()-1);
 
 		const int64_t timerFreq = bx::getHPFrequency();
 
@@ -9910,7 +10129,7 @@ VK_DESTROY
 				tvm.printf(10, pos++, 0x8b, "     DIB size: %7d ", _render->m_iboffset);
 
 				pos++;
-				tvm.printf(10, pos++, 0x8b, " Occlusion queries: %3d ", m_occlusionQuery.m_control.available() );
+				tvm.printf(10, pos++, 0x8b, " Occlusion queries: %3d ", m_occlusionQuery.m_control.getNumUsed() );
 
 				pos++;
 				tvm.printf(10, pos++, 0x8b, " State cache:             ");
@@ -9922,6 +10141,17 @@ VK_DESTROY
 					);
 				pos++;
 
+				{
+					char strUsed[64];
+					bx::prettify(strUsed, sizeof(strUsed), m_uniformScratchBuffer.m_totalUsed);
+
+					char strTotal[64];
+					bx::prettify(strTotal, sizeof(strTotal), m_uniformScratchBuffer.m_chunkControl.m_size);
+
+					tvm.printf(10, pos++, 0x8b, "Uniform scratch size: %s / %s.", strUsed, strTotal);
+				}
+
+				pos++;
 				double captureMs = double(captureElapsed)*toMs;
 				tvm.printf(10, pos++, 0x8b, "     Capture: %7.4f [ms] ", captureMs);
 
@@ -9952,14 +10182,11 @@ VK_DESTROY
 
 		m_presentElapsed = 0;
 
-		{
-			BGFX_PROFILER_SCOPE("scratchBuffer::flush", kColorResource);
-			scratchBuffer.flush();
-		}
+		uniformScratchBuffer.end();
 
 		{
-			BGFX_PROFILER_SCOPE("scratchStagingBuffer::flush", kColorResource);
-			scratchStagingBuffer.flush();
+			BGFX_PROFILER_SCOPE("stagingScratchBuffer::flush", kColorResource);
+			stagingScratchBuffer.flush();
 		}
 
 		for (uint16_t ii = 0; ii < m_numWindows; ++ii)

+ 58 - 8
src/renderer_vk.h

@@ -429,14 +429,13 @@ VK_DESTROY_FUNC(DescriptorSet);
 		bool     m_isFromScratch;
 	};
 
-	class ScratchBufferVK
+	struct StagingScratchBufferVK
 	{
-	public:
-		ScratchBufferVK()
+		StagingScratchBufferVK()
 		{
 		}
 
-		~ScratchBufferVK()
+		~StagingScratchBufferVK()
 		{
 		}
 
@@ -444,7 +443,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		void createUniform(uint32_t _size, uint32_t _count);
 		void createStaging(uint32_t _size);
 		void destroy();
-		uint32_t alloc(uint32_t _size, uint32_t _minAlign = 1);
+		uint32_t alloc(uint32_t _size, uint32_t _minAlign);
 		uint32_t write(const void* _data, uint32_t _size, uint32_t _minAlign = 1);
 		void flush(bool _reset = true);
 
@@ -453,8 +452,60 @@ VK_DESTROY_FUNC(DescriptorSet);
 
 		uint8_t* m_data;
 		uint32_t m_size;
-		uint32_t m_pos;
+		uint32_t m_chunkPos;
+		uint32_t m_align;
+	};
+
+	struct ChunkedScratchBufferOffset
+	{
+		VkBuffer buffer;
+		uint32_t offsets[2];
+	};
+
+	struct ChunkedScratchBufferAlloc
+	{
+		uint32_t offset;
+		uint32_t chunkIdx;
+	};
+
+	struct ChunkedScratchBufferVK
+	{
+		ChunkedScratchBufferVK()
+			: m_chunkControl(0)
+		{
+		}
+
+		void create(uint32_t _chunkSize, uint32_t _numChunks, VkBufferUsageFlags usage, uint32_t _align);
+		void createUniform(uint32_t _chunkSize, uint32_t _numChunks);
+		void destroy();
+
+		void addChunk(uint32_t _at = UINT32_MAX);
+		ChunkedScratchBufferAlloc alloc(uint32_t _size);
+
+		void write(ChunkedScratchBufferOffset& _outSbo, const void* _vsData, uint32_t _vsSize, const void* _fsData = NULL, uint32_t _fsSize = 0);
+
+		void begin();
+		void end();
+
+		struct Chunk
+		{
+			VkBuffer buffer;
+			DeviceMemoryAllocationVK deviceMem;
+			uint8_t* data;
+		};
+
+		using ScratchBufferChunksArray = stl::vector<Chunk>;
+
+		ScratchBufferChunksArray m_chunks;
+		bx::RingBufferControl m_chunkControl;
+
+		uint32_t m_chunkPos;
+		uint32_t m_chunkSize;
 		uint32_t m_align;
+		VkBufferUsageFlags m_usage;
+
+		uint32_t m_consume[BGFX_CONFIG_MAX_FRAME_LATENCY];
+		uint32_t m_totalUsed;
 	};
 
 	struct BufferVK
@@ -886,7 +937,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		VkResult reset();
 		void shutdown();
 
-		VkResult alloc(VkCommandBuffer* _commandBuffer);
+		VkResult alloc(VkCommandBuffer* _outCommandBuffer);
 		void addWaitSemaphore(VkSemaphore _semaphore, VkPipelineStageFlags _waitFlags = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
 		void addSignalSemaphore(VkSemaphore _semaphore);
 		void kick(bool _wait = false);
@@ -934,7 +985,6 @@ VK_DESTROY_FUNC(DescriptorSet);
 		ResourceArray m_release[BGFX_CONFIG_MAX_FRAME_LATENCY];
 		stl::vector<DeviceMemoryAllocationVK> m_recycleAllocs[BGFX_CONFIG_MAX_FRAME_LATENCY];
 
-
 	private:
 		template<typename Ty>
 		void destroy(uint64_t _handle)