Browse Source

Vulkan memory allocation LRU cache (#3422)

* First version of a memory allocation cache for Vulkan.

* Increase maximal cached memory allocations to 1024. Add configuration parameter.
Martijn Courteaux 9 months ago
parent
commit
0e734522cd
3 changed files with 266 additions and 80 deletions
  1. 8 0
      src/config.h
  2. 201 64
      src/renderer_vk.cpp
  3. 57 16
      src/renderer_vk.h

+ 8 - 0
src/config.h

@@ -333,6 +333,14 @@ static_assert(bx::isPowerOf2(BGFX_CONFIG_MAX_VIEWS), "BGFX_CONFIG_MAX_VIEWS must
 #   define BGFX_CONFIG_PER_FRAME_SCRATCH_STAGING_BUFFER_SIZE (32<<20)
 #endif
 
+#ifndef BGFX_CONFIG_MAX_BYTES_CACHED_DEVICE_MEMORY_ALLOCATIONS
+/// Amount of allowed memory allocations left on device to use for recycling during
+/// later allocations. This can be benificial in case the driver is slow allocating memory
+/// on the device.
+/// Note: Currently only used by the Vulkan backend.
+#	define BGFX_CONFIG_MAX_BYTES_CACHED_DEVICE_MEMORY_ALLOCATIONS (128 << 20)
+#endif
+
 #ifndef BGFX_CONFIG_MAX_STAGING_SIZE_FOR_SCRATCH_BUFFER
 /// The threshold of data size above which the staging scratch buffer will
 /// not be used, but instead a separate device memory allocation will take

+ 201 - 64
src/renderer_vk.cpp

@@ -1136,13 +1136,14 @@ VK_IMPORT_DEVICE
 	{
 		RendererContextVK()
 			: m_allocatorCb(NULL)
+			, m_memoryLru()
 			, m_renderDocDll(NULL)
 			, m_vulkan1Dll(NULL)
 			, m_maxAnisotropy(1.0f)
 			, m_depthClamp(false)
 			, m_wireframe(false)
 			, m_captureBuffer(VK_NULL_HANDLE)
-			, m_captureMemory(VK_NULL_HANDLE)
+			, m_captureMemory()
 			, m_captureSize(0)
 		{
 		}
@@ -2184,6 +2185,8 @@ VK_IMPORT_DEVICE
 
 			m_backBuffer.destroy();
 
+			m_memoryLru.evictAll();
+
 			m_cmd.shutdown();
 
 			vkDestroy(m_pipelineCache);
@@ -2346,7 +2349,7 @@ VK_IMPORT_DEVICE
 			uint32_t pitch  = texture.m_readback.pitch(_mip);
 			uint32_t size = height * pitch;
 
-			VkDeviceMemory stagingMemory;
+			DeviceMemoryAllocationVK stagingMemory;
 			VkBuffer stagingBuffer;
 			VK_CHECK(createReadbackBuffer(size, &stagingBuffer, &stagingMemory) );
 
@@ -2360,10 +2363,10 @@ VK_IMPORT_DEVICE
 
 			kick(true);
 
-			texture.m_readback.readback(stagingMemory, 0, _data, _mip);
+			texture.m_readback.readback(stagingMemory.mem, stagingMemory.offset, _data, _mip);
 
 			vkDestroy(stagingBuffer);
-			vkDestroy(stagingMemory);
+			recycleMemory(stagingMemory);
 		}
 
 		void resizeTexture(TextureHandle _handle, uint16_t _width, uint16_t _height, uint8_t _numMips, uint16_t _numLayers) override
@@ -2511,14 +2514,14 @@ VK_IMPORT_DEVICE
 			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(swapChain.m_colorFormat) );
 			const uint32_t size = frameBuffer.m_width * frameBuffer.m_height * bpp / 8;
 
-			VkDeviceMemory stagingMemory;
+			DeviceMemoryAllocationVK stagingMemory;
 			VkBuffer stagingBuffer;
 			VK_CHECK(createReadbackBuffer(size, &stagingBuffer, &stagingMemory) );
 
 			readSwapChain(swapChain, stagingBuffer, stagingMemory, callback, _filePath);
 
 			vkDestroy(stagingBuffer);
-			vkDestroy(stagingMemory);
+			recycleMemory(stagingMemory);
 		}
 
 		void updateViewName(ViewId _id, const char* _name) override
@@ -2601,6 +2604,11 @@ VK_IMPORT_DEVICE
 			}
 		}
 
+		void recycleMemory(DeviceMemoryAllocationVK _alloc)
+		{
+			m_cmd.recycleMemory(_alloc);
+		}
+
 		void submitBlit(BlitState& _bs, uint16_t _view);
 
 		void submit(Frame* _render, ClearQuad& _clearQuad, TextVideoMemBlitter& _textVideoMemBlitter) override;
@@ -2737,7 +2745,7 @@ VK_IMPORT_DEVICE
 				g_callback->captureEnd();
 
 				release(m_captureBuffer);
-				release(m_captureMemory);
+				recycleMemory(m_captureMemory);
 				m_captureSize = 0;
 			}
 		}
@@ -2760,7 +2768,7 @@ VK_IMPORT_DEVICE
 				if (captureSize > m_captureSize)
 				{
 					release(m_captureBuffer);
-					release(m_captureMemory);
+					recycleMemory(m_captureMemory);
 
 					m_captureSize = captureSize;
 					VK_CHECK(createReadbackBuffer(m_captureSize, &m_captureBuffer, &m_captureMemory) );
@@ -4059,7 +4067,7 @@ VK_IMPORT_DEVICE
 
 		typedef void (*SwapChainReadFunc)(void* /*src*/, uint32_t /*width*/, uint32_t /*height*/, uint32_t /*pitch*/, const void* /*userData*/);
 
-		bool readSwapChain(const SwapChainVK& _swapChain, VkBuffer _buffer, VkDeviceMemory _memory, SwapChainReadFunc _func, const void* _userData = NULL)
+		bool readSwapChain(const SwapChainVK& _swapChain, VkBuffer _buffer, DeviceMemoryAllocationVK _memory, SwapChainReadFunc _func, const void* _userData = NULL)
 		{
 			if (isSwapChainReadable(_swapChain) )
 			{
@@ -4080,7 +4088,7 @@ VK_IMPORT_DEVICE
 				kick(true);
 
 				uint8_t* src;
-				VK_CHECK(vkMapMemory(m_device, _memory, 0, VK_WHOLE_SIZE, 0, (void**)&src) );
+				VK_CHECK(vkMapMemory(m_device, _memory.mem, _memory.offset, _memory.size, 0, (void**)&src) );
 
 				if (_swapChain.m_colorFormat == TextureFormat::RGBA8)
 				{
@@ -4106,7 +4114,7 @@ VK_IMPORT_DEVICE
 					bx::free(g_allocator, dst);
 				}
 
-				vkUnmapMemory(m_device, _memory);
+				vkUnmapMemory(m_device, _memory.mem);
 
 				readback.destroy();
 
@@ -4354,9 +4362,31 @@ VK_IMPORT_DEVICE
 			return -1;
 		}
 
-		VkResult allocateMemory(const VkMemoryRequirements* requirements, VkMemoryPropertyFlags propertyFlags, ::VkDeviceMemory* memory) const
+		VkResult allocateMemory(const VkMemoryRequirements* requirements, VkMemoryPropertyFlags propertyFlags, DeviceMemoryAllocationVK* memory, bool _forcePrivateDeviceAllocation)
 		{
 			BGFX_PROFILER_SCOPE("RendererContextVK::allocateMemory", kColorResource);
+
+			// Forcing the use of a private device allocation for a certain memory allocation
+			// can be desireable when memory mapping the allocation. A memory allocation
+			// can only be mapped once. So handing out multiple subregions of one bigger
+			// allocation can lead to problems, when they get mapped multiple times.
+			// Right now, with the LRU system, we are still only handing out the full
+			// memory allocation, and never subregions of it, so it's impossible right
+			// now to map a single allocation multiple times.
+			// The argument is there to indicate this, but it's ignored right now, for the above
+			// reason: any cached memory is fine, as long as we don't partition it.
+			BX_UNUSED(_forcePrivateDeviceAllocation);
+			{
+				// Check LRU cache.
+				int memoryType = selectMemoryType(requirements->memoryTypeBits, propertyFlags, 0);
+				bool found = m_memoryLru.find(requirements->size, memoryType, memory);
+				if (found)
+				{
+					return VK_SUCCESS;
+				}
+			}
+
+
 			VkMemoryAllocateInfo ma;
 			ma.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
 			ma.pNext = NULL;
@@ -4371,8 +4401,12 @@ VK_IMPORT_DEVICE
 
 				if (searchIndex >= 0)
 				{
+					BGFX_PROFILER_SCOPE("vkAllocateMemory", kColorResource);
 					ma.memoryTypeIndex = searchIndex;
-					result = vkAllocateMemory(m_device, &ma, m_allocatorCb, memory);
+					memory->memoryTypeIndex = searchIndex;
+					memory->size = ma.allocationSize;
+					memory->offset = 0;
+					result = vkAllocateMemory(m_device, &ma, m_allocatorCb, &memory->mem);
 				}
 			}
 			while (result != VK_SUCCESS
@@ -4381,7 +4415,7 @@ VK_IMPORT_DEVICE
 			return result;
 		}
 
-		VkResult createHostBuffer(uint32_t _size, VkMemoryPropertyFlags _flags, ::VkBuffer* _buffer, ::VkDeviceMemory* _memory, const void* _data = NULL)
+		VkResult createHostBuffer(uint32_t _size, VkMemoryPropertyFlags _flags, ::VkBuffer* _buffer, DeviceMemoryAllocationVK* _memory, bool _forcePrivateDeviceAllocation, const void* _data = NULL)
 		{
 			BGFX_PROFILER_SCOPE("createHostBuffer", kColorResource);
 			VkResult result = VK_SUCCESS;
@@ -4406,12 +4440,12 @@ VK_IMPORT_DEVICE
 			VkMemoryRequirements mr;
 			vkGetBufferMemoryRequirements(m_device, *_buffer, &mr);
 
-			result = allocateMemory(&mr, _flags, _memory);
+			result = allocateMemory(&mr, _flags, _memory, _forcePrivateDeviceAllocation);
 
 			if (VK_SUCCESS != result
 			&&  (_flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) )
 			{
-				result = allocateMemory(&mr, _flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT, _memory);
+				result = allocateMemory(&mr, _flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT, _memory, _forcePrivateDeviceAllocation);
 			}
 
 			if (VK_SUCCESS != result)
@@ -4420,7 +4454,7 @@ VK_IMPORT_DEVICE
 				return result;
 			}
 
-			result = vkBindBufferMemory(m_device, *_buffer, *_memory, 0);
+			result = vkBindBufferMemory(m_device, *_buffer, _memory->mem, _memory->offset);
 			if (VK_SUCCESS != result)
 			{
 				BX_TRACE("Create host buffer error: vkBindBufferMemory failed %d: %s.", result, getName(result) );
@@ -4431,7 +4465,7 @@ VK_IMPORT_DEVICE
 			{
 				BGFX_PROFILER_SCOPE("map and copy data", kColorResource);
 				void* dst;
-				result = vkMapMemory(m_device, *_memory, 0, _size, 0, &dst);
+				result = vkMapMemory(m_device, _memory->mem, _memory->offset, _size, 0, &dst);
 				if (VK_SUCCESS != result)
 				{
 					BX_TRACE("Create host buffer error: vkMapMemory failed %d: %s.", result, getName(result) );
@@ -4439,19 +4473,19 @@ VK_IMPORT_DEVICE
 				}
 
 				bx::memCopy(dst, _data, _size);
-				vkUnmapMemory(m_device, *_memory);
+				vkUnmapMemory(m_device, _memory->mem);
 			}
 
 			return result;
 		}
 
-		VkResult createStagingBuffer(uint32_t _size, ::VkBuffer* _buffer, ::VkDeviceMemory* _memory, const void* _data = NULL)
+		VkResult createStagingBuffer(uint32_t _size, ::VkBuffer* _buffer, DeviceMemoryAllocationVK* _memory, const void* _data = NULL)
 		{
 			const VkMemoryPropertyFlags flags = 0
 				| VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
 				| VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
 				;
-			return createHostBuffer(_size, flags, _buffer, _memory, _data);
+			return createHostBuffer(_size, flags, _buffer, _memory, false, _data);
 		}
 
 		StagingBufferVK allocFromScratchStagingBuffer(uint32_t _size, uint32_t _align, const void* _data = NULL)
@@ -4467,12 +4501,12 @@ VK_IMPORT_DEVICE
 
 				if (UINT32_MAX != scratchOffset)
 				{
-					result.m_isFromScratch  = true;
-					result.m_size      = _size;
-					result.m_offset    = scratchOffset;
-					result.m_buffer    = scratch.m_buffer;
+					result.m_isFromScratch = true;
 					result.m_deviceMem = scratch.m_deviceMem;
-					result.m_data      = scratch.m_data + result.m_offset;
+					result.m_size   = _size;
+					result.m_offset = scratchOffset;
+					result.m_buffer = scratch.m_buffer;
+					result.m_data   = scratch.m_data + result.m_offset;
 
 					if (_data != NULL)
 					{
@@ -4485,18 +4519,17 @@ VK_IMPORT_DEVICE
 			}
 
 			// Not enough space or too big, we will create a new staging buffer on the spot.
-			result.m_isFromScratch = false;
-
 			VK_CHECK(createStagingBuffer(_size, &result.m_buffer, &result.m_deviceMem, _data));
 
-			result.m_size   = _size;
-			result.m_offset = 0;
-			result.m_data   = NULL;
+			result.m_isFromScratch = false;
+			result.m_offset        = 0;
+			result.m_size          = _size;
+			result.m_data          = NULL;
 
 			return result;
 		}
 
-		VkResult createReadbackBuffer(uint32_t _size, ::VkBuffer* _buffer, ::VkDeviceMemory* _memory)
+		VkResult createReadbackBuffer(uint32_t _size, ::VkBuffer* _buffer, DeviceMemoryAllocationVK* _memory)
 		{
 			const VkMemoryPropertyFlags flags = 0
 				| VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
@@ -4504,7 +4537,7 @@ VK_IMPORT_DEVICE
 				| VK_MEMORY_PROPERTY_HOST_CACHED_BIT
 				;
 
-			return createHostBuffer(_size, flags, _buffer, _memory, NULL);
+			return createHostBuffer(_size, flags, _buffer, _memory, true, NULL);
 		}
 
 		VkAllocationCallbacks*   m_allocatorCb;
@@ -4528,6 +4561,8 @@ VK_IMPORT_DEVICE
 		FrameBufferHandle m_windows[BGFX_CONFIG_MAX_FRAME_BUFFERS];
 		int64_t m_presentElapsed;
 
+		MemoryLruVK m_memoryLru;
+
 		ScratchBufferVK m_scratchBuffer[BGFX_CONFIG_MAX_FRAME_LATENCY];
 		ScratchBufferVK m_scratchStagingBuffer[BGFX_CONFIG_MAX_FRAME_LATENCY];
 
@@ -4572,7 +4607,7 @@ VK_IMPORT_DEVICE
 		bool m_wireframe;
 
 		VkBuffer m_captureBuffer;
-		VkDeviceMemory m_captureMemory;
+		DeviceMemoryAllocationVK m_captureMemory;
 		uint32_t m_captureSize;
 
 		TextVideoMem m_textVideoMem;
@@ -4665,6 +4700,96 @@ VK_DESTROY
 		s_renderVK->release(_obj);
 	}
 
+	void MemoryLruVK::recycle(DeviceMemoryAllocationVK &_alloc)
+	{
+		if (MAX_ENTRIES == lru.getNumHandles())
+		{
+			// Evict LRU
+			uint16_t handle = lru.getBack();
+			DeviceMemoryAllocationVK &alloc = entries[handle];
+			totalSizeCached -= alloc.size;
+			release(alloc.mem);
+
+			// Touch slot and overwrite
+			lru.touch(handle);
+			alloc = _alloc;
+		} else
+		{
+			uint16_t handle = lru.alloc();
+			entries[handle] = _alloc;
+		}
+		totalSizeCached += _alloc.size;
+
+		while (totalSizeCached > BGFX_CONFIG_MAX_BYTES_CACHED_DEVICE_MEMORY_ALLOCATIONS)
+		{
+			BX_ASSERT(lru.getNumHandles() > 0, "Memory badly counted.");
+			uint16_t handle = lru.getBack();
+			DeviceMemoryAllocationVK &alloc = entries[handle];
+			totalSizeCached -= alloc.size;
+			release(alloc.mem);
+			lru.free(handle);
+		}
+	}
+
+	bool MemoryLruVK::find(uint32_t _size, int32_t _memoryTypeIndex, DeviceMemoryAllocationVK *_alloc)
+	{
+		BGFX_PROFILER_SCOPE("MemoryLruVK::find", kColorResource);
+		// Find best fit.
+		uint16_t slot;
+		{
+			int16_t bestIdx = MAX_ENTRIES;
+			uint32_t bestWaste = 0xffff'ffff;
+			slot = lru.getFront();
+			while (UINT16_MAX != slot)
+			{
+				DeviceMemoryAllocationVK &alloc = entries[slot];
+				if (alloc.memoryTypeIndex == _memoryTypeIndex)
+				{
+					// 50% waste allowed, otherwise we'll just allocate a new one.
+					// This is to prevent we trash this cache of usefull allocations
+					// with a handful of tiny allocations.
+					if (alloc.size >= _size && _size * 2 >= alloc.size)
+					{
+						uint32_t waste = alloc.size - _size;
+						if (waste < bestWaste)
+						{
+							bestIdx = slot;
+							bestWaste = waste;
+							if (waste == 0)
+							{
+								break;
+							}
+						}
+					}
+				}
+				slot = lru.getNext(slot);
+			}
+			slot = bestIdx;
+		}
+
+		if (MAX_ENTRIES != slot)
+		{
+			*_alloc = entries[slot];
+			lru.free(slot);
+			totalSizeCached -= _alloc->size;
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	void MemoryLruVK::evictAll()
+	{
+		uint16_t slot = lru.getFront();
+		while (slot != UINT16_MAX)
+		{
+			release(entries[slot].mem);
+			slot = lru.getNext(slot);
+		}
+		lru.reset();
+		totalSizeCached = 0;
+	}
+
 	void ScratchBufferVK::create(uint32_t _size, uint32_t _count, VkBufferUsageFlags usage, uint32_t _align)
 	{
 		const VkAllocationCallbacks* allocatorCb = s_renderVK->m_allocatorCb;
@@ -4698,21 +4823,21 @@ VK_DESTROY
 			);
 
 		VkMemoryPropertyFlags flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-		VkResult result = s_renderVK->allocateMemory(&mr, flags, &m_deviceMem);
+		VkResult result = s_renderVK->allocateMemory(&mr, flags, &m_deviceMem, true);
 
 		if (VK_SUCCESS != result)
 		{
 			flags &= ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-			VK_CHECK(s_renderVK->allocateMemory(&mr, flags, &m_deviceMem) );
+			VK_CHECK(s_renderVK->allocateMemory(&mr, flags, &m_deviceMem, true) );
 		}
 
 		m_size = (uint32_t)mr.size;
 		m_pos  = 0;
 		m_align = _align;
 
-		VK_CHECK(vkBindBufferMemory(device, m_buffer, m_deviceMem, 0) );
+		VK_CHECK(vkBindBufferMemory(device, m_buffer, m_deviceMem.mem, m_deviceMem.offset) );
 
-		VK_CHECK(vkMapMemory(device, m_deviceMem, 0, m_size, 0, (void**)&m_data) );
+		VK_CHECK(vkMapMemory(device, m_deviceMem.mem, m_deviceMem.offset, m_size, 0, (void**)&m_data) );
 	}
 
 	void ScratchBufferVK::createUniform(uint32_t _size, uint32_t _count)
@@ -4733,10 +4858,10 @@ VK_DESTROY
 
 	void ScratchBufferVK::destroy()
 	{
-		vkUnmapMemory(s_renderVK->m_device, m_deviceMem);
+		vkUnmapMemory(s_renderVK->m_device, m_deviceMem.mem);
 
 		s_renderVK->release(m_buffer);
-		s_renderVK->release(m_deviceMem);
+		s_renderVK->recycleMemory(m_deviceMem);
 	}
 
 
@@ -4779,8 +4904,8 @@ VK_DESTROY
 		VkMappedMemoryRange range;
 		range.sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
 		range.pNext  = NULL;
-		range.memory = m_deviceMem;
-		range.offset = 0;
+		range.memory = m_deviceMem.mem;
+		range.offset = m_deviceMem.offset;
 		range.size   = size;
 		VK_CHECK(vkFlushMappedMemoryRanges(device, 1, &range) );
 
@@ -4823,9 +4948,9 @@ VK_DESTROY
 		VkMemoryRequirements mr;
 		vkGetBufferMemoryRequirements(device, m_buffer, &mr);
 
-		VK_CHECK(s_renderVK->allocateMemory(&mr, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_deviceMem) );
+		VK_CHECK(s_renderVK->allocateMemory(&mr, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_deviceMem, false) );
 
-		VK_CHECK(vkBindBufferMemory(device, m_buffer, m_deviceMem, 0) );
+		VK_CHECK(vkBindBufferMemory(device, m_buffer, m_deviceMem.mem, m_deviceMem.offset) );
 
 		if (!m_dynamic)
 		{
@@ -4855,7 +4980,7 @@ VK_DESTROY
 		if (!stagingBuffer.m_isFromScratch)
 		{
 			s_renderVK->release(stagingBuffer.m_buffer);
-			s_renderVK->release(stagingBuffer.m_deviceMem);
+			s_renderVK->recycleMemory(stagingBuffer.m_deviceMem);
 		}
 	}
 
@@ -4864,7 +4989,7 @@ VK_DESTROY
 		if (VK_NULL_HANDLE != m_buffer)
 		{
 			s_renderVK->release(m_buffer);
-			s_renderVK->release(m_deviceMem);
+			s_renderVK->recycleMemory(m_deviceMem);
 
 			m_dynamic = false;
 		}
@@ -5451,7 +5576,7 @@ VK_DESTROY
 			return result;
 		}
 
-		result = vkMapMemory(device, m_readbackMemory, 0, VK_WHOLE_SIZE, 0, (void**)&m_queryResult);
+		result = vkMapMemory(device, m_readbackMemory.mem, m_readbackMemory.offset, VK_WHOLE_SIZE, 0, (void**)&m_queryResult);
 
 		if (VK_SUCCESS != result)
 		{
@@ -5475,8 +5600,8 @@ VK_DESTROY
 	{
 		vkDestroy(m_queryPool);
 		vkDestroy(m_readback);
-		vkUnmapMemory(s_renderVK->m_device, m_readbackMemory);
-		vkDestroy(m_readbackMemory);
+		vkUnmapMemory(s_renderVK->m_device, m_readbackMemory.mem);
+		s_renderVK->recycleMemory(m_readbackMemory);
 	}
 
 	uint32_t TimerQueryVK::begin(uint32_t _resultIdx, uint32_t _frameNum)
@@ -5606,7 +5731,7 @@ VK_DESTROY
 			return result;
 		}
 
-		result = vkMapMemory(device, m_readbackMemory, 0, VK_WHOLE_SIZE, 0, (void**)&m_queryResult);
+		result = vkMapMemory(device, m_readbackMemory.mem, m_readbackMemory.offset, VK_WHOLE_SIZE, 0, (void**)&m_queryResult);
 
 		if (VK_SUCCESS != result)
 		{
@@ -5623,8 +5748,8 @@ VK_DESTROY
 	{
 		vkDestroy(m_queryPool);
 		vkDestroy(m_readback);
-		vkUnmapMemory(s_renderVK->m_device, m_readbackMemory);
-		vkDestroy(m_readbackMemory);
+		vkUnmapMemory(s_renderVK->m_device, m_readbackMemory.mem);
+		s_renderVK->recycleMemory(m_readbackMemory);
 	}
 
 	void OcclusionQueryVK::begin(OcclusionQueryHandle _handle)
@@ -5924,14 +6049,14 @@ VK_DESTROY
 		VkMemoryRequirements imageMemReq;
 		vkGetImageMemoryRequirements(device, m_textureImage, &imageMemReq);
 
-		result = s_renderVK->allocateMemory(&imageMemReq, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_textureDeviceMem);
+		result = s_renderVK->allocateMemory(&imageMemReq, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_textureDeviceMem, false);
 		if (VK_SUCCESS != result)
 		{
 			BX_TRACE("Create texture image error: allocateMemory failed %d: %s.", result, getName(result) );
 			return result;
 		}
 
-		result = vkBindImageMemory(device, m_textureImage, m_textureDeviceMem, 0);
+		result = vkBindImageMemory(device, m_textureImage, m_textureDeviceMem.mem, m_textureDeviceMem.offset);
 		if (VK_SUCCESS != result)
 		{
 			BX_TRACE("Create texture image error: vkBindImageMemory failed %d: %s.", result, getName(result) );
@@ -5967,14 +6092,14 @@ VK_DESTROY
 			VkMemoryRequirements imageMemReq_resolve;
 			vkGetImageMemoryRequirements(device, m_singleMsaaImage, &imageMemReq_resolve);
 
-			result = s_renderVK->allocateMemory(&imageMemReq_resolve, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_singleMsaaDeviceMem);
+			result = s_renderVK->allocateMemory(&imageMemReq_resolve, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, &m_singleMsaaDeviceMem, false);
 			if (VK_SUCCESS != result)
 			{
 				BX_TRACE("Create texture image error: allocateMemory failed %d: %s.", result, getName(result) );
 				return result;
 			}
 
-			result = vkBindImageMemory(device, m_singleMsaaImage, m_singleMsaaDeviceMem, 0);
+			result = vkBindImageMemory(device, m_singleMsaaImage, m_singleMsaaDeviceMem.mem, m_singleMsaaDeviceMem.offset);
 			if (VK_SUCCESS != result)
 			{
 				BX_TRACE("Create texture image error: vkBindImageMemory failed %d: %s.", result, getName(result) );
@@ -6226,8 +6351,8 @@ VK_DESTROY
 				{
 					VK_CHECK(vkMapMemory(
 						  device
-						, stagingBuffer.m_deviceMem
-						, 0
+						, stagingBuffer.m_deviceMem.mem
+						, stagingBuffer.m_deviceMem.offset
 						, totalMemSize
 						, 0
 						, (void**)&mappedMemory
@@ -6253,7 +6378,7 @@ VK_DESTROY
 
 				if (!stagingBuffer.m_isFromScratch)
 				{
-					vkUnmapMemory(device, stagingBuffer.m_deviceMem);
+					vkUnmapMemory(device, stagingBuffer.m_deviceMem.mem);
 				}
 
 				copyBufferToTexture(_commandBuffer, stagingBuffer.m_buffer, numSrd, bufferCopyInfo);
@@ -6261,7 +6386,7 @@ VK_DESTROY
 				if (!stagingBuffer.m_isFromScratch)
 				{
 					s_renderVK->release(stagingBuffer.m_buffer);
-					s_renderVK->release(stagingBuffer.m_deviceMem);
+					s_renderVK->recycleMemory(stagingBuffer.m_deviceMem);
 				}
 			}
 			else
@@ -6292,13 +6417,13 @@ VK_DESTROY
 		if (VK_NULL_HANDLE != m_textureImage)
 		{
 			s_renderVK->release(m_textureImage);
-			s_renderVK->release(m_textureDeviceMem);
+			s_renderVK->recycleMemory(m_textureDeviceMem);
 		}
 
 		if (VK_NULL_HANDLE != m_singleMsaaImage)
 		{
 			s_renderVK->release(m_singleMsaaImage);
-			s_renderVK->release(m_singleMsaaDeviceMem);
+			s_renderVK->recycleMemory(m_singleMsaaDeviceMem);
 		}
 
 		m_currentImageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -6353,7 +6478,7 @@ VK_DESTROY
 		StagingBufferVK stagingBuffer = s_renderVK->allocFromScratchStagingBuffer(size, align, data);
 		region.bufferOffset += stagingBuffer.m_offset;
 		BX_ASSERT(region.bufferOffset % align == 0,
-				"Alignment for image (mip %u, z %s) is not aligned correctly (%u).",
+				"Alignment for image (mip %u, z %u) is not aligned correctly (%u).",
 				_mip, _z, region.bufferOffset, align);
 
 		if (VK_IMAGE_VIEW_TYPE_3D == m_type)
@@ -6375,7 +6500,7 @@ VK_DESTROY
 		if (!stagingBuffer.m_isFromScratch)
 		{
 			s_renderVK->release(stagingBuffer.m_buffer);
-			s_renderVK->release(stagingBuffer.m_deviceMem);
+			s_renderVK->recycleMemory(stagingBuffer.m_deviceMem);
 		}
 
 		if (NULL != temp)
@@ -8260,11 +8385,22 @@ VK_DESTROY
 		m_release[m_currentFrameInFlight].push_back(resource);
 	}
 
+	void CommandQueueVK::recycleMemory(DeviceMemoryAllocationVK _mem)
+	{
+		m_recycleAllocs[m_currentFrameInFlight].push_back(_mem);
+	}
+
 	void CommandQueueVK::consume()
 	{
 		BGFX_PROFILER_SCOPE("CommandQueueVK::consume", kColorResource);
 		m_consumeIndex = (m_consumeIndex + 1) % m_numFramesInFlight;
 
+		for (DeviceMemoryAllocationVK &alloc : m_recycleAllocs[m_consumeIndex])
+		{
+			s_renderVK->m_memoryLru.recycle(alloc);
+		}
+		m_recycleAllocs[m_consumeIndex].clear();
+
 		for (const Resource& resource : m_release[m_consumeIndex])
 		{
 			switch (resource.m_type)
@@ -8289,6 +8425,7 @@ VK_DESTROY
 			}
 		}
 
+
 		m_release[m_consumeIndex].clear();
 	}
 

+ 57 - 16
src/renderer_vk.h

@@ -307,7 +307,7 @@ namespace bgfx { namespace vk
 		::Vk##_name* operator &() { return &vk; }                \
 		const ::Vk##_name* operator &() const { return &vk; }    \
 	};                                                           \
-	static_assert(sizeof(::Vk##_name) == sizeof(Vk##_name) ); \
+	static_assert(sizeof(::Vk##_name) == sizeof(Vk##_name) );    \
 	void vkDestroy(Vk##_name&);                                  \
 	void release(Vk##_name&)
 VK_DESTROY
@@ -372,14 +372,52 @@ VK_DESTROY_FUNC(DescriptorSet);
 		HashMap m_hashMap;
 	};
 
+	struct DeviceMemoryAllocationVK {
+		DeviceMemoryAllocationVK()
+			: mem(VK_NULL_HANDLE)
+			, offset(0)
+			, size(0)
+			, memoryTypeIndex(0)
+		{
+		}
+
+		VkDeviceMemory mem;
+		uint32_t offset;
+		uint32_t size;
+		int32_t memoryTypeIndex;
+	};
+
+	struct MemoryLruVK
+	{
+		MemoryLruVK()
+			: entries()
+			, lru()
+			, totalSizeCached(0)
+		{
+		}
+
+		static constexpr uint16_t MAX_ENTRIES = 1 << 10;
+		DeviceMemoryAllocationVK entries[MAX_ENTRIES];
+		bx::HandleAllocLruT<MAX_ENTRIES> lru;
+		uint64_t totalSizeCached;
+
+		void recycle(DeviceMemoryAllocationVK &_alloc);
+		bool find(uint32_t _size, int32_t _memoryTypeIndex, DeviceMemoryAllocationVK *_alloc);
+		void evictAll();
+	};
+
+	/** A Buffer used for moving data from main memory to GPU memory.
+	 * This can either be an independently allocated memory region, or a sub-region
+	 * of the scratch staging buffer for the frame-in-flight.
+	 */
 	struct StagingBufferVK
 	{
 		VkBuffer m_buffer;
-		VkDeviceMemory m_deviceMem;
+		DeviceMemoryAllocationVK m_deviceMem;
 
 		uint8_t* m_data;
 		uint32_t m_size;
-		uint32_t m_offset;
+		uint32_t m_offset; // Offset into the bound buffer (not the device memory!)
 		bool     m_isFromScratch;
 	};
 
@@ -403,7 +441,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		void flush(bool _reset = true);
 
 		VkBuffer m_buffer;
-		VkDeviceMemory m_deviceMem;
+		DeviceMemoryAllocationVK m_deviceMem;
 
 		uint8_t* m_data;
 		uint32_t m_size;
@@ -415,7 +453,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 	{
 		BufferVK()
 			: m_buffer(VK_NULL_HANDLE)
-			, m_deviceMem(VK_NULL_HANDLE)
+			, m_deviceMem()
 			, m_size(0)
 			, m_flags(BGFX_BUFFER_NONE)
 			, m_dynamic(false)
@@ -427,7 +465,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		void destroy();
 
 		VkBuffer m_buffer;
-		VkDeviceMemory m_deviceMem;
+		DeviceMemoryAllocationVK m_deviceMem;
 		uint32_t m_size;
 		uint16_t m_flags;
 		bool m_dynamic;
@@ -589,7 +627,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		Query m_query[BGFX_CONFIG_MAX_VIEWS*4];
 
 		VkBuffer m_readback;
-		VkDeviceMemory m_readbackMemory;
+		DeviceMemoryAllocationVK m_readbackMemory;
 		VkQueryPool m_queryPool;
 		const uint64_t* m_queryResult;
 		bx::RingBufferControl m_control;
@@ -613,7 +651,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		OcclusionQueryHandle m_handle[BGFX_CONFIG_MAX_OCCLUSION_QUERIES];
 
 		VkBuffer m_readback;
-		VkDeviceMemory m_readbackMemory;
+		DeviceMemoryAllocationVK m_readbackMemory;
 		VkQueryPool m_queryPool;
 		const uint32_t* m_queryResult;
 		bx::RingBufferControl m_control;
@@ -640,10 +678,10 @@ VK_DESTROY_FUNC(DescriptorSet);
 			, m_sampler({ 1, VK_SAMPLE_COUNT_1_BIT })
 			, m_format(VK_FORMAT_UNDEFINED)
 			, m_textureImage(VK_NULL_HANDLE)
-			, m_textureDeviceMem(VK_NULL_HANDLE)
+			, m_textureDeviceMem()
 			, m_currentImageLayout(VK_IMAGE_LAYOUT_UNDEFINED)
 			, m_singleMsaaImage(VK_NULL_HANDLE)
-			, m_singleMsaaDeviceMem(VK_NULL_HANDLE)
+			, m_singleMsaaDeviceMem()
 			, m_currentSingleMsaaImageLayout(VK_IMAGE_LAYOUT_UNDEFINED)
 		{
 		}
@@ -680,13 +718,13 @@ VK_DESTROY_FUNC(DescriptorSet);
 		VkComponentMapping m_components;
 		VkImageAspectFlags m_aspectMask;
 
-		VkImage        m_textureImage;
-		VkDeviceMemory m_textureDeviceMem;
-		VkImageLayout  m_currentImageLayout;
+		VkImage					 m_textureImage;
+		DeviceMemoryAllocationVK m_textureDeviceMem;
+		VkImageLayout			 m_currentImageLayout;
 
-		VkImage        m_singleMsaaImage;
-		VkDeviceMemory m_singleMsaaDeviceMem;
-		VkImageLayout  m_currentSingleMsaaImageLayout;
+		VkImage					 m_singleMsaaImage;
+		DeviceMemoryAllocationVK m_singleMsaaDeviceMem;
+		VkImageLayout			 m_currentSingleMsaaImageLayout;
 
 		VkImageLayout m_sampledLayout;
 
@@ -841,6 +879,7 @@ VK_DESTROY_FUNC(DescriptorSet);
 		void finish(bool _finishAll = false);
 
 		void release(uint64_t _handle, VkObjectType _type);
+		void recycleMemory(DeviceMemoryAllocationVK _mem);
 		void consume();
 
 		uint32_t m_queueFamily;
@@ -881,6 +920,8 @@ VK_DESTROY_FUNC(DescriptorSet);
 
 		typedef stl::vector<Resource> ResourceArray;
 		ResourceArray m_release[BGFX_CONFIG_MAX_FRAME_LATENCY];
+		stl::vector<DeviceMemoryAllocationVK> m_recycleAllocs[BGFX_CONFIG_MAX_FRAME_LATENCY];
+
 
 	private:
 		template<typename Ty>