// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
// All rights reserved.
// Code licensed under the BSD License.
// http://www.anki3d.org/LICENSE

#include <AnKi/Gr/Vulkan/VkGpuMemoryManager.h>
#include <AnKi/Gr/Vulkan/VkGrManager.h>
#include <AnKi/Core/StatsSet.h>

namespace anki {

ANKI_SVAR(DeviceMemoryAllocated, StatCategory::kGpuMem, "Device mem", StatFlag::kBytes)
ANKI_SVAR(DeviceMemoryInUse, StatCategory::kGpuMem, "Device mem in use", StatFlag::kBytes)
ANKI_SVAR(DeviceMemoryAllocationCount, StatCategory::kGpuMem, "Device mem allocations", StatFlag::kNone)
ANKI_SVAR(HostMemoryAllocated, StatCategory::kGpuMem, "Host mem", StatFlag::kBytes)
ANKI_SVAR(HostMemoryInUse, StatCategory::kGpuMem, "Host mem in use", StatFlag::kBytes)
ANKI_SVAR(HostMemoryAllocationCount, StatCategory::kGpuMem, "Host mem allocations", StatFlag::kNone)

static constexpr Array<GpuMemoryManagerClassInfo, 8> kClasses{
	{{4_KB, 256_KB}, {128_KB, 8_MB}, {1_MB, 64_MB}, {16_MB, 128_MB}, {64_MB, 128_MB}, {128_MB, 128_MB}, {256_MB, 256_MB}, {512_MB, 512_MB}}};

/// Special classes for the ReBAR memory. Have that as a special case because it's so limited and needs special care.
static constexpr Array<GpuMemoryManagerClassInfo, 3> kRebarClasses{{{1_MB, 1_MB}, {12_MB, 12_MB}, {24_MB, 24_MB}}};

Error GpuMemoryManagerInterface::allocateChunk(U32 classIdx, GpuMemoryManagerChunk*& chunk)
{
	VkMemoryAllocateInfo ci = {};
	ci.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
	ci.allocationSize = m_classInfos[classIdx].m_chunkSize;
	ci.memoryTypeIndex = m_memTypeIdx;

	VkMemoryAllocateFlagsInfo flags = {};
	flags.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO;
	flags.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
	if(m_exposesBufferGpuAddress)
	{
		ci.pNext = &flags;
	}

	VkDeviceMemory memHandle;
	if(vkAllocateMemory(getVkDevice(), &ci, nullptr, &memHandle) != VK_SUCCESS) [[unlikely]]
	{
		ANKI_VK_LOGF("Out of GPU memory. Mem type index %u, size %zu", m_memTypeIdx, m_classInfos[classIdx].m_suballocationSize);
	}

	chunk = newInstance<GpuMemoryManagerChunk>(GrMemoryPool::getSingleton());
	chunk->m_handle = memHandle;
	chunk->m_size = m_classInfos[classIdx].m_chunkSize;

	m_allocatedMemory += m_classInfos[classIdx].m_chunkSize;

	return Error::kNone;
}

void GpuMemoryManagerInterface::freeChunk(GpuMemoryManagerChunk* chunk)
{
	ANKI_ASSERT(chunk);
	ANKI_ASSERT(chunk->m_handle != VK_NULL_HANDLE);

	if(chunk->m_mappedAddress)
	{
		vkUnmapMemory(getVkDevice(), chunk->m_handle);
	}

	vkFreeMemory(getVkDevice(), chunk->m_handle, nullptr);

	ANKI_ASSERT(m_allocatedMemory >= chunk->m_size);
	m_allocatedMemory -= chunk->m_size;

	deleteInstance(GrMemoryPool::getSingleton(), chunk);
}

void GpuMemoryManager::destroy()
{
	ANKI_VK_LOGV("Destroying memory manager");
	m_callocs.destroy();
}

void GpuMemoryManager::init()
{
	// Print some info
	ANKI_VK_LOGV("Initializing memory manager");
	for(const GpuMemoryManagerClassInfo& c : kClasses)
	{
		ANKI_VK_LOGV("\tGPU mem class. Chunk size: %lu, suballocationSize: %lu, allocsPerChunk %lu", c.m_chunkSize, c.m_suballocationSize,
					 c.m_chunkSize / c.m_suballocationSize);
	}

	// Image buffer granularity
	{
		VkPhysicalDeviceProperties props;
		vkGetPhysicalDeviceProperties(getGrManagerImpl().getPhysicalDevice(), &props);
		m_bufferImageGranularity = U32(props.limits.bufferImageGranularity);
		ANKI_ASSERT(m_bufferImageGranularity > 0 && isPowerOfTwo(m_bufferImageGranularity));

		if(m_bufferImageGranularity > 4_KB)
		{
			ANKI_VK_LOGW("Buffer/image mem granularity is too high (%u). It will force high alignments and it will waste memory",
						 m_bufferImageGranularity);
		}

		for(const GpuMemoryManagerClassInfo& c : kClasses)
		{
			if(!isAligned(m_bufferImageGranularity, c.m_suballocationSize))
			{
				ANKI_VK_LOGW("Memory class is not aligned to buffer/image granularity (%u). It won't be used in "
							 "allocations: Chunk size: %lu, suballocationSize: %lu, allocsPerChunk %lu",
							 m_bufferImageGranularity, c.m_chunkSize, c.m_suballocationSize, c.m_chunkSize / c.m_suballocationSize);
			}
		}
	}

	vkGetPhysicalDeviceMemoryProperties(getGrManagerImpl().getPhysicalDevice(), &m_memoryProperties);

	m_callocs.resize(m_memoryProperties.memoryTypeCount);
	for(U32 memTypeIdx = 0; memTypeIdx < m_callocs.getSize(); ++memTypeIdx)
	{
		GpuMemoryManagerInterface& iface = m_callocs[memTypeIdx].getInterface();
		iface.m_parent = this;
		iface.m_memTypeIdx = U8(memTypeIdx);
		iface.m_exposesBufferGpuAddress = true;

		const U32 heapIdx = m_memoryProperties.memoryTypes[memTypeIdx].heapIndex;
		iface.m_isDeviceMemory = !!(m_memoryProperties.memoryHeaps[heapIdx].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT);

		// Find if it's ReBAR
		const VkMemoryPropertyFlags props = m_memoryProperties.memoryTypes[memTypeIdx].propertyFlags;
		const VkMemoryPropertyFlags reBarProps =
			VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
		const PtrSize heapSize = m_memoryProperties.memoryHeaps[m_memoryProperties.memoryTypes[memTypeIdx].heapIndex].size;
		const Bool isReBar = props == reBarProps && heapSize <= 256_MB;

		if(isReBar)
		{
			ANKI_VK_LOGV("Memory type %u is ReBAR", memTypeIdx);
		}

		// Choose different classes
		if(!isReBar)
		{
			iface.m_classInfos = kClasses;
		}
		else
		{
			iface.m_classInfos = kRebarClasses;
		}

		// The interface is initialized, init the builder
		m_callocs[memTypeIdx].init();
	}
}

void GpuMemoryManager::allocateMemory(U32 memTypeIdx, PtrSize size, U32 alignment, GpuMemoryHandle& handle)
{
	ClassAllocator& calloc = m_callocs[memTypeIdx];

	alignment = max(alignment, m_bufferImageGranularity);

	GpuMemoryManagerChunk* chunk;
	PtrSize offset;
	[[maybe_unused]] const Error err = calloc.allocate(size, alignment, chunk, offset);

	handle.m_memory = chunk->m_handle;
	handle.m_offset = offset;
	handle.m_chunk = chunk;
	handle.m_memTypeIdx = U8(memTypeIdx);
	handle.m_size = size;
}

void GpuMemoryManager::allocateMemoryDedicated(U32 memTypeIdx, PtrSize size, VkImage image, GpuMemoryHandle& handle)
{
	VkMemoryDedicatedAllocateInfoKHR dedicatedInfo = {};
	dedicatedInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
	dedicatedInfo.image = image;

	VkMemoryAllocateInfo memoryAllocateInfo = {};
	memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
	memoryAllocateInfo.pNext = &dedicatedInfo;
	memoryAllocateInfo.allocationSize = size;
	memoryAllocateInfo.memoryTypeIndex = memTypeIdx;

	VkDeviceMemory mem;
	ANKI_VK_CHECKF(vkAllocateMemory(getVkDevice(), &memoryAllocateInfo, nullptr, &mem));

	handle.m_memory = mem;
	handle.m_offset = 0;
	handle.m_chunk = nullptr;
	handle.m_memTypeIdx = U8(memTypeIdx);
	handle.m_size = size;

	m_dedicatedAllocatedMemory.fetchAdd(size);
	m_dedicatedAllocationCount.fetchAdd(1);
}

void GpuMemoryManager::freeMemory(GpuMemoryHandle& handle)
{
	ANKI_ASSERT(handle);

	if(handle.isDedicated())
	{
		vkFreeMemory(getVkDevice(), handle.m_memory, nullptr);
		[[maybe_unused]] const PtrSize prevSize = m_dedicatedAllocatedMemory.fetchSub(handle.m_size);
		ANKI_ASSERT(prevSize >= handle.m_size);

		[[maybe_unused]] const U32 count = m_dedicatedAllocationCount.fetchSub(1);
		ANKI_ASSERT(count > 0);
	}
	else
	{
		ClassAllocator& calloc = m_callocs[handle.m_memTypeIdx];
		calloc.free(handle.m_chunk, handle.m_offset);
	}

	handle = {};
}

void* GpuMemoryManager::getMappedAddress(GpuMemoryHandle& handle)
{
	ANKI_ASSERT(handle);
	ANKI_ASSERT(!handle.isDedicated());

	LockGuard<SpinLock> lock(handle.m_chunk->m_mappedAddressMtx);

	if(handle.m_chunk->m_mappedAddress == nullptr)
	{
		ANKI_VK_CHECKF(vkMapMemory(getVkDevice(), handle.m_chunk->m_handle, 0, handle.m_chunk->m_size, 0, &handle.m_chunk->m_mappedAddress));
	}

	return static_cast<void*>(static_cast<U8*>(handle.m_chunk->m_mappedAddress) + handle.m_offset);
}

U32 GpuMemoryManager::findMemoryType(U32 resourceMemTypeBits, VkMemoryPropertyFlags preferFlags, VkMemoryPropertyFlags avoidFlags) const
{
	U32 prefered = kMaxU32;

	// Iterate all mem types
	for(U32 i = 0; i < m_memoryProperties.memoryTypeCount; i++)
	{
		if(resourceMemTypeBits & (1u << i))
		{
			const VkMemoryPropertyFlags flags = m_memoryProperties.memoryTypes[i].propertyFlags;

			if((flags & preferFlags) == preferFlags && (flags & avoidFlags) == 0)
			{
				// It's the candidate we want

				if(prefered == kMaxU32)
				{
					prefered = i;
				}
				else
				{
					// On some Intel drivers there are identical memory types pointing to different heaps. Choose the biggest heap

					const PtrSize crntHeapSize = m_memoryProperties.memoryHeaps[m_memoryProperties.memoryTypes[i].heapIndex].size;
					const PtrSize prevHeapSize = m_memoryProperties.memoryHeaps[m_memoryProperties.memoryTypes[prefered].heapIndex].size;

					if(crntHeapSize > prevHeapSize)
					{
						prefered = i;
					}
				}
			}
		}
	}

	return prefered;
}

void GpuMemoryManager::updateStats() const
{
	g_svarDeviceMemoryAllocated.set(0);
	g_svarDeviceMemoryAllocationCount.set(0);
	g_svarDeviceMemoryInUse.set(0);
	g_svarHostMemoryAllocated.set(0);
	g_svarHostMemoryAllocationCount.set(0);
	g_svarHostMemoryInUse.set(0);

	for(U32 memTypeIdx = 0; memTypeIdx < m_callocs.getSize(); ++memTypeIdx)
	{
		const GpuMemoryManagerInterface& iface = m_callocs[memTypeIdx].getInterface();
		ClassAllocatorBuilderStats cstats;
		m_callocs[memTypeIdx].getStats(cstats);

		if(iface.m_isDeviceMemory)
		{
			g_svarDeviceMemoryAllocated.increment(cstats.m_allocatedSize);
			g_svarDeviceMemoryInUse.increment(cstats.m_inUseSize);
			g_svarDeviceMemoryAllocationCount.increment(cstats.m_chunkCount);
		}
		else
		{
			g_svarHostMemoryAllocated.increment(cstats.m_allocatedSize);
			g_svarHostMemoryInUse.increment(cstats.m_inUseSize);
			g_svarHostMemoryAllocationCount.increment(cstats.m_chunkCount);
		}
	}

	// Add dedicated stats
	const PtrSize dedicatedAllocatedMemory = m_dedicatedAllocatedMemory.load();
	g_svarDeviceMemoryAllocated.increment(dedicatedAllocatedMemory);
	g_svarDeviceMemoryInUse.increment(dedicatedAllocatedMemory);
	g_svarDeviceMemoryAllocationCount.increment(m_dedicatedAllocationCount.load());
}

void GpuMemoryManager::getImageMemoryRequirements(VkImage image, VkMemoryDedicatedRequirementsKHR& dedicatedRequirements,
												  VkMemoryRequirements2& requirements)
{
	dedicatedRequirements = {};
	dedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;

	requirements = {};
	requirements.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
	requirements.pNext = &dedicatedRequirements;

	VkImageMemoryRequirementsInfo2 imageRequirements = {};
	imageRequirements.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2;
	imageRequirements.image = image;

	vkGetImageMemoryRequirements2(getVkDevice(), &imageRequirements, &requirements);

	if(requirements.memoryRequirements.size > kClasses.getBack().m_chunkSize)
	{
		// Allocation to big, force a dedicated allocation
		dedicatedRequirements.prefersDedicatedAllocation = true;
		dedicatedRequirements.requiresDedicatedAllocation = true;
	}
}

} // end namespace anki