瀏覽代碼

Almost final version of transient meshes and MeshHeap

Marko Pintera 12 年之前
父節點
當前提交
e7e420a0a6

+ 4 - 4
CamelotCore/Include/CmCoreThreadAccessor.h

@@ -186,15 +186,15 @@ namespace CamelotFramework
 		}
 
 		/** @copydoc RenderSystem::draw() */
-		void draw(UINT32 vertexCount)
+		void draw(UINT32 vertexOffset, UINT32 vertexCount)
 		{
-			mCommandQueue->queue(boost::bind(&RenderSystem::draw, RenderSystem::instancePtr(), vertexCount));
+			mCommandQueue->queue(boost::bind(&RenderSystem::draw, RenderSystem::instancePtr(), vertexOffset, vertexCount));
 		}
 
 		/** @copydoc RenderSystem::drawIndexed() */
-		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount)
+		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount)
 		{
-			mCommandQueue->queue(boost::bind(&RenderSystem::drawIndexed, RenderSystem::instancePtr(), startIndex, indexCount, vertexCount));
+			mCommandQueue->queue(boost::bind(&RenderSystem::drawIndexed, RenderSystem::instancePtr(), startIndex, indexCount, vertexOffset, vertexCount));
 		}
 
 		/**

+ 1 - 0
CamelotCore/Include/CmMeshData.h

@@ -173,6 +173,7 @@ namespace CamelotFramework
 
 	private:
 		friend class Mesh; // To avoid polluting the public interface with a bunch of nearly useless methods for outside world
+		friend class MeshHeap;
 
 		UINT32 mDescBuilding;
 

+ 128 - 1
CamelotCore/Include/CmMeshHeap.h

@@ -1,11 +1,138 @@
 #pragma once
 
 #include "CmPrerequisites.h"
+#include "CmCoreObject.h"
+#include "CmIndexData.h"
+#include "CmDrawOps.h"
 
 namespace CamelotFramework
 {
-	class CM_EXPORT MeshHeap
+	/**
+	 * @brief	Mesh heap allows you to quickly allocate and deallocate a 
+	 * 			large amounts of temporary meshes without the large overhead of normal Mesh creation.
+	 * 			Only requirement is that meshes share the same vertex description and index type.
+	 * 			
+	 * @note	This class should be considered as a replacement for a normal Mesh if you are constantly 
+	 * 			updating the mesh (e.g. every frame) and you are not able to discard entire mesh contents 
+	 * 			on each update. Not using discard flag on normal meshes may introduce GPU-CPU sync points
+	 * 			which may severely limit performance. Primary purpose of this class is to avoid
+	 * 			those sync points by not forcing you to discard contents.
+	 * 			
+	 *			Only downside is that this class may allocate 2-3x (or more) memory than it is actually needed
+	 *			for your data.
+	 *			
+	 *			Sim thread only.
+	 */
+	class CM_EXPORT MeshHeap : public CoreObject
 	{
+		enum class UseFlags
+		{
+			Used,
+			CPUFree,
+			GPUFree,
+			Free
+		};
 
+		struct ChunkData
+		{
+			UINT32 start, size;
+		};
+
+		struct AllocatedData
+		{
+			UINT32 vertChunkIdx;
+			UINT32 idxChunkIdx;
+
+			UseFlags useFlags;
+			UINT32 eventQueryIdx;
+		};
+
+		struct QueryData
+		{
+			EventQueryPtr query;
+			UINT32 queryId;
+		};
+
+	public:
+		~MeshHeap();
+
+		/**
+		 * @note	Offsets provided by MeshData are ignored. MeshHeap will determine
+		 * 			where the data will be written internally.
+		 */
+		TransientMeshPtr alloc(const MeshDataPtr& meshData, DrawOperationType drawOp = DOT_TRIANGLE_LIST);
+		void dealloc(const TransientMeshPtr& mesh);
+
+		static MeshHeapPtr create(UINT32 numVertices, UINT32 numIndices, 
+			const VertexDataDescPtr& vertexDesc, IndexBuffer::IndexType indexType = IndexBuffer::IT_32BIT);
+
+	private:
+		UINT32 mNumVertices; // Core thread
+		UINT32 mNumIndices; // Core thread
+
+		std::shared_ptr<VertexData> mVertexData; // Core thread
+		std::shared_ptr<IndexData> mIndexData; // Core thread
+
+		Vector<UINT8*>::type mCPUVertexData; // Core thread
+		UINT8* mCPUIndexData; // Core thread
+
+		VertexDataDescPtr mVertexDesc; // Immutable
+		IndexBuffer::IndexType mIndexType; // Immutable
+
+		Map<UINT32, TransientMeshPtr>::type mMeshes; // Sim thread
+		UINT32 mNextFreeId; // Sim thread
+
+		Map<UINT32, AllocatedData>::type mMeshAllocData; // Core thread
+
+		Vector<ChunkData>::type mVertChunks; // Core thread
+		Vector<ChunkData>::type mIdxChunks; // Core thread
+
+		Stack<UINT32>::type mEmptyVertChunks; // Core thread
+		Stack<UINT32>::type mEmptyIdxChunks; // Core thread
+
+		List<UINT32>::type mFreeVertChunks; // Core thread
+		List<UINT32>::type mFreeIdxChunks; // Core thread
+
+		Vector<QueryData>::type mEventQueries; // Core thread
+		Stack<UINT32>::type mFreeEventQueries; // Core thread
+
+		UINT32 mNextQueryId;
+
+		static const float GrowPercent;
+	private:
+		friend class TransientMesh;
+
+		MeshHeap(UINT32 numVertices, UINT32 numIndices, 
+			const VertexDataDescPtr& vertexDesc, IndexBuffer::IndexType indexType = IndexBuffer::IT_32BIT);
+
+		/**
+		 * @copydoc Resource::initialize_internal()
+		 */
+		virtual void initialize_internal();
+
+		/**
+		 * @copydoc Resource::destroy_internal()
+		 */
+		virtual void destroy_internal();
+
+		void allocInternal(UINT32 meshId, const MeshDataPtr& meshData);
+		void deallocInternal(UINT32 meshId);
+
+		void growVertexBuffer(UINT32 numVertices);
+		void growIndexBuffer(UINT32 numIndices);
+
+		UINT32 createEventQuery();
+		void freeEventQuery(UINT32 idx);
+
+		std::shared_ptr<VertexData> getVertexData() const;
+		std::shared_ptr<IndexData> getIndexData() const;
+
+		UINT32 getVertexOffset(UINT32 meshId) const;
+		UINT32 getIndexOffset(UINT32 meshId) const;
+
+		void notifyUsedOnGPU(UINT32 meshId);
+		void queryTriggered(UINT32 meshId, UINT32 queryId);
+
+		void mergeWithNearbyChunks(UINT32 chunkVertIdx, UINT32 chunkIdxIdx);
 	};
 }

+ 2 - 2
CamelotCore/Include/CmRenderSystem.h

@@ -200,14 +200,14 @@ namespace CamelotFramework
 		 *			Draws directly from the vertex buffer without using
 		 *			indices.
 		 */
-		virtual void draw(UINT32 vertexCount) = 0;
+		virtual void draw(UINT32 vertexOffset, UINT32 vertexCount) = 0;
 
 		/**
 		 * @brief	Draw an object based on currently set
 		 * 			shaders, vertex declaration and vertex 
 		 * 			and index buffers.
 		 */
-		virtual void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount) = 0;
+		virtual void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount) = 0;
 
 		/**
 		 * @brief	Swap the front and back buffer of the specified render target.

+ 1 - 0
CamelotCore/Include/CmVertexDataDesc.h

@@ -43,6 +43,7 @@ namespace CamelotFramework
 		const VertexElement& getElement(UINT32 idx) const { return mVertexElements[idx]; }
 	private:
 		friend class Mesh; // To avoid polluting the public interface with a bunch of nearly useless methods for outside world
+		friend class MeshHeap;
 
 		Vector<VertexElement>::type mVertexElements;
 

+ 1 - 1
CamelotCore/Source/CmMesh.cpp

@@ -116,7 +116,7 @@ namespace CamelotFramework
 					if(!meshData.getVertexDesc()->hasElement(VES_COLOR, semanticIdx, i))
 						continue;
 
-					UINT8* colorData = bufferCopy + meshData.getElementOffset(VES_COLOR, semanticIdx, i);
+					UINT8* colorData = bufferCopy + mVertexDesc->getElementOffsetFromStream(VES_COLOR, semanticIdx, i);
 					for(UINT32 j = 0; j < mVertexData->vertexCount; j++)
 					{
 						UINT32* curColor = (UINT32*)colorData;

+ 627 - 0
CamelotCore/Source/CmMeshHeap.cpp

@@ -1,6 +1,633 @@
 #include "CmMeshHeap.h"
+#include "CmCoreThread.h"
+#include "CmTransientMesh.h"
+#include "CmHardwareBufferManager.h"
+#include "CmVertexDataDesc.h"
+#include "CmVertexData.h"
+#include "CmIndexData.h"
+#include "CmMeshData.h"
+#include "CmMath.h"
+#include "CmEventQuery.h"
 
 namespace CamelotFramework
 {
+	const float MeshHeap::GrowPercent = 1.5f;
 
+	MeshHeap::MeshHeap(UINT32 numVertices, UINT32 numIndices, 
+		const VertexDataDescPtr& vertexDesc, IndexBuffer::IndexType indexType)
+		:mNumVertices(numVertices), mNumIndices(numIndices), mNextFreeId(0), 
+		mIndexType(indexType), mVertexDesc(vertexDesc), mCPUIndexData(nullptr),
+		mNextQueryId(0)
+	{
+		for(UINT32 i = 0; i <= mVertexDesc->getMaxStreamIdx(); i++)
+		{
+			mCPUVertexData.push_back(nullptr);
+		}
+	}
+
+	MeshHeap::~MeshHeap()
+	{
+
+	}
+
+	MeshHeapPtr MeshHeap::create(UINT32 numVertices, UINT32 numIndices, 
+		const VertexDataDescPtr& vertexDesc, IndexBuffer::IndexType indexType)
+	{
+		MeshHeap* meshHeap = new (cm_alloc<MeshHeap>()) MeshHeap(numVertices, numIndices, vertexDesc, indexType); 
+		MeshHeapPtr meshHeapPtr = cm_core_ptr<MeshHeap, GenAlloc>(meshHeap);
+
+		meshHeapPtr->setThisPtr(meshHeapPtr);
+		meshHeapPtr->initialize();
+
+		return meshHeapPtr;
+	}
+
+	void MeshHeap::initialize_internal()
+	{
+		THROW_IF_NOT_CORE_THREAD;
+
+		growVertexBuffer(mNumVertices);
+		growIndexBuffer(mNumIndices);
+	}
+
+	void MeshHeap::destroy_internal()
+	{
+		THROW_IF_NOT_CORE_THREAD;
+
+		CoreObject::destroy_internal();
+	}
+
+	TransientMeshPtr MeshHeap::alloc(const MeshDataPtr& meshData, DrawOperationType drawOp)
+	{
+		UINT32 meshIdx = mNextFreeId++;
+
+		MeshHeapPtr thisPtr = std::static_pointer_cast<MeshHeap>(getThisPtr());
+		TransientMesh* transientMesh = new (cm_alloc<TransientMesh>()) TransientMesh(thisPtr, meshIdx, meshData->getNumVertices(), meshData->getNumIndices(), drawOp); 
+		TransientMeshPtr transientMeshPtr = cm_core_ptr<TransientMesh, GenAlloc>(transientMesh);
+
+		transientMeshPtr->setThisPtr(transientMeshPtr);
+		transientMeshPtr->initialize();
+
+		mMeshes[meshIdx] = transientMeshPtr;
+
+		queueGpuCommand(getThisPtr(), boost::bind(&MeshHeap::allocInternal, this, meshIdx, meshData));
+
+		return transientMeshPtr;
+	}
+
+	void MeshHeap::dealloc(const TransientMeshPtr& mesh)
+	{
+		auto iterFind = mMeshes.find(mesh->mId);
+		if(iterFind == mMeshes.end())
+			return;
+
+		mMeshes.erase(iterFind);
+
+		queueGpuCommand(getThisPtr(), boost::bind(&MeshHeap::deallocInternal, this, mesh->mId));
+	}
+
+	void MeshHeap::allocInternal(UINT32 meshId, const MeshDataPtr& meshData)
+	{
+		// Find free vertex chunk and grow if needed
+		UINT32 smallestVertFit = 0;
+		UINT32 smallestVertFitIdx = 0;
+
+		while(smallestVertFit == 0)
+		{
+			UINT32 curIdx = 0;
+			for(auto& chunkIdx : mFreeVertChunks)
+			{
+				ChunkData& chunk = mVertChunks[chunkIdx];
+
+				if(chunk.size >= meshData->getNumVertices() && (chunk.size < smallestVertFit || smallestVertFit == 0))
+				{
+					smallestVertFit = chunk.size;
+					smallestVertFitIdx = curIdx;
+				}
+
+				curIdx++;
+			}
+
+			if(smallestVertFit > 0)
+				break;
+
+			UINT32 newNumVertices = mNumVertices;
+			while(newNumVertices < (mNumVertices + meshData->getNumVertices()))
+			{
+				newNumVertices = Math::RoundToInt(newNumVertices * GrowPercent);
+			}
+
+			growVertexBuffer(newNumVertices);
+		}
+
+		// Find free index chunk and grow if needed
+		UINT32 smallestIdxFit = 0;
+		UINT32 smallestIdxFitIdx = 0;
+
+		while(smallestIdxFit == 0)
+		{
+			UINT32 curIdx = 0;
+			for(auto& chunkIdx : mFreeIdxChunks)
+			{
+				ChunkData& chunk = mIdxChunks[chunkIdx];
+
+				if(chunk.size >= meshData->getNumIndices() && (chunk.size < smallestIdxFit || smallestIdxFit == 0))
+				{
+					smallestIdxFit = chunk.size;
+					smallestIdxFitIdx = curIdx;
+				}
+
+				curIdx++;
+			}
+
+			if(smallestIdxFit > 0)
+				break;
+
+			UINT32 newNumIndices = mNumIndices;
+			while(newNumIndices < (mNumIndices + meshData->getNumIndices()))
+			{
+				newNumIndices = Math::RoundToInt(newNumIndices * GrowPercent);
+			}
+
+			growIndexBuffer(newNumIndices);
+		}
+
+		UINT32 freeVertChunkIdx = 0;
+		UINT32 freeIdxChunkIdx = 0;
+
+		auto freeVertIter = mFreeVertChunks.begin();
+		freeVertChunkIdx = (*freeVertIter);
+		for(UINT32 i = 0; i < smallestVertFitIdx; i++)
+		{
+			freeVertIter++;
+			freeVertChunkIdx = (*freeVertIter);
+		}
+
+		mFreeVertChunks.erase(freeVertIter);
+		
+		auto freeIdxIter = mFreeIdxChunks.begin();
+		freeIdxChunkIdx = (*freeIdxIter);
+		for(UINT32 i = 0; i < smallestIdxFitIdx; i++)
+		{
+			freeIdxIter++;
+			freeIdxChunkIdx = (*freeIdxIter);
+		}
+
+		mFreeIdxChunks.erase(freeIdxIter);
+
+		ChunkData& vertChunk = mVertChunks[freeVertChunkIdx];
+		ChunkData& idxChunk = mIdxChunks[freeIdxChunkIdx];
+
+		UINT32 remainingNumVerts = vertChunk.size - meshData->getNumVertices();
+		UINT32 remainingNumIdx = idxChunk.size - meshData->getNumIndices();
+
+		if(remainingNumVerts > 0)
+		{
+			if(!mEmptyVertChunks.empty())
+			{
+				UINT32 emptyChunkIdx = mEmptyVertChunks.top();
+				ChunkData& emptyChunk = mVertChunks[emptyChunkIdx];
+				mEmptyVertChunks.pop();
+
+				emptyChunk.start = vertChunk.start + meshData->getNumVertices();
+				emptyChunk.size = remainingNumVerts;
+			}
+			else
+			{
+				ChunkData newChunk;
+				newChunk.size = remainingNumVerts;
+				newChunk.start = vertChunk.start + meshData->getNumVertices();
+
+				mVertChunks.push_back(newChunk);
+				mFreeVertChunks.push_back((UINT32)(mVertChunks.size() - 1));
+			}
+		}
+
+		if(remainingNumIdx > 0)
+		{
+			if(!mEmptyIdxChunks.empty())
+			{
+				UINT32 emptyChunkIdx = mEmptyIdxChunks.top();
+				ChunkData& emptyChunk = mIdxChunks[emptyChunkIdx];
+				mEmptyIdxChunks.pop();
+
+				emptyChunk.start = idxChunk.start + meshData->getNumIndices();
+				emptyChunk.size = remainingNumIdx;
+			}
+			else
+			{
+				ChunkData newChunk;
+				newChunk.size = remainingNumIdx;
+				newChunk.start = idxChunk.start + meshData->getNumIndices();
+
+				mIdxChunks.push_back(newChunk);
+				mFreeIdxChunks.push_back((UINT32)(mIdxChunks.size() - 1));
+			}
+		}
+
+		vertChunk.size = meshData->getNumVertices();
+		idxChunk.size = meshData->getNumIndices();
+
+		AllocatedData newAllocData;
+		newAllocData.vertChunkIdx = freeVertChunkIdx;
+		newAllocData.idxChunkIdx = freeIdxChunkIdx;
+		newAllocData.useFlags = UseFlags::GPUFree;
+		newAllocData.eventQueryIdx = createEventQuery();
+
+		mMeshAllocData[meshId] = newAllocData;
+
+		// Actually copy data
+		for(UINT32 i = 0; i <= mVertexDesc->getMaxStreamIdx(); i++)
+		{
+			if(!mVertexDesc->hasStream(i))
+				continue;
+
+			UINT32 vertSize = mVertexData->vertexDeclaration->getVertexSize(i);
+			VertexBufferPtr vertexBuffer = mVertexData->getBuffer(i);
+
+			UINT8* vertDest = mCPUVertexData[i] + vertChunk.start * vertSize;
+			memcpy(vertDest, meshData->getStreamData(i), vertChunk.start * vertSize);
+
+			if(vertexBuffer->vertexColorReqRGBFlip())
+			{
+				UINT32 vertexStride = mVertexDesc->getVertexStride(i);
+				for(INT32 semanticIdx = 0; semanticIdx < VertexBuffer::MAX_SEMANTIC_IDX; semanticIdx++)
+				{
+					if(!mVertexDesc->hasElement(VES_COLOR, semanticIdx, i))
+						continue;
+
+					UINT8* colorData = vertDest + mVertexDesc->getElementOffsetFromStream(VES_COLOR, semanticIdx, i);
+					for(UINT32 j = 0; j < mVertexData->vertexCount; j++)
+					{
+						UINT32* curColor = (UINT32*)colorData;
+
+						(*curColor) = ((*curColor) & 0xFF00FF00) | ((*curColor >> 16) & 0x000000FF) | ((*curColor << 16) & 0x00FF0000);
+
+						colorData += vertexStride;
+					}
+				}
+			}
+
+			vertexBuffer->writeData(vertChunk.start * vertSize, vertChunk.size * vertSize, vertDest, false);
+		}
+
+		IndexBufferPtr indexBuffer = mIndexData->indexBuffer;
+		UINT32 idxSize = indexBuffer->getIndexSize();
+
+		UINT8* idxDest = mCPUIndexData + idxChunk.start * idxSize;
+		memcpy(idxDest, meshData->getIndexData(), idxChunk.start * idxSize);
+		indexBuffer->writeData(idxChunk.start * idxSize, idxChunk.size * idxSize, idxDest, false);
+	}
+
+	void MeshHeap::deallocInternal(UINT32 meshId)
+	{
+		auto findIter = mMeshAllocData.find(meshId);
+		assert(findIter != mMeshAllocData.end());
+
+		AllocatedData& allocData = findIter->second;
+		if(allocData.useFlags == UseFlags::GPUFree)
+		{
+			allocData.useFlags = UseFlags::Free;
+			freeEventQuery(allocData.eventQueryIdx);
+
+			mFreeVertChunks.push_back(allocData.vertChunkIdx);
+			mFreeIdxChunks.push_back(allocData.idxChunkIdx);
+
+			mergeWithNearbyChunks(allocData.vertChunkIdx, allocData.idxChunkIdx);
+
+			mMeshAllocData.erase(findIter);
+		}
+		else if(allocData.useFlags == UseFlags::Used)
+			allocData.useFlags = UseFlags::CPUFree;
+	}
+
+	void MeshHeap::growVertexBuffer(UINT32 numVertices)
+	{
+		mNumVertices = numVertices;
+		mVertexData = std::shared_ptr<VertexData>(cm_new<VertexData, PoolAlloc>());
+
+		mVertexData->vertexCount = mNumVertices;
+		mVertexData->vertexDeclaration = mVertexDesc->createDeclaration();
+
+		// Create buffers and copy data
+		for(UINT32 i = 0; i <= mVertexDesc->getMaxStreamIdx(); i++)
+		{
+			if(!mVertexDesc->hasStream(i))
+				continue;
+
+			UINT32 vertSize = mVertexData->vertexDeclaration->getVertexSize(i);
+			VertexBufferPtr vertexBuffer = HardwareBufferManager::instance().createVertexBuffer(
+				vertSize, mVertexData->vertexCount, GBU_DYNAMIC);
+
+			mVertexData->setBuffer(i, vertexBuffer);
+
+			// Copy all data to the new buffer
+			UINT8* oldBuffer = mCPUVertexData[i];
+			UINT8* buffer = (UINT8*)cm_alloc(vertSize * numVertices);
+
+			UINT32 destOffset = 0;
+			if(oldBuffer != nullptr)
+			{
+				for(auto& allocData : mMeshAllocData)
+				{
+					ChunkData& oldChunk = mVertChunks[allocData.second.vertChunkIdx];
+
+					UINT8* oldData = oldBuffer + oldChunk.start * vertSize;
+					memcpy(buffer + destOffset * vertSize, oldData, oldChunk.size * vertSize);
+
+					destOffset += oldChunk.size;
+				}
+
+				cm_free(oldBuffer);
+			}
+
+			vertexBuffer->writeData(0, destOffset * vertSize, buffer, false);
+
+			mCPUVertexData[i] = buffer;
+		}
+
+		// Reorder chunks
+		UINT32 destOffset = 0;
+		Vector<ChunkData>::type newVertChunks;
+		List<UINT32>::type freeVertChunks;
+
+		for(auto& allocData : mMeshAllocData)
+		{
+			ChunkData& oldChunk = mVertChunks[allocData.second.vertChunkIdx];
+
+			ChunkData newChunk;
+			newChunk.start = destOffset;
+			newChunk.size = oldChunk.size;
+
+			allocData.second.vertChunkIdx = (UINT32)newVertChunks.size();
+			newVertChunks.push_back(newChunk);
+
+			destOffset += oldChunk.size;
+		}
+
+		// Add free chunk
+		if(destOffset != mNumVertices)
+		{
+			ChunkData newChunk;
+			newChunk.start = destOffset;
+			newChunk.size = mNumVertices - destOffset;
+
+			newVertChunks.push_back(newChunk);
+			freeVertChunks.push_back((UINT32)(newVertChunks.size() - 1));
+		}
+
+		mVertChunks = newVertChunks;
+		mFreeVertChunks = freeVertChunks;
+		
+		while(!mEmptyVertChunks.empty())
+			mEmptyVertChunks.pop();
+	}
+
+	void MeshHeap::growIndexBuffer(UINT32 numIndices)
+	{
+		mNumIndices = numIndices;
+
+		mIndexData = std::shared_ptr<IndexData>(cm_new<IndexData, PoolAlloc>());
+		mIndexData->indexCount = mNumIndices;
+		mIndexData->indexBuffer = HardwareBufferManager::instance().createIndexBuffer(
+			mIndexType, mIndexData->indexCount, GBU_DYNAMIC);
+
+		// Copy all data to the new buffer
+		UINT32 idxSize = mIndexData->indexBuffer->getIndexSize();
+
+		UINT8* oldBuffer = mCPUIndexData;
+		UINT8* buffer = (UINT8*)cm_alloc(idxSize * numIndices);
+
+		UINT32 destOffset = 0;
+		if(oldBuffer != nullptr)
+		{
+			for(auto& allocData : mMeshAllocData)
+			{
+				ChunkData& oldChunk = mIdxChunks[allocData.second.idxChunkIdx];
+
+				UINT8* oldData = oldBuffer + oldChunk.start * idxSize;
+				memcpy(buffer + destOffset * idxSize, oldData, oldChunk.size * idxSize);
+
+				destOffset += oldChunk.size;
+			}
+
+			cm_free(oldBuffer);
+		}
+
+		mIndexData->indexBuffer->writeData(0, destOffset * idxSize, buffer, false);
+
+		mCPUIndexData = buffer;
+
+		// Reorder chunks
+		destOffset = 0;
+		Vector<ChunkData>::type newIdxChunks;
+		List<UINT32>::type freeIdxChunks;
+
+		for(auto& allocData : mMeshAllocData)
+		{
+			ChunkData& oldChunk = mIdxChunks[allocData.second.idxChunkIdx];
+
+			ChunkData newChunk;
+			newChunk.start = destOffset;
+			newChunk.size = oldChunk.size;
+
+			allocData.second.idxChunkIdx = (UINT32)newIdxChunks.size();
+			newIdxChunks.push_back(newChunk);
+
+			destOffset += oldChunk.size;
+		}
+
+		// Add free chunk
+		if(destOffset != mNumIndices)
+		{
+			ChunkData newChunk;
+			newChunk.start = destOffset;
+			newChunk.size = mNumIndices - destOffset;
+
+			newIdxChunks.push_back(newChunk);
+			freeIdxChunks.push_back((UINT32)(newIdxChunks.size() - 1));
+		}
+
+		mIdxChunks = newIdxChunks;
+		mFreeIdxChunks = freeIdxChunks;
+
+		while(!mEmptyIdxChunks.empty())
+			mEmptyIdxChunks.pop();
+	}
+
+	UINT32 MeshHeap::createEventQuery()
+	{
+		UINT32 idx = 0;
+		if(mFreeEventQueries.size() > 0)
+		{
+			idx = mFreeEventQueries.top();
+			mFreeEventQueries.pop();
+		}
+		else
+		{
+			QueryData newQuery;
+			newQuery.query = EventQuery::create();
+			newQuery.queryId = 0;
+
+			mEventQueries.push_back(newQuery);
+			idx = (UINT32)(mEventQueries.size() - 1);
+		}
+
+		return idx;
+	}
+
+	void MeshHeap::freeEventQuery(UINT32 idx)
+	{
+		mEventQueries[idx].queryId = 0;
+		mFreeEventQueries.push(idx);
+	}
+
+	std::shared_ptr<VertexData> MeshHeap::getVertexData() const
+	{
+		return mVertexData;
+	}
+
+	std::shared_ptr<IndexData> MeshHeap::getIndexData() const
+	{
+		return mIndexData;
+	}
+
+	UINT32 MeshHeap::getVertexOffset(UINT32 meshId) const
+	{
+		auto findIter = mMeshAllocData.find(meshId);
+		assert(findIter != mMeshAllocData.end());
+
+		UINT32 chunkIdx = findIter->second.vertChunkIdx;
+		return mVertChunks[chunkIdx].start;
+	}
+
+	UINT32 MeshHeap::getIndexOffset(UINT32 meshId) const
+	{
+		auto findIter = mMeshAllocData.find(meshId);
+		assert(findIter != mMeshAllocData.end());
+
+		UINT32 chunkIdx = findIter->second.idxChunkIdx;
+		return mIdxChunks[chunkIdx].start;
+	}
+
+	void MeshHeap::notifyUsedOnGPU(UINT32 meshId)
+	{
+		auto findIter = mMeshAllocData.find(meshId);
+		assert(findIter != mMeshAllocData.end());
+
+		AllocatedData& allocData = findIter->second;
+		assert(allocData.useFlags != UseFlags::Free);
+
+		if(allocData.useFlags == UseFlags::GPUFree)
+			allocData.useFlags = UseFlags::Used;
+
+		QueryData& queryData = mEventQueries[allocData.eventQueryIdx];
+		queryData.queryId = mNextQueryId++;
+		queryData.query->onTriggered.connect(boost::bind(&MeshHeap::queryTriggered, this, meshId, queryData.queryId));
+		queryData.query->begin();
+	}
+
+	void MeshHeap::queryTriggered(UINT32 meshId, UINT32 queryId)
+	{
+		auto findIter = mMeshAllocData.find(meshId);
+		assert(findIter != mMeshAllocData.end());
+
+		AllocatedData& allocData = findIter->second;
+
+		// If query ids don't match then it means there either a more recent query or
+		// the buffer was discarded and we are not interested in query result
+		QueryData& queryData = mEventQueries[allocData.eventQueryIdx];
+		if(queryId == queryData.queryId) 
+		{
+			assert(allocData.useFlags != UseFlags::Free && allocData.useFlags != UseFlags::GPUFree);
+
+			if(allocData.useFlags == UseFlags::CPUFree)
+			{
+				allocData.useFlags = UseFlags::Free;
+				freeEventQuery(allocData.eventQueryIdx);
+
+				mFreeVertChunks.push_back(allocData.vertChunkIdx);
+				mFreeIdxChunks.push_back(allocData.idxChunkIdx);
+
+				mergeWithNearbyChunks(allocData.vertChunkIdx, allocData.idxChunkIdx);
+
+				mMeshAllocData.erase(findIter);
+			}
+			else
+				allocData.useFlags = UseFlags::GPUFree;
+		}
+	}
+
+	void MeshHeap::mergeWithNearbyChunks(UINT32 chunkVertIdx, UINT32 chunkIdxIdx)
+	{
+		// Merge vertex chunks
+		ChunkData& vertChunk = mVertChunks[chunkVertIdx];
+		for(auto& freeChunkIdx : mFreeVertChunks)
+		{
+			if(chunkVertIdx == freeChunkIdx)
+				continue;
+
+			ChunkData& curChunk = mVertChunks[freeChunkIdx];
+			bool merged = false;
+
+			if(curChunk.start == (vertChunk.start + vertChunk.size))
+			{
+				vertChunk.size += curChunk.size;
+
+				merged = true;
+			}
+			
+			if((curChunk.start + curChunk.size) == vertChunk.start)
+			{
+				vertChunk.start = curChunk.start;
+				vertChunk.size += curChunk.size;
+
+				merged = true;
+			}
+
+			if(merged)
+			{
+				// We can't remove the chunk since that would break the indexing scheme, so 
+				// mark it as empty and set size to 0. It will be reused when needed.
+				curChunk.start = 0;
+				curChunk.size = 0;
+				mEmptyVertChunks.push(freeChunkIdx);
+			}
+		}
+
+		// Merge index chunks
+		ChunkData& idxChunk = mIdxChunks[chunkIdxIdx];
+		for(auto& freeChunkIdx : mFreeIdxChunks)
+		{
+			if(chunkIdxIdx == freeChunkIdx)
+				continue;
+
+			ChunkData& curChunk = mIdxChunks[freeChunkIdx];
+			bool merged = false;
+
+			if(curChunk.start == (idxChunk.start + idxChunk.size))
+			{
+				idxChunk.size += curChunk.size;
+
+				merged = true;
+			}
+
+			if((curChunk.start + curChunk.size) == idxChunk.start)
+			{
+				idxChunk.start = curChunk.start;
+				idxChunk.size += curChunk.size;
+
+				merged = true;
+			}
+
+			if(merged)
+			{
+				// We can't remove the chunk since that would break the indexing scheme, so 
+				// mark it as empty and set size to 0. It will be reused when needed.
+				curChunk.start = 0;
+				curChunk.size = 0;
+				mEmptyIdxChunks.push(freeChunkIdx);
+			}
+		}
+	}
 }

+ 4 - 2
CamelotCore/Source/CmRenderSystem.cpp

@@ -260,10 +260,12 @@ namespace CamelotFramework {
 				indexCount = indexData->indexCount;
 
 			setIndexBuffer(indexData->indexBuffer);
-			drawIndexed(indexOffset, indexCount, vertexData->vertexCount);
+			drawIndexed(indexOffset + mesh->getIndexOffset(), indexCount, mesh->getVertexOffset(), vertexData->vertexCount);
 		}
 		else
-			draw(vertexData->vertexCount);
+			draw(mesh->getVertexOffset(), vertexData->vertexCount);
+
+		mesh->notifyUsedOnGPU();
 
 		gProfiler().endSample("render");
 	}

+ 6 - 9
CamelotCore/Source/CmTransientMesh.cpp

@@ -1,6 +1,7 @@
 #include "CmTransientMesh.h"
 #include "CmVertexData.h"
 #include "CmIndexData.h"
+#include "CmMeshHeap.h"
 
 namespace CamelotFramework
 {
@@ -27,30 +28,26 @@ namespace CamelotFramework
 
 	std::shared_ptr<VertexData> TransientMesh::getVertexData() const
 	{
-		// TODO - Get vertex data from parent MeshHeap
-		return nullptr;
+		return mParentHeap->getVertexData();
 	}
 
 	std::shared_ptr<IndexData> TransientMesh::getIndexData() const
 	{
-		// TODO - Get index data from parent MeshHeap
-		return nullptr;
+		return mParentHeap->getIndexData();
 	}
 
 	UINT32 TransientMesh::getVertexOffset() const
 	{
-		// TODO - Get vertex offset from parent MeshHeap
-		return 0;
+		return mParentHeap->getVertexOffset(mId);
 	}
 
 	UINT32 TransientMesh::getIndexOffset() const
 	{
-		// TODO - Get index offset from parent MeshHeap
-		return 0;
+		return mParentHeap->getIndexOffset(mId);
 	}
 
 	void TransientMesh::notifyUsedOnGPU()
 	{
-		// TODO - Refresh EventQuery
+		mParentHeap->notifyUsedOnGPU(mId);
 	}
 }

+ 2 - 2
CamelotD3D11RenderSystem/Include/CmD3D11RenderSystem.h

@@ -51,10 +51,10 @@ namespace CamelotFramework
 		void setDrawOperation(DrawOperationType op);
 
 		/** @copydoc RenderSystem::draw() */
-		void draw(UINT32 vertexCount);
+		void draw(UINT32 vertexOffset, UINT32 vertexCount);
 
 		/** @copydoc RenderSystem::drawIndexed() */
-		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount);
+		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount);
 
 		/** @copydoc RenderSystem::bindGpuProgram() */
 		void bindGpuProgram(HGpuProgram prg);

+ 4 - 4
CamelotD3D11RenderSystem/Source/CmD3D11RenderSystem.cpp

@@ -545,13 +545,13 @@ namespace CamelotFramework
 			CM_EXCEPT(RenderingAPIException, "Failed to bindGpuParams : " + mDevice->getErrorDescription());
 	}
 
-	void D3D11RenderSystem::draw(UINT32 vertexCount)
+	void D3D11RenderSystem::draw(UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		THROW_IF_NOT_CORE_THREAD;
 
 		applyInputLayout();
 
-		mDevice->getImmediateContext()->Draw(vertexCount, 0);
+		mDevice->getImmediateContext()->Draw(vertexCount, vertexOffset);
 
 #if CM_DEBUG_MODE
 		if(mDevice->hasError())
@@ -559,13 +559,13 @@ namespace CamelotFramework
 #endif
 	}
 
-	void D3D11RenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount)
+	void D3D11RenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		THROW_IF_NOT_CORE_THREAD;
 
 		applyInputLayout();
 
-		mDevice->getImmediateContext()->DrawIndexed(indexCount, startIndex, 0);
+		mDevice->getImmediateContext()->DrawIndexed(indexCount, startIndex, vertexOffset);
 
 #if CM_DEBUG_MODE
 		if(mDevice->hasError())

+ 2 - 2
CamelotD3D9Renderer/Include/CmD3D9RenderSystem.h

@@ -141,12 +141,12 @@ namespace CamelotFramework
 		/**
 		 * @copydoc RenderSystem::draw()
 		 */
-		void draw(UINT32 vertexCount);
+		void draw(UINT32 vertexOffset, UINT32 vertexCount);
 
 		/**
 		 * @copydoc RenderSystem::drawIndexed()
 		 */
-		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount);
+		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount);
 
         void setScissorRect(UINT32 left, UINT32 top, UINT32 right, UINT32 bottom);
 

+ 4 - 4
CamelotD3D9Renderer/Source/CmD3D9RenderSystem.cpp

@@ -1270,11 +1270,11 @@ namespace CamelotFramework
 		mCurrentDrawOperation = op;
 	}
 	//---------------------------------------------------------------------
-	void D3D9RenderSystem::draw(UINT32 vertexCount)
+	void D3D9RenderSystem::draw(UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		UINT32 primCount = pointCountToPrimCount(mCurrentDrawOperation, vertexCount);
 
-		HRESULT hr = getActiveD3D9Device()->DrawPrimitive(getD3D9PrimitiveType(), 0, static_cast<UINT>(primCount)); 
+		HRESULT hr = getActiveD3D9Device()->DrawPrimitive(getD3D9PrimitiveType(), static_cast<UINT>(vertexOffset), static_cast<UINT>(primCount)); 
 
 		if( FAILED( hr ) )
 		{
@@ -1283,14 +1283,14 @@ namespace CamelotFramework
 		}
 	}
 	//---------------------------------------------------------------------
-	void D3D9RenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount)
+	void D3D9RenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		UINT32 primCount = pointCountToPrimCount(mCurrentDrawOperation, indexCount);
 
 		// do indexed draw operation
 		HRESULT hr = getActiveD3D9Device()->DrawIndexedPrimitive(
 			getD3D9PrimitiveType(), 
-			0, 
+			static_cast<UINT>(vertexOffset), 
 			0, 
 			static_cast<UINT>(vertexCount), 
 			static_cast<UINT>(startIndex), 

+ 2 - 2
CamelotGLRenderer/Include/CmGLRenderSystem.h

@@ -144,12 +144,12 @@ namespace CamelotFramework {
 		/**
 		 * @copydoc RenderSystem::draw()
 		 */
-		void draw(UINT32 vertexCount);
+		void draw(UINT32 vertexOffset, UINT32 vertexCount);
 
 		/**
 		 * @copydoc RenderSystem::drawIndexed()
 		 */
-		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount);
+		void drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount);
 
 		/**
 		 * @copydoc RenderSystem::clearRenderTarget()

+ 4 - 4
CamelotGLRenderer/Source/CmGLRenderSystem.cpp

@@ -688,18 +688,18 @@ namespace CamelotFramework
 		mBoundIndexBuffer = buffer;
 	}
 	//---------------------------------------------------------------------
-	void GLRenderSystem::draw(UINT32 vertexCount)
+	void GLRenderSystem::draw(UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		// Find the correct type to render
 		GLint primType = getGLDrawMode();
 		beginDraw();
 
-		glDrawArrays(primType, 0, vertexCount);
+		glDrawArrays(primType, vertexOffset, vertexCount);
 
 		endDraw();
 	}
 	//---------------------------------------------------------------------
-	void GLRenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexCount)
+	void GLRenderSystem::drawIndexed(UINT32 startIndex, UINT32 indexCount, UINT32 vertexOffset, UINT32 vertexCount)
 	{
 		if(mBoundIndexBuffer == nullptr)
 		{
@@ -716,7 +716,7 @@ namespace CamelotFramework
 
 		GLenum indexType = (mBoundIndexBuffer->getType() == IndexBuffer::IT_16BIT) ? GL_UNSIGNED_SHORT : GL_UNSIGNED_INT;
 
-		glDrawElements(primType, indexCount, indexType, 0);
+		glDrawElementsBaseVertex(primType, indexCount, indexType, (GLvoid*)(mBoundIndexBuffer->getIndexSize() * startIndex), vertexOffset);
 
 		endDraw();
 	}

+ 2 - 48
Opts.txt

@@ -18,16 +18,6 @@ When optimizing UpdateLayout make sure to mark elements that are fully culled as
  - But in order to determine that I first need to update the sprite to find out the elements bounds which defeats the point
  - TODO - FIgure this out
 
- -------------
-
-TransientMesh
- - Only used for writing, only dynamic, and only MAP_NO_OVERWRITE writing
- - Accepts starting buffer sizes, and will enlarge them as needed
-  - When buffer is enlarged send a warning so user knows to use a bigger buffer next time
- - Keeps track of parts of the buffer used by GPU using GPU queries
-   - Need to implement a proper GPU query interface
- - Keeps track of fragmentation and has an option to defragment, manually or auto after certain %
-
 ----------
 
 BIG TODO FINALLY: Reorganize GUI so it all uses one big vertex buffer (probably in the form of a TransientMesh). This means I need better support for drawing individual objects
@@ -36,41 +26,5 @@ from a transient mesh by drawing only parts of its buffer. But only do this afte
 
 ----------
 
-Transient mesh brainstorming:
-How to release memory from transient mesh?
- - Make it use non-CoreAccessor interface?
-
-
-TransientMesh
- Upon construction we specify vertex and index format, including initial vertex/index buffer sizes
-   MeshChunk allocate(UINT32 numVertices, UINT32 numIndices)
-    - Thread safe method (custom mutex), returns a MeshChunk which contains an unique index
-    - Each allocation represents its own SubMesh - there can't be multiple sub-meshes per allocation
-   deallocate(MeshChunk chunk)
-    - I could make MeshChunk as GpuResource, so I can write to it directly
-   How do I set materials per sub-mesh?
-    - I keep a mapping MeshChunk->Material in GUIManager 
-   I will need to update Render method so it can accept MeshChunk?
-    - Or should TransientMesh derive from Mesh?
-   When calling Render I need to provide index offset/length, however since I will be
-   supporting defragmenting that can change internally at any time. It would be ideal to 
-   retrieve that data when on core thread.
-    - Maybe even rethink Mesh submeshes so that they have a unified interface?
-	- BUT: I can handle defragment on the CPU. I just need to find offsets and sizes, and actual
-	  memory copies can be done on the core thread later.
-
-Name it MeshHeap instead of TransientMesh
- - It can derive from a common class MeshBase (which Mesh also derives from)
- - Render can then accept MeshBase instead of Mesh
-
-
------------
-
-POTENTIALLY
-
-I could have normal Meshes use one big vertex/index buffer in the background.
- - However that means Dynamic/Static tags don't mean anything
- - writeSubresource discard is ignored
- - It's not as clear to the user
- - Need to implement reading as well
- - Not sure if it would work well with a bunch of smaller allocations (Although I think we would avoid those in any case)
+When writing to buffer in MeshHeap (in two places at least) I need to add NO_OVERWITE flag to HardwareBuffer
+When doing allocInternal I don't check that index/vertex desc in MeshData actually matches the ones in MeshHeap.

+ 1 - 1
TODO.txt

@@ -129,7 +129,7 @@ Low priority TODO
  - onMovedOrResized is still used by Viewport while that same callback is offered by RenderWindowManager. There is no need to have them in both places.
  - Texture "ScaleToFit" will cause the texture to repeat instead of clipping the image. e.g. a 50x20 texture placed on an 50x100 area will repeat 5x
  - When writing to mesh vertex buffer in Mesh::writeSubresource that requires a color flip I need to create a temporary copy of the 
-    entire buffer. It would be better to handle this differently. 
+    entire buffer. It would be better to handle this differently. Same thing happens in MeshHeap
 ----------------------------------------------------------------------------------------------
 Optional:
  - Need better handling for shader techniques. Some Materials are able to run on all renderers yet I can only specify one. This is problematic