Przeglądaj źródła

Fixed up stack allocator so it doesn't require you to manually initialize specific heaps
Reduced number of allocations in the stack allocator

Marko Pintera 12 lat temu
rodzic
commit
d43309ccb3

+ 6 - 6
BansheeEngine/Source/BsGUILayoutX.cpp

@@ -95,13 +95,13 @@ namespace BansheeEngine
 
 		if(mChildren.size() > 0)
 		{
-			processedElements = stackAllocN<bool>((UINT32)mChildren.size(), HID_Main);
+			processedElements = stackAllocN<bool>((UINT32)mChildren.size());
 			memset(processedElements, 0, mChildren.size() * sizeof(bool));
 
-			elementSizes = stackAllocN<UINT32>((UINT32)mChildren.size(), HID_Main);
+			elementSizes = stackAllocN<UINT32>((UINT32)mChildren.size());
 			memset(elementSizes, 0, mChildren.size() * sizeof(UINT32));
 
-			elementScaleWeights = stackAllocN<float>((UINT32)mChildren.size(), HID_Main);
+			elementScaleWeights = stackAllocN<float>((UINT32)mChildren.size());
 			memset(elementScaleWeights, 0, mChildren.size() * sizeof(float));
 		}
 
@@ -401,13 +401,13 @@ namespace BansheeEngine
 		}
 
 		if(elementScaleWeights != nullptr)
-			stackDeallocLast(elementScaleWeights, HID_Main);
+			stackDeallocLast(elementScaleWeights);
 
 		if(elementSizes != nullptr)
-			stackDeallocLast(elementSizes, HID_Main);
+			stackDeallocLast(elementSizes);
 
 		if(processedElements != nullptr)
-			stackDeallocLast(processedElements, HID_Main);
+			stackDeallocLast(processedElements);
 
 		_markAsClean();
 	}

+ 6 - 6
BansheeEngine/Source/BsGUILayoutY.cpp

@@ -90,13 +90,13 @@ namespace BansheeEngine
 		UINT32 numNonClampedElements = 0;
 		UINT32 numFlexibleSpaces = 0;
 
-		bool* processedElements = stackAllocN<bool>((UINT32)mChildren.size(), HID_Main);
+		bool* processedElements = stackAllocN<bool>((UINT32)mChildren.size());
 		memset(processedElements, 0, mChildren.size() * sizeof(bool));
 
-		UINT32* elementSizes = stackAllocN<UINT32>((UINT32)mChildren.size(), HID_Main);
+		UINT32* elementSizes = stackAllocN<UINT32>((UINT32)mChildren.size());
 		memset(elementSizes, 0, mChildren.size() * sizeof(UINT32));
 
-		float* elementScaleWeights = stackAllocN<float>((UINT32)mChildren.size(), HID_Main);
+		float* elementScaleWeights = stackAllocN<float>((UINT32)mChildren.size());
 		memset(elementScaleWeights, 0, mChildren.size() * sizeof(float));
 
 		// Set initial sizes, count number of children per type and mark fixed elements as already processed
@@ -392,9 +392,9 @@ namespace BansheeEngine
 			childIdx++;
 		}
 
-		stackDeallocLast(elementScaleWeights, HID_Main);
-		stackDeallocLast(elementSizes, HID_Main);
-		stackDeallocLast(processedElements, HID_Main);
+		stackDeallocLast(elementScaleWeights);
+		stackDeallocLast(elementSizes);
+		stackDeallocLast(processedElements);
 
 		_markAsClean();
 	}

+ 2 - 8
CamelotCore/Include/CmPrerequisites.h

@@ -60,14 +60,8 @@
 
 #include "CmMemAllocCategories.h"
 
-namespace CamelotFramework {
-
-	enum HeapID
-	{
-		HID_Main = 0,
-		HID_Render = 1
-	};
-
+namespace CamelotFramework 
+{
 // Pre-declare classes
 // Allows use of pointers in header files without including individual .h
 // so decreases dependencies between files

+ 2 - 1
CamelotCore/Source/CmApplication.cpp

@@ -49,7 +49,7 @@ namespace CamelotFramework
 	void Application::startUp(START_UP_DESC& desc)
 	{
 		Platform::startUp();
-		MemStack::setupHeap(HID_Main);
+		MemStack::beginThread();
 
 		Profiler::startUp(cm_new<Profiler>());
 		StringTable::startUp(cm_new<StringTable>());
@@ -188,6 +188,7 @@ namespace CamelotFramework
 		StringTable::shutDown();
 
 		Profiler::shutDown();
+		MemStack::endThread();
 		Platform::shutDown();
 	}
 

+ 6 - 0
CamelotCore/Source/CmCoreThread.cpp

@@ -52,6 +52,8 @@ namespace CamelotFramework
 	void CoreThread::runCoreThread()
 	{
 #if !CM_FORCE_SINGLETHREADED_RENDERING
+		MemStack::beginThread();
+
 		mCoreThreadId = CM_THREAD_CURRENT_ID;
 		mSyncedCoreAccessor = cm_new<CoreThreadAccessor<CommandQueueSync>>(CM_THREAD_CURRENT_ID);
 
@@ -73,7 +75,10 @@ namespace CamelotFramework
 				while(mCommandQueue->isEmpty())
 				{
 					if(mCoreThreadShutdown)
+					{
+						MemStack::endThread();
 						return;
+					}
 
 					CM_THREAD_WAIT(mCommandReadyCondition, mCommandQueueMutex, lock);
 				}
@@ -86,6 +91,7 @@ namespace CamelotFramework
 		}
 
 		cm_delete(mSyncedCoreAccessor);
+		MemStack::endThread();
 #endif
 	}
 

+ 0 - 2
CamelotCore/Source/CmRenderSystem.cpp

@@ -74,8 +74,6 @@ namespace CamelotFramework {
 	{
 		mPrimaryWindowDesc = primaryWindowDesc;
 
-		MemStack::setupHeap(HID_Render);
-
 		AsyncOp op = gCoreThread().queueReturnCommand(boost::bind(&RenderSystem::initialize_internal, this, _1), true);
 		return op.getReturnValue<RenderWindowPtr>();
 	}

+ 67 - 55
CamelotUtility/Include/CmMemStack.h

@@ -6,6 +6,13 @@
 
 namespace CamelotFramework
 {
+	/**
+	 * @brief	Memory stack.
+	 *
+	 *  @tparam	BlockCapacity Minimum size of a block. Larger blocks mean less memory allocations, but also potentially
+	 * 			more wasted memory. If an allocation requests more bytes than BlockCapacity, first largest multiple is
+	 * 			used instead.
+	 */
 	template <int BlockCapacity = 1024 * 1024>
 	class MemStackInternal
 	{
@@ -15,14 +22,10 @@ namespace CamelotFramework
 		public:
 			MemBlock(UINT32 size)
 				:mData(nullptr), mFreePtr(0), mSize(size)
-			{
-				mData = static_cast<UINT8*>(cm_alloc(mSize));
-			}
+			{ }
 
 			~MemBlock()
-			{
-				cm_free(mData);
-			}
+			{ }
 
 			UINT8* alloc(UINT8 amount)
 			{
@@ -56,64 +59,79 @@ namespace CamelotFramework
 				MemBlock* curPtr = mBlocks.top();
 				mBlocks.pop();
 
-				cm_delete(curPtr);
+				deallocBlock(curPtr);
 			}
 		}
 
 		UINT8* alloc(UINT32 amount)
 		{
+			amount += sizeof(UINT32);
+
 			MemBlock* topBlock;
 			if(mBlocks.size() == 0)
-				topBlock = allocNewBlock(amount);
+				topBlock = allocBlock(amount);
 			else
 				topBlock = mBlocks.top();
 
-			mAllocSizes.push(amount);
-
+			MemBlock* memBlock = nullptr;
 			UINT32 freeMem = topBlock->mSize - topBlock->mFreePtr;
 			if(amount <= freeMem)
-				return topBlock->alloc(amount);
+				memBlock = topBlock;
+			else
+				memBlock = allocBlock(amount);
+
+			UINT8* data = memBlock->alloc(amount);
 
-			MemBlock* newBlock = allocNewBlock(amount);
-			return newBlock->alloc(amount);
+			UINT32* storedSize = reinterpret_cast<UINT32*>(data);
+			*storedSize = amount;
+
+			return data + sizeof(UINT32);
 		}
 
 		void dealloc(UINT8* data)
 		{
-			assert(mAllocSizes.size() > 0 && "Out of order stack deallocation detected. Deallocations need to happen in order opposite of allocations.");
+			data -= sizeof(UINT32);
 
-			UINT32 amount = mAllocSizes.top();
-			mAllocSizes.pop();
+			UINT32* storedSize = reinterpret_cast<UINT32*>(data);
 
 			MemBlock* topBlock = mBlocks.top();
-			topBlock->dealloc(data, amount);
+			topBlock->dealloc(data, *storedSize);
 
 			if(topBlock->mFreePtr == 0)
 			{
-				cm_delete(topBlock);
+				deallocBlock(topBlock);
 				mBlocks.pop();
 			}
 		}
 
 	private:
 		std::stack<MemBlock*> mBlocks;
-		std::stack<UINT32> mAllocSizes;
 
-		MemBlock* allocNewBlock(UINT32 wantedSize)
+		MemBlock* allocBlock(UINT32 wantedSize)
 		{
 			UINT32 blockSize = BlockCapacity;
 			if(wantedSize > blockSize)
 				blockSize = wantedSize;
 
-			MemBlock* newBlock = cm_new<MemBlock>(blockSize);
+			UINT8* data = (UINT8*)reinterpret_cast<UINT8*>(cm_alloc(blockSize + sizeof(MemBlock)));
+			MemBlock* newBlock = new (data) MemBlock(blockSize);
+			data += sizeof(MemBlock);
+			newBlock->mData = data;
+
 			mBlocks.push(newBlock);
 
 			return newBlock;
 		}
+
+		void deallocBlock(MemBlock* block)
+		{
+			block->~MemBlock();
+			cm_free(block);
+		}
 	};
 
 	/**
-	 * @brief	Fastest, but also most limiting type of allocator. All deallocations
+	 * @brief	One of the fastest, but also very limiting type of allocator. All deallocations
 	 * 			must happen in opposite order from allocations. 
 	 * 			
 	 * @note	It's mostly useful when you need to allocate something temporarily on the heap,
@@ -121,55 +139,49 @@ namespace CamelotFramework
 	 * 			
 	 *			Each allocation comes with a pretty hefty 4 byte memory overhead, so don't use it for small allocations. 
 	 *			
-	 *			Operations done on a single heap are thread safe. Multiple threads are not allowed to access a heap that wasn't
-	 *			created for them.
-	 *
-	 * @tparam	BlockCapacity Minimum size of a block. Larger blocks mean less memory allocations, but also potentially
-	 * 						  more wasted memory. If an allocation requests more bytes than BlockCapacity, first largest multiple is
-	 * 						  used instead.
-	 * @tparam	VectorAligned If true, all allocations will be aligned to 16bit boundaries.
+	 *			This class is thread safe but you cannot allocate on one thread and deallocate on another. Threads will keep
+	 *			separate stacks internally. Make sure to call beginThread/endThread for any thread this stack is used on.
 	 */
-	class CM_UTILITY_EXPORT MemStack
+	class MemStack
 	{
 	public:
 		/**
-		 * @brief	Sets up the heap you can later use with alloc/dealloc calls. It is most common to have one heap
-		 * 			per thread.
-		 *
-		 * @param	heapId	Unique heap ID. Each heap can only be used from one thread, it cannot be shared.
-		 * 					You cannot have more than 256 heaps.
+		 * @brief	Sets up the stack with the currently active thread. You need to call this
+		 * 			on any thread before doing any allocations or deallocations 
 		 */
-		static void setupHeap(UINT8 heapId);
+		static CM_UTILITY_EXPORT void beginThread();
 
-		static UINT8* alloc(UINT32 numBytes, UINT32 heapId);
-		static void deallocLast(UINT8* data, UINT32 heapId);
+		/**
+		 * @brief	Cleans up the stack for the current thread. You may not perform any allocations or deallocations
+		 * 			after this is called, unless you call beginThread again.
+		 */
+		static CM_UTILITY_EXPORT void endThread();
 
-	private:
-		static MemStackInternal<1024 * 1024> mStacks[256];
+		static CM_UTILITY_EXPORT UINT8* alloc(UINT32 numBytes);
+		static CM_UTILITY_EXPORT void deallocLast(UINT8* data);
 
-#if CM_DEBUG_MODE
-		static CM_THREAD_ID_TYPE mThreadIds[256];
-#endif
+	private:
+		static CM_THREADLOCAL MemStackInternal<1024 * 1024>* ThreadMemStack;
 	};
 
-	CM_UTILITY_EXPORT inline UINT8* stackAlloc(UINT32 numBytes, UINT32 heapId);
+	CM_UTILITY_EXPORT inline UINT8* stackAlloc(UINT32 numBytes);
 
 	template<class T>
-	T* stackAlloc(UINT32 heapId)
+	T* stackAlloc()
 	{
-		return (T*)MemStack::alloc(sizeof(T), heapId);
+		return (T*)MemStack::alloc(sizeof(T));
 	}
 
 	template<class T>
-	T* stackAllocN(UINT32 count, UINT32 heapId)
+	T* stackAllocN(UINT32 count)
 	{
-		return (T*)MemStack::alloc(sizeof(T) * count, heapId);
+		return (T*)MemStack::alloc(sizeof(T) * count);
 	}
 
 	template<class T>
-	T* stackConstructN(UINT32 count, UINT32 heapId)
+	T* stackConstructN(UINT32 count)
 	{
-		T* data = stackAllocN<T>(count, heapId);
+		T* data = stackAllocN<T>(count);
 
 		for(unsigned int i = 0; i < count; i++)
 			new ((void*)&data[i]) T;
@@ -178,21 +190,21 @@ namespace CamelotFramework
 	}
 
 	template<class T>
-	void stackDestruct(T* data, UINT32 heapId)
+	void stackDestruct(T* data)
 	{
 		data->~T();
 
-		MemStack::deallocLast((UINT8*)data, heapId);
+		MemStack::deallocLast((UINT8*)data);
 	}
 
 	template<class T>
-	void stackDestructN(T* data, UINT32 count, UINT32 heapId)
+	void stackDestructN(T* data, UINT32 count)
 	{
 		for(unsigned int i = 0; i < count; i++)
 			data[i].~T();
 
-		MemStack::deallocLast((UINT8*)data, heapId);
+		MemStack::deallocLast((UINT8*)data);
 	}
 
-	CM_UTILITY_EXPORT inline void stackDeallocLast(void* data, UINT32 heapId);
+	CM_UTILITY_EXPORT inline void stackDeallocLast(void* data);
 }

+ 23 - 25
CamelotUtility/Source/CmMemStack.cpp

@@ -3,48 +3,46 @@
 
 namespace CamelotFramework
 {
-	MemStackInternal<1024 * 1024> MemStack::mStacks[256];
+	MemStackInternal<1024 * 1024>* MemStack::ThreadMemStack = nullptr;
 
-#if CM_DEBUG_MODE
-	CM_THREAD_ID_TYPE MemStack::mThreadIds[256];
-#endif
-
-	void MemStack::setupHeap(UINT8 heapId)
+	void MemStack::beginThread()
 	{
-		assert(heapId < 256);
+		if(ThreadMemStack != nullptr)
+			endThread();
 
-		mStacks[heapId] = MemStackInternal<1024 * 1024>();
+		ThreadMemStack = cm_new<MemStackInternal<1024 * 1024>>();
+	}
 
-#if CM_DEBUG_MODE
-		mThreadIds[heapId] = CM_THREAD_CURRENT_ID;
-#endif
+	void MemStack::endThread()
+	{
+		if(ThreadMemStack != nullptr)
+		{
+			cm_delete(ThreadMemStack);
+			ThreadMemStack = nullptr;
+		}
 	}
 
-	UINT8* MemStack::alloc(UINT32 numBytes, UINT32 heapId)
+	UINT8* MemStack::alloc(UINT32 numBytes)
 	{
-#if CM_DEBUG_MODE
-		assert(mThreadIds[heapId] == CM_THREAD_CURRENT_ID && "Accessing a heap from an invalid thread.");
-#endif
+		assert(ThreadMemStack != nullptr && "Stack allocation failed. Did you call beginThread?");
 
-		return mStacks[heapId].alloc(numBytes);
+		return ThreadMemStack->alloc(numBytes);
 	}
 
-	void MemStack::deallocLast(UINT8* data, UINT32 heapId)
+	void MemStack::deallocLast(UINT8* data)
 	{
-#if CM_DEBUG_MODE
-		assert(mThreadIds[heapId] == CM_THREAD_CURRENT_ID && "Accessing a heap from an invalid thread.");
-#endif
+		assert(ThreadMemStack != nullptr && "Stack deallocation failed. Did you call beginThread?");
 
-		mStacks[heapId].dealloc(data);
+		ThreadMemStack->dealloc(data);
 	}
 
-	UINT8* stackAlloc(UINT32 numBytes, UINT32 heapId)
+	UINT8* stackAlloc(UINT32 numBytes)
 	{
-		return MemStack::alloc(numBytes, heapId);
+		return MemStack::alloc(numBytes);
 	}
 
-	void stackDeallocLast(void* data, UINT32 heapId)
+	void stackDeallocLast(void* data)
 	{
-		return MemStack::deallocLast((UINT8*)data, heapId);
+		return MemStack::deallocLast((UINT8*)data);
 	}
 }

+ 11 - 12
Opts.txt

@@ -7,14 +7,6 @@ Strings when setting material params cause an allocation
  - Consider using normal char arrays
  - Or ensure materials are set using some smarter way, e.g. you get material param reference by name, you save it and then use that for setting the value
 
-HardwareBuffer::lock (more exactly ImmediateContext::Map) is very slow, with 35 calls taking up almost 3ms. 
- - First, mesh creates both vertex and index buffer whenever writeSubresource is called!!! It should instead just update them.
- - Second, mesh buffers are created with default buffer flags (static). Try changing it to dynamic
-
-When optimizing UpdateLayout make sure to mark elements that are fully culled as Culled
- - But in order to determine that I first need to update the sprite to find out the elements bounds which defeats the point
- - TODO - FIgure this out
-
 FrameAlloc
    Keeps two internal stacks
      - One core, one sim
@@ -24,13 +16,20 @@ FrameAlloc
     - frees the previously active stack
    During frame we call alloc()
   
-
 Make sure BindableGpuParams are not copyable
 
-CPUProfiler still uses normal allocator for containers
-
 Refactor stack allocator so I don't need to manually initialize heaps
  - Use atomic variable counter?
 
  There is a shit-ton of allocations in CPUProfiler (especially report generation)
-  - Maybe attempt to cut down on them?
+  - Maybe attempt to cut down on them?
+
+--------------------
+
+HardwareBuffer::lock (more exactly ImmediateContext::Map) is very slow, with 35 calls taking up almost 3ms. 
+ - First, mesh creates both vertex and index buffer whenever writeSubresource is called!!! It should instead just update them.
+ - Second, mesh buffers are created with default buffer flags (static). Try changing it to dynamic
+
+When optimizing UpdateLayout make sure to mark elements that are fully culled as Culled
+ - But in order to determine that I first need to update the sprite to find out the elements bounds which defeats the point
+ - TODO - FIgure this out