浏览代码

Untested profiler reports

Marko Pintera 12 年之前
父节点
当前提交
4534e3a5af
共有 3 个文件被更改,包括 415 次插入222 次删除
  1. 3 0
      CamelotClient/Source/BsMainEditorWindow.cpp
  2. 10 23
      CamelotUtility/Include/CmCPUProfiler.h
  3. 402 199
      CamelotUtility/Source/CmCPUProfiler.cpp

+ 3 - 0
CamelotClient/Source/BsMainEditorWindow.cpp

@@ -4,6 +4,7 @@
 #include "BsCamera.h"
 #include "CmSceneObject.h"
 #include "CmRenderTexture.h"
+#include "CmCPUProfiler.h"
 
 // DEBUG ONLY
 #include "CmTestTextSprite.h"
@@ -61,6 +62,8 @@ namespace BansheeEditor
 
 		AABox dbgBox(Vector3(-300, -200, 1000), Vector3(300, 300, 1500));
 		DrawHelper3D::instance().drawAABox(sceneCamera, dbgBox, Color::Green, 250.0f);
+
+		CPUProfiler profiler;
 	}
 
 	MainEditorWindow::~MainEditorWindow()

+ 10 - 23
CamelotUtility/Include/CmCPUProfiler.h

@@ -97,26 +97,15 @@ namespace CamelotFramework
 			~ProfiledBlock();
 
 			String name;
-			ProfileData data;
+			
+			ProfileData basic;
+			PreciseProfileData precise;
+
 			Vector<ProfiledBlock*>::type children;
-			Vector<PreciseProfiledBlock*>::type preciseChildren; // Needed only for estimating overhead
 
 			ProfiledBlock* findChild(const String& name) const;
 		};
 
-		struct PreciseProfiledBlock
-		{
-			PreciseProfiledBlock();
-			~PreciseProfiledBlock();
-
-			String name;
-			PreciseProfileData data;
-			Vector<PreciseProfiledBlock*>::type children;
-			Vector<ProfiledBlock*>::type basicChildren; // Needed only for estimating overhead
-
-			PreciseProfiledBlock* findChild(const String& name) const;
-		};
-
 		struct ThreadInfo
 		{
 			ThreadInfo();
@@ -128,9 +117,8 @@ namespace CamelotFramework
 			Stack<ProfiledBlock*>::type activeBlocks;
 			ProfiledBlock* activeBlock;
 
-			PreciseProfiledBlock* rootPreciseBlock;
-			Stack<PreciseProfiledBlock*>::type activePreciseBlocks;
-			PreciseProfiledBlock* activePreciseBlock;
+			Stack<ProfiledBlock*>::type activePreciseBlocks;
+			ProfiledBlock* activePreciseBlock;
 
 			void begin(const String& _name);
 			void end();
@@ -138,9 +126,6 @@ namespace CamelotFramework
 
 			ProfiledBlock* getBlock();
 			void releaseBlock(ProfiledBlock* block);
-
-			PreciseProfiledBlock* getPreciseBlock();
-			void releasePreciseBlock(PreciseProfiledBlock* block);
 		};
 
 	public:
@@ -221,8 +206,10 @@ namespace CamelotFramework
 		double mBasicTimerOverhead;
 		UINT64 mPreciseTimerOverhead;
 
-		double mBasicSamplingOverhead;
-		UINT64 mPreciseSamplingOverhead;
+		double mBasicSamplingOverheadMs;
+		double mPreciseSamplingOverheadMs;
+		UINT64 mBasicSamplingOverheadCycles;
+		UINT64 mPreciseSamplingOverheadCycles;
 
 		Vector<ThreadInfo*>::type mActiveThreads;
 		CM_MUTEX(mThreadSync);

+ 402 - 199
CamelotUtility/Source/CmCPUProfiler.cpp

@@ -119,7 +119,7 @@ namespace CamelotFramework
 
 	CPUProfiler::ThreadInfo::ThreadInfo()
 		:isActive(false), activeBlock(nullptr), rootBlock(nullptr), 
-		activePreciseBlock(nullptr), rootPreciseBlock(nullptr)
+		activePreciseBlock(nullptr)
 	{
 
 	}
@@ -135,28 +135,17 @@ namespace CamelotFramework
 		if(rootBlock == nullptr)
 			rootBlock = getBlock();
 
-		if(rootPreciseBlock == nullptr)
-			rootPreciseBlock = getPreciseBlock();
-
 		activeBlocks.push(rootBlock);
 		activeBlock = rootBlock;
 
-		activePreciseBlocks.push(rootPreciseBlock);
-		activePreciseBlock = rootPreciseBlock;
-
 		rootBlock->name = _name; 
-		rootBlock->data.beginSample();
-
-		rootPreciseBlock->name = _name;
-		rootPreciseBlock->data.beginSample();
+		rootBlock->basic.beginSample();
+		isActive = true;
 	}
 
 	void CPUProfiler::ThreadInfo::end()
 	{
-		activePreciseBlock->data.endSample();
-		activePreciseBlocks.pop();
-
-		activeBlock->data.endSample();
+		activeBlock->basic.endSample();
 		activeBlocks.pop();
 
 		if(!isActive)
@@ -169,7 +158,7 @@ namespace CamelotFramework
 			while(activeBlocks.size() > 0)
 			{
 				ProfiledBlock* block = activeBlocks.top();
-				block->data.endSample();
+				block->basic.endSample();
 
 				activeBlocks.pop();
 			}
@@ -181,8 +170,8 @@ namespace CamelotFramework
 
 			while(activePreciseBlocks.size() > 0)
 			{
-				PreciseProfiledBlock* block = activePreciseBlocks.top();
-				block->data.endSample();
+				ProfiledBlock* block = activePreciseBlocks.top();
+				block->precise.endSample();
 
 				activePreciseBlocks.pop();
 			}
@@ -191,7 +180,7 @@ namespace CamelotFramework
 		isActive = false;
 		activeBlocks = Stack<ProfiledBlock*>::type();
 		activeBlock = nullptr;
-		activePreciseBlocks = Stack<PreciseProfiledBlock*>::type();
+		activePreciseBlocks = Stack<ProfiledBlock*>::type();
 		activePreciseBlock = nullptr;
 	}
 
@@ -203,11 +192,7 @@ namespace CamelotFramework
 		if(rootBlock != nullptr)
 			releaseBlock(rootBlock);
 
-		if(rootPreciseBlock != nullptr)
-			releasePreciseBlock(rootPreciseBlock);
-
 		rootBlock = nullptr;
-		rootPreciseBlock = nullptr;
 	}
 
 	CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
@@ -222,22 +207,8 @@ namespace CamelotFramework
 		cm_delete(block);
 	}
 
-	CPUProfiler::PreciseProfiledBlock* CPUProfiler::ThreadInfo::getPreciseBlock()
-	{
-		// TODO - Pool this, if possible using the memory allocator stuff
-		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
-		return cm_new<PreciseProfiledBlock>();
-	}
-
-	void CPUProfiler::ThreadInfo::releasePreciseBlock(CPUProfiler::PreciseProfiledBlock* block)
-	{
-		cm_delete(block);
-	}
-
 	CPUProfiler::ProfiledBlock::ProfiledBlock()
-	{
-
-	}
+	{ }
 
 	CPUProfiler::ProfiledBlock::~ProfiledBlock()
 	{
@@ -260,34 +231,9 @@ namespace CamelotFramework
 		return nullptr;
 	}
 
-	CPUProfiler::PreciseProfiledBlock::PreciseProfiledBlock()
-	{
-
-	}
-
-	CPUProfiler::PreciseProfiledBlock::~PreciseProfiledBlock()
-	{
-		ThreadInfo* thread = ThreadInfo::activeThread;
-
-		for(auto& child : children)
-			thread->releasePreciseBlock(child);
-
-		children.clear();
-	}
-
-	CPUProfiler::PreciseProfiledBlock* CPUProfiler::PreciseProfiledBlock::findChild(const String& name) const
-	{
-		for(auto& child : children)
-		{
-			if(child->name == name)
-				return child;
-		}
-
-		return nullptr;
-	}
-
 	CPUProfiler::CPUProfiler()
-		:mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverhead(0.0), mPreciseSamplingOverhead(0)
+		:mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
+		mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
 	{
 		// TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
 		// and keep separate values per thread.
@@ -301,10 +247,7 @@ namespace CamelotFramework
 		CM_LOCK_MUTEX(mThreadSync);
 
 		for(auto& threadInfo : mActiveThreads)
-		{
-			threadInfo->releaseBlock(threadInfo->rootBlock);
 			cm_delete(threadInfo);
-		}
 	}
 
 	void CPUProfiler::beginThread(const String& name)
@@ -340,40 +283,53 @@ namespace CamelotFramework
 		ProfiledBlock* parent = thread->activeBlock;
 		ProfiledBlock* block = nullptr;
 		
-		parent->findChild(name);
+		if(parent != nullptr)
+			block = parent->findChild(name);
 
 		if(block == nullptr)
 		{
 			block = thread->getBlock();
 			block->name = name;
 
-			parent->children.push_back(block);
-			thread->activePreciseBlock->basicChildren.push_back(block);
-
-			thread->activeBlocks.push(block);
-			thread->activeBlock = block;
+			if(parent != nullptr)
+				parent->children.push_back(block);
+			else
+				thread->rootBlock->children.push_back(block);
 		}
 
-		block->data.beginSample();
+		thread->activeBlocks.push(block);
+		thread->activeBlock = block;
+
+		block->basic.beginSample();
 	}
 
 	void CPUProfiler::endSample(const String& name)
 	{
 		ThreadInfo* thread = ThreadInfo::activeThread;
 		ProfiledBlock* block = thread->activeBlock;
-		block->data.endSample();
+		if(block == nullptr)
+		{
+			LOGWRN("Mismatched Profiler::endSample. No beginSample was called.");
+			return;
+		}
+
+		block->basic.endSample();
 
 		if(block->name != name)
 		{
 			LOGWRN("Mismatched Profiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
 
-			block->data.resumeLastSample();
+			block->basic.resumeLastSample();
 
 			return;
 		}
 
 		thread->activeBlocks.pop();
-		thread->activeBlock = thread->activeBlocks.top();
+
+		if(!thread->activeBlocks.empty())
+			thread->activeBlock = thread->activeBlocks.top();
+		else
+			thread->activeBlock = nullptr;
 	}
 
 	void CPUProfiler::beginSamplePrecise(const String& name)
@@ -385,43 +341,56 @@ namespace CamelotFramework
 		if(thread == nullptr || !thread->isActive)
 			beginThread("Unknown");
 
-		PreciseProfiledBlock* parent = thread->activePreciseBlock;
-		PreciseProfiledBlock* block = nullptr;
+		ProfiledBlock* parent = thread->activePreciseBlock;
+		ProfiledBlock* block = nullptr;
 		
-		parent->findChild(name);
+		if(parent != nullptr)
+			block = parent->findChild(name);
 
 		if(block == nullptr)
 		{
-			block = thread->getPreciseBlock();
+			block = thread->getBlock();
 			block->name = name;
 
-			parent->children.push_back(block);
-			thread->activeBlock->preciseChildren.push_back(block);
-
-			thread->activePreciseBlocks.push(block);
-			thread->activePreciseBlock = block;
+			if(parent != nullptr)
+				parent->children.push_back(block);
+			else
+				thread->rootBlock->children.push_back(block);
 		}
 
-		block->data.beginSample();
+		thread->activePreciseBlocks.push(block);
+		thread->activePreciseBlock = block;
+
+		block->precise.beginSample();
 	}
 
 	void CPUProfiler::endSamplePrecise(const String& name)
 	{
 		ThreadInfo* thread = ThreadInfo::activeThread;
-		PreciseProfiledBlock* block = thread->activePreciseBlock;
-		block->data.endSample();
+		ProfiledBlock* block = thread->activePreciseBlock;
+		if(block == nullptr)
+		{
+			LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
+			return;
+		}
+
+		block->precise.endSample();
 
 		if(block->name != name)
 		{
 			LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
 
-			block->data.resumeLastSample();
+			block->precise.resumeLastSample();
 
 			return;
 		}
 
 		thread->activePreciseBlocks.pop();
-		thread->activePreciseBlock = thread->activePreciseBlocks.top();
+
+		if(!thread->activePreciseBlocks.empty())
+			thread->activePreciseBlock = thread->activePreciseBlocks.top();
+		else
+			thread->activePreciseBlock = nullptr;
 	}
 
 	void CPUProfiler::update()
@@ -448,141 +417,257 @@ namespace CamelotFramework
 		if(thread->isActive)
 			thread->end();
 
-		if(thread->rootBlock != nullptr)
+		// We need to separate out basic and precise data and form two separate hierarchies
+		if(thread->rootBlock == nullptr)
+			return report;
+
+		struct TempEntry
 		{
-			// Fill up flatHierarchy array in a way so we always process
-			// children before parents
-			Stack<ProfiledBlock*>::type todo;
-			Vector<ProfiledBlock*>::type flatHierarchy;
-			Vector<CPUProfilerBasicSamplingEntry*>::type flatResultHierarchy;
+			TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
+				:parentBlock(_parentBlock), entryIdx(_entryIdx)
+			{ }
+
+			ProfiledBlock* parentBlock;
+			UINT32 entryIdx;
+			Vector<UINT32>::type childIndexes;
+		};
+
+		Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
+		Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;	
 
-			todo.push(thread->rootBlock);
-			flatHierarchy.push_back(thread->rootBlock);
-			flatResultHierarchy.push_back(&report.mBasicSamplingRootEntry);
+		// Fill up flatHierarchy array in a way so we always process children before parents
+		Stack<UINT32>::type todo;
+		Vector<TempEntry>::type flatHierarchy;
+
+		UINT32 entryIdx = 0;
+		todo.push(entryIdx);
+		flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
+
+		entryIdx++;
+		while(!todo.empty())
+		{
+			UINT32 curDataIdx = todo.top();
+			ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
 
-			while(!todo.empty())
+			todo.pop();
+
+			for(auto& child : curBlock->children)
 			{
-				ProfiledBlock* curBlock = todo.top();
-				todo.pop();
+				flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
 
-				CPUProfilerBasicSamplingEntry* parentEntry = flatResultHierarchy.back();
-				for(auto& child : curBlock->children)
-				{
-					todo.push(child);
-					flatHierarchy.push_back(child);
+				todo.push(entryIdx);
+				flatHierarchy.push_back(TempEntry(child, entryIdx));
 
-					parentEntry->childEntries.push_back(CPUProfilerBasicSamplingEntry());
-					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					
-				}
+				entryIdx++;
 			}
+		}
+		
+		// Calculate sampling data for all entries
+		basicEntries.resize(flatHierarchy.size());
+		preciseEntries.resize(flatHierarchy.size());
+
+		for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
+		{
+			TempEntry& curData = *iter;
+			ProfiledBlock* curBlock = curData.parentBlock;
 
-			auto& iter = flatHierarchy.rbegin();
-			auto& iterSample = flatResultHierarchy.rbegin();
+			CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
+			CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
 
-			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)
+			// Calculate basic data
+			entryBasic->data.name = curBlock->name;
+
+			entryBasic->data.totalTimeMs = 0.0;
+			entryBasic->data.maxTimeMs = 0.0;
+			for(auto& sample : curBlock->basic.samples)
 			{
-				ProfiledBlock* curBlock = *iter;
-				CPUProfilerBasicSamplingEntry* entry = *iterSample;
+				entryBasic->data.totalTimeMs += sample.time;
+				entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
+			}
 
-				entry->data.name = curBlock->name;
+			entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
 
-				entry->data.totalTimeMs = 0.0;
-				entry->data.maxTimeMs = 0.0;
-				for(auto& sample : curBlock->data.samples)
-				{
-					entry->data.totalTimeMs += sample.time;
-					entry->data.maxTimeMs = std::max(entry->data.maxTimeMs, sample.time);
-				}
+			if(entryBasic->data.numCalls > 0)
+				entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
 
-				entry->data.numCalls = (UINT32)curBlock->data.samples.size();
-				entry->data.avgTimeMs = entry->data.totalTimeMs / entry->data.numCalls;
+			double totalChildTime = 0.0;
+			for(auto& childIdx : curData.childIndexes)
+			{
+				CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
+				totalChildTime += childEntry->data.totalTimeMs;
+				childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
 
-				UINT32 childIdx = 0;
-				double totalChildTime = 0.0;
-				for(auto& child : curBlock->children)
-				{
-					totalChildTime += entry->childEntries[childIdx].data.totalTimeMs;
-					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalTimeMs / entry->data.totalTimeMs;
+				entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
+			}
 
-					entry->data.estimatedOverheadMs += entry->childEntries[childIdx].data.estimatedOverheadMs + mBasicSamplingOverhead;
+			entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
+			entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
 
-					childIdx++;
-				}
+			entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
+
+			if(entryBasic->data.numCalls > 0)
+				entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
+
+			entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
 
-				entry->data.totalSelfTimeMs = entry->data.totalTimeMs - totalChildTime;
-				entry->data.avgSelfTimeMs = entry->data.totalSelfTimeMs / entry->data.numCalls;
+			// Calculate precise data
+			entryPrecise->data.name = curBlock->name;
 
-				entry->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
+			entryPrecise->data.totalCycles = 0;
+			entryPrecise->data.maxCycles = 0;
+			for(auto& sample : curBlock->precise.samples)
+			{
+				entryPrecise->data.totalCycles += sample.cycles;
+				entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
 			}
+
+			entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
+
+			if(entryPrecise->data.numCalls > 0)
+				entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
+
+			UINT64 totalChildCycles = 0;
+			for(auto& childIdx : curData.childIndexes)
+			{
+				CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
+				totalChildCycles += childEntry->data.totalCycles;
+				childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
+
+				entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
+			}
+
+			entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
+			entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
+
+			entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
+
+			if(entryPrecise->data.numCalls > 0)
+				entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
+
+			entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
 		}
 
-		if(thread->rootPreciseBlock != nullptr)
+		// Prune empty basic entries
+		Stack<UINT32>::type finalBasicHierarchyTodo;
+		Stack<UINT32>::type parentBasicEntryIndexes;
+		Vector<TempEntry>::type newBasicEntries;
+
+		finalBasicHierarchyTodo.push(0);
+
+		entryIdx = 0;
+		parentBasicEntryIndexes.push(entryIdx);
+		newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
+
+		entryIdx++;
+
+		while(!finalBasicHierarchyTodo.empty())
 		{
-			// Fill up flatHierarchy array in a way so we always process
-			// children before parents
-			Stack<PreciseProfiledBlock*>::type todo;
-			Vector<PreciseProfiledBlock*>::type flatHierarchy;
-			Vector<CPUProfilerPreciseSamplingEntry*>::type flatResultHierarchy;
+			UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
+			parentBasicEntryIndexes.pop();
 
-			todo.push(thread->rootPreciseBlock);
-			flatHierarchy.push_back(thread->rootPreciseBlock);
-			flatResultHierarchy.push_back(&report.mPreciseSamplingRootEntry);
+			UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
+			TempEntry& curEntry = flatHierarchy[curEntryIdx];
+			CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[curEntryIdx];
+			finalBasicHierarchyTodo.pop();
 
-			while(!todo.empty())
+			for(auto& childIdx : curEntry.childIndexes)
 			{
-				PreciseProfiledBlock* curBlock = todo.top();
-				todo.pop();
+				finalBasicHierarchyTodo.push(childIdx);
 
-				CPUProfilerPreciseSamplingEntry* parentEntry = flatResultHierarchy.back();
-				for(auto& child : curBlock->children)
+				if(basicEntry.data.numCalls > 0)
 				{
-					todo.push(child);
-					flatHierarchy.push_back(child);
+					newBasicEntries.push_back(TempEntry(nullptr, childIdx));
+					newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
+
+					parentBasicEntryIndexes.push(entryIdx);
 
-					parentEntry->childEntries.push_back(CPUProfilerPreciseSamplingEntry());
-					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					
+					entryIdx++;
 				}
+				else
+					parentBasicEntryIndexes.push(parentEntryIdx);
 			}
+		}
 
-			auto& iter = flatHierarchy.rbegin();
-			auto& iterSample = flatResultHierarchy.rbegin();
+		Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
+		finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
+		
+		UINT32 curEntryIdx = 0;
+		for(auto& curEntry : newBasicEntries)
+		{
+			CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
 
-			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)
+			for(auto& childIdx : curEntry.childIndexes)
 			{
-				PreciseProfiledBlock* curBlock = *iter;
-				CPUProfilerPreciseSamplingEntry* entry = *iterSample;
+				TempEntry& childEntry = newBasicEntries[childIdx];
+				basicEntry->childEntries.push_back(basicEntries[childEntry.entryIdx]);
 
-				entry->data.name = curBlock->name;
+				finalBasicEntries.push_back(&basicEntry->childEntries.back());
+			}
 
-				entry->data.totalCycles = 0;
-				entry->data.maxCycles = 0;
-				for(auto& sample : curBlock->data.samples)
-				{
-					entry->data.totalCycles += sample.cycles;
-					entry->data.maxCycles = std::max(entry->data.maxCycles, sample.cycles);
-				}
+			curEntryIdx++;
+		}
+
+		// Prune empty precise entries
+		Stack<UINT32>::type finalPreciseHierarchyTodo;
+		Stack<UINT32>::type parentPreciseEntryIndexes;
+		Vector<TempEntry>::type newPreciseEntries;
+
+		finalPreciseHierarchyTodo.push(0);
+
+		entryIdx = 0;
+		parentPreciseEntryIndexes.push(entryIdx);
+		newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
+
+		entryIdx++;
+
+		while(!finalPreciseHierarchyTodo.empty())
+		{
+			UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
+			parentPreciseEntryIndexes.pop();
 
-				entry->data.numCalls = (UINT32)curBlock->data.samples.size();
-				entry->data.avgCycles = entry->data.avgCycles / entry->data.numCalls;
+			UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
+			TempEntry& curEntry = flatHierarchy[curEntryIdx];
+			CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[curEntryIdx];
+			finalPreciseHierarchyTodo.pop();
 
-				UINT32 childIdx = 0;
-				UINT64 totalChildCycles = 0;
-				for(auto& child : curBlock->children)
+			for(auto& childIdx : curEntry.childIndexes)
+			{
+				finalPreciseHierarchyTodo.push(childIdx);
+
+				if(preciseEntry.data.numCalls > 0)
 				{
-					totalChildCycles += entry->childEntries[childIdx].data.totalCycles;
-					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalCycles / (float)entry->data.totalCycles;
+					newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
+					newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
 
-					entry->data.estimatedOverhead += entry->childEntries[childIdx].data.estimatedOverhead + mPreciseSamplingOverhead;
+					parentPreciseEntryIndexes.push(entryIdx);
 
-					childIdx++;
+					entryIdx++;
 				}
+				else
+					parentPreciseEntryIndexes.push(parentEntryIdx);
+			}
+		}
 
-				entry->data.totalSelfCycles = entry->data.totalCycles - totalChildCycles;
-				entry->data.avgSelfCycles = entry->data.totalSelfCycles / entry->data.numCalls;
+		Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
+		finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
 
-				entry->data.estimatedSelfOverhead = mPreciseTimerOverhead;
+		curEntryIdx = 0;
+		for(auto& curEntry : newPreciseEntries)
+		{
+			CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
+
+			for(auto& childIdx : curEntry.childIndexes)
+			{
+				TempEntry& childEntry = newPreciseEntries[childIdx];
+				preciseEntry->childEntries.push_back(preciseEntries[childEntry.entryIdx]);
+
+				finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
 			}
+
+			curEntryIdx++;
 		}
+
+		return report;
 	}
 
 	void CPUProfiler::estimateTimerOverhead()
@@ -592,7 +677,7 @@ namespace CamelotFramework
 
 		mBasicTimerOverhead = 1000000.0;
 		mPreciseTimerOverhead = 1000000;
-		for (UINT32 tries = 0; tries < 20; tries++) 
+		for (UINT32 tries = 0; tries < 5; tries++) 
 		{
 			Timer timer;
 			for (UINT32 i = 0; i < reps; i++) 
@@ -617,10 +702,19 @@ namespace CamelotFramework
 				mPreciseTimerOverhead = avgCycles;
 		}
 
+		mBasicSamplingOverheadMs = 1000000.0;
+		mPreciseSamplingOverheadMs = 1000000.0;
+		mBasicSamplingOverheadCycles = 1000000;
+		mPreciseSamplingOverheadCycles = 1000000;
 		for (UINT32 tries = 0; tries < 20; tries++) 
 		{
-			Timer timer;
-			timer.start();
+			/************************************************************************/
+			/* 				AVERAGE TIME IN MS FOR BASIC SAMPLING                   */
+			/************************************************************************/
+
+			Timer timerA;
+			timerA.start();
+
 			beginThread("Main");
 
 			// Two different cases that can effect performance, one where
@@ -651,21 +745,125 @@ namespace CamelotFramework
 
 			for (UINT32 i = 0; i < sampleReps * 5; i++) 
 			{
-				beginSample("Test#" + toString(i));
-				endSample("Test#" + toString(i));
+				beginSample("TestAvg#" + toString(i));
+				endSample("TestAvg#" + toString(i));
 			}
 
 			endThread();
-			timer.stop();
+
+			timerA.stop();
 
 			reset();
 
-			double avgTime = double(timer.time)/double(sampleReps * 10 + sampleReps * 5);
-			if (avgTime < mBasicSamplingOverhead)
-				mBasicSamplingOverhead = avgTime;
+			double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
+			if (avgTimeBasic < mBasicSamplingOverheadMs)
+				mBasicSamplingOverheadMs = avgTimeBasic;
+
+			/************************************************************************/
+			/* 					AVERAGE CYCLES FOR BASIC SAMPLING                   */
+			/************************************************************************/
+
+			TimerPrecise timerPreciseA;
+			timerPreciseA.start();
 
-			TimerPrecise timerPrecise;
-			timerPrecise.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSample("TestAvg1");
+				endSample("TestAvg1");
+				beginSample("TestAvg2");
+				endSample("TestAvg2");
+				beginSample("TestAvg3");
+				endSample("TestAvg3");
+				beginSample("TestAvg4");
+				endSample("TestAvg4");
+				beginSample("TestAvg5");
+				endSample("TestAvg5");
+				beginSample("TestAvg6");
+				endSample("TestAvg6");
+				beginSample("TestAvg7");
+				endSample("TestAvg7");
+				beginSample("TestAvg8");
+				endSample("TestAvg8");
+				beginSample("TestAvg9");
+				endSample("TestAvg9");
+				beginSample("TestAvg10");
+				endSample("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSample("TestAvg#" + toString(i));
+				endSample("TestAvg#" + toString(i));
+			}
+
+			endThread();
+			timerPreciseA.stop();
+
+			reset();
+
+			UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
+			if (avgCyclesBasic < mBasicSamplingOverheadCycles)
+				mBasicSamplingOverheadCycles = avgCyclesBasic;
+
+			/************************************************************************/
+			/* 				AVERAGE TIME IN MS FOR PRECISE SAMPLING                 */
+			/************************************************************************/
+
+			Timer timerB;
+			timerB.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSamplePrecise("TestAvg1");
+				endSamplePrecise("TestAvg1");
+				beginSamplePrecise("TestAvg2");
+				endSamplePrecise("TestAvg2");
+				beginSamplePrecise("TestAvg3");
+				endSamplePrecise("TestAvg3");
+				beginSamplePrecise("TestAvg4");
+				endSamplePrecise("TestAvg4");
+				beginSamplePrecise("TestAvg5");
+				endSamplePrecise("TestAvg5");
+				beginSamplePrecise("TestAvg6");
+				endSamplePrecise("TestAvg6");
+				beginSamplePrecise("TestAvg7");
+				endSamplePrecise("TestAvg7");
+				beginSamplePrecise("TestAvg8");
+				endSamplePrecise("TestAvg8");
+				beginSamplePrecise("TestAvg9");
+				endSamplePrecise("TestAvg9");
+				beginSamplePrecise("TestAvg10");
+				endSamplePrecise("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSamplePrecise("TestAvg#" + toString(i));
+				endSamplePrecise("TestAvg#" + toString(i));
+			}
+
+			endThread();
+			timerB.stop();
+
+			reset();
+
+			double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
+			if (avgTimesPrecise < mPreciseSamplingOverheadMs)
+				mPreciseSamplingOverheadMs = avgTimesPrecise;
+
+			/************************************************************************/
+			/* 				AVERAGE CYCLES FOR PRECISE SAMPLING                     */
+			/************************************************************************/
+
+			TimerPrecise timerPreciseB;
+			timerPreciseB.start();
 			beginThread("Main");
 
 			// Two different cases that can effect performance, one where
@@ -696,18 +894,18 @@ namespace CamelotFramework
 
 			for (UINT32 i = 0; i < sampleReps * 5; i++) 
 			{
-				beginSamplePrecise("Test#" + toString(i));
-				endSamplePrecise("Test#" + toString(i));
+				beginSamplePrecise("TestAvg#" + toString(i));
+				endSamplePrecise("TestAvg#" + toString(i));
 			}
 
 			endThread();
-			timerPrecise.stop();
+			timerPreciseB.stop();
 
 			reset();
 
-			UINT64 avgCycles = timerPrecise.cycles/(sampleReps * 10 + sampleReps * 5);
-			if (avgCycles < mPreciseSamplingOverhead)
-				mPreciseSamplingOverhead = avgCycles;
+			UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
+			if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
+				mPreciseSamplingOverheadCycles = avgCyclesPrecise;
 		}
 	}
 
@@ -722,4 +920,9 @@ namespace CamelotFramework
 		avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
 		estimatedOverhead(0), pctOfParent(1.0f)
 	{ }
+
+	CPUProfilerReport::CPUProfilerReport()
+	{
+
+	}
 }