Forráskód Böngészése

Moved CPUProfiler to Core so I can use Platform methods for querying performance instead of requiring different profiler implementations per platform

Marko Pintera 12 éve
szülő
commit
32dc981a8d

+ 2 - 0
CamelotCore/CamelotCore.vcxproj

@@ -273,6 +273,7 @@
   </ItemDefinitionGroup>
   </ItemDefinitionGroup>
   <ItemGroup>
   <ItemGroup>
     <ClInclude Include="Include\CmCoreThread.h" />
     <ClInclude Include="Include\CmCoreThread.h" />
+    <ClInclude Include="Include\CmCPUProfiler.h" />
     <ClInclude Include="Include\CmDefaultRenderQueue.h" />
     <ClInclude Include="Include\CmDefaultRenderQueue.h" />
     <ClInclude Include="Include\CmDeferredCallManager.h" />
     <ClInclude Include="Include\CmDeferredCallManager.h" />
     <ClInclude Include="Include\CmGameObjectHandle.h" />
     <ClInclude Include="Include\CmGameObjectHandle.h" />
@@ -403,6 +404,7 @@
   <ItemGroup>
   <ItemGroup>
     <ClCompile Include="Include\CmMaterialManager.cpp" />
     <ClCompile Include="Include\CmMaterialManager.cpp" />
     <ClCompile Include="Source\CmCoreThread.cpp" />
     <ClCompile Include="Source\CmCoreThread.cpp" />
+    <ClCompile Include="Source\CmCPUProfiler.cpp" />
     <ClCompile Include="Source\CmDefaultRenderQueue.cpp" />
     <ClCompile Include="Source\CmDefaultRenderQueue.cpp" />
     <ClCompile Include="Source\CmDeferredCallManager.cpp" />
     <ClCompile Include="Source\CmDeferredCallManager.cpp" />
     <ClCompile Include="Source\CmGameObjectHandle.cpp" />
     <ClCompile Include="Source\CmGameObjectHandle.cpp" />

+ 6 - 0
CamelotCore/CamelotCore.vcxproj.filters

@@ -474,6 +474,9 @@
     <ClInclude Include="Include\CmProfiler.h">
     <ClInclude Include="Include\CmProfiler.h">
       <Filter>Header Files</Filter>
       <Filter>Header Files</Filter>
     </ClInclude>
     </ClInclude>
+    <ClInclude Include="Include\CmCPUProfiler.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   </ItemGroup>
   <ItemGroup>
   <ItemGroup>
     <ClCompile Include="Source\CmApplication.cpp">
     <ClCompile Include="Source\CmApplication.cpp">
@@ -734,5 +737,8 @@
     <ClCompile Include="Source\CmProfiler.cpp">
     <ClCompile Include="Source\CmProfiler.cpp">
       <Filter>Source Files</Filter>
       <Filter>Source Files</Filter>
     </ClCompile>
     </ClCompile>
+    <ClCompile Include="Source\CmCPUProfiler.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   </ItemGroup>
 </Project>
 </Project>

+ 297 - 0
CamelotCore/Include/CmCPUProfiler.h

@@ -0,0 +1,297 @@
+#pragma once
+
+#include "CmPrerequisites.h"
+
+namespace CamelotFramework
+{
+	class CPUProfilerReport;
+
+	// TODO: Add #defines for all profiler methods so we can easily remove them from final version
+
+	/**
+	 * @brief	Provides various performance measuring methods
+	 * 			
+	 * @note	This class is thread safe. Matching begin*\end* calls
+	 * 			must belong to the same thread though.
+	 */
+	class CM_EXPORT CPUProfiler
+	{
+		class Timer
+		{
+		public:
+			Timer();
+
+			void start();
+			void stop();
+			void reset();
+
+			double time;
+		private:
+			double startTime;
+
+			static inline double getCurrentTime();
+		};
+
+		class TimerPrecise
+		{
+		public:
+			TimerPrecise();
+
+			void start();
+			void stop();
+			void reset();
+
+			UINT64 cycles;
+		private:
+			UINT64 startCycles;
+
+			static inline UINT64 getNumCycles();
+		};
+
+		struct ProfileSample
+		{
+			ProfileSample(double _time)
+				:time(_time)
+			{ }
+
+			double time;
+		};
+
+		struct PreciseProfileSample
+		{
+			PreciseProfileSample(UINT64 _cycles)
+				:cycles(_cycles)
+			{ }
+
+			UINT64 cycles;
+		};
+
+		struct ProfileData
+		{
+			Vector<ProfileSample>::type samples;
+			Timer timer;
+
+			void beginSample();
+			void endSample();
+			void resumeLastSample();
+		};
+
+		struct PreciseProfileData
+		{
+			// TODO - Add cache misses, branch mispredictions, retired instructions vs. optimal number of cycles (RDPMC instruction on Intel)
+
+			Vector<PreciseProfileSample>::type samples;
+			TimerPrecise timer;
+
+			void beginSample();
+			void endSample();
+			void resumeLastSample();
+		};
+
+		struct PreciseProfiledBlock;
+		struct ProfiledBlock;
+
+		struct ProfiledBlock
+		{
+			ProfiledBlock();
+			~ProfiledBlock();
+
+			String name;
+			
+			ProfileData basic;
+			PreciseProfileData precise;
+
+			Vector<ProfiledBlock*>::type children;
+
+			ProfiledBlock* findChild(const String& name) const;
+		};
+
+		enum class ActiveSamplingType
+		{
+			Basic,
+			Precise
+		};
+
+		struct ActiveBlock
+		{
+			ActiveBlock()
+				:type(ActiveSamplingType::Basic), block(nullptr)
+			{ }
+
+			ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block)
+				:type(_type), block(_block)
+			{ }
+
+			ActiveSamplingType type;
+			ProfiledBlock* block;
+		};
+
+		struct ThreadInfo
+		{
+			ThreadInfo();
+
+			static CM_THREADLOCAL ThreadInfo* activeThread;
+			bool isActive;
+
+			ProfiledBlock* rootBlock;
+
+			Stack<ActiveBlock>::type activeBlocks;
+			ActiveBlock activeBlock;
+
+			void begin(const String& _name);
+			void end();
+			void reset();
+
+			ProfiledBlock* getBlock();
+			void releaseBlock(ProfiledBlock* block);
+		};
+
+	public:
+		CPUProfiler();
+		~CPUProfiler();
+
+		/**
+		 * @brief	Registers a new thread we will be doing sampling in. This needs to be called before any beginSample*\endSample* calls
+		 * 			are made in that thread.
+		 *
+		 * @param	name	Name that will allow you to more easily identify the thread.
+		 */
+		void beginThread(const String& name);
+
+		/**
+		 * @brief	Ends sampling for the current thread. No beginSample*\endSample* calls after this point.
+		 */
+		void endThread();
+
+		/**
+		 * @brief	Begins sample measurement. Must be followed by endSample. 
+		 *
+		 * @param	name	Unique name for the sample you can later use to find the sampling data.
+		 */
+		void beginSample(const String& name);
+
+		/**
+		 * @brief	Ends sample measurement and returns measured data.
+		 *
+		 * @param	name	Unique name for the sample. 
+		 * 					
+		 * @note	Unique name is primarily needed to more easily identify mismatched
+		 * 			begin/end sample pairs. Otherwise the name in beginSample would be enough.
+		 */
+		void endSample(const String& name);
+
+		/**
+		 * @brief	Begins sample measurement. Must be followed by endSample. 
+		 *
+		 * @param	name	Unique name for the sample you can later use to find the sampling data.
+		 * 					
+		 * @note	This method uses very precise CPU counters to determine variety of data not
+		 * 			provided by standard beginSample. However due to the way these counters work you should
+		 * 			not use this method for larger parts of code. It does not consider context switches so if the OS
+		 * 			decides to switch context between measurements you will get invalid data.
+		 */
+		void beginSamplePrecise(const String& name);
+
+		/**
+		 * @brief	Ends precise sample measurement and returns measured data.
+		 *
+		 * @param	name	Unique name for the sample. 
+		 * 					
+		 * @note	Unique name is primarily needed to more easily identify mismatched
+		 * 			begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.
+		 */
+		void endSamplePrecise(const String& name);
+
+		/**
+		 * @brief	Clears all sampling data, and ends any unfinished sampling blocks.
+		 */
+		void reset();
+
+		/**
+		 * @brief	Generates a report from all previously sampled data.
+		 * 			
+		 * @note	Generating a report will stop all in-progress sampling. You should make sure
+		 * 			you call endSample* manually beforehand so this doesn't have to happen.
+		 */
+		CPUProfilerReport generateReport();
+
+	private:
+		double mBasicTimerOverhead;
+		UINT64 mPreciseTimerOverhead;
+
+		double mBasicSamplingOverheadMs;
+		double mPreciseSamplingOverheadMs;
+		UINT64 mBasicSamplingOverheadCycles;
+		UINT64 mPreciseSamplingOverheadCycles;
+
+		Vector<ThreadInfo*>::type mActiveThreads;
+		CM_MUTEX(mThreadSync);
+
+		void estimateTimerOverhead();
+	};
+
+	struct CM_EXPORT CPUProfilerBasicSamplingEntry
+	{
+		struct CM_EXPORT Data
+		{
+			Data();
+
+			String name;
+			UINT32 numCalls;
+
+			double avgTimeMs;
+			double maxTimeMs;
+			double totalTimeMs;
+
+			double avgSelfTimeMs;
+			double totalSelfTimeMs;
+
+			double estimatedSelfOverheadMs;
+			double estimatedOverheadMs;
+
+			float pctOfParent;
+		} data;
+
+		Vector<CPUProfilerBasicSamplingEntry>::type childEntries;
+	};
+
+	struct CM_EXPORT CPUProfilerPreciseSamplingEntry
+	{
+		struct CM_EXPORT Data
+		{
+			Data();
+
+			String name;
+			UINT32 numCalls;
+
+			UINT64 avgCycles;
+			UINT64 maxCycles;
+			UINT64 totalCycles;
+
+			UINT64 avgSelfCycles;
+			UINT64 totalSelfCycles;
+
+			UINT64 estimatedSelfOverhead;
+			UINT64 estimatedOverhead;
+
+			float pctOfParent;
+		} data;
+
+		Vector<CPUProfilerPreciseSamplingEntry>::type childEntries;
+	};
+
+	class CM_EXPORT CPUProfilerReport
+	{
+	public:
+		CPUProfilerReport();
+
+		const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
+		const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
+
+	private:
+		friend class CPUProfiler;
+
+		CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
+		CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
+	};
+}

+ 2 - 8
CamelotCore/Include/Win32/CmPlatformImpl.h

@@ -167,15 +167,9 @@ namespace CamelotFramework
 
 
 		/**
 		/**
 		 * @brief	Queries the internal system performance counter you can use for very precise time
 		 * @brief	Queries the internal system performance counter you can use for very precise time
-		 * 			measurements. Value is in "queryPerformanceFrequency" units.
+		 * 			measurements. Value is in milliseconds.
 		 */
 		 */
-		static UINT64 queryPerformanceCounter();
-
-		/**
-		 * @brief	Queries the internal system performance counter frequency. Used for interpreting
-		 * 			data returned by "queryPerformanceCounter".
-		 */
-		static UINT64 queryPerformanceFrequency();
+		static double queryPerformanceTimerMs();
 
 
 		/**
 		/**
 		 * @brief	Message pump. Processes OS messages and returns when it's free.
 		 * @brief	Message pump. Processes OS messages and returns when it's free.

+ 927 - 0
CamelotCore/Source/CmCPUProfiler.cpp

@@ -0,0 +1,927 @@
+#include "CmCPUProfiler.h"
+#include "CmDebug.h"
+#include "CmPlatform.h"
+
+namespace CamelotFramework
+{
+	CPUProfiler::Timer::Timer()
+	{
+		time = 0.0f;
+	}
+
+	void CPUProfiler::Timer::start()
+	{
+		startTime = getCurrentTime();
+	}
+
+	void CPUProfiler::Timer::stop()
+	{
+		time += getCurrentTime() - startTime;
+	}
+
+	void CPUProfiler::Timer::reset()
+	{
+		time = 0.0f;
+	}
+
+	inline double CPUProfiler::Timer::getCurrentTime() 
+	{
+		return Platform::queryPerformanceTimerMs();
+	}
+
+	CPUProfiler::TimerPrecise::TimerPrecise()
+	{
+		cycles = 0;
+	}
+
+	void CPUProfiler::TimerPrecise::start()
+	{
+		startCycles = getNumCycles();
+	}
+
+	void CPUProfiler::TimerPrecise::stop()
+	{
+		cycles += getNumCycles() - startCycles;
+	}
+
+	void CPUProfiler::TimerPrecise::reset()
+	{
+		cycles = 0;
+	}
+
+	inline UINT64 CPUProfiler::TimerPrecise::getNumCycles() 
+	{
+#if CM_COMPILER == CM_COMPILER_GNUC
+		asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
+		UINT32 __a,__d;
+		asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
+		return ( UINT64(__a) | UINT64(__d) << 32 );
+#else
+		int a[4];
+		int b = 0;
+		__cpuid(a, b);
+		return __rdtsc();
+#endif		
+	}
+
+	void CPUProfiler::ProfileData::beginSample()
+	{
+		timer.reset();
+		timer.start();
+	}
+
+	void CPUProfiler::ProfileData::endSample()
+	{
+		timer.stop();
+		samples.push_back(ProfileSample(timer.time));
+	}
+
+	void CPUProfiler::ProfileData::resumeLastSample()
+	{
+		timer.start();
+		samples.erase(samples.end() - 1);
+	}
+
+	void CPUProfiler::PreciseProfileData::beginSample()
+	{
+		timer.reset();
+		timer.start();
+	}
+
+	void CPUProfiler::PreciseProfileData::endSample()
+	{
+		timer.stop();
+		samples.push_back(PreciseProfileSample(timer.cycles));
+	}
+
+	void CPUProfiler::PreciseProfileData::resumeLastSample()
+	{
+		timer.start();
+		samples.erase(samples.end() - 1);
+	}
+
+	CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
+
+	CPUProfiler::ThreadInfo::ThreadInfo()
+		:isActive(false), rootBlock(nullptr)
+	{
+
+	}
+
+	void CPUProfiler::ThreadInfo::begin(const String& _name)
+	{
+		if(isActive)
+		{
+			LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
+			return;
+		}
+
+		if(rootBlock == nullptr)
+			rootBlock = getBlock();
+
+		activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
+		activeBlocks.push(activeBlock);
+		
+		rootBlock->name = _name; 
+		rootBlock->basic.beginSample();
+		isActive = true;
+	}
+
+	void CPUProfiler::ThreadInfo::end()
+	{
+		if(activeBlock.type == ActiveSamplingType::Basic)
+			activeBlock.block->basic.endSample();
+		else
+			activeBlock.block->precise.endSample();
+
+		activeBlocks.pop();
+
+		if(!isActive)
+			LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
+
+		if(activeBlocks.size() > 0)
+		{
+			LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
+
+			while(activeBlocks.size() > 0)
+			{
+				ActiveBlock& curBlock = activeBlocks.top();
+				if(curBlock.type == ActiveSamplingType::Basic)
+					curBlock.block->basic.endSample();
+				else
+					curBlock.block->precise.endSample();
+
+				activeBlocks.pop();
+			}
+		}
+
+		isActive = false;
+		activeBlocks = Stack<ActiveBlock>::type();
+		activeBlock = ActiveBlock();
+	}
+
+	void CPUProfiler::ThreadInfo::reset()
+	{
+		if(isActive)
+			end();
+
+		if(rootBlock != nullptr)
+			releaseBlock(rootBlock);
+
+		rootBlock = nullptr;
+	}
+
+	CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
+	{
+		// TODO - Pool this, if possible using the memory allocator stuff
+		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
+		return cm_new<ProfiledBlock>();
+	}
+
+	void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
+	{
+		cm_delete(block);
+	}
+
+	CPUProfiler::ProfiledBlock::ProfiledBlock()
+	{ }
+
+	CPUProfiler::ProfiledBlock::~ProfiledBlock()
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+
+		for(auto& child : children)
+			thread->releaseBlock(child);
+
+		children.clear();
+	}
+
+	CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
+	{
+		for(auto& child : children)
+		{
+			if(child->name == name)
+				return child;
+		}
+
+		return nullptr;
+	}
+
+	CPUProfiler::CPUProfiler()
+		:mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
+		mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
+	{
+		// TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
+		// and keep separate values per thread.
+		estimateTimerOverhead();
+	}
+
+	CPUProfiler::~CPUProfiler()
+	{
+		reset();
+
+		CM_LOCK_MUTEX(mThreadSync);
+
+		for(auto& threadInfo : mActiveThreads)
+			cm_delete(threadInfo);
+	}
+
+	void CPUProfiler::beginThread(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr)
+		{
+			ThreadInfo::activeThread = cm_new<ThreadInfo>();
+			thread = ThreadInfo::activeThread;
+
+			{
+				CM_LOCK_MUTEX(mThreadSync);
+
+				mActiveThreads.push_back(thread);
+			}
+		}
+
+		thread->begin(name);
+	}
+
+	void CPUProfiler::endThread()
+	{
+		// I don't do a nullcheck where on purpose, so endSample can be called ASAP
+		ThreadInfo::activeThread->end();
+	}
+
+	void CPUProfiler::beginSample(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr || !thread->isActive)
+			beginThread("Unknown");
+
+		ProfiledBlock* parent = thread->activeBlock.block;
+		ProfiledBlock* block = nullptr;
+		
+		if(parent != nullptr)
+			block = parent->findChild(name);
+
+		if(block == nullptr)
+		{
+			block = thread->getBlock();
+			block->name = name;
+
+			if(parent != nullptr)
+				parent->children.push_back(block);
+			else
+				thread->rootBlock->children.push_back(block);
+		}
+
+		thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
+		thread->activeBlocks.push(thread->activeBlock);
+
+		block->basic.beginSample();
+	}
+
+	void CPUProfiler::endSample(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		ProfiledBlock* block = thread->activeBlock.block;
+
+#if CM_DEBUG_MODE
+		if(block == nullptr)
+		{
+			LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
+			return;
+		}
+
+		if(thread->activeBlock.type == ActiveSamplingType::Precise)
+		{
+			LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
+			return;
+		}
+
+		if(block->name != name)
+		{
+			LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
+			return;
+		}
+#endif
+
+		block->basic.endSample();
+
+		thread->activeBlocks.pop();
+
+		if(!thread->activeBlocks.empty())
+			thread->activeBlock = thread->activeBlocks.top();
+		else
+			thread->activeBlock = ActiveBlock();
+	}
+
+	void CPUProfiler::beginSamplePrecise(const String& name)
+	{
+		// Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed. 
+		// Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
+		
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr || !thread->isActive)
+			beginThread("Unknown");
+
+		ProfiledBlock* parent = thread->activeBlock.block;
+		ProfiledBlock* block = nullptr;
+		
+		if(parent != nullptr)
+			block = parent->findChild(name);
+
+		if(block == nullptr)
+		{
+			block = thread->getBlock();
+			block->name = name;
+
+			if(parent != nullptr)
+				parent->children.push_back(block);
+			else
+				thread->rootBlock->children.push_back(block);
+		}
+
+		thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
+		thread->activeBlocks.push(thread->activeBlock);
+
+		block->precise.beginSample();
+	}
+
+	void CPUProfiler::endSamplePrecise(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		ProfiledBlock* block = thread->activeBlock.block;
+
+#if CM_DEBUG_MODE
+		if(block == nullptr)
+		{
+			LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
+			return;
+		}
+
+		if(thread->activeBlock.type == ActiveSamplingType::Basic)
+		{
+			LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
+			return;
+		}
+
+		if(block->name != name)
+		{
+			LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
+			return;
+		}
+#endif
+
+		block->precise.endSample();
+
+		thread->activeBlocks.pop();
+
+		if(!thread->activeBlocks.empty())
+			thread->activeBlock = thread->activeBlocks.top();
+		else
+			thread->activeBlock = ActiveBlock();
+	}
+
+	void CPUProfiler::reset()
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+
+		if(thread != nullptr)
+			thread->reset();
+	}
+
+	CPUProfilerReport CPUProfiler::generateReport()
+	{
+		CPUProfilerReport report;
+
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr)
+			return report;
+
+		if(thread->isActive)
+			thread->end();
+
+		// We need to separate out basic and precise data and form two separate hierarchies
+		if(thread->rootBlock == nullptr)
+			return report;
+
+		struct TempEntry
+		{
+			TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
+				:parentBlock(_parentBlock), entryIdx(_entryIdx)
+			{ }
+
+			ProfiledBlock* parentBlock;
+			UINT32 entryIdx;
+			Vector<UINT32>::type childIndexes;
+		};
+
+		Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
+		Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;	
+
+		// Fill up flatHierarchy array in a way so we always process children before parents
+		Stack<UINT32>::type todo;
+		Vector<TempEntry>::type flatHierarchy;
+
+		UINT32 entryIdx = 0;
+		todo.push(entryIdx);
+		flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
+
+		entryIdx++;
+		while(!todo.empty())
+		{
+			UINT32 curDataIdx = todo.top();
+			ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
+
+			todo.pop();
+
+			for(auto& child : curBlock->children)
+			{
+				flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
+
+				todo.push(entryIdx);
+				flatHierarchy.push_back(TempEntry(child, entryIdx));
+
+				entryIdx++;
+			}
+		}
+		
+		// Calculate sampling data for all entries
+		basicEntries.resize(flatHierarchy.size());
+		preciseEntries.resize(flatHierarchy.size());
+
+		for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
+		{
+			TempEntry& curData = *iter;
+			ProfiledBlock* curBlock = curData.parentBlock;
+
+			CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
+			CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
+
+			// Calculate basic data
+			entryBasic->data.name = curBlock->name;
+
+			entryBasic->data.totalTimeMs = 0.0;
+			entryBasic->data.maxTimeMs = 0.0;
+			for(auto& sample : curBlock->basic.samples)
+			{
+				entryBasic->data.totalTimeMs += sample.time;
+				entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
+			}
+
+			entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
+
+			if(entryBasic->data.numCalls > 0)
+				entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
+
+			double totalChildTime = 0.0;
+			for(auto& childIdx : curData.childIndexes)
+			{
+				CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
+				totalChildTime += childEntry->data.totalTimeMs;
+				childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
+
+				entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
+			}
+
+			entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
+			entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
+
+			entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
+
+			if(entryBasic->data.numCalls > 0)
+				entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
+
+			entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
+
+			// Calculate precise data
+			entryPrecise->data.name = curBlock->name;
+
+			entryPrecise->data.totalCycles = 0;
+			entryPrecise->data.maxCycles = 0;
+			for(auto& sample : curBlock->precise.samples)
+			{
+				entryPrecise->data.totalCycles += sample.cycles;
+				entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
+			}
+
+			entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
+
+			if(entryPrecise->data.numCalls > 0)
+				entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
+
+			UINT64 totalChildCycles = 0;
+			for(auto& childIdx : curData.childIndexes)
+			{
+				CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
+				totalChildCycles += childEntry->data.totalCycles;
+				childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
+
+				entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
+			}
+
+			entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
+			entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
+
+			entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
+
+			if(entryPrecise->data.numCalls > 0)
+				entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
+
+			entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
+		}
+
+		// Prune empty basic entries
+		Stack<UINT32>::type finalBasicHierarchyTodo;
+		Stack<UINT32>::type parentBasicEntryIndexes;
+		Vector<TempEntry>::type newBasicEntries;
+
+		finalBasicHierarchyTodo.push(0);
+
+		entryIdx = 0;
+		parentBasicEntryIndexes.push(entryIdx);
+		newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
+
+		entryIdx++;
+
+		while(!finalBasicHierarchyTodo.empty())
+		{
+			UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
+			parentBasicEntryIndexes.pop();
+
+			UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
+			TempEntry& curEntry = flatHierarchy[curEntryIdx];
+			finalBasicHierarchyTodo.pop();
+
+			for(auto& childIdx : curEntry.childIndexes)
+			{
+				finalBasicHierarchyTodo.push(childIdx);
+
+				CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
+				if(basicEntry.data.numCalls > 0)
+				{
+					newBasicEntries.push_back(TempEntry(nullptr, childIdx));
+					newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
+
+					parentBasicEntryIndexes.push(entryIdx);
+
+					entryIdx++;
+				}
+				else
+					parentBasicEntryIndexes.push(parentEntryIdx);
+			}
+		}
+
+		if(newBasicEntries.size() > 0)
+		{
+			Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
+
+			report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
+			finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
+
+			UINT32 curEntryIdx = 0;
+			for(auto& curEntry : newBasicEntries)
+			{
+				CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
+
+				basicEntry->childEntries.resize(curEntry.childIndexes.size());
+				UINT32 idx = 0;
+				for(auto& childIdx : curEntry.childIndexes)
+				{
+					TempEntry& childEntry = newBasicEntries[childIdx];
+					basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
+
+					finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
+					idx++;
+				}
+
+				curEntryIdx++;
+			}
+		}
+
+		// Prune empty precise entries
+		Stack<UINT32>::type finalPreciseHierarchyTodo;
+		Stack<UINT32>::type parentPreciseEntryIndexes;
+		Vector<TempEntry>::type newPreciseEntries;
+
+		finalPreciseHierarchyTodo.push(0);
+
+		entryIdx = 0;
+		parentPreciseEntryIndexes.push(entryIdx);
+		newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
+
+		entryIdx++;
+
+		while(!finalPreciseHierarchyTodo.empty())
+		{
+			UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
+			parentPreciseEntryIndexes.pop();
+
+			UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
+			TempEntry& curEntry = flatHierarchy[curEntryIdx];
+			finalPreciseHierarchyTodo.pop();
+
+			for(auto& childIdx : curEntry.childIndexes)
+			{
+				finalPreciseHierarchyTodo.push(childIdx);
+
+				CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
+				if(preciseEntry.data.numCalls > 0)
+				{
+					newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
+					newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
+
+					parentPreciseEntryIndexes.push(entryIdx);
+
+					entryIdx++;
+				}
+				else
+					parentPreciseEntryIndexes.push(parentEntryIdx);
+			}
+		}
+
+		if(newPreciseEntries.size() > 0)
+		{
+			Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
+
+			report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
+			finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
+
+			UINT32 curEntryIdx = 0;
+			for(auto& curEntry : newPreciseEntries)
+			{
+				CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
+
+				preciseEntry->childEntries.resize(curEntry.childIndexes.size());
+				UINT32 idx = 0;
+				for(auto& childIdx : curEntry.childIndexes)
+				{
+					TempEntry& childEntry = newPreciseEntries[childIdx];
+					preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
+
+					finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
+					idx++;
+				}
+
+				curEntryIdx++;
+			}
+		}
+
+		return report;
+	}
+
+	void CPUProfiler::estimateTimerOverhead()
+	{
+		// Get an idea of how long timer calls and RDTSC takes
+		const UINT32 reps = 1000, sampleReps = 100;
+
+		mBasicTimerOverhead = 1000000.0;
+		mPreciseTimerOverhead = 1000000;
+		for (UINT32 tries = 0; tries < 20; tries++) 
+		{
+			Timer timer;
+			for (UINT32 i = 0; i < reps; i++) 
+			{
+				timer.start();
+				timer.stop();
+			}
+
+			double avgTime = double(timer.time)/double(reps);
+			if (avgTime < mBasicTimerOverhead)
+				mBasicTimerOverhead = avgTime;
+
+			TimerPrecise timerPrecise;
+			for (UINT32 i = 0; i < reps; i++) 
+			{
+				timerPrecise.start();
+				timerPrecise.stop();
+			}
+
+			UINT64 avgCycles = timerPrecise.cycles/reps;
+			if (avgCycles < mPreciseTimerOverhead)
+				mPreciseTimerOverhead = avgCycles;
+		}
+
+		mBasicSamplingOverheadMs = 1000000.0;
+		mPreciseSamplingOverheadMs = 1000000.0;
+		mBasicSamplingOverheadCycles = 1000000;
+		mPreciseSamplingOverheadCycles = 1000000;
+		for (UINT32 tries = 0; tries < 20; tries++) 
+		{
+			/************************************************************************/
+			/* 				AVERAGE TIME IN MS FOR BASIC SAMPLING                   */
+			/************************************************************************/
+
+			Timer timerA;
+			timerA.start();
+
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSample("TestAvg1");
+				endSample("TestAvg1");
+				beginSample("TestAvg2");
+				endSample("TestAvg2");
+				beginSample("TestAvg3");
+				endSample("TestAvg3");
+				beginSample("TestAvg4");
+				endSample("TestAvg4");
+				beginSample("TestAvg5");
+				endSample("TestAvg5");
+				beginSample("TestAvg6");
+				endSample("TestAvg6");
+				beginSample("TestAvg7");
+				endSample("TestAvg7");
+				beginSample("TestAvg8");
+				endSample("TestAvg8");
+				beginSample("TestAvg9");
+				endSample("TestAvg9");
+				beginSample("TestAvg10");
+				endSample("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSample("TestAvg#" + toString(i));
+				endSample("TestAvg#" + toString(i));
+			}
+
+			endThread();
+
+			timerA.stop();
+
+			reset();
+
+			double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
+			if (avgTimeBasic < mBasicSamplingOverheadMs)
+				mBasicSamplingOverheadMs = avgTimeBasic;
+
+			/************************************************************************/
+			/* 					AVERAGE CYCLES FOR BASIC SAMPLING                   */
+			/************************************************************************/
+
+			TimerPrecise timerPreciseA;
+			timerPreciseA.start();
+
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSample("TestAvg1");
+				endSample("TestAvg1");
+				beginSample("TestAvg2");
+				endSample("TestAvg2");
+				beginSample("TestAvg3");
+				endSample("TestAvg3");
+				beginSample("TestAvg4");
+				endSample("TestAvg4");
+				beginSample("TestAvg5");
+				endSample("TestAvg5");
+				beginSample("TestAvg6");
+				endSample("TestAvg6");
+				beginSample("TestAvg7");
+				endSample("TestAvg7");
+				beginSample("TestAvg8");
+				endSample("TestAvg8");
+				beginSample("TestAvg9");
+				endSample("TestAvg9");
+				beginSample("TestAvg10");
+				endSample("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSample("TestAvg#" + toString(i));
+				endSample("TestAvg#" + toString(i));
+			}
+
+			endThread();
+			timerPreciseA.stop();
+
+			reset();
+
+			UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
+			if (avgCyclesBasic < mBasicSamplingOverheadCycles)
+				mBasicSamplingOverheadCycles = avgCyclesBasic;
+
+			/************************************************************************/
+			/* 				AVERAGE TIME IN MS FOR PRECISE SAMPLING                 */
+			/************************************************************************/
+
+			Timer timerB;
+			timerB.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSamplePrecise("TestAvg1");
+				endSamplePrecise("TestAvg1");
+				beginSamplePrecise("TestAvg2");
+				endSamplePrecise("TestAvg2");
+				beginSamplePrecise("TestAvg3");
+				endSamplePrecise("TestAvg3");
+				beginSamplePrecise("TestAvg4");
+				endSamplePrecise("TestAvg4");
+				beginSamplePrecise("TestAvg5");
+				endSamplePrecise("TestAvg5");
+				beginSamplePrecise("TestAvg6");
+				endSamplePrecise("TestAvg6");
+				beginSamplePrecise("TestAvg7");
+				endSamplePrecise("TestAvg7");
+				beginSamplePrecise("TestAvg8");
+				endSamplePrecise("TestAvg8");
+				beginSamplePrecise("TestAvg9");
+				endSamplePrecise("TestAvg9");
+				beginSamplePrecise("TestAvg10");
+				endSamplePrecise("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSamplePrecise("TestAvg#" + toString(i));
+				endSamplePrecise("TestAvg#" + toString(i));
+			}
+
+			endThread();
+			timerB.stop();
+
+			reset();
+
+			double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
+			if (avgTimesPrecise < mPreciseSamplingOverheadMs)
+				mPreciseSamplingOverheadMs = avgTimesPrecise;
+
+			/************************************************************************/
+			/* 				AVERAGE CYCLES FOR PRECISE SAMPLING                     */
+			/************************************************************************/
+
+			TimerPrecise timerPreciseB;
+			timerPreciseB.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSamplePrecise("TestAvg1");
+				endSamplePrecise("TestAvg1");
+				beginSamplePrecise("TestAvg2");
+				endSamplePrecise("TestAvg2");
+				beginSamplePrecise("TestAvg3");
+				endSamplePrecise("TestAvg3");
+				beginSamplePrecise("TestAvg4");
+				endSamplePrecise("TestAvg4");
+				beginSamplePrecise("TestAvg5");
+				endSamplePrecise("TestAvg5");
+				beginSamplePrecise("TestAvg6");
+				endSamplePrecise("TestAvg6");
+				beginSamplePrecise("TestAvg7");
+				endSamplePrecise("TestAvg7");
+				beginSamplePrecise("TestAvg8");
+				endSamplePrecise("TestAvg8");
+				beginSamplePrecise("TestAvg9");
+				endSamplePrecise("TestAvg9");
+				beginSamplePrecise("TestAvg10");
+				endSamplePrecise("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSamplePrecise("TestAvg#" + toString(i));
+				endSamplePrecise("TestAvg#" + toString(i));
+			}
+
+			endThread();
+			timerPreciseB.stop();
+
+			reset();
+
+			UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
+			if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
+				mPreciseSamplingOverheadCycles = avgCyclesPrecise;
+		}
+	}
+
+	CPUProfilerBasicSamplingEntry::Data::Data()
+		:numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
+		avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
+		estimatedOverheadMs(0.0), pctOfParent(1.0f)
+	{ }
+
+	CPUProfilerPreciseSamplingEntry::Data::Data()
+		:numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
+		avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
+		estimatedOverhead(0), pctOfParent(1.0f)
+	{ }
+
+	CPUProfilerReport::CPUProfilerReport()
+	{
+
+	}
+}

+ 2 - 7
CamelotCore/Source/Win32/CmPlatformImpl.cpp

@@ -341,20 +341,15 @@ namespace CamelotFramework
 		return L"";
 		return L"";
 	}
 	}
 
 
-	UINT64 Platform::queryPerformanceCounter()
+	double Platform::queryPerformanceTimerMs()
 	{
 	{
 		LARGE_INTEGER counterValue;
 		LARGE_INTEGER counterValue;
 		QueryPerformanceCounter(&counterValue);
 		QueryPerformanceCounter(&counterValue);
 		
 		
-		return (UINT64)counterValue.QuadPart;
-	}
-
-	UINT64 Platform::queryPerformanceFrequency()
-	{
 		LARGE_INTEGER counterFreq;
 		LARGE_INTEGER counterFreq;
 		QueryPerformanceFrequency(&counterFreq);
 		QueryPerformanceFrequency(&counterFreq);
 
 
-		return (UINT64)counterFreq.QuadPart;
+		return (double)counterValue.QuadPart / (counterFreq.QuadPart * 0.001);
 	}
 	}
 
 
 	void Platform::messagePump()
 	void Platform::messagePump()

+ 0 - 2
CamelotUtility/CamelotUtility.vcxproj

@@ -251,7 +251,6 @@
     <ClCompile Include="Source\CmInt2.cpp" />
     <ClCompile Include="Source\CmInt2.cpp" />
     <ClCompile Include="Source\CmManagedDataBlock.cpp" />
     <ClCompile Include="Source\CmManagedDataBlock.cpp" />
     <ClCompile Include="Source\CmMemStack.cpp" />
     <ClCompile Include="Source\CmMemStack.cpp" />
-    <ClCompile Include="Source\CmCPUProfiler.cpp" />
     <ClCompile Include="Source\CmRect.cpp" />
     <ClCompile Include="Source\CmRect.cpp" />
     <ClCompile Include="Source\CmStringTable.cpp" />
     <ClCompile Include="Source\CmStringTable.cpp" />
     <ClCompile Include="Source\CmTexAtlasGenerator.cpp" />
     <ClCompile Include="Source\CmTexAtlasGenerator.cpp" />
@@ -280,7 +279,6 @@
     <ClInclude Include="Include\CmMemoryAllocator.h" />
     <ClInclude Include="Include\CmMemoryAllocator.h" />
     <ClInclude Include="Include\CmModule.h" />
     <ClInclude Include="Include\CmModule.h" />
     <ClInclude Include="Include\CmPath.h" />
     <ClInclude Include="Include\CmPath.h" />
-    <ClInclude Include="Include\CmCPUProfiler.h" />
     <ClInclude Include="Include\CmRect.h" />
     <ClInclude Include="Include\CmRect.h" />
     <ClInclude Include="Include\CmRTTIField.h" />
     <ClInclude Include="Include\CmRTTIField.h" />
     <ClInclude Include="Include\CmRTTIManagedDataBlockField.h" />
     <ClInclude Include="Include\CmRTTIManagedDataBlockField.h" />

+ 0 - 6
CamelotUtility/CamelotUtility.vcxproj.filters

@@ -228,9 +228,6 @@
     <ClInclude Include="Include\CmStringTable.h">
     <ClInclude Include="Include\CmStringTable.h">
       <Filter>Header Files</Filter>
       <Filter>Header Files</Filter>
     </ClInclude>
     </ClInclude>
-    <ClInclude Include="Include\CmCPUProfiler.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
   </ItemGroup>
   </ItemGroup>
   <ItemGroup>
   <ItemGroup>
     <ClCompile Include="Source\CmMath.cpp">
     <ClCompile Include="Source\CmMath.cpp">
@@ -347,8 +344,5 @@
     <ClCompile Include="Source\CmStringTable.cpp">
     <ClCompile Include="Source\CmStringTable.cpp">
       <Filter>Source Files</Filter>
       <Filter>Source Files</Filter>
     </ClCompile>
     </ClCompile>
-    <ClCompile Include="Source\CmCPUProfiler.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
   </ItemGroup>
   </ItemGroup>
 </Project>
 </Project>