2
0
Эх сурвалжийг харах

A lot of Profiler WIP work

Marko Pintera 12 жил өмнө
parent
commit
64db00cec3

+ 12 - 0
CamelotCore/Include/Win32/CmPlatformImpl.h

@@ -165,6 +165,18 @@ namespace CamelotFramework
 		 */
 		static WString copyFromClipboard();
 
+		/**
+		 * @brief	Queries the internal system performance counter you can use for very precise time
+		 * 			measurements. Value is in "queryPerformanceFrequency" units.
+		 */
+		static UINT64 queryPerformanceCounter();
+
+		/**
+		 * @brief	Queries the internal system performance counter frequency. Used for interpreting
+		 * 			data returned by "queryPerformanceCounter".
+		 */
+		static UINT64 queryPerformanceFrequency();
+
 		/**
 		 * @brief	Message pump. Processes OS messages and returns when it's free.
 		 * 			

+ 16 - 0
CamelotCore/Source/Win32/CmPlatformImpl.cpp

@@ -341,6 +341,22 @@ namespace CamelotFramework
 		return L"";
 	}
 
+	UINT64 Platform::queryPerformanceCounter()
+	{
+		LARGE_INTEGER counterValue;
+		QueryPerformanceCounter(&counterValue);
+		
+		return (UINT64)counterValue.QuadPart;
+	}
+
+	UINT64 Platform::queryPerformanceFrequency()
+	{
+		LARGE_INTEGER counterFreq;
+		QueryPerformanceFrequency(&counterFreq);
+
+		return (UINT64)counterFreq.QuadPart;
+	}
+
 	void Platform::messagePump()
 	{
 		MSG  msg;

+ 2 - 2
CamelotUtility/CamelotUtility.vcxproj

@@ -251,7 +251,7 @@
     <ClCompile Include="Source\CmInt2.cpp" />
     <ClCompile Include="Source\CmManagedDataBlock.cpp" />
     <ClCompile Include="Source\CmMemStack.cpp" />
-    <ClCompile Include="Source\CmProfiler.cpp" />
+    <ClCompile Include="Source\CmCPUProfiler.cpp" />
     <ClCompile Include="Source\CmRect.cpp" />
     <ClCompile Include="Source\CmStringTable.cpp" />
     <ClCompile Include="Source\CmTexAtlasGenerator.cpp" />
@@ -280,7 +280,7 @@
     <ClInclude Include="Include\CmMemoryAllocator.h" />
     <ClInclude Include="Include\CmModule.h" />
     <ClInclude Include="Include\CmPath.h" />
-    <ClInclude Include="Include\CmProfiler.h" />
+    <ClInclude Include="Include\CmCPUProfiler.h" />
     <ClInclude Include="Include\CmRect.h" />
     <ClInclude Include="Include\CmRTTIField.h" />
     <ClInclude Include="Include\CmRTTIManagedDataBlockField.h" />

+ 2 - 2
CamelotUtility/CamelotUtility.vcxproj.filters

@@ -228,7 +228,7 @@
     <ClInclude Include="Include\CmStringTable.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="Include\CmProfiler.h">
+    <ClInclude Include="Include\CmCPUProfiler.h">
       <Filter>Header Files</Filter>
     </ClInclude>
   </ItemGroup>
@@ -347,7 +347,7 @@
     <ClCompile Include="Source\CmStringTable.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="Source\CmProfiler.cpp">
+    <ClCompile Include="Source\CmCPUProfiler.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
   </ItemGroup>

+ 297 - 0
CamelotUtility/Include/CmCPUProfiler.h

@@ -0,0 +1,297 @@
+#pragma once
+
+#include "CmPrerequisitesUtil.h"
+
+namespace CamelotFramework
+{
+	class CPUProfilerReport;
+
+	// TODO: Add #defines for all profiler methods so we can easily remove them from final version
+
+	/**
+	 * @brief	Provides various performance measuring methods
+	 * 			
+	 * @note	This class is thread safe. Matching begin*\end* calls
+	 * 			must belong to the same thread though.
+	 */
+	class CM_UTILITY_EXPORT CPUProfiler
+	{
+		class Timer
+		{
+		public:
+			Timer();
+
+			void start();
+			void stop();
+			void reset();
+
+			double time;
+		private:
+			double startTime;
+
+			static inline double getCurrentTime();
+		};
+
+		class TimerPrecise
+		{
+		public:
+			TimerPrecise();
+
+			void start();
+			void stop();
+			void reset();
+
+			UINT64 cycles;
+		private:
+			UINT64 startCycles;
+
+			static inline UINT64 getNumCycles();
+		};
+
+		struct ProfileSample
+		{
+			ProfileSample(double _time)
+				:time(_time)
+			{ }
+
+			double time;
+		};
+
+		struct PreciseProfileSample
+		{
+			PreciseProfileSample(UINT64 _cycles)
+				:cycles(_cycles)
+			{ }
+
+			UINT64 cycles;
+		};
+
+		struct ProfileData
+		{
+			Vector<ProfileSample>::type samples;
+			Timer timer;
+
+			void beginSample();
+			void endSample();
+			void resumeLastSample();
+		};
+
+		struct PreciseProfileData
+		{
+			// TODO - Add cache misses, branch mispredictions, retired instructions vs. optimal number of cycles
+
+			Vector<PreciseProfileSample>::type samples;
+			TimerPrecise timer;
+
+			void beginSample();
+			void endSample();
+			void resumeLastSample();
+		};
+
+		struct PreciseProfiledBlock;
+		struct ProfiledBlock;
+
+		struct ProfiledBlock
+		{
+			ProfiledBlock();
+			~ProfiledBlock();
+
+			String name;
+			ProfileData data;
+			Vector<ProfiledBlock*>::type children;
+			Vector<PreciseProfiledBlock*>::type preciseChildren; // Needed only for estimating overhead
+
+			ProfiledBlock* findChild(const String& name) const;
+		};
+
+		struct PreciseProfiledBlock
+		{
+			PreciseProfiledBlock();
+			~PreciseProfiledBlock();
+
+			String name;
+			PreciseProfileData data;
+			Vector<PreciseProfiledBlock*>::type children;
+			Vector<ProfiledBlock*>::type basicChildren; // Needed only for estimating overhead
+
+			PreciseProfiledBlock* findChild(const String& name) const;
+		};
+
+		struct ThreadInfo
+		{
+			ThreadInfo();
+
+			static CM_THREADLOCAL ThreadInfo* activeThread;
+			bool isActive;
+
+			ProfiledBlock* rootBlock;
+			Stack<ProfiledBlock*>::type activeBlocks;
+			ProfiledBlock* activeBlock;
+
+			PreciseProfiledBlock* rootPreciseBlock;
+			Stack<PreciseProfiledBlock*>::type activePreciseBlocks;
+			PreciseProfiledBlock* activePreciseBlock;
+
+			void begin(const String& _name);
+			void end();
+			void reset();
+
+			ProfiledBlock* getBlock();
+			void releaseBlock(ProfiledBlock* block);
+
+			PreciseProfiledBlock* getPreciseBlock();
+			void releasePreciseBlock(PreciseProfiledBlock* block);
+		};
+
+	public:
+		CPUProfiler();
+		~CPUProfiler();
+
+		/**
+		 * @brief	Registers a new thread we will be doing sampling in. This needs to be called before any beginSample*\endSample* calls
+		 * 			are made in that thread.
+		 *
+		 * @param	name	Name that will allow you to more easily identify the thread.
+		 */
+		void beginThread(const String& name);
+
+		/**
+		 * @brief	Ends sampling for the current thread. No beginSample*\endSample* calls after this point.
+		 */
+		void endThread();
+
+		/**
+		 * @brief	Begins sample measurement. Must be followed by endSample. 
+		 *
+		 * @param	name	Unique name for the sample you can later use to find the sampling data.
+		 */
+		void beginSample(const String& name);
+
+		/**
+		 * @brief	Ends sample measurement and returns measured data.
+		 *
+		 * @param	name	Unique name for the sample. 
+		 * 					
+		 * @note	Unique name is primarily needed to more easily identify mismatched
+		 * 			begin/end sample pairs. Otherwise the name in beginSample would be enough.
+		 */
+		void endSample(const String& name);
+
+		/**
+		 * @brief	Begins sample measurement. Must be followed by endSample. 
+		 *
+		 * @param	name	Unique name for the sample you can later use to find the sampling data.
+		 * 					
+		 * @note	This method uses very precise CPU counters to determine variety of data not
+		 * 			provided by standard beginSample. However due to the way these counters work you should
+		 * 			not use this method for larger parts of code. It does not consider context switches so if the OS
+		 * 			decides to switch context between measurements you will get invalid data.
+		 */
+		void beginSamplePrecise(const String& name);
+
+		/**
+		 * @brief	Ends precise sample measurement and returns measured data.
+		 *
+		 * @param	name	Unique name for the sample. 
+		 * 					
+		 * @note	Unique name is primarily needed to more easily identify mismatched
+		 * 			begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.
+		 */
+		void endSamplePrecise(const String& name);
+
+		/**
+		 * @brief	Called every frame. Internal method.
+		 */
+		void update();
+
+		/**
+		 * @brief	Clears all sampling data, and ends any unfinished sampling blocks.
+		 */
+		void reset();
+
+		/**
+		 * @brief	Generates a report from all previously sampled data.
+		 * 			
+		 * @note	Generating a report will stop all in-progress sampling. You should make sure
+		 * 			you call endSample* manually beforehand so this doesn't have to happen.
+		 */
+		CPUProfilerReport generateReport();
+
+	private:
+		double mBasicTimerOverhead;
+		UINT64 mPreciseTimerOverhead;
+
+		double mBasicSamplingOverhead;
+		UINT64 mPreciseSamplingOverhead;
+
+		Vector<ThreadInfo*>::type mActiveThreads;
+		CM_MUTEX(mThreadSync);
+
+		void estimateTimerOverhead();
+	};
+
+	struct CM_UTILITY_EXPORT CPUProfilerBasicSamplingEntry
+	{
+		struct CM_UTILITY_EXPORT Data
+		{
+			Data();
+
+			String name;
+			UINT32 numCalls;
+
+			double avgTimeMs;
+			double maxTimeMs;
+			double totalTimeMs;
+
+			double avgSelfTimeMs;
+			double totalSelfTimeMs;
+
+			double estimatedSelfOverheadMs;
+			double estimatedOverheadMs;
+
+			float pctOfParent;
+		} data;
+
+		Vector<CPUProfilerBasicSamplingEntry>::type childEntries;
+	};
+
+	struct CM_UTILITY_EXPORT CPUProfilerPreciseSamplingEntry
+	{
+		struct CM_UTILITY_EXPORT Data
+		{
+			Data();
+
+			String name;
+			UINT32 numCalls;
+
+			UINT64 avgCycles;
+			UINT64 maxCycles;
+			UINT64 totalCycles;
+
+			UINT64 avgSelfCycles;
+			UINT64 totalSelfCycles;
+
+			UINT64 estimatedSelfOverhead;
+			UINT64 estimatedOverhead;
+
+			float pctOfParent;
+		} data;
+
+		Vector<CPUProfilerPreciseSamplingEntry>::type childEntries;
+	};
+
+	class CM_UTILITY_EXPORT CPUProfilerReport
+	{
+	public:
+		const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
+		const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
+
+	private:
+		friend class CPUProfiler;
+
+		CPUProfilerReport();
+
+		CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
+		CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
+	};
+}

+ 0 - 78
CamelotUtility/Include/CmProfiler.h

@@ -1,78 +0,0 @@
-#pragma once
-
-#include "CmPrerequisitesUtil.h"
-#include "CmModule.h"
-
-namespace CamelotFramework
-{
-	/**
-	 * @brief	Provides various performance measuring methods
-	 */
-	class CM_UTILITY_EXPORT Profiler : public Module<Profiler>
-	{
-	public:
-		class CM_UTILITY_EXPORT Data
-		{
-			String name;
-			float timeAvgMs;
-			float timeMaxMs;
-			float timeTotalMs;
-			UINT32 hitCount;
-		};
-
-		class CM_UTILITY_EXPORT PreciseData
-		{
-			String name;
-			UINT64 cyclesAvg;
-			UINT64 cyclesMax;
-			UINT64 cyclesTotal;
-			UINT32 hitCount;
-
-			// TODO - Add cache misses, branch mispredictions, retired instructions vs. optimal number of cycles
-		};
-
-		/**
-		 * @brief	Begins sample measurement. Must be followed by endSample. 
-		 *
-		 * @param	name	Unique name for the sample you can later use to find the sampling data.
-		 */
-		void beginSample(const String& name);
-
-		/**
-		 * @brief	Ends sample measurement and returns measured data.
-		 *
-		 * @param	name	Unique name for the sample. 
-		 * 					
-		 * @note	Unique name is primarily needed to more easily identify mismatched
-		 * 			begin/end sample pairs. Otherwise the name in beginSample would be enough.
-		 */
-		Data endSample(const String& name);
-
-		/**
-		 * @brief	Begins sample measurement. Must be followed by endSample. 
-		 *
-		 * @param	name	Unique name for the sample you can later use to find the sampling data.
-		 * 					
-		 * @note	This method uses very precise CPU counters to determine variety of data not
-		 * 			provided by standard beginSample. However due to the way these counters work you should
-		 * 			not use this method for larger parts of code. It does not consider context switches so if the OS
-		 * 			decides to switch context between measurements you will get invalid data.
-		 */
-		void beginSamplePrecise(const String& name);
-
-		/**
-		 * @brief	Ends precise sample measurement and returns measured data.
-		 *
-		 * @param	name	Unique name for the sample. 
-		 * 					
-		 * @note	Unique name is primarily needed to more easily identify mismatched
-		 * 			begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.
-		 */
-		PreciseData endSamplePrecise(const String& name);
-
-		/**
-		 * @brief	Called every frame. Internal method.
-		 */
-		void update();
-	};
-}

+ 725 - 0
CamelotUtility/Source/CmCPUProfiler.cpp

@@ -0,0 +1,725 @@
+#include "CmCPUProfiler.h"
+#include "CmDebug.h"
+
+#if CM_PLATFORM == CM_PLATFORM_WIN32
+#include "windows.h"
+#endif
+
+namespace CamelotFramework
+{
+	CPUProfiler::Timer::Timer()
+	{
+		time = 0.0f;
+	}
+
+	void CPUProfiler::Timer::start()
+	{
+		startTime = getCurrentTime();
+	}
+
+	void CPUProfiler::Timer::stop()
+	{
+		time += getCurrentTime() - startTime;
+	}
+
+	void CPUProfiler::Timer::reset()
+	{
+		time = 0.0f;
+	}
+
+	inline double CPUProfiler::Timer::getCurrentTime() 
+	{
+		// TODO: I should be calling Platform:: performance methods instead of doing it here.
+		// The only problem is that Platform belong to Core and not Utility
+#if CM_PLATFORM == CM_PLATFORM_WIN32
+		LARGE_INTEGER counterValue;
+		QueryPerformanceCounter(&counterValue);
+
+		LARGE_INTEGER counterFrequency;
+		QueryPerformanceFrequency(&counterFrequency);
+
+		return (double)counterValue.QuadPart / (counterFrequency.QuadPart * 0.001);
+#else
+		NOT IMPLEMENTED
+#endif
+	}
+
+	CPUProfiler::TimerPrecise::TimerPrecise()
+	{
+		cycles = 0;
+	}
+
+	void CPUProfiler::TimerPrecise::start()
+	{
+		startCycles = getNumCycles();
+	}
+
+	void CPUProfiler::TimerPrecise::stop()
+	{
+		cycles += getNumCycles() - startCycles;
+	}
+
+	void CPUProfiler::TimerPrecise::reset()
+	{
+		cycles = 0;
+	}
+
+	inline UINT64 CPUProfiler::TimerPrecise::getNumCycles() 
+	{
+#if CM_COMPILER == CM_COMPILER_GNUC
+		asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
+		UINT32 __a,__d;
+		asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
+		return ( UINT64(__a) | UINT64(__d) << 32 );
+#else
+		int a[4];
+		int b = 0;
+		__cpuid(a, b);
+		return __rdtsc();
+#endif		
+	}
+
+	void CPUProfiler::ProfileData::beginSample()
+	{
+		timer.reset();
+		timer.start();
+	}
+
+	void CPUProfiler::ProfileData::endSample()
+	{
+		timer.stop();
+		samples.push_back(ProfileSample(timer.time));
+	}
+
+	void CPUProfiler::ProfileData::resumeLastSample()
+	{
+		timer.start();
+		samples.erase(samples.end() - 1);
+	}
+
+	void CPUProfiler::PreciseProfileData::beginSample()
+	{
+		timer.reset();
+		timer.start();
+	}
+
+	void CPUProfiler::PreciseProfileData::endSample()
+	{
+		timer.stop();
+		samples.push_back(PreciseProfileSample(timer.cycles));
+	}
+
+	void CPUProfiler::PreciseProfileData::resumeLastSample()
+	{
+		timer.start();
+		samples.erase(samples.end() - 1);
+	}
+
+	CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
+
+	CPUProfiler::ThreadInfo::ThreadInfo()
+		:isActive(false), activeBlock(nullptr), rootBlock(nullptr), 
+		activePreciseBlock(nullptr), rootPreciseBlock(nullptr)
+	{
+
+	}
+
+	void CPUProfiler::ThreadInfo::begin(const String& _name)
+	{
+		if(isActive)
+		{
+			LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
+			return;
+		}
+
+		if(rootBlock == nullptr)
+			rootBlock = getBlock();
+
+		if(rootPreciseBlock == nullptr)
+			rootPreciseBlock = getPreciseBlock();
+
+		activeBlocks.push(rootBlock);
+		activeBlock = rootBlock;
+
+		activePreciseBlocks.push(rootPreciseBlock);
+		activePreciseBlock = rootPreciseBlock;
+
+		rootBlock->name = _name; 
+		rootBlock->data.beginSample();
+
+		rootPreciseBlock->name = _name;
+		rootPreciseBlock->data.beginSample();
+	}
+
+	void CPUProfiler::ThreadInfo::end()
+	{
+		activePreciseBlock->data.endSample();
+		activePreciseBlocks.pop();
+
+		activeBlock->data.endSample();
+		activeBlocks.pop();
+
+		if(!isActive)
+			LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
+
+		if(activeBlocks.size() > 0)
+		{
+			LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
+
+			while(activeBlocks.size() > 0)
+			{
+				ProfiledBlock* block = activeBlocks.top();
+				block->data.endSample();
+
+				activeBlocks.pop();
+			}
+		}
+
+		if(activePreciseBlocks.size() > 0)
+		{
+			LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
+
+			while(activePreciseBlocks.size() > 0)
+			{
+				PreciseProfiledBlock* block = activePreciseBlocks.top();
+				block->data.endSample();
+
+				activePreciseBlocks.pop();
+			}
+		}
+
+		isActive = false;
+		activeBlocks = Stack<ProfiledBlock*>::type();
+		activeBlock = nullptr;
+		activePreciseBlocks = Stack<PreciseProfiledBlock*>::type();
+		activePreciseBlock = nullptr;
+	}
+
+	void CPUProfiler::ThreadInfo::reset()
+	{
+		if(isActive)
+			end();
+
+		if(rootBlock != nullptr)
+			releaseBlock(rootBlock);
+
+		if(rootPreciseBlock != nullptr)
+			releasePreciseBlock(rootPreciseBlock);
+
+		rootBlock = nullptr;
+		rootPreciseBlock = nullptr;
+	}
+
+	CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
+	{
+		// TODO - Pool this, if possible using the memory allocator stuff
+		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
+		return cm_new<ProfiledBlock>();
+	}
+
+	void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
+	{
+		cm_delete(block);
+	}
+
+	CPUProfiler::PreciseProfiledBlock* CPUProfiler::ThreadInfo::getPreciseBlock()
+	{
+		// TODO - Pool this, if possible using the memory allocator stuff
+		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
+		return cm_new<PreciseProfiledBlock>();
+	}
+
+	void CPUProfiler::ThreadInfo::releasePreciseBlock(CPUProfiler::PreciseProfiledBlock* block)
+	{
+		cm_delete(block);
+	}
+
+	CPUProfiler::ProfiledBlock::ProfiledBlock()
+	{
+
+	}
+
+	CPUProfiler::ProfiledBlock::~ProfiledBlock()
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+
+		for(auto& child : children)
+			thread->releaseBlock(child);
+
+		children.clear();
+	}
+
+	CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
+	{
+		for(auto& child : children)
+		{
+			if(child->name == name)
+				return child;
+		}
+
+		return nullptr;
+	}
+
+	CPUProfiler::PreciseProfiledBlock::PreciseProfiledBlock()
+	{
+
+	}
+
+	CPUProfiler::PreciseProfiledBlock::~PreciseProfiledBlock()
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+
+		for(auto& child : children)
+			thread->releasePreciseBlock(child);
+
+		children.clear();
+	}
+
+	CPUProfiler::PreciseProfiledBlock* CPUProfiler::PreciseProfiledBlock::findChild(const String& name) const
+	{
+		for(auto& child : children)
+		{
+			if(child->name == name)
+				return child;
+		}
+
+		return nullptr;
+	}
+
+	CPUProfiler::CPUProfiler()
+		:mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverhead(0.0), mPreciseSamplingOverhead(0)
+	{
+		// TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
+		// and keep separate values per thread.
+		estimateTimerOverhead();
+	}
+
+	CPUProfiler::~CPUProfiler()
+	{
+		reset();
+
+		CM_LOCK_MUTEX(mThreadSync);
+
+		for(auto& threadInfo : mActiveThreads)
+		{
+			threadInfo->releaseBlock(threadInfo->rootBlock);
+			cm_delete(threadInfo);
+		}
+	}
+
+	void CPUProfiler::beginThread(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr)
+		{
+			ThreadInfo::activeThread = cm_new<ThreadInfo>();
+			thread = ThreadInfo::activeThread;
+
+			{
+				CM_LOCK_MUTEX(mThreadSync);
+
+				mActiveThreads.push_back(thread);
+			}
+		}
+
+		thread->begin(name);
+	}
+
+	void CPUProfiler::endThread()
+	{
+		// I don't do a nullcheck where on purpose, so endSample can be called ASAP
+		ThreadInfo::activeThread->end();
+	}
+
+	void CPUProfiler::beginSample(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr || !thread->isActive)
+			beginThread("Unknown");
+
+		ProfiledBlock* parent = thread->activeBlock;
+		ProfiledBlock* block = nullptr;
+		
+		parent->findChild(name);
+
+		if(block == nullptr)
+		{
+			block = thread->getBlock();
+			block->name = name;
+
+			parent->children.push_back(block);
+			thread->activePreciseBlock->basicChildren.push_back(block);
+
+			thread->activeBlocks.push(block);
+			thread->activeBlock = block;
+		}
+
+		block->data.beginSample();
+	}
+
+	void CPUProfiler::endSample(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		ProfiledBlock* block = thread->activeBlock;
+		block->data.endSample();
+
+		if(block->name != name)
+		{
+			LOGWRN("Mismatched Profiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
+
+			block->data.resumeLastSample();
+
+			return;
+		}
+
+		thread->activeBlocks.pop();
+		thread->activeBlock = thread->activeBlocks.top();
+	}
+
+	void CPUProfiler::beginSamplePrecise(const String& name)
+	{
+		// Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed. 
+		// Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
+		
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr || !thread->isActive)
+			beginThread("Unknown");
+
+		PreciseProfiledBlock* parent = thread->activePreciseBlock;
+		PreciseProfiledBlock* block = nullptr;
+		
+		parent->findChild(name);
+
+		if(block == nullptr)
+		{
+			block = thread->getPreciseBlock();
+			block->name = name;
+
+			parent->children.push_back(block);
+			thread->activeBlock->preciseChildren.push_back(block);
+
+			thread->activePreciseBlocks.push(block);
+			thread->activePreciseBlock = block;
+		}
+
+		block->data.beginSample();
+	}
+
+	void CPUProfiler::endSamplePrecise(const String& name)
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		PreciseProfiledBlock* block = thread->activePreciseBlock;
+		block->data.endSample();
+
+		if(block->name != name)
+		{
+			LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
+
+			block->data.resumeLastSample();
+
+			return;
+		}
+
+		thread->activePreciseBlocks.pop();
+		thread->activePreciseBlock = thread->activePreciseBlocks.top();
+	}
+
+	void CPUProfiler::update()
+	{
+		// TODO: Keep track of FPS
+	}
+
+	void CPUProfiler::reset()
+	{
+		ThreadInfo* thread = ThreadInfo::activeThread;
+
+		if(thread != nullptr)
+			thread->reset();
+	}
+
+	CPUProfilerReport CPUProfiler::generateReport()
+	{
+		CPUProfilerReport report;
+
+		ThreadInfo* thread = ThreadInfo::activeThread;
+		if(thread == nullptr)
+			return report;
+
+		if(thread->isActive)
+			thread->end();
+
+		if(thread->rootBlock != nullptr)
+		{
+			// Fill up flatHierarchy array in a way so we always process
+			// children before parents
+			Stack<ProfiledBlock*>::type todo;
+			Vector<ProfiledBlock*>::type flatHierarchy;
+			Vector<CPUProfilerBasicSamplingEntry*>::type flatResultHierarchy;
+
+			todo.push(thread->rootBlock);
+			flatHierarchy.push_back(thread->rootBlock);
+			flatResultHierarchy.push_back(&report.mBasicSamplingRootEntry);
+
+			while(!todo.empty())
+			{
+				ProfiledBlock* curBlock = todo.top();
+				todo.pop();
+
+				CPUProfilerBasicSamplingEntry* parentEntry = flatResultHierarchy.back();
+				for(auto& child : curBlock->children)
+				{
+					todo.push(child);
+					flatHierarchy.push_back(child);
+
+					parentEntry->childEntries.push_back(CPUProfilerBasicSamplingEntry());
+					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					
+				}
+			}
+
+			auto& iter = flatHierarchy.rbegin();
+			auto& iterSample = flatResultHierarchy.rbegin();
+
+			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)
+			{
+				ProfiledBlock* curBlock = *iter;
+				CPUProfilerBasicSamplingEntry* entry = *iterSample;
+
+				entry->data.name = curBlock->name;
+
+				entry->data.totalTimeMs = 0.0;
+				entry->data.maxTimeMs = 0.0;
+				for(auto& sample : curBlock->data.samples)
+				{
+					entry->data.totalTimeMs += sample.time;
+					entry->data.maxTimeMs = std::max(entry->data.maxTimeMs, sample.time);
+				}
+
+				entry->data.numCalls = (UINT32)curBlock->data.samples.size();
+				entry->data.avgTimeMs = entry->data.totalTimeMs / entry->data.numCalls;
+
+				UINT32 childIdx = 0;
+				double totalChildTime = 0.0;
+				for(auto& child : curBlock->children)
+				{
+					totalChildTime += entry->childEntries[childIdx].data.totalTimeMs;
+					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalTimeMs / entry->data.totalTimeMs;
+
+					entry->data.estimatedOverheadMs += entry->childEntries[childIdx].data.estimatedOverheadMs + mBasicSamplingOverhead;
+
+					childIdx++;
+				}
+
+				entry->data.totalSelfTimeMs = entry->data.totalTimeMs - totalChildTime;
+				entry->data.avgSelfTimeMs = entry->data.totalSelfTimeMs / entry->data.numCalls;
+
+				entry->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
+			}
+		}
+
+		if(thread->rootPreciseBlock != nullptr)
+		{
+			// Fill up flatHierarchy array in a way so we always process
+			// children before parents
+			Stack<PreciseProfiledBlock*>::type todo;
+			Vector<PreciseProfiledBlock*>::type flatHierarchy;
+			Vector<CPUProfilerPreciseSamplingEntry*>::type flatResultHierarchy;
+
+			todo.push(thread->rootPreciseBlock);
+			flatHierarchy.push_back(thread->rootPreciseBlock);
+			flatResultHierarchy.push_back(&report.mPreciseSamplingRootEntry);
+
+			while(!todo.empty())
+			{
+				PreciseProfiledBlock* curBlock = todo.top();
+				todo.pop();
+
+				CPUProfilerPreciseSamplingEntry* parentEntry = flatResultHierarchy.back();
+				for(auto& child : curBlock->children)
+				{
+					todo.push(child);
+					flatHierarchy.push_back(child);
+
+					parentEntry->childEntries.push_back(CPUProfilerPreciseSamplingEntry());
+					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					
+				}
+			}
+
+			auto& iter = flatHierarchy.rbegin();
+			auto& iterSample = flatResultHierarchy.rbegin();
+
+			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)
+			{
+				PreciseProfiledBlock* curBlock = *iter;
+				CPUProfilerPreciseSamplingEntry* entry = *iterSample;
+
+				entry->data.name = curBlock->name;
+
+				entry->data.totalCycles = 0;
+				entry->data.maxCycles = 0;
+				for(auto& sample : curBlock->data.samples)
+				{
+					entry->data.totalCycles += sample.cycles;
+					entry->data.maxCycles = std::max(entry->data.maxCycles, sample.cycles);
+				}
+
+				entry->data.numCalls = (UINT32)curBlock->data.samples.size();
+				entry->data.avgCycles = entry->data.avgCycles / entry->data.numCalls;
+
+				UINT32 childIdx = 0;
+				UINT64 totalChildCycles = 0;
+				for(auto& child : curBlock->children)
+				{
+					totalChildCycles += entry->childEntries[childIdx].data.totalCycles;
+					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalCycles / (float)entry->data.totalCycles;
+
+					entry->data.estimatedOverhead += entry->childEntries[childIdx].data.estimatedOverhead + mPreciseSamplingOverhead;
+
+					childIdx++;
+				}
+
+				entry->data.totalSelfCycles = entry->data.totalCycles - totalChildCycles;
+				entry->data.avgSelfCycles = entry->data.totalSelfCycles / entry->data.numCalls;
+
+				entry->data.estimatedSelfOverhead = mPreciseTimerOverhead;
+			}
+		}
+	}
+
+	void CPUProfiler::estimateTimerOverhead()
+	{
+		// Get an idea of how long timer calls and RDTSC takes
+		const UINT32 reps = 1000, sampleReps = 100;
+
+		mBasicTimerOverhead = 1000000.0;
+		mPreciseTimerOverhead = 1000000;
+		for (UINT32 tries = 0; tries < 20; tries++) 
+		{
+			Timer timer;
+			for (UINT32 i = 0; i < reps; i++) 
+			{
+				timer.start();
+				timer.stop();
+			}
+
+			double avgTime = double(timer.time)/double(reps);
+			if (avgTime < mBasicTimerOverhead)
+				mBasicTimerOverhead = avgTime;
+
+			TimerPrecise timerPrecise;
+			for (UINT32 i = 0; i < reps; i++) 
+			{
+				timerPrecise.start();
+				timerPrecise.stop();
+			}
+
+			UINT64 avgCycles = timerPrecise.cycles/reps;
+			if (avgCycles < mPreciseTimerOverhead)
+				mPreciseTimerOverhead = avgCycles;
+		}
+
+		for (UINT32 tries = 0; tries < 20; tries++) 
+		{
+			Timer timer;
+			timer.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSample("TestAvg1");
+				endSample("TestAvg1");
+				beginSample("TestAvg2");
+				endSample("TestAvg2");
+				beginSample("TestAvg3");
+				endSample("TestAvg3");
+				beginSample("TestAvg4");
+				endSample("TestAvg4");
+				beginSample("TestAvg5");
+				endSample("TestAvg5");
+				beginSample("TestAvg6");
+				endSample("TestAvg6");
+				beginSample("TestAvg7");
+				endSample("TestAvg7");
+				beginSample("TestAvg8");
+				endSample("TestAvg8");
+				beginSample("TestAvg9");
+				endSample("TestAvg9");
+				beginSample("TestAvg10");
+				endSample("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSample("Test#" + toString(i));
+				endSample("Test#" + toString(i));
+			}
+
+			endThread();
+			timer.stop();
+
+			reset();
+
+			double avgTime = double(timer.time)/double(sampleReps * 10 + sampleReps * 5);
+			if (avgTime < mBasicSamplingOverhead)
+				mBasicSamplingOverhead = avgTime;
+
+			TimerPrecise timerPrecise;
+			timerPrecise.start();
+			beginThread("Main");
+
+			// Two different cases that can effect performance, one where
+			// sample already exists and other where new one needs to be created
+			for (UINT32 i = 0; i < sampleReps; i++) 
+			{
+				beginSamplePrecise("TestAvg1");
+				endSamplePrecise("TestAvg1");
+				beginSamplePrecise("TestAvg2");
+				endSamplePrecise("TestAvg2");
+				beginSamplePrecise("TestAvg3");
+				endSamplePrecise("TestAvg3");
+				beginSamplePrecise("TestAvg4");
+				endSamplePrecise("TestAvg4");
+				beginSamplePrecise("TestAvg5");
+				endSamplePrecise("TestAvg5");
+				beginSamplePrecise("TestAvg6");
+				endSamplePrecise("TestAvg6");
+				beginSamplePrecise("TestAvg7");
+				endSamplePrecise("TestAvg7");
+				beginSamplePrecise("TestAvg8");
+				endSamplePrecise("TestAvg8");
+				beginSamplePrecise("TestAvg9");
+				endSamplePrecise("TestAvg9");
+				beginSamplePrecise("TestAvg10");
+				endSamplePrecise("TestAvg10");
+			}
+
+			for (UINT32 i = 0; i < sampleReps * 5; i++) 
+			{
+				beginSamplePrecise("Test#" + toString(i));
+				endSamplePrecise("Test#" + toString(i));
+			}
+
+			endThread();
+			timerPrecise.stop();
+
+			reset();
+
+			UINT64 avgCycles = timerPrecise.cycles/(sampleReps * 10 + sampleReps * 5);
+			if (avgCycles < mPreciseSamplingOverhead)
+				mPreciseSamplingOverhead = avgCycles;
+		}
+	}
+
+	CPUProfilerBasicSamplingEntry::Data::Data()
+		:numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
+		avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
+		estimatedOverheadMs(0.0), pctOfParent(1.0f)
+	{ }
+
+	CPUProfilerPreciseSamplingEntry::Data::Data()
+		:numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
+		avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
+		estimatedOverhead(0), pctOfParent(1.0f)
+	{ }
+}

+ 0 - 29
CamelotUtility/Source/CmProfiler.cpp

@@ -1,29 +0,0 @@
-#include "CmProfiler.h"
-
-namespace CamelotFramework
-{
-	void Profiler::beginSample(const String& name)
-	{
-
-	}
-
-	Profiler::Data Profiler::endSample(const String& name)
-	{
-		return Profiler::Data();
-	}
-
-	void Profiler::beginSamplePrecise(const String& name)
-	{
-
-	}
-
-	Profiler::PreciseData Profiler::endSamplePrecise(const String& name)
-	{
-		return Profiler::PreciseData();
-	}
-
-	void Profiler::update()
-	{
-
-	}
-}

+ 7 - 0
TODO.txt

@@ -8,6 +8,11 @@ LONGTERM TODO:
   - When building a profiler have main Profiler class which just does measurements, then ProfilerOverlay for data display on-screen, ProfilerEditor for Unity-like Profiler, etc.
   - For now just create a profiler with basic measuring stats (FPS, core & sim thread time, plus times for most important systems), and ProfilerOverlay to display them
 
+PROFILER:
+ TODO: Profiler is right now including windows.h. I need to work around that but don't feel like bothering with it atm
+
+
+
 I still re-create GUIWidget mesh every frame instead of just updating it.
 
 MAJOR ISSUE: writeSubresource/readSubresoure doesn't require a shared ptr to GpuResourceData which means it could get destroyed while still in command queue. Right now it only works because I block right after I call those methods, which ensures nothing is destroyed.
@@ -87,6 +92,8 @@ Medium priority:
    - Doing setPixels_async in the texture doesn't make sure that the user doesn't actually modify the provided PixelData after that call.
    - In general I need to rethink how to handle modifying resources with multithreading
  - Closing down a window (any window) will shut down main rendering loop
+ - GUIManager draw call merging:
+    I merge two GUI elements if they don't overlap each other, however I don't consider if there is some world geometry between them. Another reason to move batching out of GUIManager.
 
 ----------------------------------------------------------------------------------------------
 Low priority TODO