12 жил өмнө · 64db00cec3
--- a/CamelotCore/Include/Win32/CmPlatformImpl.h
+++ b/CamelotCore/Include/Win32/CmPlatformImpl.h
@@ -165,6 +165,18 @@ namespace CamelotFramework
 
				 		 */

			
 
				 		static WString copyFromClipboard();

			
 
				 

			
 
				+		/**

			
 
				+		 * @brief	Queries the internal system performance counter you can use for very precise time

			
 
				+		 * 			measurements. Value is in "queryPerformanceFrequency" units.

			
 
				+		 */

			
 
				+		static UINT64 queryPerformanceCounter();

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Queries the internal system performance counter frequency. Used for interpreting

			
 
				+		 * 			data returned by "queryPerformanceCounter".

			
 
				+		 */

			
 
				+		static UINT64 queryPerformanceFrequency();

			
 
				+

			
 
				 		/**

			
 
				 		 * @brief	Message pump. Processes OS messages and returns when it's free.

			
 
				 		 * 			

			
--- a/CamelotCore/Source/Win32/CmPlatformImpl.cpp
+++ b/CamelotCore/Source/Win32/CmPlatformImpl.cpp
@@ -341,6 +341,22 @@ namespace CamelotFramework
 
				 		return L"";

			
 
				 	}

			
 
				 

			
 
				+	UINT64 Platform::queryPerformanceCounter()

			
 
				+	{

			
 
				+		LARGE_INTEGER counterValue;

			
 
				+		QueryPerformanceCounter(&counterValue);

			
 
				+		

			
 
				+		return (UINT64)counterValue.QuadPart;

			
 
				+	}

			
 
				+

			
 
				+	UINT64 Platform::queryPerformanceFrequency()

			
 
				+	{

			
 
				+		LARGE_INTEGER counterFreq;

			
 
				+		QueryPerformanceFrequency(&counterFreq);

			
 
				+

			
 
				+		return (UINT64)counterFreq.QuadPart;

			
 
				+	}

			
 
				+

			
 
				 	void Platform::messagePump()

			
 
				 	{

			
 
				 		MSG  msg;

			
--- a/CamelotUtility/CamelotUtility.vcxproj
+++ b/CamelotUtility/CamelotUtility.vcxproj
@@ -251,7 +251,7 @@
 
				     <ClCompile Include="Source\CmInt2.cpp" />

			
 
				     <ClCompile Include="Source\CmManagedDataBlock.cpp" />

			
 
				     <ClCompile Include="Source\CmMemStack.cpp" />

			
 
				-    <ClCompile Include="Source\CmProfiler.cpp" />

			
 
				+    <ClCompile Include="Source\CmCPUProfiler.cpp" />

			
 
				     <ClCompile Include="Source\CmRect.cpp" />

			
 
				     <ClCompile Include="Source\CmStringTable.cpp" />

			
 
				     <ClCompile Include="Source\CmTexAtlasGenerator.cpp" />

			
@@ -280,7 +280,7 @@
 
				     <ClInclude Include="Include\CmMemoryAllocator.h" />

			
 
				     <ClInclude Include="Include\CmModule.h" />

			
 
				     <ClInclude Include="Include\CmPath.h" />

			
 
				-    <ClInclude Include="Include\CmProfiler.h" />

			
 
				+    <ClInclude Include="Include\CmCPUProfiler.h" />

			
 
				     <ClInclude Include="Include\CmRect.h" />

			
 
				     <ClInclude Include="Include\CmRTTIField.h" />

			
 
				     <ClInclude Include="Include\CmRTTIManagedDataBlockField.h" />

			
--- a/CamelotUtility/CamelotUtility.vcxproj.filters
+++ b/CamelotUtility/CamelotUtility.vcxproj.filters
@@ -228,7 +228,7 @@
 
				     <ClInclude Include="Include\CmStringTable.h">

			
 
				       <Filter>Header Files</Filter>

			
 
				     </ClInclude>

			
 
				-    <ClInclude Include="Include\CmProfiler.h">

			
 
				+    <ClInclude Include="Include\CmCPUProfiler.h">

			
 
				       <Filter>Header Files</Filter>

			
 
				     </ClInclude>

			
 
				   </ItemGroup>

			
@@ -347,7 +347,7 @@
 
				     <ClCompile Include="Source\CmStringTable.cpp">

			
 
				       <Filter>Source Files</Filter>

			
 
				     </ClCompile>

			
 
				-    <ClCompile Include="Source\CmProfiler.cpp">

			
 
				+    <ClCompile Include="Source\CmCPUProfiler.cpp">

			
 
				       <Filter>Source Files</Filter>

			
 
				     </ClCompile>

			
 
				   </ItemGroup>

			
--- a/CamelotUtility/Include/CmCPUProfiler.h
+++ b/CamelotUtility/Include/CmCPUProfiler.h
@@ -0,0 +1,297 @@
 
				+#pragma once

			
 
				+

			
 
				+#include "CmPrerequisitesUtil.h"

			
 
				+

			
 
				+namespace CamelotFramework

			
 
				+{

			
 
				+	class CPUProfilerReport;

			
 
				+

			
 
				+	// TODO: Add #defines for all profiler methods so we can easily remove them from final version

			
 
				+

			
 
				+	/**

			
 
				+	 * @brief	Provides various performance measuring methods

			
 
				+	 * 			

			
 
				+	 * @note	This class is thread safe. Matching begin*\end* calls

			
 
				+	 * 			must belong to the same thread though.

			
 
				+	 */

			
 
				+	class CM_UTILITY_EXPORT CPUProfiler

			
 
				+	{

			
 
				+		class Timer

			
 
				+		{

			
 
				+		public:

			
 
				+			Timer();

			
 
				+

			
 
				+			void start();

			
 
				+			void stop();

			
 
				+			void reset();

			
 
				+

			
 
				+			double time;

			
 
				+		private:

			
 
				+			double startTime;

			
 
				+

			
 
				+			static inline double getCurrentTime();

			
 
				+		};

			
 
				+

			
 
				+		class TimerPrecise

			
 
				+		{

			
 
				+		public:

			
 
				+			TimerPrecise();

			
 
				+

			
 
				+			void start();

			
 
				+			void stop();

			
 
				+			void reset();

			
 
				+

			
 
				+			UINT64 cycles;

			
 
				+		private:

			
 
				+			UINT64 startCycles;

			
 
				+

			
 
				+			static inline UINT64 getNumCycles();

			
 
				+		};

			
 
				+

			
 
				+		struct ProfileSample

			
 
				+		{

			
 
				+			ProfileSample(double _time)

			
 
				+				:time(_time)

			
 
				+			{ }

			
 
				+

			
 
				+			double time;

			
 
				+		};

			
 
				+

			
 
				+		struct PreciseProfileSample

			
 
				+		{

			
 
				+			PreciseProfileSample(UINT64 _cycles)

			
 
				+				:cycles(_cycles)

			
 
				+			{ }

			
 
				+

			
 
				+			UINT64 cycles;

			
 
				+		};

			
 
				+

			
 
				+		struct ProfileData

			
 
				+		{

			
 
				+			Vector<ProfileSample>::type samples;

			
 
				+			Timer timer;

			
 
				+

			
 
				+			void beginSample();

			
 
				+			void endSample();

			
 
				+			void resumeLastSample();

			
 
				+		};

			
 
				+

			
 
				+		struct PreciseProfileData

			
 
				+		{

			
 
				+			// TODO - Add cache misses, branch mispredictions, retired instructions vs. optimal number of cycles

			
 
				+

			
 
				+			Vector<PreciseProfileSample>::type samples;

			
 
				+			TimerPrecise timer;

			
 
				+

			
 
				+			void beginSample();

			
 
				+			void endSample();

			
 
				+			void resumeLastSample();

			
 
				+		};

			
 
				+

			
 
				+		struct PreciseProfiledBlock;

			
 
				+		struct ProfiledBlock;

			
 
				+

			
 
				+		struct ProfiledBlock

			
 
				+		{

			
 
				+			ProfiledBlock();

			
 
				+			~ProfiledBlock();

			
 
				+

			
 
				+			String name;

			
 
				+			ProfileData data;

			
 
				+			Vector<ProfiledBlock*>::type children;

			
 
				+			Vector<PreciseProfiledBlock*>::type preciseChildren; // Needed only for estimating overhead

			
 
				+

			
 
				+			ProfiledBlock* findChild(const String& name) const;

			
 
				+		};

			
 
				+

			
 
				+		struct PreciseProfiledBlock

			
 
				+		{

			
 
				+			PreciseProfiledBlock();

			
 
				+			~PreciseProfiledBlock();

			
 
				+

			
 
				+			String name;

			
 
				+			PreciseProfileData data;

			
 
				+			Vector<PreciseProfiledBlock*>::type children;

			
 
				+			Vector<ProfiledBlock*>::type basicChildren; // Needed only for estimating overhead

			
 
				+

			
 
				+			PreciseProfiledBlock* findChild(const String& name) const;

			
 
				+		};

			
 
				+

			
 
				+		struct ThreadInfo

			
 
				+		{

			
 
				+			ThreadInfo();

			
 
				+

			
 
				+			static CM_THREADLOCAL ThreadInfo* activeThread;

			
 
				+			bool isActive;

			
 
				+

			
 
				+			ProfiledBlock* rootBlock;

			
 
				+			Stack<ProfiledBlock*>::type activeBlocks;

			
 
				+			ProfiledBlock* activeBlock;

			
 
				+

			
 
				+			PreciseProfiledBlock* rootPreciseBlock;

			
 
				+			Stack<PreciseProfiledBlock*>::type activePreciseBlocks;

			
 
				+			PreciseProfiledBlock* activePreciseBlock;

			
 
				+

			
 
				+			void begin(const String& _name);

			
 
				+			void end();

			
 
				+			void reset();

			
 
				+

			
 
				+			ProfiledBlock* getBlock();

			
 
				+			void releaseBlock(ProfiledBlock* block);

			
 
				+

			
 
				+			PreciseProfiledBlock* getPreciseBlock();

			
 
				+			void releasePreciseBlock(PreciseProfiledBlock* block);

			
 
				+		};

			
 
				+

			
 
				+	public:

			
 
				+		CPUProfiler();

			
 
				+		~CPUProfiler();

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Registers a new thread we will be doing sampling in. This needs to be called before any beginSample*\endSample* calls

			
 
				+		 * 			are made in that thread.

			
 
				+		 *

			
 
				+		 * @param	name	Name that will allow you to more easily identify the thread.

			
 
				+		 */

			
 
				+		void beginThread(const String& name);

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Ends sampling for the current thread. No beginSample*\endSample* calls after this point.

			
 
				+		 */

			
 
				+		void endThread();

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Begins sample measurement. Must be followed by endSample. 

			
 
				+		 *

			
 
				+		 * @param	name	Unique name for the sample you can later use to find the sampling data.

			
 
				+		 */

			
 
				+		void beginSample(const String& name);

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Ends sample measurement and returns measured data.

			
 
				+		 *

			
 
				+		 * @param	name	Unique name for the sample. 

			
 
				+		 * 					

			
 
				+		 * @note	Unique name is primarily needed to more easily identify mismatched

			
 
				+		 * 			begin/end sample pairs. Otherwise the name in beginSample would be enough.

			
 
				+		 */

			
 
				+		void endSample(const String& name);

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Begins sample measurement. Must be followed by endSample. 

			
 
				+		 *

			
 
				+		 * @param	name	Unique name for the sample you can later use to find the sampling data.

			
 
				+		 * 					

			
 
				+		 * @note	This method uses very precise CPU counters to determine variety of data not

			
 
				+		 * 			provided by standard beginSample. However due to the way these counters work you should

			
 
				+		 * 			not use this method for larger parts of code. It does not consider context switches so if the OS

			
 
				+		 * 			decides to switch context between measurements you will get invalid data.

			
 
				+		 */

			
 
				+		void beginSamplePrecise(const String& name);

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Ends precise sample measurement and returns measured data.

			
 
				+		 *

			
 
				+		 * @param	name	Unique name for the sample. 

			
 
				+		 * 					

			
 
				+		 * @note	Unique name is primarily needed to more easily identify mismatched

			
 
				+		 * 			begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.

			
 
				+		 */

			
 
				+		void endSamplePrecise(const String& name);

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Called every frame. Internal method.

			
 
				+		 */

			
 
				+		void update();

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Clears all sampling data, and ends any unfinished sampling blocks.

			
 
				+		 */

			
 
				+		void reset();

			
 
				+

			
 
				+		/**

			
 
				+		 * @brief	Generates a report from all previously sampled data.

			
 
				+		 * 			

			
 
				+		 * @note	Generating a report will stop all in-progress sampling. You should make sure

			
 
				+		 * 			you call endSample* manually beforehand so this doesn't have to happen.

			
 
				+		 */

			
 
				+		CPUProfilerReport generateReport();

			
 
				+

			
 
				+	private:

			
 
				+		double mBasicTimerOverhead;

			
 
				+		UINT64 mPreciseTimerOverhead;

			
 
				+

			
 
				+		double mBasicSamplingOverhead;

			
 
				+		UINT64 mPreciseSamplingOverhead;

			
 
				+

			
 
				+		Vector<ThreadInfo*>::type mActiveThreads;

			
 
				+		CM_MUTEX(mThreadSync);

			
 
				+

			
 
				+		void estimateTimerOverhead();

			
 
				+	};

			
 
				+

			
 
				+	struct CM_UTILITY_EXPORT CPUProfilerBasicSamplingEntry

			
 
				+	{

			
 
				+		struct CM_UTILITY_EXPORT Data

			
 
				+		{

			
 
				+			Data();

			
 
				+

			
 
				+			String name;

			
 
				+			UINT32 numCalls;

			
 
				+

			
 
				+			double avgTimeMs;

			
 
				+			double maxTimeMs;

			
 
				+			double totalTimeMs;

			
 
				+

			
 
				+			double avgSelfTimeMs;

			
 
				+			double totalSelfTimeMs;

			
 
				+

			
 
				+			double estimatedSelfOverheadMs;

			
 
				+			double estimatedOverheadMs;

			
 
				+

			
 
				+			float pctOfParent;

			
 
				+		} data;

			
 
				+

			
 
				+		Vector<CPUProfilerBasicSamplingEntry>::type childEntries;

			
 
				+	};

			
 
				+

			
 
				+	struct CM_UTILITY_EXPORT CPUProfilerPreciseSamplingEntry

			
 
				+	{

			
 
				+		struct CM_UTILITY_EXPORT Data

			
 
				+		{

			
 
				+			Data();

			
 
				+

			
 
				+			String name;

			
 
				+			UINT32 numCalls;

			
 
				+

			
 
				+			UINT64 avgCycles;

			
 
				+			UINT64 maxCycles;

			
 
				+			UINT64 totalCycles;

			
 
				+

			
 
				+			UINT64 avgSelfCycles;

			
 
				+			UINT64 totalSelfCycles;

			
 
				+

			
 
				+			UINT64 estimatedSelfOverhead;

			
 
				+			UINT64 estimatedOverhead;

			
 
				+

			
 
				+			float pctOfParent;

			
 
				+		} data;

			
 
				+

			
 
				+		Vector<CPUProfilerPreciseSamplingEntry>::type childEntries;

			
 
				+	};

			
 
				+

			
 
				+	class CM_UTILITY_EXPORT CPUProfilerReport

			
 
				+	{

			
 
				+	public:

			
 
				+		const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }

			
 
				+		const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }

			
 
				+

			
 
				+	private:

			
 
				+		friend class CPUProfiler;

			
 
				+

			
 
				+		CPUProfilerReport();

			
 
				+

			
 
				+		CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;

			
 
				+		CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;

			
 
				+	};

			
 
				+}
			
--- a/CamelotUtility/Include/CmProfiler.h
+++ b/CamelotUtility/Include/CmProfiler.h
@@ -1,78 +0,0 @@
 
				-#pragma once

			
 
				-

			
 
				-#include "CmPrerequisitesUtil.h"

			
 
				-#include "CmModule.h"

			
 
				-

			
 
				-namespace CamelotFramework

			
 
				-{

			
 
				-	/**

			
 
				-	 * @brief	Provides various performance measuring methods

			
 
				-	 */

			
 
				-	class CM_UTILITY_EXPORT Profiler : public Module<Profiler>

			
 
				-	{

			
 
				-	public:

			
 
				-		class CM_UTILITY_EXPORT Data

			
 
				-		{

			
 
				-			String name;

			
 
				-			float timeAvgMs;

			
 
				-			float timeMaxMs;

			
 
				-			float timeTotalMs;

			
 
				-			UINT32 hitCount;

			
 
				-		};

			
 
				-

			
 
				-		class CM_UTILITY_EXPORT PreciseData

			
 
				-		{

			
 
				-			String name;

			
 
				-			UINT64 cyclesAvg;

			
 
				-			UINT64 cyclesMax;

			
 
				-			UINT64 cyclesTotal;

			
 
				-			UINT32 hitCount;

			
 
				-

			
 
				-			// TODO - Add cache misses, branch mispredictions, retired instructions vs. optimal number of cycles

			
 
				-		};

			
 
				-

			
 
				-		/**

			
 
				-		 * @brief	Begins sample measurement. Must be followed by endSample. 

			
 
				-		 *

			
 
				-		 * @param	name	Unique name for the sample you can later use to find the sampling data.

			
 
				-		 */

			
 
				-		void beginSample(const String& name);

			
 
				-

			
 
				-		/**

			
 
				-		 * @brief	Ends sample measurement and returns measured data.

			
 
				-		 *

			
 
				-		 * @param	name	Unique name for the sample. 

			
 
				-		 * 					

			
 
				-		 * @note	Unique name is primarily needed to more easily identify mismatched

			
 
				-		 * 			begin/end sample pairs. Otherwise the name in beginSample would be enough.

			
 
				-		 */

			
 
				-		Data endSample(const String& name);

			
 
				-

			
 
				-		/**

			
 
				-		 * @brief	Begins sample measurement. Must be followed by endSample. 

			
 
				-		 *

			
 
				-		 * @param	name	Unique name for the sample you can later use to find the sampling data.

			
 
				-		 * 					

			
 
				-		 * @note	This method uses very precise CPU counters to determine variety of data not

			
 
				-		 * 			provided by standard beginSample. However due to the way these counters work you should

			
 
				-		 * 			not use this method for larger parts of code. It does not consider context switches so if the OS

			
 
				-		 * 			decides to switch context between measurements you will get invalid data.

			
 
				-		 */

			
 
				-		void beginSamplePrecise(const String& name);

			
 
				-

			
 
				-		/**

			
 
				-		 * @brief	Ends precise sample measurement and returns measured data.

			
 
				-		 *

			
 
				-		 * @param	name	Unique name for the sample. 

			
 
				-		 * 					

			
 
				-		 * @note	Unique name is primarily needed to more easily identify mismatched

			
 
				-		 * 			begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.

			
 
				-		 */

			
 
				-		PreciseData endSamplePrecise(const String& name);

			
 
				-

			
 
				-		/**

			
 
				-		 * @brief	Called every frame. Internal method.

			
 
				-		 */

			
 
				-		void update();

			
 
				-	};

			
 
				-}
			
--- a/CamelotUtility/Source/CmCPUProfiler.cpp
+++ b/CamelotUtility/Source/CmCPUProfiler.cpp
@@ -0,0 +1,725 @@
 
				+#include "CmCPUProfiler.h"

			
 
				+#include "CmDebug.h"

			
 
				+

			
 
				+#if CM_PLATFORM == CM_PLATFORM_WIN32

			
 
				+#include "windows.h"

			
 
				+#endif

			
 
				+

			
 
				+namespace CamelotFramework

			
 
				+{

			
 
				+	CPUProfiler::Timer::Timer()

			
 
				+	{

			
 
				+		time = 0.0f;

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::Timer::start()

			
 
				+	{

			
 
				+		startTime = getCurrentTime();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::Timer::stop()

			
 
				+	{

			
 
				+		time += getCurrentTime() - startTime;

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::Timer::reset()

			
 
				+	{

			
 
				+		time = 0.0f;

			
 
				+	}

			
 
				+

			
 
				+	inline double CPUProfiler::Timer::getCurrentTime() 

			
 
				+	{

			
 
				+		// TODO: I should be calling Platform:: performance methods instead of doing it here.

			
 
				+		// The only problem is that Platform belong to Core and not Utility

			
 
				+#if CM_PLATFORM == CM_PLATFORM_WIN32

			
 
				+		LARGE_INTEGER counterValue;

			
 
				+		QueryPerformanceCounter(&counterValue);

			
 
				+

			
 
				+		LARGE_INTEGER counterFrequency;

			
 
				+		QueryPerformanceFrequency(&counterFrequency);

			
 
				+

			
 
				+		return (double)counterValue.QuadPart / (counterFrequency.QuadPart * 0.001);

			
 
				+#else

			
 
				+		NOT IMPLEMENTED

			
 
				+#endif

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::TimerPrecise::TimerPrecise()

			
 
				+	{

			
 
				+		cycles = 0;

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::TimerPrecise::start()

			
 
				+	{

			
 
				+		startCycles = getNumCycles();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::TimerPrecise::stop()

			
 
				+	{

			
 
				+		cycles += getNumCycles() - startCycles;

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::TimerPrecise::reset()

			
 
				+	{

			
 
				+		cycles = 0;

			
 
				+	}

			
 
				+

			
 
				+	inline UINT64 CPUProfiler::TimerPrecise::getNumCycles() 

			
 
				+	{

			
 
				+#if CM_COMPILER == CM_COMPILER_GNUC

			
 
				+		asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );

			
 
				+		UINT32 __a,__d;

			
 
				+		asm volatile("rdtsc" : "=a" (__a), "=d" (__d));

			
 
				+		return ( UINT64(__a) | UINT64(__d) << 32 );

			
 
				+#else

			
 
				+		int a[4];

			
 
				+		int b = 0;

			
 
				+		__cpuid(a, b);

			
 
				+		return __rdtsc();

			
 
				+#endif		

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ProfileData::beginSample()

			
 
				+	{

			
 
				+		timer.reset();

			
 
				+		timer.start();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ProfileData::endSample()

			
 
				+	{

			
 
				+		timer.stop();

			
 
				+		samples.push_back(ProfileSample(timer.time));

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ProfileData::resumeLastSample()

			
 
				+	{

			
 
				+		timer.start();

			
 
				+		samples.erase(samples.end() - 1);

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::PreciseProfileData::beginSample()

			
 
				+	{

			
 
				+		timer.reset();

			
 
				+		timer.start();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::PreciseProfileData::endSample()

			
 
				+	{

			
 
				+		timer.stop();

			
 
				+		samples.push_back(PreciseProfileSample(timer.cycles));

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::PreciseProfileData::resumeLastSample()

			
 
				+	{

			
 
				+		timer.start();

			
 
				+		samples.erase(samples.end() - 1);

			
 
				+	}

			
 
				+

			
 
				+	CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;

			
 
				+

			
 
				+	CPUProfiler::ThreadInfo::ThreadInfo()

			
 
				+		:isActive(false), activeBlock(nullptr), rootBlock(nullptr), 

			
 
				+		activePreciseBlock(nullptr), rootPreciseBlock(nullptr)

			
 
				+	{

			
 
				+

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ThreadInfo::begin(const String& _name)

			
 
				+	{

			
 
				+		if(isActive)

			
 
				+		{

			
 
				+			LOGWRN("Profiler::beginThread called on a thread that was already being sampled");

			
 
				+			return;

			
 
				+		}

			
 
				+

			
 
				+		if(rootBlock == nullptr)

			
 
				+			rootBlock = getBlock();

			
 
				+

			
 
				+		if(rootPreciseBlock == nullptr)

			
 
				+			rootPreciseBlock = getPreciseBlock();

			
 
				+

			
 
				+		activeBlocks.push(rootBlock);

			
 
				+		activeBlock = rootBlock;

			
 
				+

			
 
				+		activePreciseBlocks.push(rootPreciseBlock);

			
 
				+		activePreciseBlock = rootPreciseBlock;

			
 
				+

			
 
				+		rootBlock->name = _name; 

			
 
				+		rootBlock->data.beginSample();

			
 
				+

			
 
				+		rootPreciseBlock->name = _name;

			
 
				+		rootPreciseBlock->data.beginSample();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ThreadInfo::end()

			
 
				+	{

			
 
				+		activePreciseBlock->data.endSample();

			
 
				+		activePreciseBlocks.pop();

			
 
				+

			
 
				+		activeBlock->data.endSample();

			
 
				+		activeBlocks.pop();

			
 
				+

			
 
				+		if(!isActive)

			
 
				+			LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");

			
 
				+

			
 
				+		if(activeBlocks.size() > 0)

			
 
				+		{

			
 
				+			LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");

			
 
				+

			
 
				+			while(activeBlocks.size() > 0)

			
 
				+			{

			
 
				+				ProfiledBlock* block = activeBlocks.top();

			
 
				+				block->data.endSample();

			
 
				+

			
 
				+				activeBlocks.pop();

			
 
				+			}

			
 
				+		}

			
 
				+

			
 
				+		if(activePreciseBlocks.size() > 0)

			
 
				+		{

			
 
				+			LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");

			
 
				+

			
 
				+			while(activePreciseBlocks.size() > 0)

			
 
				+			{

			
 
				+				PreciseProfiledBlock* block = activePreciseBlocks.top();

			
 
				+				block->data.endSample();

			
 
				+

			
 
				+				activePreciseBlocks.pop();

			
 
				+			}

			
 
				+		}

			
 
				+

			
 
				+		isActive = false;

			
 
				+		activeBlocks = Stack<ProfiledBlock*>::type();

			
 
				+		activeBlock = nullptr;

			
 
				+		activePreciseBlocks = Stack<PreciseProfiledBlock*>::type();

			
 
				+		activePreciseBlock = nullptr;

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ThreadInfo::reset()

			
 
				+	{

			
 
				+		if(isActive)

			
 
				+			end();

			
 
				+

			
 
				+		if(rootBlock != nullptr)

			
 
				+			releaseBlock(rootBlock);

			
 
				+

			
 
				+		if(rootPreciseBlock != nullptr)

			
 
				+			releasePreciseBlock(rootPreciseBlock);

			
 
				+

			
 
				+		rootBlock = nullptr;

			
 
				+		rootPreciseBlock = nullptr;

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()

			
 
				+	{

			
 
				+		// TODO - Pool this, if possible using the memory allocator stuff

			
 
				+		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)

			
 
				+		return cm_new<ProfiledBlock>();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)

			
 
				+	{

			
 
				+		cm_delete(block);

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::PreciseProfiledBlock* CPUProfiler::ThreadInfo::getPreciseBlock()

			
 
				+	{

			
 
				+		// TODO - Pool this, if possible using the memory allocator stuff

			
 
				+		// TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)

			
 
				+		return cm_new<PreciseProfiledBlock>();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::ThreadInfo::releasePreciseBlock(CPUProfiler::PreciseProfiledBlock* block)

			
 
				+	{

			
 
				+		cm_delete(block);

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::ProfiledBlock::ProfiledBlock()

			
 
				+	{

			
 
				+

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::ProfiledBlock::~ProfiledBlock()

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+

			
 
				+		for(auto& child : children)

			
 
				+			thread->releaseBlock(child);

			
 
				+

			
 
				+		children.clear();

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const

			
 
				+	{

			
 
				+		for(auto& child : children)

			
 
				+		{

			
 
				+			if(child->name == name)

			
 
				+				return child;

			
 
				+		}

			
 
				+

			
 
				+		return nullptr;

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::PreciseProfiledBlock::PreciseProfiledBlock()

			
 
				+	{

			
 
				+

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::PreciseProfiledBlock::~PreciseProfiledBlock()

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+

			
 
				+		for(auto& child : children)

			
 
				+			thread->releasePreciseBlock(child);

			
 
				+

			
 
				+		children.clear();

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::PreciseProfiledBlock* CPUProfiler::PreciseProfiledBlock::findChild(const String& name) const

			
 
				+	{

			
 
				+		for(auto& child : children)

			
 
				+		{

			
 
				+			if(child->name == name)

			
 
				+				return child;

			
 
				+		}

			
 
				+

			
 
				+		return nullptr;

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::CPUProfiler()

			
 
				+		:mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverhead(0.0), mPreciseSamplingOverhead(0)

			
 
				+	{

			
 
				+		// TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,

			
 
				+		// and keep separate values per thread.

			
 
				+		estimateTimerOverhead();

			
 
				+	}

			
 
				+

			
 
				+	CPUProfiler::~CPUProfiler()

			
 
				+	{

			
 
				+		reset();

			
 
				+

			
 
				+		CM_LOCK_MUTEX(mThreadSync);

			
 
				+

			
 
				+		for(auto& threadInfo : mActiveThreads)

			
 
				+		{

			
 
				+			threadInfo->releaseBlock(threadInfo->rootBlock);

			
 
				+			cm_delete(threadInfo);

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::beginThread(const String& name)

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		if(thread == nullptr)

			
 
				+		{

			
 
				+			ThreadInfo::activeThread = cm_new<ThreadInfo>();

			
 
				+			thread = ThreadInfo::activeThread;

			
 
				+

			
 
				+			{

			
 
				+				CM_LOCK_MUTEX(mThreadSync);

			
 
				+

			
 
				+				mActiveThreads.push_back(thread);

			
 
				+			}

			
 
				+		}

			
 
				+

			
 
				+		thread->begin(name);

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::endThread()

			
 
				+	{

			
 
				+		// I don't do a nullcheck where on purpose, so endSample can be called ASAP

			
 
				+		ThreadInfo::activeThread->end();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::beginSample(const String& name)

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		if(thread == nullptr || !thread->isActive)

			
 
				+			beginThread("Unknown");

			
 
				+

			
 
				+		ProfiledBlock* parent = thread->activeBlock;

			
 
				+		ProfiledBlock* block = nullptr;

			
 
				+		

			
 
				+		parent->findChild(name);

			
 
				+

			
 
				+		if(block == nullptr)

			
 
				+		{

			
 
				+			block = thread->getBlock();

			
 
				+			block->name = name;

			
 
				+

			
 
				+			parent->children.push_back(block);

			
 
				+			thread->activePreciseBlock->basicChildren.push_back(block);

			
 
				+

			
 
				+			thread->activeBlocks.push(block);

			
 
				+			thread->activeBlock = block;

			
 
				+		}

			
 
				+

			
 
				+		block->data.beginSample();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::endSample(const String& name)

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		ProfiledBlock* block = thread->activeBlock;

			
 
				+		block->data.endSample();

			
 
				+

			
 
				+		if(block->name != name)

			
 
				+		{

			
 
				+			LOGWRN("Mismatched Profiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");

			
 
				+

			
 
				+			block->data.resumeLastSample();

			
 
				+

			
 
				+			return;

			
 
				+		}

			
 
				+

			
 
				+		thread->activeBlocks.pop();

			
 
				+		thread->activeBlock = thread->activeBlocks.top();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::beginSamplePrecise(const String& name)

			
 
				+	{

			
 
				+		// Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed. 

			
 
				+		// Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)

			
 
				+		

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		if(thread == nullptr || !thread->isActive)

			
 
				+			beginThread("Unknown");

			
 
				+

			
 
				+		PreciseProfiledBlock* parent = thread->activePreciseBlock;

			
 
				+		PreciseProfiledBlock* block = nullptr;

			
 
				+		

			
 
				+		parent->findChild(name);

			
 
				+

			
 
				+		if(block == nullptr)

			
 
				+		{

			
 
				+			block = thread->getPreciseBlock();

			
 
				+			block->name = name;

			
 
				+

			
 
				+			parent->children.push_back(block);

			
 
				+			thread->activeBlock->preciseChildren.push_back(block);

			
 
				+

			
 
				+			thread->activePreciseBlocks.push(block);

			
 
				+			thread->activePreciseBlock = block;

			
 
				+		}

			
 
				+

			
 
				+		block->data.beginSample();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::endSamplePrecise(const String& name)

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		PreciseProfiledBlock* block = thread->activePreciseBlock;

			
 
				+		block->data.endSample();

			
 
				+

			
 
				+		if(block->name != name)

			
 
				+		{

			
 
				+			LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");

			
 
				+

			
 
				+			block->data.resumeLastSample();

			
 
				+

			
 
				+			return;

			
 
				+		}

			
 
				+

			
 
				+		thread->activePreciseBlocks.pop();

			
 
				+		thread->activePreciseBlock = thread->activePreciseBlocks.top();

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::update()

			
 
				+	{

			
 
				+		// TODO: Keep track of FPS

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::reset()

			
 
				+	{

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+

			
 
				+		if(thread != nullptr)

			
 
				+			thread->reset();

			
 
				+	}

			
 
				+

			
 
				+	CPUProfilerReport CPUProfiler::generateReport()

			
 
				+	{

			
 
				+		CPUProfilerReport report;

			
 
				+

			
 
				+		ThreadInfo* thread = ThreadInfo::activeThread;

			
 
				+		if(thread == nullptr)

			
 
				+			return report;

			
 
				+

			
 
				+		if(thread->isActive)

			
 
				+			thread->end();

			
 
				+

			
 
				+		if(thread->rootBlock != nullptr)

			
 
				+		{

			
 
				+			// Fill up flatHierarchy array in a way so we always process

			
 
				+			// children before parents

			
 
				+			Stack<ProfiledBlock*>::type todo;

			
 
				+			Vector<ProfiledBlock*>::type flatHierarchy;

			
 
				+			Vector<CPUProfilerBasicSamplingEntry*>::type flatResultHierarchy;

			
 
				+

			
 
				+			todo.push(thread->rootBlock);

			
 
				+			flatHierarchy.push_back(thread->rootBlock);

			
 
				+			flatResultHierarchy.push_back(&report.mBasicSamplingRootEntry);

			
 
				+

			
 
				+			while(!todo.empty())

			
 
				+			{

			
 
				+				ProfiledBlock* curBlock = todo.top();

			
 
				+				todo.pop();

			
 
				+

			
 
				+				CPUProfilerBasicSamplingEntry* parentEntry = flatResultHierarchy.back();

			
 
				+				for(auto& child : curBlock->children)

			
 
				+				{

			
 
				+					todo.push(child);

			
 
				+					flatHierarchy.push_back(child);

			
 
				+

			
 
				+					parentEntry->childEntries.push_back(CPUProfilerBasicSamplingEntry());

			
 
				+					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					

			
 
				+				}

			
 
				+			}

			
 
				+

			
 
				+			auto& iter = flatHierarchy.rbegin();

			
 
				+			auto& iterSample = flatResultHierarchy.rbegin();

			
 
				+

			
 
				+			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)

			
 
				+			{

			
 
				+				ProfiledBlock* curBlock = *iter;

			
 
				+				CPUProfilerBasicSamplingEntry* entry = *iterSample;

			
 
				+

			
 
				+				entry->data.name = curBlock->name;

			
 
				+

			
 
				+				entry->data.totalTimeMs = 0.0;

			
 
				+				entry->data.maxTimeMs = 0.0;

			
 
				+				for(auto& sample : curBlock->data.samples)

			
 
				+				{

			
 
				+					entry->data.totalTimeMs += sample.time;

			
 
				+					entry->data.maxTimeMs = std::max(entry->data.maxTimeMs, sample.time);

			
 
				+				}

			
 
				+

			
 
				+				entry->data.numCalls = (UINT32)curBlock->data.samples.size();

			
 
				+				entry->data.avgTimeMs = entry->data.totalTimeMs / entry->data.numCalls;

			
 
				+

			
 
				+				UINT32 childIdx = 0;

			
 
				+				double totalChildTime = 0.0;

			
 
				+				for(auto& child : curBlock->children)

			
 
				+				{

			
 
				+					totalChildTime += entry->childEntries[childIdx].data.totalTimeMs;

			
 
				+					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalTimeMs / entry->data.totalTimeMs;

			
 
				+

			
 
				+					entry->data.estimatedOverheadMs += entry->childEntries[childIdx].data.estimatedOverheadMs + mBasicSamplingOverhead;

			
 
				+

			
 
				+					childIdx++;

			
 
				+				}

			
 
				+

			
 
				+				entry->data.totalSelfTimeMs = entry->data.totalTimeMs - totalChildTime;

			
 
				+				entry->data.avgSelfTimeMs = entry->data.totalSelfTimeMs / entry->data.numCalls;

			
 
				+

			
 
				+				entry->data.estimatedSelfOverheadMs = mBasicTimerOverhead;

			
 
				+			}

			
 
				+		}

			
 
				+

			
 
				+		if(thread->rootPreciseBlock != nullptr)

			
 
				+		{

			
 
				+			// Fill up flatHierarchy array in a way so we always process

			
 
				+			// children before parents

			
 
				+			Stack<PreciseProfiledBlock*>::type todo;

			
 
				+			Vector<PreciseProfiledBlock*>::type flatHierarchy;

			
 
				+			Vector<CPUProfilerPreciseSamplingEntry*>::type flatResultHierarchy;

			
 
				+

			
 
				+			todo.push(thread->rootPreciseBlock);

			
 
				+			flatHierarchy.push_back(thread->rootPreciseBlock);

			
 
				+			flatResultHierarchy.push_back(&report.mPreciseSamplingRootEntry);

			
 
				+

			
 
				+			while(!todo.empty())

			
 
				+			{

			
 
				+				PreciseProfiledBlock* curBlock = todo.top();

			
 
				+				todo.pop();

			
 
				+

			
 
				+				CPUProfilerPreciseSamplingEntry* parentEntry = flatResultHierarchy.back();

			
 
				+				for(auto& child : curBlock->children)

			
 
				+				{

			
 
				+					todo.push(child);

			
 
				+					flatHierarchy.push_back(child);

			
 
				+

			
 
				+					parentEntry->childEntries.push_back(CPUProfilerPreciseSamplingEntry());

			
 
				+					flatResultHierarchy.push_back(&parentEntry->childEntries.back());					

			
 
				+				}

			
 
				+			}

			
 
				+

			
 
				+			auto& iter = flatHierarchy.rbegin();

			
 
				+			auto& iterSample = flatResultHierarchy.rbegin();

			
 
				+

			
 
				+			for(; iter != flatHierarchy.rend(); ++iter, ++iterSample)

			
 
				+			{

			
 
				+				PreciseProfiledBlock* curBlock = *iter;

			
 
				+				CPUProfilerPreciseSamplingEntry* entry = *iterSample;

			
 
				+

			
 
				+				entry->data.name = curBlock->name;

			
 
				+

			
 
				+				entry->data.totalCycles = 0;

			
 
				+				entry->data.maxCycles = 0;

			
 
				+				for(auto& sample : curBlock->data.samples)

			
 
				+				{

			
 
				+					entry->data.totalCycles += sample.cycles;

			
 
				+					entry->data.maxCycles = std::max(entry->data.maxCycles, sample.cycles);

			
 
				+				}

			
 
				+

			
 
				+				entry->data.numCalls = (UINT32)curBlock->data.samples.size();

			
 
				+				entry->data.avgCycles = entry->data.avgCycles / entry->data.numCalls;

			
 
				+

			
 
				+				UINT32 childIdx = 0;

			
 
				+				UINT64 totalChildCycles = 0;

			
 
				+				for(auto& child : curBlock->children)

			
 
				+				{

			
 
				+					totalChildCycles += entry->childEntries[childIdx].data.totalCycles;

			
 
				+					entry->childEntries[childIdx].data.pctOfParent = entry->childEntries[childIdx].data.totalCycles / (float)entry->data.totalCycles;

			
 
				+

			
 
				+					entry->data.estimatedOverhead += entry->childEntries[childIdx].data.estimatedOverhead + mPreciseSamplingOverhead;

			
 
				+

			
 
				+					childIdx++;

			
 
				+				}

			
 
				+

			
 
				+				entry->data.totalSelfCycles = entry->data.totalCycles - totalChildCycles;

			
 
				+				entry->data.avgSelfCycles = entry->data.totalSelfCycles / entry->data.numCalls;

			
 
				+

			
 
				+				entry->data.estimatedSelfOverhead = mPreciseTimerOverhead;

			
 
				+			}

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	void CPUProfiler::estimateTimerOverhead()

			
 
				+	{

			
 
				+		// Get an idea of how long timer calls and RDTSC takes

			
 
				+		const UINT32 reps = 1000, sampleReps = 100;

			
 
				+

			
 
				+		mBasicTimerOverhead = 1000000.0;

			
 
				+		mPreciseTimerOverhead = 1000000;

			
 
				+		for (UINT32 tries = 0; tries < 20; tries++) 

			
 
				+		{

			
 
				+			Timer timer;

			
 
				+			for (UINT32 i = 0; i < reps; i++) 

			
 
				+			{

			
 
				+				timer.start();

			
 
				+				timer.stop();

			
 
				+			}

			
 
				+

			
 
				+			double avgTime = double(timer.time)/double(reps);

			
 
				+			if (avgTime < mBasicTimerOverhead)

			
 
				+				mBasicTimerOverhead = avgTime;

			
 
				+

			
 
				+			TimerPrecise timerPrecise;

			
 
				+			for (UINT32 i = 0; i < reps; i++) 

			
 
				+			{

			
 
				+				timerPrecise.start();

			
 
				+				timerPrecise.stop();

			
 
				+			}

			
 
				+

			
 
				+			UINT64 avgCycles = timerPrecise.cycles/reps;

			
 
				+			if (avgCycles < mPreciseTimerOverhead)

			
 
				+				mPreciseTimerOverhead = avgCycles;

			
 
				+		}

			
 
				+

			
 
				+		for (UINT32 tries = 0; tries < 20; tries++) 

			
 
				+		{

			
 
				+			Timer timer;

			
 
				+			timer.start();

			
 
				+			beginThread("Main");

			
 
				+

			
 
				+			// Two different cases that can effect performance, one where

			
 
				+			// sample already exists and other where new one needs to be created

			
 
				+			for (UINT32 i = 0; i < sampleReps; i++) 

			
 
				+			{

			
 
				+				beginSample("TestAvg1");

			
 
				+				endSample("TestAvg1");

			
 
				+				beginSample("TestAvg2");

			
 
				+				endSample("TestAvg2");

			
 
				+				beginSample("TestAvg3");

			
 
				+				endSample("TestAvg3");

			
 
				+				beginSample("TestAvg4");

			
 
				+				endSample("TestAvg4");

			
 
				+				beginSample("TestAvg5");

			
 
				+				endSample("TestAvg5");

			
 
				+				beginSample("TestAvg6");

			
 
				+				endSample("TestAvg6");

			
 
				+				beginSample("TestAvg7");

			
 
				+				endSample("TestAvg7");

			
 
				+				beginSample("TestAvg8");

			
 
				+				endSample("TestAvg8");

			
 
				+				beginSample("TestAvg9");

			
 
				+				endSample("TestAvg9");

			
 
				+				beginSample("TestAvg10");

			
 
				+				endSample("TestAvg10");

			
 
				+			}

			
 
				+

			
 
				+			for (UINT32 i = 0; i < sampleReps * 5; i++) 

			
 
				+			{

			
 
				+				beginSample("Test#" + toString(i));

			
 
				+				endSample("Test#" + toString(i));

			
 
				+			}

			
 
				+

			
 
				+			endThread();

			
 
				+			timer.stop();

			
 
				+

			
 
				+			reset();

			
 
				+

			
 
				+			double avgTime = double(timer.time)/double(sampleReps * 10 + sampleReps * 5);

			
 
				+			if (avgTime < mBasicSamplingOverhead)

			
 
				+				mBasicSamplingOverhead = avgTime;

			
 
				+

			
 
				+			TimerPrecise timerPrecise;

			
 
				+			timerPrecise.start();

			
 
				+			beginThread("Main");

			
 
				+

			
 
				+			// Two different cases that can effect performance, one where

			
 
				+			// sample already exists and other where new one needs to be created

			
 
				+			for (UINT32 i = 0; i < sampleReps; i++) 

			
 
				+			{

			
 
				+				beginSamplePrecise("TestAvg1");

			
 
				+				endSamplePrecise("TestAvg1");

			
 
				+				beginSamplePrecise("TestAvg2");

			
 
				+				endSamplePrecise("TestAvg2");

			
 
				+				beginSamplePrecise("TestAvg3");

			
 
				+				endSamplePrecise("TestAvg3");

			
 
				+				beginSamplePrecise("TestAvg4");

			
 
				+				endSamplePrecise("TestAvg4");

			
 
				+				beginSamplePrecise("TestAvg5");

			
 
				+				endSamplePrecise("TestAvg5");

			
 
				+				beginSamplePrecise("TestAvg6");

			
 
				+				endSamplePrecise("TestAvg6");

			
 
				+				beginSamplePrecise("TestAvg7");

			
 
				+				endSamplePrecise("TestAvg7");

			
 
				+				beginSamplePrecise("TestAvg8");

			
 
				+				endSamplePrecise("TestAvg8");

			
 
				+				beginSamplePrecise("TestAvg9");

			
 
				+				endSamplePrecise("TestAvg9");

			
 
				+				beginSamplePrecise("TestAvg10");

			
 
				+				endSamplePrecise("TestAvg10");

			
 
				+			}

			
 
				+

			
 
				+			for (UINT32 i = 0; i < sampleReps * 5; i++) 

			
 
				+			{

			
 
				+				beginSamplePrecise("Test#" + toString(i));

			
 
				+				endSamplePrecise("Test#" + toString(i));

			
 
				+			}

			
 
				+

			
 
				+			endThread();

			
 
				+			timerPrecise.stop();

			
 
				+

			
 
				+			reset();

			
 
				+

			
 
				+			UINT64 avgCycles = timerPrecise.cycles/(sampleReps * 10 + sampleReps * 5);

			
 
				+			if (avgCycles < mPreciseSamplingOverhead)

			
 
				+				mPreciseSamplingOverhead = avgCycles;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	CPUProfilerBasicSamplingEntry::Data::Data()

			
 
				+		:numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),

			
 
				+		avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),

			
 
				+		estimatedOverheadMs(0.0), pctOfParent(1.0f)

			
 
				+	{ }

			
 
				+

			
 
				+	CPUProfilerPreciseSamplingEntry::Data::Data()

			
 
				+		:numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),

			
 
				+		avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),

			
 
				+		estimatedOverhead(0), pctOfParent(1.0f)

			
 
				+	{ }

			
 
				+}
			
--- a/CamelotUtility/Source/CmProfiler.cpp
+++ b/CamelotUtility/Source/CmProfiler.cpp
@@ -1,29 +0,0 @@
 
				-#include "CmProfiler.h"

			
 
				-

			
 
				-namespace CamelotFramework

			
 
				-{

			
 
				-	void Profiler::beginSample(const String& name)

			
 
				-	{

			
 
				-

			
 
				-	}

			
 
				-

			
 
				-	Profiler::Data Profiler::endSample(const String& name)

			
 
				-	{

			
 
				-		return Profiler::Data();

			
 
				-	}

			
 
				-

			
 
				-	void Profiler::beginSamplePrecise(const String& name)

			
 
				-	{

			
 
				-

			
 
				-	}

			
 
				-

			
 
				-	Profiler::PreciseData Profiler::endSamplePrecise(const String& name)

			
 
				-	{

			
 
				-		return Profiler::PreciseData();

			
 
				-	}

			
 
				-

			
 
				-	void Profiler::update()

			
 
				-	{

			
 
				-

			
 
				-	}

			
 
				-}
			
--- a/TODO.txt
+++ b/TODO.txt
@@ -8,6 +8,11 @@ LONGTERM TODO:
 
				   - When building a profiler have main Profiler class which just does measurements, then ProfilerOverlay for data display on-screen, ProfilerEditor for Unity-like Profiler, etc.

			
 
				   - For now just create a profiler with basic measuring stats (FPS, core & sim thread time, plus times for most important systems), and ProfilerOverlay to display them

			
 
				 

			
 
				+PROFILER:

			
 
				+ TODO: Profiler is right now including windows.h. I need to work around that but don't feel like bothering with it atm

			
 
				+

			
 
				+

			
 
				+

			
 
				 I still re-create GUIWidget mesh every frame instead of just updating it.

			
 
				 

			
 
				 MAJOR ISSUE: writeSubresource/readSubresoure doesn't require a shared ptr to GpuResourceData which means it could get destroyed while still in command queue. Right now it only works because I block right after I call those methods, which ensures nothing is destroyed.

			
@@ -87,6 +92,8 @@ Medium priority:
 
				    - Doing setPixels_async in the texture doesn't make sure that the user doesn't actually modify the provided PixelData after that call.

			
 
				    - In general I need to rethink how to handle modifying resources with multithreading

			
 
				  - Closing down a window (any window) will shut down main rendering loop

			
 
				+ - GUIManager draw call merging:

			
 
				+    I merge two GUI elements if they don't overlap each other, however I don't consider if there is some world geometry between them. Another reason to move batching out of GUIManager.

			
 
				 

			
 
				 ----------------------------------------------------------------------------------------------

			
 
				 Low priority TODO