BsProfilerCPU.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #pragma once
  4. #include "BsCorePrerequisites.h"
  5. #include "BsModule.h"
  6. #include "BsFrameAlloc.h"
  7. namespace BansheeEngine
  8. {
  9. /** @addtogroup Profiling
  10. * @{
  11. */
  12. class CPUProfilerReport;
  13. /**
  14. * Provides various performance measuring methods.
  15. *
  16. * @note Thread safe. Matching begin*\end* calls must belong to the same thread though.
  17. */
  18. class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU>
  19. {
  20. /** Timer class responsible for tracking elapsed time. */
  21. class Timer
  22. {
  23. public:
  24. Timer();
  25. /** Sets the start time for the timer. */
  26. void start();
  27. /** Stops the timer and calculates the elapsed time from start time to now. */
  28. void stop();
  29. /** Resets the elapsed time to zero. */
  30. void reset();
  31. double time;
  32. private:
  33. double startTime;
  34. /** Returns time elapsed since CPU was started in millseconds. */
  35. static inline double getCurrentTime();
  36. };
  37. /** Timer class responsible for tracking number of elapsed CPU cycles. */
  38. class TimerPrecise
  39. {
  40. public:
  41. TimerPrecise();
  42. /** Starts the counter marking the current number of executed CPU cycles since CPU was started. */
  43. void start();
  44. /** Ends the counter and calculates the number of CPU cycles between now and the start time. */
  45. void stop();
  46. /** Resets the cycle count to zero. */
  47. void reset();
  48. UINT64 cycles;
  49. private:
  50. UINT64 startCycles;
  51. /** Queries the CPU for the current number of CPU cycles executed since the program was started. */
  52. static inline UINT64 getNumCycles();
  53. };
  54. /**
  55. * Contains data about a single profiler sample (counting time in milliseconds).
  56. *
  57. * @note
  58. * A sample is created whenever a named profile block is entered. For example if you have a function you are
  59. * profiling, and it gets called 10 times, there will be 10 samples.
  60. */
  61. struct ProfileSample
  62. {
  63. ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees)
  64. :time(_time), numAllocs(_numAllocs), numFrees(_numFrees)
  65. { }
  66. double time;
  67. UINT64 numAllocs;
  68. UINT64 numFrees;
  69. };
  70. /**
  71. * Contains data about a single precise profiler sample (counting CPU cycles).
  72. *
  73. * @note
  74. * A sample is created whenever a named profile block is entered. For example if you have a function you are
  75. * profiling, and it gets called 10 times, there will be 10 samples.
  76. */
  77. struct PreciseProfileSample
  78. {
  79. PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees)
  80. :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees)
  81. { }
  82. UINT64 cycles;
  83. UINT64 numAllocs;
  84. UINT64 numFrees;
  85. };
  86. /** Contains basic (time based) profiling data contained in a profiling block. */
  87. struct ProfileData
  88. {
  89. ProfileData(FrameAlloc* alloc);
  90. /** Begins a new sample and records current sample state. Previous sample must not be active. */
  91. void beginSample();
  92. /**
  93. * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
  94. * sample list.
  95. */
  96. void endSample();
  97. /**
  98. * Removes the last added sample from the sample list and makes it active again. You must call endSample()
  99. * when done as if you called beginSample().
  100. */
  101. void resumeLastSample();
  102. Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples;
  103. Timer timer;
  104. UINT64 memAllocs;
  105. UINT64 memFrees;
  106. };
  107. /** Contains precise (CPU cycle based) profiling data contained in a profiling block. */
  108. struct PreciseProfileData
  109. {
  110. PreciseProfileData(FrameAlloc* alloc);
  111. /** Begins a new sample and records current sample state. Previous sample must not be active. */
  112. void beginSample();
  113. /**
  114. * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
  115. * sample list.
  116. */
  117. void endSample();
  118. /**
  119. * Removes the last added sample from the sample list and makes it active again. You must call endSample()
  120. * when done as if you called beginSample.
  121. */
  122. void resumeLastSample();
  123. Vector<PreciseProfileSample, StdFrameAlloc<ProfileSample>> samples;
  124. TimerPrecise timer;
  125. UINT64 memAllocs;
  126. UINT64 memFrees;
  127. };
  128. /**
  129. * Contains all sampling information about a single named profiling block. Each block has its own sampling
  130. * information and optionally child blocks.
  131. */
  132. struct ProfiledBlock
  133. {
  134. ProfiledBlock(FrameAlloc* alloc);
  135. ~ProfiledBlock();
  136. /** Attempts to find a child block with the specified name. Returns null if not found. */
  137. ProfiledBlock* findChild(const char* name) const;
  138. char* name;
  139. ProfileData basic;
  140. PreciseProfileData precise;
  141. Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children;
  142. };
  143. /** CPU sampling type. */
  144. enum class ActiveSamplingType
  145. {
  146. Basic, /**< Sample using milliseconds. */
  147. Precise /**< Sample using CPU cycles. */
  148. };
  149. /** Contains data about the currently active profiling block. */
  150. struct ActiveBlock
  151. {
  152. ActiveBlock()
  153. :type(ActiveSamplingType::Basic), block(nullptr)
  154. { }
  155. ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block)
  156. :type(_type), block(_block)
  157. { }
  158. ActiveSamplingType type;
  159. ProfiledBlock* block;
  160. };
  161. /** Contains data about an active profiling thread. */
  162. struct ThreadInfo
  163. {
  164. ThreadInfo();
  165. /**
  166. * Starts profiling on the thread. New primary profiling block is created with the given name.
  167. */
  168. void begin(const char* _name);
  169. /**
  170. * Ends profiling on the thread. You should end all samples before calling this, but if you don't they will be
  171. * terminated automatically.
  172. */
  173. void end();
  174. /**
  175. * Deletes all internal profiling data and makes the object ready for another iteration. Should be called
  176. * after end in order to delete any existing data.
  177. */
  178. void reset();
  179. /** Gets the primary profiling block used by the thread. */
  180. ProfiledBlock* getBlock(const char* name);
  181. /** Deletes the provided block. */
  182. void releaseBlock(ProfiledBlock* block);
  183. static BS_THREADLOCAL ThreadInfo* activeThread;
  184. bool isActive;
  185. ProfiledBlock* rootBlock;
  186. FrameAlloc frameAlloc;
  187. ActiveBlock activeBlock;
  188. Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks;
  189. };
  190. public:
  191. ProfilerCPU();
  192. ~ProfilerCPU();
  193. /**
  194. * Registers a new thread we will be doing sampling in. This needs to be called before any beginSample*\endSample*
  195. * calls are made in that thread.
  196. *
  197. * @param[in] name Name that will allow you to more easily identify the thread.
  198. */
  199. void beginThread(const char* name);
  200. /** Ends sampling for the current thread. No beginSample*\endSample* calls after this point. */
  201. void endThread();
  202. /**
  203. * Begins sample measurement. Must be followed by endSample().
  204. *
  205. * @param[in] name Unique name for the sample you can later use to find the sampling data.
  206. */
  207. void beginSample(const char* name);
  208. /**
  209. * Ends sample measurement.
  210. *
  211. * @param[in] name Unique name for the sample.
  212. *
  213. * @note
  214. * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name in
  215. * beginSample() would be enough.
  216. */
  217. void endSample(const char* name);
  218. /**
  219. * Begins precise sample measurement. Must be followed by endSamplePrecise().
  220. *
  221. * @param[in] name Unique name for the sample you can later use to find the sampling data.
  222. *
  223. * @note
  224. * This method uses very precise CPU counters to determine variety of data not provided by standard beginSample().
  225. * However due to the way these counters work you should not use this method for larger parts of code. It does not
  226. * consider context switches so if the OS decides to switch context between measurements you will get invalid data.
  227. */
  228. void beginSamplePrecise(const char* name);
  229. /**
  230. * Ends precise sample measurement.
  231. *
  232. * @param[in] name Unique name for the sample.
  233. *
  234. * @note
  235. * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name
  236. * in beginSamplePrecise() would be enough.
  237. */
  238. void endSamplePrecise(const char* name);
  239. /** Clears all sampling data, and ends any unfinished sampling blocks. */
  240. void reset();
  241. /**
  242. * Generates a report from all previously sampled data.
  243. *
  244. * @note Generating a report will stop all in-progress sampling. You should make sure
  245. * you call endSample* manually beforehand so this doesn't have to happen.
  246. */
  247. CPUProfilerReport generateReport();
  248. private:
  249. /**
  250. * Calculates overhead that the timing and sampling methods themselves introduce so we might get more accurate
  251. * measurements when creating reports.
  252. */
  253. void estimateTimerOverhead();
  254. private:
  255. double mBasicTimerOverhead;
  256. UINT64 mPreciseTimerOverhead;
  257. double mBasicSamplingOverheadMs;
  258. double mPreciseSamplingOverheadMs;
  259. UINT64 mBasicSamplingOverheadCycles;
  260. UINT64 mPreciseSamplingOverheadCycles;
  261. ProfilerVector<ThreadInfo*> mActiveThreads;
  262. BS_MUTEX(mThreadSync);
  263. };
  264. /** Profiling entry containing information about a single CPU profiling block containing timing information. */
  265. struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry
  266. {
  267. struct BS_CORE_EXPORT Data
  268. {
  269. Data();
  270. String name; /**< Name of the profiling block. */
  271. UINT32 numCalls; /**< Number of times the block was entered. */
  272. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  273. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  274. double avgTimeMs; /**< Average time it took to execute the block, per call. In milliseconds. */
  275. double maxTimeMs; /**< Maximum time of a single call in the block. In milliseconds. */
  276. double totalTimeMs; /**< Total time the block took, across all calls. In milliseconds. */
  277. double avgSelfTimeMs; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */
  278. double totalSelfTimeMs; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */
  279. double estimatedSelfOverheadMs; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */
  280. double estimatedOverheadMs; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */
  281. float pctOfParent; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */
  282. } data;
  283. ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries;
  284. };
  285. /**
  286. * Profiling entry containing information about a single CPU profiling block containing CPU cycle count based
  287. * information.
  288. */
  289. struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry
  290. {
  291. struct BS_CORE_EXPORT Data
  292. {
  293. Data();
  294. String name; /**< Name of the profiling block. */
  295. UINT32 numCalls; /**< Number of times the block was entered. */
  296. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  297. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  298. UINT64 avgCycles; /**< Average number of cycles it took to execute the block, per call. */
  299. UINT64 maxCycles; /**< Maximum number of cycles of a single call in the block. */
  300. UINT64 totalCycles; /**< Total number of cycles across all calls in the block. */
  301. UINT64 avgSelfCycles; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */
  302. UINT64 totalSelfCycles; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */
  303. UINT64 estimatedSelfOverhead; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */
  304. UINT64 estimatedOverhead; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */
  305. float pctOfParent; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */
  306. } data;
  307. ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries;
  308. };
  309. /** CPU profiling report containing all profiling information for a single profiling session. */
  310. class BS_CORE_EXPORT CPUProfilerReport
  311. {
  312. public:
  313. CPUProfilerReport();
  314. /**
  315. * Returns root entry for the basic (time based) sampling data. Root entry always contains the profiling block
  316. * associated with the entire thread.
  317. */
  318. const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
  319. /**
  320. * Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the profiling
  321. * block associated with the entire thread.
  322. */
  323. const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
  324. private:
  325. friend class ProfilerCPU;
  326. CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
  327. CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
  328. };
  329. /** Easier way to access ProfilerCPU. */
  330. BS_CORE_EXPORT ProfilerCPU& gProfilerCPU();
  331. /** Shortcut for profiling a single function call. */
  332. #define PROFILE_CALL(call, name) \
  333. BansheeEngine::gProfilerCPU().beginSample(##name##); \
  334. call; \
  335. BansheeEngine::gProfilerCPU().endSample(##name##);
  336. /** @} */
  337. }