BsProfilerCPU.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. #pragma once
  2. #include "BsCorePrerequisites.h"
  3. #include "BsModule.h"
  4. #include "BsFrameAlloc.h"
  5. namespace BansheeEngine
  6. {
  7. class CPUProfilerReport;
  8. /**
  9. * @brief Provides various performance measuring methods.
  10. *
  11. * @note Thread safe. Matching begin*\end* calls
  12. * must belong to the same thread though.
  13. */
  14. class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU>
  15. {
  16. /**
  17. * @brief Timer class responsible for tracking elapsed time.
  18. */
  19. class Timer
  20. {
  21. public:
  22. Timer();
  23. /**
  24. * @brief Sets the start time for the timer.
  25. */
  26. void start();
  27. /**
  28. * @brief Stops the timer and calculates the elapsed time
  29. * from start time to now.
  30. */
  31. void stop();
  32. /**
  33. * @brief Resets the elapsed time to zero.
  34. */
  35. void reset();
  36. double time;
  37. private:
  38. double startTime;
  39. /**
  40. * @brief Returns time elapsed since CPU was started in millseconds.
  41. */
  42. static inline double getCurrentTime();
  43. };
  44. /**
  45. * @brief Timer class responsible for tracking number of elapsed CPU cycles.
  46. */
  47. class TimerPrecise
  48. {
  49. public:
  50. TimerPrecise();
  51. /**
  52. * @brief Starts the counter marking the current number of executed
  53. * CPU cycles since CPU was started.
  54. */
  55. void start();
  56. /**
  57. * @brief Ends the counter and calculates the number of CPU cycles between
  58. * now and the start time.
  59. */
  60. void stop();
  61. /**
  62. * @brief Resets the cycle count to zero.
  63. */
  64. void reset();
  65. UINT64 cycles;
  66. private:
  67. UINT64 startCycles;
  68. /**
  69. * @brief Queries the CPU for the current number of CPU cycles executed since the
  70. * program was started.
  71. */
  72. static inline UINT64 getNumCycles();
  73. };
  74. /**
  75. * @brief Contains data about a single profiler sample (counting time in milliseconds).
  76. *
  77. * @note A sample is created whenever a named profile block is entered. e.g. if you have a function
  78. * you are profiling, and it gets called 10 times, there will be 10 samples.
  79. */
  80. struct ProfileSample
  81. {
  82. ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees)
  83. :time(_time), numAllocs(_numAllocs), numFrees(_numFrees)
  84. { }
  85. double time;
  86. UINT64 numAllocs;
  87. UINT64 numFrees;
  88. };
  89. /**
  90. * @brief Contains data about a single precise profiler sample (counting CPU cycles).
  91. *
  92. * @note A sample is created whenever a named profile block is entered. e.g. if you have a function
  93. * you are profiling, and it gets called 10 times, there will be 10 samples.
  94. */
  95. struct PreciseProfileSample
  96. {
  97. PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees)
  98. :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees)
  99. { }
  100. UINT64 cycles;
  101. UINT64 numAllocs;
  102. UINT64 numFrees;
  103. };
  104. /**
  105. * @brief Contains basic (time based) profiling data contained in a profiling block.
  106. */
  107. struct ProfileData
  108. {
  109. ProfileData(FrameAlloc* alloc);
  110. /**
  111. * @brief Begins a new sample and records current sample state. Previous sample must
  112. * not be active.
  113. */
  114. void beginSample();
  115. /**
  116. * @brief Records current sample state and creates a new sample based on start and end state.
  117. * Adds the sample to the sample list.
  118. */
  119. void endSample();
  120. /**
  121. * @brief Removes the last added sample from the sample list and makes it active again. You must
  122. * call endSample when done as if you called beginSample.
  123. */
  124. void resumeLastSample();
  125. Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples;
  126. Timer timer;
  127. UINT64 memAllocs;
  128. UINT64 memFrees;
  129. };
  130. /**
  131. * @brief Contains precise (CPU cycle based) profiling data contained in a profiling block.
  132. */
  133. struct PreciseProfileData
  134. {
  135. PreciseProfileData(FrameAlloc* alloc);
  136. /**
  137. * @brief Begins a new sample and records current sample state. Previous sample must
  138. * not be active.
  139. */
  140. void beginSample();
  141. /**
  142. * @brief Records current sample state and creates a new sample based on start and end state.
  143. * Adds the sample to the sample list.
  144. */
  145. void endSample();
  146. /**
  147. * @brief Removes the last added sample from the sample list and makes it active again. You must
  148. * call endSample when done as if you called beginSample.
  149. */
  150. void resumeLastSample();
  151. Vector<PreciseProfileSample, StdFrameAlloc<ProfileSample>> samples;
  152. TimerPrecise timer;
  153. UINT64 memAllocs;
  154. UINT64 memFrees;
  155. };
  156. /**
  157. * @brief Contains all sampling information about a single named profiling block.
  158. * Each block has its own sampling information and optionally child blocks.
  159. */
  160. struct ProfiledBlock
  161. {
  162. ProfiledBlock(FrameAlloc* alloc);
  163. ~ProfiledBlock();
  164. /**
  165. * @brief Attempts to find a child block with the specified name. Returns
  166. * null if not found.
  167. */
  168. ProfiledBlock* findChild(const char* name) const;
  169. char* name;
  170. ProfileData basic;
  171. PreciseProfileData precise;
  172. Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children;
  173. };
  174. /**
  175. * @brief CPU sampling type.
  176. */
  177. enum class ActiveSamplingType
  178. {
  179. Basic, /**< Sample using milliseconds. */
  180. Precise /**< Sample using CPU cycles. */
  181. };
  182. /**
  183. * @brief Contains data about the currently active profiling block.
  184. */
  185. struct ActiveBlock
  186. {
  187. ActiveBlock()
  188. :type(ActiveSamplingType::Basic), block(nullptr)
  189. { }
  190. ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block)
  191. :type(_type), block(_block)
  192. { }
  193. ActiveSamplingType type;
  194. ProfiledBlock* block;
  195. };
  196. /**
  197. * @brief Contains data about an active profiling thread.
  198. */
  199. struct ThreadInfo
  200. {
  201. ThreadInfo();
  202. /**
  203. * @brief Starts profiling on the thread. New primary profiling block
  204. * is created with the given name.
  205. */
  206. void begin(const char* _name);
  207. /**
  208. * @brief Ends profiling on the thread. You should end all samples before calling this,
  209. * but if you don't they will be terminated automatically.
  210. */
  211. void end();
  212. /**
  213. * @brief Deletes all internal profiling data and makes the object ready for another
  214. * iteration. Should be called after end in order to delete any existing data.
  215. */
  216. void reset();
  217. /**
  218. * @brief Gets the primary profiling block used by the thread.
  219. */
  220. ProfiledBlock* getBlock(const char* name);
  221. /**
  222. * @brief Deletes the provided block.
  223. */
  224. void releaseBlock(ProfiledBlock* block);
  225. static BS_THREADLOCAL ThreadInfo* activeThread;
  226. bool isActive;
  227. ProfiledBlock* rootBlock;
  228. FrameAlloc frameAlloc;
  229. ActiveBlock activeBlock;
  230. Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks;
  231. };
  232. public:
  233. ProfilerCPU();
  234. ~ProfilerCPU();
  235. /**
  236. * @brief Registers a new thread we will be doing sampling in. This needs to be called before any beginSample*\endSample* calls
  237. * are made in that thread.
  238. *
  239. * @param name Name that will allow you to more easily identify the thread.
  240. */
  241. void beginThread(const char* name);
  242. /**
  243. * @brief Ends sampling for the current thread. No beginSample*\endSample* calls after this point.
  244. */
  245. void endThread();
  246. /**
  247. * @brief Begins sample measurement. Must be followed by endSample.
  248. *
  249. * @param name Unique name for the sample you can later use to find the sampling data.
  250. */
  251. void beginSample(const char* name);
  252. /**
  253. * @brief Ends sample measurement.
  254. *
  255. * @param name Unique name for the sample.
  256. *
  257. * @note Unique name is primarily needed to more easily identify mismatched
  258. * begin/end sample pairs. Otherwise the name in beginSample would be enough.
  259. */
  260. void endSample(const char* name);
  261. /**
  262. * @brief Begins sample measurement. Must be followed by endSample.
  263. *
  264. * @param name Unique name for the sample you can later use to find the sampling data.
  265. *
  266. * @note This method uses very precise CPU counters to determine variety of data not
  267. * provided by standard beginSample. However due to the way these counters work you should
  268. * not use this method for larger parts of code. It does not consider context switches so if the OS
  269. * decides to switch context between measurements you will get invalid data.
  270. */
  271. void beginSamplePrecise(const char* name);
  272. /**
  273. * @brief Ends precise sample measurement.
  274. *
  275. * @param name Unique name for the sample.
  276. *
  277. * @note Unique name is primarily needed to more easily identify mismatched
  278. * begin/end sample pairs. Otherwise the name in beginSamplePrecise would be enough.
  279. */
  280. void endSamplePrecise(const char* name);
  281. /**
  282. * @brief Clears all sampling data, and ends any unfinished sampling blocks.
  283. */
  284. void reset();
  285. /**
  286. * @brief Generates a report from all previously sampled data.
  287. *
  288. * @note Generating a report will stop all in-progress sampling. You should make sure
  289. * you call endSample* manually beforehand so this doesn't have to happen.
  290. */
  291. CPUProfilerReport generateReport();
  292. private:
  293. /**
  294. * @brief Calculates overhead that the timing and sampling methods themselves introduce
  295. * so we might get more accurate measurements when creating reports.
  296. */
  297. void estimateTimerOverhead();
  298. private:
  299. double mBasicTimerOverhead;
  300. UINT64 mPreciseTimerOverhead;
  301. double mBasicSamplingOverheadMs;
  302. double mPreciseSamplingOverheadMs;
  303. UINT64 mBasicSamplingOverheadCycles;
  304. UINT64 mPreciseSamplingOverheadCycles;
  305. ProfilerVector<ThreadInfo*> mActiveThreads;
  306. BS_MUTEX(mThreadSync);
  307. };
  308. /**
  309. * @brief Profiling entry containing information about a single CPU profiling block
  310. * containing timing information.
  311. */
  312. struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry
  313. {
  314. struct BS_CORE_EXPORT Data
  315. {
  316. Data();
  317. String name; /**< Name of the profiling block. */
  318. UINT32 numCalls; /**< Number of times the block was entered. */
  319. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  320. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  321. double avgTimeMs; /**< Average time it took to execute the block, per call. In milliseconds. */
  322. double maxTimeMs; /**< Maximum time of a single call in the block. In milliseconds. */
  323. double totalTimeMs; /**< Total time the block took, across all calls. In milliseconds. */
  324. double avgSelfTimeMs; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */
  325. double totalSelfTimeMs; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */
  326. double estimatedSelfOverheadMs; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */
  327. double estimatedOverheadMs; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */
  328. float pctOfParent; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */
  329. } data;
  330. ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries;
  331. };
  332. /**
  333. * @brief Profiling entry containing information about a single CPU profiling block
  334. * containing CPU cycle count based information.
  335. */
  336. struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry
  337. {
  338. struct BS_CORE_EXPORT Data
  339. {
  340. Data();
  341. String name; /**< Name of the profiling block. */
  342. UINT32 numCalls; /**< Number of times the block was entered. */
  343. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  344. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  345. UINT64 avgCycles; /**< Average number of cycles it took to execute the block, per call. */
  346. UINT64 maxCycles; /**< Maximum number of cycles of a single call in the block. */
  347. UINT64 totalCycles; /**< Total number of cycles across all calls in the block. */
  348. UINT64 avgSelfCycles; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */
  349. UINT64 totalSelfCycles; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */
  350. UINT64 estimatedSelfOverhead; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */
  351. UINT64 estimatedOverhead; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */
  352. float pctOfParent; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */
  353. } data;
  354. ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries;
  355. };
  356. /**
  357. * @brief CPU profiling report containing all profiling information for a single profiling session.
  358. */
  359. class BS_CORE_EXPORT CPUProfilerReport
  360. {
  361. public:
  362. CPUProfilerReport();
  363. /**
  364. * @brief Returns root entry for the basic (time based) sampling data. Root entry always contains the
  365. * profiling block associated with the entire thread.
  366. */
  367. const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
  368. /**
  369. * @brief Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the
  370. * profiling block associated with the entire thread.
  371. */
  372. const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
  373. private:
  374. friend class ProfilerCPU;
  375. CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
  376. CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
  377. };
  378. /**
  379. * @brief Quick way to access the CPU profiler.
  380. */
  381. BS_CORE_EXPORT ProfilerCPU& gProfilerCPU();
  382. /**
  383. * @brief Shortcut for profiling a single function call.
  384. */
  385. #define PROFILE_CALL(call, name) \
  386. BansheeEngine::gProfilerCPU().beginSample(##name##); \
  387. call; \
  388. BansheeEngine::gProfilerCPU().endSample(##name##);
  389. }