CmCPUProfiler.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961
  1. #include "CmCPUProfiler.h"
  2. #include "CmDebug.h"
  3. #include "CmPlatform.h"
  4. namespace CamelotFramework
  5. {
  6. CPUProfiler::Timer::Timer()
  7. {
  8. time = 0.0f;
  9. }
  10. void CPUProfiler::Timer::start()
  11. {
  12. startTime = getCurrentTime();
  13. }
  14. void CPUProfiler::Timer::stop()
  15. {
  16. time += getCurrentTime() - startTime;
  17. }
  18. void CPUProfiler::Timer::reset()
  19. {
  20. time = 0.0f;
  21. }
  22. inline double CPUProfiler::Timer::getCurrentTime()
  23. {
  24. return Platform::queryPerformanceTimerMs();
  25. }
  26. CPUProfiler::TimerPrecise::TimerPrecise()
  27. {
  28. cycles = 0;
  29. }
  30. void CPUProfiler::TimerPrecise::start()
  31. {
  32. startCycles = getNumCycles();
  33. }
  34. void CPUProfiler::TimerPrecise::stop()
  35. {
  36. cycles += getNumCycles() - startCycles;
  37. }
  38. void CPUProfiler::TimerPrecise::reset()
  39. {
  40. cycles = 0;
  41. }
  42. inline UINT64 CPUProfiler::TimerPrecise::getNumCycles()
  43. {
  44. #if CM_COMPILER == CM_COMPILER_GNUC
  45. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  46. UINT32 __a,__d;
  47. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  48. return ( UINT64(__a) | UINT64(__d) << 32 );
  49. #else
  50. int a[4];
  51. int b = 0;
  52. __cpuid(a, b);
  53. return __rdtsc();
  54. #endif
  55. }
  56. void CPUProfiler::ProfileData::beginSample()
  57. {
  58. memAllocs = MemoryCounter::getNumAllocs();
  59. memFrees = MemoryCounter::getNumFrees();
  60. timer.reset();
  61. timer.start();
  62. }
  63. void CPUProfiler::ProfileData::endSample()
  64. {
  65. timer.stop();
  66. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  67. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  68. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  69. }
  70. void CPUProfiler::ProfileData::resumeLastSample()
  71. {
  72. timer.start();
  73. samples.erase(samples.end() - 1);
  74. }
  75. void CPUProfiler::PreciseProfileData::beginSample()
  76. {
  77. memAllocs = MemoryCounter::getNumAllocs();
  78. memFrees = MemoryCounter::getNumFrees();
  79. timer.reset();
  80. timer.start();
  81. }
  82. void CPUProfiler::PreciseProfileData::endSample()
  83. {
  84. timer.stop();
  85. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  86. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  87. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  88. }
  89. void CPUProfiler::PreciseProfileData::resumeLastSample()
  90. {
  91. timer.start();
  92. samples.erase(samples.end() - 1);
  93. }
  94. CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
  95. CPUProfiler::ThreadInfo::ThreadInfo()
  96. :isActive(false), rootBlock(nullptr)
  97. {
  98. }
  99. void CPUProfiler::ThreadInfo::begin(const String& _name)
  100. {
  101. if(isActive)
  102. {
  103. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  104. return;
  105. }
  106. if(rootBlock == nullptr)
  107. rootBlock = getBlock();
  108. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  109. activeBlocks.push(activeBlock);
  110. rootBlock->name = _name;
  111. rootBlock->basic.beginSample();
  112. isActive = true;
  113. }
  114. void CPUProfiler::ThreadInfo::end()
  115. {
  116. if(activeBlock.type == ActiveSamplingType::Basic)
  117. activeBlock.block->basic.endSample();
  118. else
  119. activeBlock.block->precise.endSample();
  120. activeBlocks.pop();
  121. if(!isActive)
  122. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  123. if(activeBlocks.size() > 0)
  124. {
  125. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  126. while(activeBlocks.size() > 0)
  127. {
  128. ActiveBlock& curBlock = activeBlocks.top();
  129. if(curBlock.type == ActiveSamplingType::Basic)
  130. curBlock.block->basic.endSample();
  131. else
  132. curBlock.block->precise.endSample();
  133. activeBlocks.pop();
  134. }
  135. }
  136. isActive = false;
  137. activeBlocks = Stack<ActiveBlock>::type();
  138. activeBlock = ActiveBlock();
  139. }
  140. void CPUProfiler::ThreadInfo::reset()
  141. {
  142. if(isActive)
  143. end();
  144. if(rootBlock != nullptr)
  145. releaseBlock(rootBlock);
  146. rootBlock = nullptr;
  147. }
  148. CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
  149. {
  150. // TODO - Pool this, if possible using the memory allocator stuff
  151. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  152. return cm_new<ProfiledBlock>();
  153. }
  154. void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
  155. {
  156. cm_delete(block);
  157. }
  158. CPUProfiler::ProfiledBlock::ProfiledBlock()
  159. { }
  160. CPUProfiler::ProfiledBlock::~ProfiledBlock()
  161. {
  162. ThreadInfo* thread = ThreadInfo::activeThread;
  163. for(auto& child : children)
  164. thread->releaseBlock(child);
  165. children.clear();
  166. }
  167. CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
  168. {
  169. for(auto& child : children)
  170. {
  171. if(child->name == name)
  172. return child;
  173. }
  174. return nullptr;
  175. }
  176. CPUProfiler::CPUProfiler()
  177. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  178. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  179. {
  180. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  181. // and keep separate values per thread.
  182. estimateTimerOverhead();
  183. }
  184. CPUProfiler::~CPUProfiler()
  185. {
  186. reset();
  187. CM_LOCK_MUTEX(mThreadSync);
  188. for(auto& threadInfo : mActiveThreads)
  189. cm_delete(threadInfo);
  190. }
  191. void CPUProfiler::beginThread(const String& name)
  192. {
  193. ThreadInfo* thread = ThreadInfo::activeThread;
  194. if(thread == nullptr)
  195. {
  196. ThreadInfo::activeThread = cm_new<ThreadInfo>();
  197. thread = ThreadInfo::activeThread;
  198. {
  199. CM_LOCK_MUTEX(mThreadSync);
  200. mActiveThreads.push_back(thread);
  201. }
  202. }
  203. thread->begin(name);
  204. }
  205. void CPUProfiler::endThread()
  206. {
  207. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  208. ThreadInfo::activeThread->end();
  209. }
  210. void CPUProfiler::beginSample(const String& name)
  211. {
  212. ThreadInfo* thread = ThreadInfo::activeThread;
  213. if(thread == nullptr || !thread->isActive)
  214. beginThread("Unknown");
  215. ProfiledBlock* parent = thread->activeBlock.block;
  216. ProfiledBlock* block = nullptr;
  217. if(parent != nullptr)
  218. block = parent->findChild(name);
  219. if(block == nullptr)
  220. {
  221. block = thread->getBlock();
  222. block->name = name;
  223. if(parent != nullptr)
  224. parent->children.push_back(block);
  225. else
  226. thread->rootBlock->children.push_back(block);
  227. }
  228. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  229. thread->activeBlocks.push(thread->activeBlock);
  230. block->basic.beginSample();
  231. }
  232. void CPUProfiler::endSample(const String& name)
  233. {
  234. ThreadInfo* thread = ThreadInfo::activeThread;
  235. ProfiledBlock* block = thread->activeBlock.block;
  236. #if CM_DEBUG_MODE
  237. if(block == nullptr)
  238. {
  239. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  240. return;
  241. }
  242. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  243. {
  244. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  245. return;
  246. }
  247. if(block->name != name)
  248. {
  249. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  250. return;
  251. }
  252. #endif
  253. block->basic.endSample();
  254. thread->activeBlocks.pop();
  255. if(!thread->activeBlocks.empty())
  256. thread->activeBlock = thread->activeBlocks.top();
  257. else
  258. thread->activeBlock = ActiveBlock();
  259. }
  260. void CPUProfiler::beginSamplePrecise(const String& name)
  261. {
  262. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  263. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  264. ThreadInfo* thread = ThreadInfo::activeThread;
  265. if(thread == nullptr || !thread->isActive)
  266. beginThread("Unknown");
  267. ProfiledBlock* parent = thread->activeBlock.block;
  268. ProfiledBlock* block = nullptr;
  269. if(parent != nullptr)
  270. block = parent->findChild(name);
  271. if(block == nullptr)
  272. {
  273. block = thread->getBlock();
  274. block->name = name;
  275. if(parent != nullptr)
  276. parent->children.push_back(block);
  277. else
  278. thread->rootBlock->children.push_back(block);
  279. }
  280. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  281. thread->activeBlocks.push(thread->activeBlock);
  282. block->precise.beginSample();
  283. }
  284. void CPUProfiler::endSamplePrecise(const String& name)
  285. {
  286. ThreadInfo* thread = ThreadInfo::activeThread;
  287. ProfiledBlock* block = thread->activeBlock.block;
  288. #if CM_DEBUG_MODE
  289. if(block == nullptr)
  290. {
  291. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  292. return;
  293. }
  294. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  295. {
  296. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  297. return;
  298. }
  299. if(block->name != name)
  300. {
  301. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  302. return;
  303. }
  304. #endif
  305. block->precise.endSample();
  306. thread->activeBlocks.pop();
  307. if(!thread->activeBlocks.empty())
  308. thread->activeBlock = thread->activeBlocks.top();
  309. else
  310. thread->activeBlock = ActiveBlock();
  311. }
  312. void CPUProfiler::reset()
  313. {
  314. ThreadInfo* thread = ThreadInfo::activeThread;
  315. if(thread != nullptr)
  316. thread->reset();
  317. }
  318. CPUProfilerReport CPUProfiler::generateReport()
  319. {
  320. CPUProfilerReport report;
  321. ThreadInfo* thread = ThreadInfo::activeThread;
  322. if(thread == nullptr)
  323. return report;
  324. if(thread->isActive)
  325. thread->end();
  326. // We need to separate out basic and precise data and form two separate hierarchies
  327. if(thread->rootBlock == nullptr)
  328. return report;
  329. struct TempEntry
  330. {
  331. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  332. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  333. { }
  334. ProfiledBlock* parentBlock;
  335. UINT32 entryIdx;
  336. Vector<UINT32>::type childIndexes;
  337. };
  338. Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
  339. Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;
  340. // Fill up flatHierarchy array in a way so we always process children before parents
  341. Stack<UINT32>::type todo;
  342. Vector<TempEntry>::type flatHierarchy;
  343. UINT32 entryIdx = 0;
  344. todo.push(entryIdx);
  345. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  346. entryIdx++;
  347. while(!todo.empty())
  348. {
  349. UINT32 curDataIdx = todo.top();
  350. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  351. todo.pop();
  352. for(auto& child : curBlock->children)
  353. {
  354. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  355. todo.push(entryIdx);
  356. flatHierarchy.push_back(TempEntry(child, entryIdx));
  357. entryIdx++;
  358. }
  359. }
  360. // Calculate sampling data for all entries
  361. basicEntries.resize(flatHierarchy.size());
  362. preciseEntries.resize(flatHierarchy.size());
  363. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  364. {
  365. TempEntry& curData = *iter;
  366. ProfiledBlock* curBlock = curData.parentBlock;
  367. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  368. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  369. // Calculate basic data
  370. entryBasic->data.name = curBlock->name;
  371. entryBasic->data.memAllocs = 0;
  372. entryBasic->data.memFrees = 0;
  373. entryBasic->data.totalTimeMs = 0.0;
  374. entryBasic->data.maxTimeMs = 0.0;
  375. for(auto& sample : curBlock->basic.samples)
  376. {
  377. entryBasic->data.totalTimeMs += sample.time;
  378. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  379. entryBasic->data.memAllocs += sample.numAllocs;
  380. entryBasic->data.memFrees += sample.numFrees;
  381. }
  382. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  383. if(entryBasic->data.numCalls > 0)
  384. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  385. double totalChildTime = 0.0;
  386. for(auto& childIdx : curData.childIndexes)
  387. {
  388. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  389. totalChildTime += childEntry->data.totalTimeMs;
  390. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  391. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  392. }
  393. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  394. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  395. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  396. if(entryBasic->data.numCalls > 0)
  397. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  398. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  399. // Calculate precise data
  400. entryPrecise->data.name = curBlock->name;
  401. entryPrecise->data.memAllocs = 0;
  402. entryPrecise->data.memFrees = 0;
  403. entryPrecise->data.totalCycles = 0;
  404. entryPrecise->data.maxCycles = 0;
  405. for(auto& sample : curBlock->precise.samples)
  406. {
  407. entryPrecise->data.totalCycles += sample.cycles;
  408. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  409. entryPrecise->data.memAllocs += sample.numAllocs;
  410. entryPrecise->data.memFrees += sample.numFrees;
  411. }
  412. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  413. if(entryPrecise->data.numCalls > 0)
  414. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  415. UINT64 totalChildCycles = 0;
  416. for(auto& childIdx : curData.childIndexes)
  417. {
  418. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  419. totalChildCycles += childEntry->data.totalCycles;
  420. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  421. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  422. }
  423. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  424. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  425. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  426. if(entryPrecise->data.numCalls > 0)
  427. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  428. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  429. }
  430. // Prune empty basic entries
  431. Stack<UINT32>::type finalBasicHierarchyTodo;
  432. Stack<UINT32>::type parentBasicEntryIndexes;
  433. Vector<TempEntry>::type newBasicEntries;
  434. finalBasicHierarchyTodo.push(0);
  435. entryIdx = 0;
  436. parentBasicEntryIndexes.push(entryIdx);
  437. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  438. entryIdx++;
  439. while(!finalBasicHierarchyTodo.empty())
  440. {
  441. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  442. parentBasicEntryIndexes.pop();
  443. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  444. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  445. finalBasicHierarchyTodo.pop();
  446. for(auto& childIdx : curEntry.childIndexes)
  447. {
  448. finalBasicHierarchyTodo.push(childIdx);
  449. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  450. if(basicEntry.data.numCalls > 0)
  451. {
  452. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  453. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  454. parentBasicEntryIndexes.push(entryIdx);
  455. entryIdx++;
  456. }
  457. else
  458. parentBasicEntryIndexes.push(parentEntryIdx);
  459. }
  460. }
  461. if(newBasicEntries.size() > 0)
  462. {
  463. Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
  464. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  465. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  466. finalBasicHierarchyTodo.push(0);
  467. while(!finalBasicHierarchyTodo.empty())
  468. {
  469. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  470. finalBasicHierarchyTodo.pop();
  471. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  472. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  473. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  474. UINT32 idx = 0;
  475. for(auto& childIdx : curEntry.childIndexes)
  476. {
  477. TempEntry& childEntry = newBasicEntries[childIdx];
  478. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  479. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  480. finalBasicHierarchyTodo.push(childIdx);
  481. idx++;
  482. }
  483. }
  484. }
  485. // Prune empty precise entries
  486. Stack<UINT32>::type finalPreciseHierarchyTodo;
  487. Stack<UINT32>::type parentPreciseEntryIndexes;
  488. Vector<TempEntry>::type newPreciseEntries;
  489. finalPreciseHierarchyTodo.push(0);
  490. entryIdx = 0;
  491. parentPreciseEntryIndexes.push(entryIdx);
  492. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  493. entryIdx++;
  494. while(!finalPreciseHierarchyTodo.empty())
  495. {
  496. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  497. parentPreciseEntryIndexes.pop();
  498. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  499. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  500. finalPreciseHierarchyTodo.pop();
  501. for(auto& childIdx : curEntry.childIndexes)
  502. {
  503. finalPreciseHierarchyTodo.push(childIdx);
  504. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  505. if(preciseEntry.data.numCalls > 0)
  506. {
  507. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  508. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  509. parentPreciseEntryIndexes.push(entryIdx);
  510. entryIdx++;
  511. }
  512. else
  513. parentPreciseEntryIndexes.push(parentEntryIdx);
  514. }
  515. }
  516. if(newPreciseEntries.size() > 0)
  517. {
  518. Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
  519. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  520. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  521. finalPreciseHierarchyTodo.push(0);
  522. while(!finalPreciseHierarchyTodo.empty())
  523. {
  524. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  525. finalPreciseHierarchyTodo.pop();
  526. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  527. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  528. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  529. UINT32 idx = 0;
  530. for(auto& childIdx : curEntry.childIndexes)
  531. {
  532. TempEntry& childEntry = newPreciseEntries[childIdx];
  533. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  534. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  535. finalPreciseHierarchyTodo.push(childIdx);
  536. idx++;
  537. }
  538. }
  539. }
  540. return report;
  541. }
  542. void CPUProfiler::estimateTimerOverhead()
  543. {
  544. // Get an idea of how long timer calls and RDTSC takes
  545. const UINT32 reps = 1000, sampleReps = 100;
  546. mBasicTimerOverhead = 1000000.0;
  547. mPreciseTimerOverhead = 1000000;
  548. for (UINT32 tries = 0; tries < 20; tries++)
  549. {
  550. Timer timer;
  551. for (UINT32 i = 0; i < reps; i++)
  552. {
  553. timer.start();
  554. timer.stop();
  555. }
  556. double avgTime = double(timer.time)/double(reps);
  557. if (avgTime < mBasicTimerOverhead)
  558. mBasicTimerOverhead = avgTime;
  559. TimerPrecise timerPrecise;
  560. for (UINT32 i = 0; i < reps; i++)
  561. {
  562. timerPrecise.start();
  563. timerPrecise.stop();
  564. }
  565. UINT64 avgCycles = timerPrecise.cycles/reps;
  566. if (avgCycles < mPreciseTimerOverhead)
  567. mPreciseTimerOverhead = avgCycles;
  568. }
  569. mBasicSamplingOverheadMs = 1000000.0;
  570. mPreciseSamplingOverheadMs = 1000000.0;
  571. mBasicSamplingOverheadCycles = 1000000;
  572. mPreciseSamplingOverheadCycles = 1000000;
  573. for (UINT32 tries = 0; tries < 20; tries++)
  574. {
  575. /************************************************************************/
  576. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  577. /************************************************************************/
  578. Timer timerA;
  579. timerA.start();
  580. beginThread("Main");
  581. // Two different cases that can effect performance, one where
  582. // sample already exists and other where new one needs to be created
  583. for (UINT32 i = 0; i < sampleReps; i++)
  584. {
  585. beginSample("TestAvg1");
  586. endSample("TestAvg1");
  587. beginSample("TestAvg2");
  588. endSample("TestAvg2");
  589. beginSample("TestAvg3");
  590. endSample("TestAvg3");
  591. beginSample("TestAvg4");
  592. endSample("TestAvg4");
  593. beginSample("TestAvg5");
  594. endSample("TestAvg5");
  595. beginSample("TestAvg6");
  596. endSample("TestAvg6");
  597. beginSample("TestAvg7");
  598. endSample("TestAvg7");
  599. beginSample("TestAvg8");
  600. endSample("TestAvg8");
  601. beginSample("TestAvg9");
  602. endSample("TestAvg9");
  603. beginSample("TestAvg10");
  604. endSample("TestAvg10");
  605. }
  606. for (UINT32 i = 0; i < sampleReps * 5; i++)
  607. {
  608. beginSample("TestAvg#" + toString(i));
  609. endSample("TestAvg#" + toString(i));
  610. }
  611. endThread();
  612. timerA.stop();
  613. reset();
  614. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  615. if (avgTimeBasic < mBasicSamplingOverheadMs)
  616. mBasicSamplingOverheadMs = avgTimeBasic;
  617. /************************************************************************/
  618. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  619. /************************************************************************/
  620. TimerPrecise timerPreciseA;
  621. timerPreciseA.start();
  622. beginThread("Main");
  623. // Two different cases that can effect performance, one where
  624. // sample already exists and other where new one needs to be created
  625. for (UINT32 i = 0; i < sampleReps; i++)
  626. {
  627. beginSample("TestAvg1");
  628. endSample("TestAvg1");
  629. beginSample("TestAvg2");
  630. endSample("TestAvg2");
  631. beginSample("TestAvg3");
  632. endSample("TestAvg3");
  633. beginSample("TestAvg4");
  634. endSample("TestAvg4");
  635. beginSample("TestAvg5");
  636. endSample("TestAvg5");
  637. beginSample("TestAvg6");
  638. endSample("TestAvg6");
  639. beginSample("TestAvg7");
  640. endSample("TestAvg7");
  641. beginSample("TestAvg8");
  642. endSample("TestAvg8");
  643. beginSample("TestAvg9");
  644. endSample("TestAvg9");
  645. beginSample("TestAvg10");
  646. endSample("TestAvg10");
  647. }
  648. for (UINT32 i = 0; i < sampleReps * 5; i++)
  649. {
  650. beginSample("TestAvg#" + toString(i));
  651. endSample("TestAvg#" + toString(i));
  652. }
  653. endThread();
  654. timerPreciseA.stop();
  655. reset();
  656. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  657. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  658. mBasicSamplingOverheadCycles = avgCyclesBasic;
  659. /************************************************************************/
  660. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  661. /************************************************************************/
  662. Timer timerB;
  663. timerB.start();
  664. beginThread("Main");
  665. // Two different cases that can effect performance, one where
  666. // sample already exists and other where new one needs to be created
  667. for (UINT32 i = 0; i < sampleReps; i++)
  668. {
  669. beginSamplePrecise("TestAvg1");
  670. endSamplePrecise("TestAvg1");
  671. beginSamplePrecise("TestAvg2");
  672. endSamplePrecise("TestAvg2");
  673. beginSamplePrecise("TestAvg3");
  674. endSamplePrecise("TestAvg3");
  675. beginSamplePrecise("TestAvg4");
  676. endSamplePrecise("TestAvg4");
  677. beginSamplePrecise("TestAvg5");
  678. endSamplePrecise("TestAvg5");
  679. beginSamplePrecise("TestAvg6");
  680. endSamplePrecise("TestAvg6");
  681. beginSamplePrecise("TestAvg7");
  682. endSamplePrecise("TestAvg7");
  683. beginSamplePrecise("TestAvg8");
  684. endSamplePrecise("TestAvg8");
  685. beginSamplePrecise("TestAvg9");
  686. endSamplePrecise("TestAvg9");
  687. beginSamplePrecise("TestAvg10");
  688. endSamplePrecise("TestAvg10");
  689. }
  690. for (UINT32 i = 0; i < sampleReps * 5; i++)
  691. {
  692. beginSamplePrecise("TestAvg#" + toString(i));
  693. endSamplePrecise("TestAvg#" + toString(i));
  694. }
  695. endThread();
  696. timerB.stop();
  697. reset();
  698. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  699. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  700. mPreciseSamplingOverheadMs = avgTimesPrecise;
  701. /************************************************************************/
  702. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  703. /************************************************************************/
  704. TimerPrecise timerPreciseB;
  705. timerPreciseB.start();
  706. beginThread("Main");
  707. // Two different cases that can effect performance, one where
  708. // sample already exists and other where new one needs to be created
  709. for (UINT32 i = 0; i < sampleReps; i++)
  710. {
  711. beginSamplePrecise("TestAvg1");
  712. endSamplePrecise("TestAvg1");
  713. beginSamplePrecise("TestAvg2");
  714. endSamplePrecise("TestAvg2");
  715. beginSamplePrecise("TestAvg3");
  716. endSamplePrecise("TestAvg3");
  717. beginSamplePrecise("TestAvg4");
  718. endSamplePrecise("TestAvg4");
  719. beginSamplePrecise("TestAvg5");
  720. endSamplePrecise("TestAvg5");
  721. beginSamplePrecise("TestAvg6");
  722. endSamplePrecise("TestAvg6");
  723. beginSamplePrecise("TestAvg7");
  724. endSamplePrecise("TestAvg7");
  725. beginSamplePrecise("TestAvg8");
  726. endSamplePrecise("TestAvg8");
  727. beginSamplePrecise("TestAvg9");
  728. endSamplePrecise("TestAvg9");
  729. beginSamplePrecise("TestAvg10");
  730. endSamplePrecise("TestAvg10");
  731. }
  732. for (UINT32 i = 0; i < sampleReps * 5; i++)
  733. {
  734. beginSamplePrecise("TestAvg#" + toString(i));
  735. endSamplePrecise("TestAvg#" + toString(i));
  736. }
  737. endThread();
  738. timerPreciseB.stop();
  739. reset();
  740. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  741. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  742. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  743. }
  744. }
  745. CPUProfilerBasicSamplingEntry::Data::Data()
  746. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  747. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  748. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  749. { }
  750. CPUProfilerPreciseSamplingEntry::Data::Data()
  751. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  752. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  753. estimatedOverhead(0), pctOfParent(1.0f)
  754. { }
  755. CPUProfilerReport::CPUProfilerReport()
  756. {
  757. }
  758. }