CmCPUProfiler.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927
  1. #include "CmCPUProfiler.h"
  2. #include "CmDebug.h"
  3. #include "CmPlatform.h"
  4. namespace CamelotFramework
  5. {
  6. CPUProfiler::Timer::Timer()
  7. {
  8. time = 0.0f;
  9. }
  10. void CPUProfiler::Timer::start()
  11. {
  12. startTime = getCurrentTime();
  13. }
  14. void CPUProfiler::Timer::stop()
  15. {
  16. time += getCurrentTime() - startTime;
  17. }
  18. void CPUProfiler::Timer::reset()
  19. {
  20. time = 0.0f;
  21. }
  22. inline double CPUProfiler::Timer::getCurrentTime()
  23. {
  24. return Platform::queryPerformanceTimerMs();
  25. }
  26. CPUProfiler::TimerPrecise::TimerPrecise()
  27. {
  28. cycles = 0;
  29. }
  30. void CPUProfiler::TimerPrecise::start()
  31. {
  32. startCycles = getNumCycles();
  33. }
  34. void CPUProfiler::TimerPrecise::stop()
  35. {
  36. cycles += getNumCycles() - startCycles;
  37. }
  38. void CPUProfiler::TimerPrecise::reset()
  39. {
  40. cycles = 0;
  41. }
  42. inline UINT64 CPUProfiler::TimerPrecise::getNumCycles()
  43. {
  44. #if CM_COMPILER == CM_COMPILER_GNUC
  45. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  46. UINT32 __a,__d;
  47. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  48. return ( UINT64(__a) | UINT64(__d) << 32 );
  49. #else
  50. int a[4];
  51. int b = 0;
  52. __cpuid(a, b);
  53. return __rdtsc();
  54. #endif
  55. }
  56. void CPUProfiler::ProfileData::beginSample()
  57. {
  58. timer.reset();
  59. timer.start();
  60. }
  61. void CPUProfiler::ProfileData::endSample()
  62. {
  63. timer.stop();
  64. samples.push_back(ProfileSample(timer.time));
  65. }
  66. void CPUProfiler::ProfileData::resumeLastSample()
  67. {
  68. timer.start();
  69. samples.erase(samples.end() - 1);
  70. }
  71. void CPUProfiler::PreciseProfileData::beginSample()
  72. {
  73. timer.reset();
  74. timer.start();
  75. }
  76. void CPUProfiler::PreciseProfileData::endSample()
  77. {
  78. timer.stop();
  79. samples.push_back(PreciseProfileSample(timer.cycles));
  80. }
  81. void CPUProfiler::PreciseProfileData::resumeLastSample()
  82. {
  83. timer.start();
  84. samples.erase(samples.end() - 1);
  85. }
  86. CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
  87. CPUProfiler::ThreadInfo::ThreadInfo()
  88. :isActive(false), rootBlock(nullptr)
  89. {
  90. }
  91. void CPUProfiler::ThreadInfo::begin(const String& _name)
  92. {
  93. if(isActive)
  94. {
  95. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  96. return;
  97. }
  98. if(rootBlock == nullptr)
  99. rootBlock = getBlock();
  100. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  101. activeBlocks.push(activeBlock);
  102. rootBlock->name = _name;
  103. rootBlock->basic.beginSample();
  104. isActive = true;
  105. }
  106. void CPUProfiler::ThreadInfo::end()
  107. {
  108. if(activeBlock.type == ActiveSamplingType::Basic)
  109. activeBlock.block->basic.endSample();
  110. else
  111. activeBlock.block->precise.endSample();
  112. activeBlocks.pop();
  113. if(!isActive)
  114. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  115. if(activeBlocks.size() > 0)
  116. {
  117. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  118. while(activeBlocks.size() > 0)
  119. {
  120. ActiveBlock& curBlock = activeBlocks.top();
  121. if(curBlock.type == ActiveSamplingType::Basic)
  122. curBlock.block->basic.endSample();
  123. else
  124. curBlock.block->precise.endSample();
  125. activeBlocks.pop();
  126. }
  127. }
  128. isActive = false;
  129. activeBlocks = Stack<ActiveBlock>::type();
  130. activeBlock = ActiveBlock();
  131. }
  132. void CPUProfiler::ThreadInfo::reset()
  133. {
  134. if(isActive)
  135. end();
  136. if(rootBlock != nullptr)
  137. releaseBlock(rootBlock);
  138. rootBlock = nullptr;
  139. }
  140. CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
  141. {
  142. // TODO - Pool this, if possible using the memory allocator stuff
  143. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  144. return cm_new<ProfiledBlock>();
  145. }
  146. void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
  147. {
  148. cm_delete(block);
  149. }
  150. CPUProfiler::ProfiledBlock::ProfiledBlock()
  151. { }
  152. CPUProfiler::ProfiledBlock::~ProfiledBlock()
  153. {
  154. ThreadInfo* thread = ThreadInfo::activeThread;
  155. for(auto& child : children)
  156. thread->releaseBlock(child);
  157. children.clear();
  158. }
  159. CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
  160. {
  161. for(auto& child : children)
  162. {
  163. if(child->name == name)
  164. return child;
  165. }
  166. return nullptr;
  167. }
  168. CPUProfiler::CPUProfiler()
  169. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  170. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  171. {
  172. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  173. // and keep separate values per thread.
  174. estimateTimerOverhead();
  175. }
  176. CPUProfiler::~CPUProfiler()
  177. {
  178. reset();
  179. CM_LOCK_MUTEX(mThreadSync);
  180. for(auto& threadInfo : mActiveThreads)
  181. cm_delete(threadInfo);
  182. }
  183. void CPUProfiler::beginThread(const String& name)
  184. {
  185. ThreadInfo* thread = ThreadInfo::activeThread;
  186. if(thread == nullptr)
  187. {
  188. ThreadInfo::activeThread = cm_new<ThreadInfo>();
  189. thread = ThreadInfo::activeThread;
  190. {
  191. CM_LOCK_MUTEX(mThreadSync);
  192. mActiveThreads.push_back(thread);
  193. }
  194. }
  195. thread->begin(name);
  196. }
  197. void CPUProfiler::endThread()
  198. {
  199. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  200. ThreadInfo::activeThread->end();
  201. }
  202. void CPUProfiler::beginSample(const String& name)
  203. {
  204. ThreadInfo* thread = ThreadInfo::activeThread;
  205. if(thread == nullptr || !thread->isActive)
  206. beginThread("Unknown");
  207. ProfiledBlock* parent = thread->activeBlock.block;
  208. ProfiledBlock* block = nullptr;
  209. if(parent != nullptr)
  210. block = parent->findChild(name);
  211. if(block == nullptr)
  212. {
  213. block = thread->getBlock();
  214. block->name = name;
  215. if(parent != nullptr)
  216. parent->children.push_back(block);
  217. else
  218. thread->rootBlock->children.push_back(block);
  219. }
  220. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  221. thread->activeBlocks.push(thread->activeBlock);
  222. block->basic.beginSample();
  223. }
  224. void CPUProfiler::endSample(const String& name)
  225. {
  226. ThreadInfo* thread = ThreadInfo::activeThread;
  227. ProfiledBlock* block = thread->activeBlock.block;
  228. #if CM_DEBUG_MODE
  229. if(block == nullptr)
  230. {
  231. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  232. return;
  233. }
  234. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  235. {
  236. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  237. return;
  238. }
  239. if(block->name != name)
  240. {
  241. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  242. return;
  243. }
  244. #endif
  245. block->basic.endSample();
  246. thread->activeBlocks.pop();
  247. if(!thread->activeBlocks.empty())
  248. thread->activeBlock = thread->activeBlocks.top();
  249. else
  250. thread->activeBlock = ActiveBlock();
  251. }
  252. void CPUProfiler::beginSamplePrecise(const String& name)
  253. {
  254. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  255. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  256. ThreadInfo* thread = ThreadInfo::activeThread;
  257. if(thread == nullptr || !thread->isActive)
  258. beginThread("Unknown");
  259. ProfiledBlock* parent = thread->activeBlock.block;
  260. ProfiledBlock* block = nullptr;
  261. if(parent != nullptr)
  262. block = parent->findChild(name);
  263. if(block == nullptr)
  264. {
  265. block = thread->getBlock();
  266. block->name = name;
  267. if(parent != nullptr)
  268. parent->children.push_back(block);
  269. else
  270. thread->rootBlock->children.push_back(block);
  271. }
  272. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  273. thread->activeBlocks.push(thread->activeBlock);
  274. block->precise.beginSample();
  275. }
  276. void CPUProfiler::endSamplePrecise(const String& name)
  277. {
  278. ThreadInfo* thread = ThreadInfo::activeThread;
  279. ProfiledBlock* block = thread->activeBlock.block;
  280. #if CM_DEBUG_MODE
  281. if(block == nullptr)
  282. {
  283. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  284. return;
  285. }
  286. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  287. {
  288. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  289. return;
  290. }
  291. if(block->name != name)
  292. {
  293. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  294. return;
  295. }
  296. #endif
  297. block->precise.endSample();
  298. thread->activeBlocks.pop();
  299. if(!thread->activeBlocks.empty())
  300. thread->activeBlock = thread->activeBlocks.top();
  301. else
  302. thread->activeBlock = ActiveBlock();
  303. }
  304. void CPUProfiler::reset()
  305. {
  306. ThreadInfo* thread = ThreadInfo::activeThread;
  307. if(thread != nullptr)
  308. thread->reset();
  309. }
  310. CPUProfilerReport CPUProfiler::generateReport()
  311. {
  312. CPUProfilerReport report;
  313. ThreadInfo* thread = ThreadInfo::activeThread;
  314. if(thread == nullptr)
  315. return report;
  316. if(thread->isActive)
  317. thread->end();
  318. // We need to separate out basic and precise data and form two separate hierarchies
  319. if(thread->rootBlock == nullptr)
  320. return report;
  321. struct TempEntry
  322. {
  323. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  324. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  325. { }
  326. ProfiledBlock* parentBlock;
  327. UINT32 entryIdx;
  328. Vector<UINT32>::type childIndexes;
  329. };
  330. Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
  331. Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;
  332. // Fill up flatHierarchy array in a way so we always process children before parents
  333. Stack<UINT32>::type todo;
  334. Vector<TempEntry>::type flatHierarchy;
  335. UINT32 entryIdx = 0;
  336. todo.push(entryIdx);
  337. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  338. entryIdx++;
  339. while(!todo.empty())
  340. {
  341. UINT32 curDataIdx = todo.top();
  342. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  343. todo.pop();
  344. for(auto& child : curBlock->children)
  345. {
  346. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  347. todo.push(entryIdx);
  348. flatHierarchy.push_back(TempEntry(child, entryIdx));
  349. entryIdx++;
  350. }
  351. }
  352. // Calculate sampling data for all entries
  353. basicEntries.resize(flatHierarchy.size());
  354. preciseEntries.resize(flatHierarchy.size());
  355. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  356. {
  357. TempEntry& curData = *iter;
  358. ProfiledBlock* curBlock = curData.parentBlock;
  359. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  360. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  361. // Calculate basic data
  362. entryBasic->data.name = curBlock->name;
  363. entryBasic->data.totalTimeMs = 0.0;
  364. entryBasic->data.maxTimeMs = 0.0;
  365. for(auto& sample : curBlock->basic.samples)
  366. {
  367. entryBasic->data.totalTimeMs += sample.time;
  368. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  369. }
  370. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  371. if(entryBasic->data.numCalls > 0)
  372. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  373. double totalChildTime = 0.0;
  374. for(auto& childIdx : curData.childIndexes)
  375. {
  376. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  377. totalChildTime += childEntry->data.totalTimeMs;
  378. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  379. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  380. }
  381. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  382. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  383. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  384. if(entryBasic->data.numCalls > 0)
  385. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  386. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  387. // Calculate precise data
  388. entryPrecise->data.name = curBlock->name;
  389. entryPrecise->data.totalCycles = 0;
  390. entryPrecise->data.maxCycles = 0;
  391. for(auto& sample : curBlock->precise.samples)
  392. {
  393. entryPrecise->data.totalCycles += sample.cycles;
  394. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  395. }
  396. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  397. if(entryPrecise->data.numCalls > 0)
  398. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  399. UINT64 totalChildCycles = 0;
  400. for(auto& childIdx : curData.childIndexes)
  401. {
  402. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  403. totalChildCycles += childEntry->data.totalCycles;
  404. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  405. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  406. }
  407. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  408. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  409. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  410. if(entryPrecise->data.numCalls > 0)
  411. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  412. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  413. }
  414. // Prune empty basic entries
  415. Stack<UINT32>::type finalBasicHierarchyTodo;
  416. Stack<UINT32>::type parentBasicEntryIndexes;
  417. Vector<TempEntry>::type newBasicEntries;
  418. finalBasicHierarchyTodo.push(0);
  419. entryIdx = 0;
  420. parentBasicEntryIndexes.push(entryIdx);
  421. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  422. entryIdx++;
  423. while(!finalBasicHierarchyTodo.empty())
  424. {
  425. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  426. parentBasicEntryIndexes.pop();
  427. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  428. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  429. finalBasicHierarchyTodo.pop();
  430. for(auto& childIdx : curEntry.childIndexes)
  431. {
  432. finalBasicHierarchyTodo.push(childIdx);
  433. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  434. if(basicEntry.data.numCalls > 0)
  435. {
  436. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  437. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  438. parentBasicEntryIndexes.push(entryIdx);
  439. entryIdx++;
  440. }
  441. else
  442. parentBasicEntryIndexes.push(parentEntryIdx);
  443. }
  444. }
  445. if(newBasicEntries.size() > 0)
  446. {
  447. Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
  448. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  449. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  450. UINT32 curEntryIdx = 0;
  451. for(auto& curEntry : newBasicEntries)
  452. {
  453. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  454. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  455. UINT32 idx = 0;
  456. for(auto& childIdx : curEntry.childIndexes)
  457. {
  458. TempEntry& childEntry = newBasicEntries[childIdx];
  459. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  460. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  461. idx++;
  462. }
  463. curEntryIdx++;
  464. }
  465. }
  466. // Prune empty precise entries
  467. Stack<UINT32>::type finalPreciseHierarchyTodo;
  468. Stack<UINT32>::type parentPreciseEntryIndexes;
  469. Vector<TempEntry>::type newPreciseEntries;
  470. finalPreciseHierarchyTodo.push(0);
  471. entryIdx = 0;
  472. parentPreciseEntryIndexes.push(entryIdx);
  473. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  474. entryIdx++;
  475. while(!finalPreciseHierarchyTodo.empty())
  476. {
  477. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  478. parentPreciseEntryIndexes.pop();
  479. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  480. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  481. finalPreciseHierarchyTodo.pop();
  482. for(auto& childIdx : curEntry.childIndexes)
  483. {
  484. finalPreciseHierarchyTodo.push(childIdx);
  485. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  486. if(preciseEntry.data.numCalls > 0)
  487. {
  488. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  489. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  490. parentPreciseEntryIndexes.push(entryIdx);
  491. entryIdx++;
  492. }
  493. else
  494. parentPreciseEntryIndexes.push(parentEntryIdx);
  495. }
  496. }
  497. if(newPreciseEntries.size() > 0)
  498. {
  499. Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
  500. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  501. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  502. UINT32 curEntryIdx = 0;
  503. for(auto& curEntry : newPreciseEntries)
  504. {
  505. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  506. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  507. UINT32 idx = 0;
  508. for(auto& childIdx : curEntry.childIndexes)
  509. {
  510. TempEntry& childEntry = newPreciseEntries[childIdx];
  511. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  512. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  513. idx++;
  514. }
  515. curEntryIdx++;
  516. }
  517. }
  518. return report;
  519. }
  520. void CPUProfiler::estimateTimerOverhead()
  521. {
  522. // Get an idea of how long timer calls and RDTSC takes
  523. const UINT32 reps = 1000, sampleReps = 100;
  524. mBasicTimerOverhead = 1000000.0;
  525. mPreciseTimerOverhead = 1000000;
  526. for (UINT32 tries = 0; tries < 20; tries++)
  527. {
  528. Timer timer;
  529. for (UINT32 i = 0; i < reps; i++)
  530. {
  531. timer.start();
  532. timer.stop();
  533. }
  534. double avgTime = double(timer.time)/double(reps);
  535. if (avgTime < mBasicTimerOverhead)
  536. mBasicTimerOverhead = avgTime;
  537. TimerPrecise timerPrecise;
  538. for (UINT32 i = 0; i < reps; i++)
  539. {
  540. timerPrecise.start();
  541. timerPrecise.stop();
  542. }
  543. UINT64 avgCycles = timerPrecise.cycles/reps;
  544. if (avgCycles < mPreciseTimerOverhead)
  545. mPreciseTimerOverhead = avgCycles;
  546. }
  547. mBasicSamplingOverheadMs = 1000000.0;
  548. mPreciseSamplingOverheadMs = 1000000.0;
  549. mBasicSamplingOverheadCycles = 1000000;
  550. mPreciseSamplingOverheadCycles = 1000000;
  551. for (UINT32 tries = 0; tries < 20; tries++)
  552. {
  553. /************************************************************************/
  554. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  555. /************************************************************************/
  556. Timer timerA;
  557. timerA.start();
  558. beginThread("Main");
  559. // Two different cases that can effect performance, one where
  560. // sample already exists and other where new one needs to be created
  561. for (UINT32 i = 0; i < sampleReps; i++)
  562. {
  563. beginSample("TestAvg1");
  564. endSample("TestAvg1");
  565. beginSample("TestAvg2");
  566. endSample("TestAvg2");
  567. beginSample("TestAvg3");
  568. endSample("TestAvg3");
  569. beginSample("TestAvg4");
  570. endSample("TestAvg4");
  571. beginSample("TestAvg5");
  572. endSample("TestAvg5");
  573. beginSample("TestAvg6");
  574. endSample("TestAvg6");
  575. beginSample("TestAvg7");
  576. endSample("TestAvg7");
  577. beginSample("TestAvg8");
  578. endSample("TestAvg8");
  579. beginSample("TestAvg9");
  580. endSample("TestAvg9");
  581. beginSample("TestAvg10");
  582. endSample("TestAvg10");
  583. }
  584. for (UINT32 i = 0; i < sampleReps * 5; i++)
  585. {
  586. beginSample("TestAvg#" + toString(i));
  587. endSample("TestAvg#" + toString(i));
  588. }
  589. endThread();
  590. timerA.stop();
  591. reset();
  592. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  593. if (avgTimeBasic < mBasicSamplingOverheadMs)
  594. mBasicSamplingOverheadMs = avgTimeBasic;
  595. /************************************************************************/
  596. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  597. /************************************************************************/
  598. TimerPrecise timerPreciseA;
  599. timerPreciseA.start();
  600. beginThread("Main");
  601. // Two different cases that can effect performance, one where
  602. // sample already exists and other where new one needs to be created
  603. for (UINT32 i = 0; i < sampleReps; i++)
  604. {
  605. beginSample("TestAvg1");
  606. endSample("TestAvg1");
  607. beginSample("TestAvg2");
  608. endSample("TestAvg2");
  609. beginSample("TestAvg3");
  610. endSample("TestAvg3");
  611. beginSample("TestAvg4");
  612. endSample("TestAvg4");
  613. beginSample("TestAvg5");
  614. endSample("TestAvg5");
  615. beginSample("TestAvg6");
  616. endSample("TestAvg6");
  617. beginSample("TestAvg7");
  618. endSample("TestAvg7");
  619. beginSample("TestAvg8");
  620. endSample("TestAvg8");
  621. beginSample("TestAvg9");
  622. endSample("TestAvg9");
  623. beginSample("TestAvg10");
  624. endSample("TestAvg10");
  625. }
  626. for (UINT32 i = 0; i < sampleReps * 5; i++)
  627. {
  628. beginSample("TestAvg#" + toString(i));
  629. endSample("TestAvg#" + toString(i));
  630. }
  631. endThread();
  632. timerPreciseA.stop();
  633. reset();
  634. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  635. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  636. mBasicSamplingOverheadCycles = avgCyclesBasic;
  637. /************************************************************************/
  638. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  639. /************************************************************************/
  640. Timer timerB;
  641. timerB.start();
  642. beginThread("Main");
  643. // Two different cases that can effect performance, one where
  644. // sample already exists and other where new one needs to be created
  645. for (UINT32 i = 0; i < sampleReps; i++)
  646. {
  647. beginSamplePrecise("TestAvg1");
  648. endSamplePrecise("TestAvg1");
  649. beginSamplePrecise("TestAvg2");
  650. endSamplePrecise("TestAvg2");
  651. beginSamplePrecise("TestAvg3");
  652. endSamplePrecise("TestAvg3");
  653. beginSamplePrecise("TestAvg4");
  654. endSamplePrecise("TestAvg4");
  655. beginSamplePrecise("TestAvg5");
  656. endSamplePrecise("TestAvg5");
  657. beginSamplePrecise("TestAvg6");
  658. endSamplePrecise("TestAvg6");
  659. beginSamplePrecise("TestAvg7");
  660. endSamplePrecise("TestAvg7");
  661. beginSamplePrecise("TestAvg8");
  662. endSamplePrecise("TestAvg8");
  663. beginSamplePrecise("TestAvg9");
  664. endSamplePrecise("TestAvg9");
  665. beginSamplePrecise("TestAvg10");
  666. endSamplePrecise("TestAvg10");
  667. }
  668. for (UINT32 i = 0; i < sampleReps * 5; i++)
  669. {
  670. beginSamplePrecise("TestAvg#" + toString(i));
  671. endSamplePrecise("TestAvg#" + toString(i));
  672. }
  673. endThread();
  674. timerB.stop();
  675. reset();
  676. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  677. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  678. mPreciseSamplingOverheadMs = avgTimesPrecise;
  679. /************************************************************************/
  680. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  681. /************************************************************************/
  682. TimerPrecise timerPreciseB;
  683. timerPreciseB.start();
  684. beginThread("Main");
  685. // Two different cases that can effect performance, one where
  686. // sample already exists and other where new one needs to be created
  687. for (UINT32 i = 0; i < sampleReps; i++)
  688. {
  689. beginSamplePrecise("TestAvg1");
  690. endSamplePrecise("TestAvg1");
  691. beginSamplePrecise("TestAvg2");
  692. endSamplePrecise("TestAvg2");
  693. beginSamplePrecise("TestAvg3");
  694. endSamplePrecise("TestAvg3");
  695. beginSamplePrecise("TestAvg4");
  696. endSamplePrecise("TestAvg4");
  697. beginSamplePrecise("TestAvg5");
  698. endSamplePrecise("TestAvg5");
  699. beginSamplePrecise("TestAvg6");
  700. endSamplePrecise("TestAvg6");
  701. beginSamplePrecise("TestAvg7");
  702. endSamplePrecise("TestAvg7");
  703. beginSamplePrecise("TestAvg8");
  704. endSamplePrecise("TestAvg8");
  705. beginSamplePrecise("TestAvg9");
  706. endSamplePrecise("TestAvg9");
  707. beginSamplePrecise("TestAvg10");
  708. endSamplePrecise("TestAvg10");
  709. }
  710. for (UINT32 i = 0; i < sampleReps * 5; i++)
  711. {
  712. beginSamplePrecise("TestAvg#" + toString(i));
  713. endSamplePrecise("TestAvg#" + toString(i));
  714. }
  715. endThread();
  716. timerPreciseB.stop();
  717. reset();
  718. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  719. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  720. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  721. }
  722. }
  723. CPUProfilerBasicSamplingEntry::Data::Data()
  724. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  725. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  726. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  727. { }
  728. CPUProfilerPreciseSamplingEntry::Data::Data()
  729. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  730. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  731. estimatedOverhead(0), pctOfParent(1.0f)
  732. { }
  733. CPUProfilerReport::CPUProfilerReport()
  734. {
  735. }
  736. }