CmCPUProfiler.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942
  1. #include "CmCPUProfiler.h"
  2. #include "CmDebug.h"
  3. #if CM_PLATFORM == CM_PLATFORM_WIN32
  4. #include "windows.h"
  5. #endif
  6. namespace CamelotFramework
  7. {
  8. CPUProfiler::Timer::Timer()
  9. {
  10. time = 0.0f;
  11. }
  12. void CPUProfiler::Timer::start()
  13. {
  14. startTime = getCurrentTime();
  15. }
  16. void CPUProfiler::Timer::stop()
  17. {
  18. time += getCurrentTime() - startTime;
  19. }
  20. void CPUProfiler::Timer::reset()
  21. {
  22. time = 0.0f;
  23. }
  24. inline double CPUProfiler::Timer::getCurrentTime()
  25. {
  26. // TODO: I should be calling Platform:: performance methods instead of doing it here.
  27. // The only problem is that Platform belong to Core and not Utility
  28. #if CM_PLATFORM == CM_PLATFORM_WIN32
  29. LARGE_INTEGER counterValue;
  30. QueryPerformanceCounter(&counterValue);
  31. LARGE_INTEGER counterFrequency;
  32. QueryPerformanceFrequency(&counterFrequency);
  33. return (double)counterValue.QuadPart / (counterFrequency.QuadPart * 0.001);
  34. #else
  35. NOT IMPLEMENTED
  36. #endif
  37. }
  38. CPUProfiler::TimerPrecise::TimerPrecise()
  39. {
  40. cycles = 0;
  41. }
  42. void CPUProfiler::TimerPrecise::start()
  43. {
  44. startCycles = getNumCycles();
  45. }
  46. void CPUProfiler::TimerPrecise::stop()
  47. {
  48. cycles += getNumCycles() - startCycles;
  49. }
  50. void CPUProfiler::TimerPrecise::reset()
  51. {
  52. cycles = 0;
  53. }
  54. inline UINT64 CPUProfiler::TimerPrecise::getNumCycles()
  55. {
  56. #if CM_COMPILER == CM_COMPILER_GNUC
  57. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  58. UINT32 __a,__d;
  59. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  60. return ( UINT64(__a) | UINT64(__d) << 32 );
  61. #else
  62. int a[4];
  63. int b = 0;
  64. __cpuid(a, b);
  65. return __rdtsc();
  66. #endif
  67. }
  68. void CPUProfiler::ProfileData::beginSample()
  69. {
  70. timer.reset();
  71. timer.start();
  72. }
  73. void CPUProfiler::ProfileData::endSample()
  74. {
  75. timer.stop();
  76. samples.push_back(ProfileSample(timer.time));
  77. }
  78. void CPUProfiler::ProfileData::resumeLastSample()
  79. {
  80. timer.start();
  81. samples.erase(samples.end() - 1);
  82. }
  83. void CPUProfiler::PreciseProfileData::beginSample()
  84. {
  85. timer.reset();
  86. timer.start();
  87. }
  88. void CPUProfiler::PreciseProfileData::endSample()
  89. {
  90. timer.stop();
  91. samples.push_back(PreciseProfileSample(timer.cycles));
  92. }
  93. void CPUProfiler::PreciseProfileData::resumeLastSample()
  94. {
  95. timer.start();
  96. samples.erase(samples.end() - 1);
  97. }
  98. CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
  99. CPUProfiler::ThreadInfo::ThreadInfo()
  100. :isActive(false), rootBlock(nullptr)
  101. {
  102. }
  103. void CPUProfiler::ThreadInfo::begin(const String& _name)
  104. {
  105. if(isActive)
  106. {
  107. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  108. return;
  109. }
  110. if(rootBlock == nullptr)
  111. rootBlock = getBlock();
  112. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  113. activeBlocks.push(activeBlock);
  114. rootBlock->name = _name;
  115. rootBlock->basic.beginSample();
  116. isActive = true;
  117. }
  118. void CPUProfiler::ThreadInfo::end()
  119. {
  120. if(activeBlock.type == ActiveSamplingType::Basic)
  121. activeBlock.block->basic.endSample();
  122. else
  123. activeBlock.block->precise.endSample();
  124. activeBlocks.pop();
  125. if(!isActive)
  126. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  127. if(activeBlocks.size() > 0)
  128. {
  129. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  130. while(activeBlocks.size() > 0)
  131. {
  132. ActiveBlock& curBlock = activeBlocks.top();
  133. if(curBlock.type == ActiveSamplingType::Basic)
  134. curBlock.block->basic.endSample();
  135. else
  136. curBlock.block->precise.endSample();
  137. activeBlocks.pop();
  138. }
  139. }
  140. isActive = false;
  141. activeBlocks = Stack<ActiveBlock>::type();
  142. activeBlock = ActiveBlock();
  143. }
  144. void CPUProfiler::ThreadInfo::reset()
  145. {
  146. if(isActive)
  147. end();
  148. if(rootBlock != nullptr)
  149. releaseBlock(rootBlock);
  150. rootBlock = nullptr;
  151. }
  152. CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
  153. {
  154. // TODO - Pool this, if possible using the memory allocator stuff
  155. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  156. return cm_new<ProfiledBlock>();
  157. }
  158. void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
  159. {
  160. cm_delete(block);
  161. }
  162. CPUProfiler::ProfiledBlock::ProfiledBlock()
  163. { }
  164. CPUProfiler::ProfiledBlock::~ProfiledBlock()
  165. {
  166. ThreadInfo* thread = ThreadInfo::activeThread;
  167. for(auto& child : children)
  168. thread->releaseBlock(child);
  169. children.clear();
  170. }
  171. CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
  172. {
  173. for(auto& child : children)
  174. {
  175. if(child->name == name)
  176. return child;
  177. }
  178. return nullptr;
  179. }
  180. CPUProfiler::CPUProfiler()
  181. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  182. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  183. {
  184. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  185. // and keep separate values per thread.
  186. estimateTimerOverhead();
  187. }
  188. CPUProfiler::~CPUProfiler()
  189. {
  190. reset();
  191. CM_LOCK_MUTEX(mThreadSync);
  192. for(auto& threadInfo : mActiveThreads)
  193. cm_delete(threadInfo);
  194. }
  195. void CPUProfiler::beginThread(const String& name)
  196. {
  197. ThreadInfo* thread = ThreadInfo::activeThread;
  198. if(thread == nullptr)
  199. {
  200. ThreadInfo::activeThread = cm_new<ThreadInfo>();
  201. thread = ThreadInfo::activeThread;
  202. {
  203. CM_LOCK_MUTEX(mThreadSync);
  204. mActiveThreads.push_back(thread);
  205. }
  206. }
  207. thread->begin(name);
  208. }
  209. void CPUProfiler::endThread()
  210. {
  211. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  212. ThreadInfo::activeThread->end();
  213. }
  214. void CPUProfiler::beginSample(const String& name)
  215. {
  216. ThreadInfo* thread = ThreadInfo::activeThread;
  217. if(thread == nullptr || !thread->isActive)
  218. beginThread("Unknown");
  219. ProfiledBlock* parent = thread->activeBlock.block;
  220. ProfiledBlock* block = nullptr;
  221. if(parent != nullptr)
  222. block = parent->findChild(name);
  223. if(block == nullptr)
  224. {
  225. block = thread->getBlock();
  226. block->name = name;
  227. if(parent != nullptr)
  228. parent->children.push_back(block);
  229. else
  230. thread->rootBlock->children.push_back(block);
  231. }
  232. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  233. thread->activeBlocks.push(thread->activeBlock);
  234. block->basic.beginSample();
  235. }
  236. void CPUProfiler::endSample(const String& name)
  237. {
  238. ThreadInfo* thread = ThreadInfo::activeThread;
  239. ProfiledBlock* block = thread->activeBlock.block;
  240. #if CM_DEBUG_MODE
  241. if(block == nullptr)
  242. {
  243. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  244. return;
  245. }
  246. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  247. {
  248. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  249. return;
  250. }
  251. if(block->name != name)
  252. {
  253. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  254. return;
  255. }
  256. #endif
  257. block->basic.endSample();
  258. thread->activeBlocks.pop();
  259. if(!thread->activeBlocks.empty())
  260. thread->activeBlock = thread->activeBlocks.top();
  261. else
  262. thread->activeBlock = ActiveBlock();
  263. }
  264. void CPUProfiler::beginSamplePrecise(const String& name)
  265. {
  266. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  267. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  268. ThreadInfo* thread = ThreadInfo::activeThread;
  269. if(thread == nullptr || !thread->isActive)
  270. beginThread("Unknown");
  271. ProfiledBlock* parent = thread->activeBlock.block;
  272. ProfiledBlock* block = nullptr;
  273. if(parent != nullptr)
  274. block = parent->findChild(name);
  275. if(block == nullptr)
  276. {
  277. block = thread->getBlock();
  278. block->name = name;
  279. if(parent != nullptr)
  280. parent->children.push_back(block);
  281. else
  282. thread->rootBlock->children.push_back(block);
  283. }
  284. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  285. thread->activeBlocks.push(thread->activeBlock);
  286. block->precise.beginSample();
  287. }
  288. void CPUProfiler::endSamplePrecise(const String& name)
  289. {
  290. ThreadInfo* thread = ThreadInfo::activeThread;
  291. ProfiledBlock* block = thread->activeBlock.block;
  292. #if CM_DEBUG_MODE
  293. if(block == nullptr)
  294. {
  295. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  296. return;
  297. }
  298. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  299. {
  300. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  301. return;
  302. }
  303. if(block->name != name)
  304. {
  305. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  306. return;
  307. }
  308. #endif
  309. block->precise.endSample();
  310. thread->activeBlocks.pop();
  311. if(!thread->activeBlocks.empty())
  312. thread->activeBlock = thread->activeBlocks.top();
  313. else
  314. thread->activeBlock = ActiveBlock();
  315. }
  316. void CPUProfiler::reset()
  317. {
  318. ThreadInfo* thread = ThreadInfo::activeThread;
  319. if(thread != nullptr)
  320. thread->reset();
  321. }
  322. CPUProfilerReport CPUProfiler::generateReport()
  323. {
  324. CPUProfilerReport report;
  325. ThreadInfo* thread = ThreadInfo::activeThread;
  326. if(thread == nullptr)
  327. return report;
  328. if(thread->isActive)
  329. thread->end();
  330. // We need to separate out basic and precise data and form two separate hierarchies
  331. if(thread->rootBlock == nullptr)
  332. return report;
  333. struct TempEntry
  334. {
  335. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  336. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  337. { }
  338. ProfiledBlock* parentBlock;
  339. UINT32 entryIdx;
  340. Vector<UINT32>::type childIndexes;
  341. };
  342. Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
  343. Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;
  344. // Fill up flatHierarchy array in a way so we always process children before parents
  345. Stack<UINT32>::type todo;
  346. Vector<TempEntry>::type flatHierarchy;
  347. UINT32 entryIdx = 0;
  348. todo.push(entryIdx);
  349. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  350. entryIdx++;
  351. while(!todo.empty())
  352. {
  353. UINT32 curDataIdx = todo.top();
  354. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  355. todo.pop();
  356. for(auto& child : curBlock->children)
  357. {
  358. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  359. todo.push(entryIdx);
  360. flatHierarchy.push_back(TempEntry(child, entryIdx));
  361. entryIdx++;
  362. }
  363. }
  364. // Calculate sampling data for all entries
  365. basicEntries.resize(flatHierarchy.size());
  366. preciseEntries.resize(flatHierarchy.size());
  367. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  368. {
  369. TempEntry& curData = *iter;
  370. ProfiledBlock* curBlock = curData.parentBlock;
  371. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  372. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  373. // Calculate basic data
  374. entryBasic->data.name = curBlock->name;
  375. entryBasic->data.totalTimeMs = 0.0;
  376. entryBasic->data.maxTimeMs = 0.0;
  377. for(auto& sample : curBlock->basic.samples)
  378. {
  379. entryBasic->data.totalTimeMs += sample.time;
  380. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  381. }
  382. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  383. if(entryBasic->data.numCalls > 0)
  384. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  385. double totalChildTime = 0.0;
  386. for(auto& childIdx : curData.childIndexes)
  387. {
  388. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  389. totalChildTime += childEntry->data.totalTimeMs;
  390. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  391. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  392. }
  393. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  394. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  395. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  396. if(entryBasic->data.numCalls > 0)
  397. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  398. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  399. // Calculate precise data
  400. entryPrecise->data.name = curBlock->name;
  401. entryPrecise->data.totalCycles = 0;
  402. entryPrecise->data.maxCycles = 0;
  403. for(auto& sample : curBlock->precise.samples)
  404. {
  405. entryPrecise->data.totalCycles += sample.cycles;
  406. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  407. }
  408. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  409. if(entryPrecise->data.numCalls > 0)
  410. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  411. UINT64 totalChildCycles = 0;
  412. for(auto& childIdx : curData.childIndexes)
  413. {
  414. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  415. totalChildCycles += childEntry->data.totalCycles;
  416. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  417. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  418. }
  419. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  420. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  421. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  422. if(entryPrecise->data.numCalls > 0)
  423. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  424. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  425. }
  426. // Prune empty basic entries
  427. Stack<UINT32>::type finalBasicHierarchyTodo;
  428. Stack<UINT32>::type parentBasicEntryIndexes;
  429. Vector<TempEntry>::type newBasicEntries;
  430. finalBasicHierarchyTodo.push(0);
  431. entryIdx = 0;
  432. parentBasicEntryIndexes.push(entryIdx);
  433. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  434. entryIdx++;
  435. while(!finalBasicHierarchyTodo.empty())
  436. {
  437. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  438. parentBasicEntryIndexes.pop();
  439. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  440. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  441. finalBasicHierarchyTodo.pop();
  442. for(auto& childIdx : curEntry.childIndexes)
  443. {
  444. finalBasicHierarchyTodo.push(childIdx);
  445. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  446. if(basicEntry.data.numCalls > 0)
  447. {
  448. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  449. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  450. parentBasicEntryIndexes.push(entryIdx);
  451. entryIdx++;
  452. }
  453. else
  454. parentBasicEntryIndexes.push(parentEntryIdx);
  455. }
  456. }
  457. if(newBasicEntries.size() > 0)
  458. {
  459. Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
  460. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  461. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  462. UINT32 curEntryIdx = 0;
  463. for(auto& curEntry : newBasicEntries)
  464. {
  465. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  466. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  467. UINT32 idx = 0;
  468. for(auto& childIdx : curEntry.childIndexes)
  469. {
  470. TempEntry& childEntry = newBasicEntries[childIdx];
  471. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  472. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  473. idx++;
  474. }
  475. curEntryIdx++;
  476. }
  477. }
  478. // Prune empty precise entries
  479. Stack<UINT32>::type finalPreciseHierarchyTodo;
  480. Stack<UINT32>::type parentPreciseEntryIndexes;
  481. Vector<TempEntry>::type newPreciseEntries;
  482. finalPreciseHierarchyTodo.push(0);
  483. entryIdx = 0;
  484. parentPreciseEntryIndexes.push(entryIdx);
  485. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  486. entryIdx++;
  487. while(!finalPreciseHierarchyTodo.empty())
  488. {
  489. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  490. parentPreciseEntryIndexes.pop();
  491. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  492. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  493. finalPreciseHierarchyTodo.pop();
  494. for(auto& childIdx : curEntry.childIndexes)
  495. {
  496. finalPreciseHierarchyTodo.push(childIdx);
  497. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  498. if(preciseEntry.data.numCalls > 0)
  499. {
  500. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  501. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  502. parentPreciseEntryIndexes.push(entryIdx);
  503. entryIdx++;
  504. }
  505. else
  506. parentPreciseEntryIndexes.push(parentEntryIdx);
  507. }
  508. }
  509. if(newPreciseEntries.size() > 0)
  510. {
  511. Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
  512. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  513. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  514. UINT32 curEntryIdx = 0;
  515. for(auto& curEntry : newPreciseEntries)
  516. {
  517. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  518. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  519. UINT32 idx = 0;
  520. for(auto& childIdx : curEntry.childIndexes)
  521. {
  522. TempEntry& childEntry = newPreciseEntries[childIdx];
  523. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  524. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  525. idx++;
  526. }
  527. curEntryIdx++;
  528. }
  529. }
  530. return report;
  531. }
  532. void CPUProfiler::estimateTimerOverhead()
  533. {
  534. // Get an idea of how long timer calls and RDTSC takes
  535. const UINT32 reps = 1000, sampleReps = 100;
  536. mBasicTimerOverhead = 1000000.0;
  537. mPreciseTimerOverhead = 1000000;
  538. for (UINT32 tries = 0; tries < 20; tries++)
  539. {
  540. Timer timer;
  541. for (UINT32 i = 0; i < reps; i++)
  542. {
  543. timer.start();
  544. timer.stop();
  545. }
  546. double avgTime = double(timer.time)/double(reps);
  547. if (avgTime < mBasicTimerOverhead)
  548. mBasicTimerOverhead = avgTime;
  549. TimerPrecise timerPrecise;
  550. for (UINT32 i = 0; i < reps; i++)
  551. {
  552. timerPrecise.start();
  553. timerPrecise.stop();
  554. }
  555. UINT64 avgCycles = timerPrecise.cycles/reps;
  556. if (avgCycles < mPreciseTimerOverhead)
  557. mPreciseTimerOverhead = avgCycles;
  558. }
  559. mBasicSamplingOverheadMs = 1000000.0;
  560. mPreciseSamplingOverheadMs = 1000000.0;
  561. mBasicSamplingOverheadCycles = 1000000;
  562. mPreciseSamplingOverheadCycles = 1000000;
  563. for (UINT32 tries = 0; tries < 20; tries++)
  564. {
  565. /************************************************************************/
  566. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  567. /************************************************************************/
  568. Timer timerA;
  569. timerA.start();
  570. beginThread("Main");
  571. // Two different cases that can effect performance, one where
  572. // sample already exists and other where new one needs to be created
  573. for (UINT32 i = 0; i < sampleReps; i++)
  574. {
  575. beginSample("TestAvg1");
  576. endSample("TestAvg1");
  577. beginSample("TestAvg2");
  578. endSample("TestAvg2");
  579. beginSample("TestAvg3");
  580. endSample("TestAvg3");
  581. beginSample("TestAvg4");
  582. endSample("TestAvg4");
  583. beginSample("TestAvg5");
  584. endSample("TestAvg5");
  585. beginSample("TestAvg6");
  586. endSample("TestAvg6");
  587. beginSample("TestAvg7");
  588. endSample("TestAvg7");
  589. beginSample("TestAvg8");
  590. endSample("TestAvg8");
  591. beginSample("TestAvg9");
  592. endSample("TestAvg9");
  593. beginSample("TestAvg10");
  594. endSample("TestAvg10");
  595. }
  596. for (UINT32 i = 0; i < sampleReps * 5; i++)
  597. {
  598. beginSample("TestAvg#" + toString(i));
  599. endSample("TestAvg#" + toString(i));
  600. }
  601. endThread();
  602. timerA.stop();
  603. reset();
  604. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  605. if (avgTimeBasic < mBasicSamplingOverheadMs)
  606. mBasicSamplingOverheadMs = avgTimeBasic;
  607. /************************************************************************/
  608. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  609. /************************************************************************/
  610. TimerPrecise timerPreciseA;
  611. timerPreciseA.start();
  612. beginThread("Main");
  613. // Two different cases that can effect performance, one where
  614. // sample already exists and other where new one needs to be created
  615. for (UINT32 i = 0; i < sampleReps; i++)
  616. {
  617. beginSample("TestAvg1");
  618. endSample("TestAvg1");
  619. beginSample("TestAvg2");
  620. endSample("TestAvg2");
  621. beginSample("TestAvg3");
  622. endSample("TestAvg3");
  623. beginSample("TestAvg4");
  624. endSample("TestAvg4");
  625. beginSample("TestAvg5");
  626. endSample("TestAvg5");
  627. beginSample("TestAvg6");
  628. endSample("TestAvg6");
  629. beginSample("TestAvg7");
  630. endSample("TestAvg7");
  631. beginSample("TestAvg8");
  632. endSample("TestAvg8");
  633. beginSample("TestAvg9");
  634. endSample("TestAvg9");
  635. beginSample("TestAvg10");
  636. endSample("TestAvg10");
  637. }
  638. for (UINT32 i = 0; i < sampleReps * 5; i++)
  639. {
  640. beginSample("TestAvg#" + toString(i));
  641. endSample("TestAvg#" + toString(i));
  642. }
  643. endThread();
  644. timerPreciseA.stop();
  645. reset();
  646. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  647. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  648. mBasicSamplingOverheadCycles = avgCyclesBasic;
  649. /************************************************************************/
  650. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  651. /************************************************************************/
  652. Timer timerB;
  653. timerB.start();
  654. beginThread("Main");
  655. // Two different cases that can effect performance, one where
  656. // sample already exists and other where new one needs to be created
  657. for (UINT32 i = 0; i < sampleReps; i++)
  658. {
  659. beginSamplePrecise("TestAvg1");
  660. endSamplePrecise("TestAvg1");
  661. beginSamplePrecise("TestAvg2");
  662. endSamplePrecise("TestAvg2");
  663. beginSamplePrecise("TestAvg3");
  664. endSamplePrecise("TestAvg3");
  665. beginSamplePrecise("TestAvg4");
  666. endSamplePrecise("TestAvg4");
  667. beginSamplePrecise("TestAvg5");
  668. endSamplePrecise("TestAvg5");
  669. beginSamplePrecise("TestAvg6");
  670. endSamplePrecise("TestAvg6");
  671. beginSamplePrecise("TestAvg7");
  672. endSamplePrecise("TestAvg7");
  673. beginSamplePrecise("TestAvg8");
  674. endSamplePrecise("TestAvg8");
  675. beginSamplePrecise("TestAvg9");
  676. endSamplePrecise("TestAvg9");
  677. beginSamplePrecise("TestAvg10");
  678. endSamplePrecise("TestAvg10");
  679. }
  680. for (UINT32 i = 0; i < sampleReps * 5; i++)
  681. {
  682. beginSamplePrecise("TestAvg#" + toString(i));
  683. endSamplePrecise("TestAvg#" + toString(i));
  684. }
  685. endThread();
  686. timerB.stop();
  687. reset();
  688. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  689. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  690. mPreciseSamplingOverheadMs = avgTimesPrecise;
  691. /************************************************************************/
  692. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  693. /************************************************************************/
  694. TimerPrecise timerPreciseB;
  695. timerPreciseB.start();
  696. beginThread("Main");
  697. // Two different cases that can effect performance, one where
  698. // sample already exists and other where new one needs to be created
  699. for (UINT32 i = 0; i < sampleReps; i++)
  700. {
  701. beginSamplePrecise("TestAvg1");
  702. endSamplePrecise("TestAvg1");
  703. beginSamplePrecise("TestAvg2");
  704. endSamplePrecise("TestAvg2");
  705. beginSamplePrecise("TestAvg3");
  706. endSamplePrecise("TestAvg3");
  707. beginSamplePrecise("TestAvg4");
  708. endSamplePrecise("TestAvg4");
  709. beginSamplePrecise("TestAvg5");
  710. endSamplePrecise("TestAvg5");
  711. beginSamplePrecise("TestAvg6");
  712. endSamplePrecise("TestAvg6");
  713. beginSamplePrecise("TestAvg7");
  714. endSamplePrecise("TestAvg7");
  715. beginSamplePrecise("TestAvg8");
  716. endSamplePrecise("TestAvg8");
  717. beginSamplePrecise("TestAvg9");
  718. endSamplePrecise("TestAvg9");
  719. beginSamplePrecise("TestAvg10");
  720. endSamplePrecise("TestAvg10");
  721. }
  722. for (UINT32 i = 0; i < sampleReps * 5; i++)
  723. {
  724. beginSamplePrecise("TestAvg#" + toString(i));
  725. endSamplePrecise("TestAvg#" + toString(i));
  726. }
  727. endThread();
  728. timerPreciseB.stop();
  729. reset();
  730. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  731. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  732. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  733. }
  734. }
  735. CPUProfilerBasicSamplingEntry::Data::Data()
  736. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  737. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  738. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  739. { }
  740. CPUProfilerPreciseSamplingEntry::Data::Data()
  741. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  742. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  743. estimatedOverhead(0), pctOfParent(1.0f)
  744. { }
  745. CPUProfilerReport::CPUProfilerReport()
  746. {
  747. }
  748. }