CmCPUProfiler.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. #include "CmCPUProfiler.h"
  2. #include "CmDebug.h"
  3. #if CM_PLATFORM == CM_PLATFORM_WIN32
  4. #include "windows.h"
  5. #endif
  6. namespace CamelotFramework
  7. {
  8. CPUProfiler::Timer::Timer()
  9. {
  10. time = 0.0f;
  11. }
  12. void CPUProfiler::Timer::start()
  13. {
  14. startTime = getCurrentTime();
  15. }
  16. void CPUProfiler::Timer::stop()
  17. {
  18. time += getCurrentTime() - startTime;
  19. }
  20. void CPUProfiler::Timer::reset()
  21. {
  22. time = 0.0f;
  23. }
  24. inline double CPUProfiler::Timer::getCurrentTime()
  25. {
  26. // TODO: I should be calling Platform:: performance methods instead of doing it here.
  27. // The only problem is that Platform belong to Core and not Utility
  28. #if CM_PLATFORM == CM_PLATFORM_WIN32
  29. LARGE_INTEGER counterValue;
  30. QueryPerformanceCounter(&counterValue);
  31. LARGE_INTEGER counterFrequency;
  32. QueryPerformanceFrequency(&counterFrequency);
  33. return (double)counterValue.QuadPart / (counterFrequency.QuadPart * 0.001);
  34. #else
  35. NOT IMPLEMENTED
  36. #endif
  37. }
  38. CPUProfiler::TimerPrecise::TimerPrecise()
  39. {
  40. cycles = 0;
  41. }
  42. void CPUProfiler::TimerPrecise::start()
  43. {
  44. startCycles = getNumCycles();
  45. }
  46. void CPUProfiler::TimerPrecise::stop()
  47. {
  48. cycles += getNumCycles() - startCycles;
  49. }
  50. void CPUProfiler::TimerPrecise::reset()
  51. {
  52. cycles = 0;
  53. }
  54. inline UINT64 CPUProfiler::TimerPrecise::getNumCycles()
  55. {
  56. #if CM_COMPILER == CM_COMPILER_GNUC
  57. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  58. UINT32 __a,__d;
  59. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  60. return ( UINT64(__a) | UINT64(__d) << 32 );
  61. #else
  62. int a[4];
  63. int b = 0;
  64. __cpuid(a, b);
  65. return __rdtsc();
  66. #endif
  67. }
  68. void CPUProfiler::ProfileData::beginSample()
  69. {
  70. timer.reset();
  71. timer.start();
  72. }
  73. void CPUProfiler::ProfileData::endSample()
  74. {
  75. timer.stop();
  76. samples.push_back(ProfileSample(timer.time));
  77. }
  78. void CPUProfiler::ProfileData::resumeLastSample()
  79. {
  80. timer.start();
  81. samples.erase(samples.end() - 1);
  82. }
  83. void CPUProfiler::PreciseProfileData::beginSample()
  84. {
  85. timer.reset();
  86. timer.start();
  87. }
  88. void CPUProfiler::PreciseProfileData::endSample()
  89. {
  90. timer.stop();
  91. samples.push_back(PreciseProfileSample(timer.cycles));
  92. }
  93. void CPUProfiler::PreciseProfileData::resumeLastSample()
  94. {
  95. timer.start();
  96. samples.erase(samples.end() - 1);
  97. }
  98. CM_THREADLOCAL CPUProfiler::ThreadInfo* CPUProfiler::ThreadInfo::activeThread = nullptr;
  99. CPUProfiler::ThreadInfo::ThreadInfo()
  100. :isActive(false), rootBlock(nullptr)
  101. {
  102. }
  103. void CPUProfiler::ThreadInfo::begin(const String& _name)
  104. {
  105. if(isActive)
  106. {
  107. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  108. return;
  109. }
  110. if(rootBlock == nullptr)
  111. rootBlock = getBlock();
  112. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  113. activeBlocks.push(activeBlock);
  114. rootBlock->name = _name;
  115. rootBlock->basic.beginSample();
  116. isActive = true;
  117. }
  118. void CPUProfiler::ThreadInfo::end()
  119. {
  120. if(activeBlock.type == ActiveSamplingType::Basic)
  121. activeBlock.block->basic.endSample();
  122. else
  123. activeBlock.block->precise.endSample();
  124. activeBlocks.pop();
  125. if(!isActive)
  126. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  127. if(activeBlocks.size() > 0)
  128. {
  129. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  130. while(activeBlocks.size() > 0)
  131. {
  132. ActiveBlock& curBlock = activeBlocks.top();
  133. if(curBlock.type == ActiveSamplingType::Basic)
  134. curBlock.block->basic.endSample();
  135. else
  136. curBlock.block->precise.endSample();
  137. activeBlocks.pop();
  138. }
  139. }
  140. isActive = false;
  141. activeBlocks = Stack<ActiveBlock>::type();
  142. activeBlock = ActiveBlock();
  143. }
  144. void CPUProfiler::ThreadInfo::reset()
  145. {
  146. if(isActive)
  147. end();
  148. if(rootBlock != nullptr)
  149. releaseBlock(rootBlock);
  150. rootBlock = nullptr;
  151. }
  152. CPUProfiler::ProfiledBlock* CPUProfiler::ThreadInfo::getBlock()
  153. {
  154. // TODO - Pool this, if possible using the memory allocator stuff
  155. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  156. return cm_new<ProfiledBlock>();
  157. }
  158. void CPUProfiler::ThreadInfo::releaseBlock(CPUProfiler::ProfiledBlock* block)
  159. {
  160. cm_delete(block);
  161. }
  162. CPUProfiler::ProfiledBlock::ProfiledBlock()
  163. { }
  164. CPUProfiler::ProfiledBlock::~ProfiledBlock()
  165. {
  166. ThreadInfo* thread = ThreadInfo::activeThread;
  167. for(auto& child : children)
  168. thread->releaseBlock(child);
  169. children.clear();
  170. }
  171. CPUProfiler::ProfiledBlock* CPUProfiler::ProfiledBlock::findChild(const String& name) const
  172. {
  173. for(auto& child : children)
  174. {
  175. if(child->name == name)
  176. return child;
  177. }
  178. return nullptr;
  179. }
  180. CPUProfiler::CPUProfiler()
  181. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  182. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  183. {
  184. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  185. // and keep separate values per thread.
  186. estimateTimerOverhead();
  187. }
  188. CPUProfiler::~CPUProfiler()
  189. {
  190. reset();
  191. CM_LOCK_MUTEX(mThreadSync);
  192. for(auto& threadInfo : mActiveThreads)
  193. cm_delete(threadInfo);
  194. }
  195. void CPUProfiler::beginThread(const String& name)
  196. {
  197. ThreadInfo* thread = ThreadInfo::activeThread;
  198. if(thread == nullptr)
  199. {
  200. ThreadInfo::activeThread = cm_new<ThreadInfo>();
  201. thread = ThreadInfo::activeThread;
  202. {
  203. CM_LOCK_MUTEX(mThreadSync);
  204. mActiveThreads.push_back(thread);
  205. }
  206. }
  207. thread->begin(name);
  208. }
  209. void CPUProfiler::endThread()
  210. {
  211. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  212. ThreadInfo::activeThread->end();
  213. }
  214. void CPUProfiler::beginSample(const String& name)
  215. {
  216. ThreadInfo* thread = ThreadInfo::activeThread;
  217. if(thread == nullptr || !thread->isActive)
  218. beginThread("Unknown");
  219. ProfiledBlock* parent = thread->activeBlock.block;
  220. ProfiledBlock* block = nullptr;
  221. if(parent != nullptr)
  222. block = parent->findChild(name);
  223. if(block == nullptr)
  224. {
  225. block = thread->getBlock();
  226. block->name = name;
  227. if(parent != nullptr)
  228. parent->children.push_back(block);
  229. else
  230. thread->rootBlock->children.push_back(block);
  231. }
  232. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  233. thread->activeBlocks.push(thread->activeBlock);
  234. block->basic.beginSample();
  235. }
  236. void CPUProfiler::endSample(const String& name)
  237. {
  238. ThreadInfo* thread = ThreadInfo::activeThread;
  239. ProfiledBlock* block = thread->activeBlock.block;
  240. #if CM_DEBUG_MODE
  241. if(block == nullptr)
  242. {
  243. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  244. return;
  245. }
  246. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  247. {
  248. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  249. return;
  250. }
  251. if(block->name != name)
  252. {
  253. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  254. return;
  255. }
  256. #endif
  257. block->basic.endSample();
  258. thread->activeBlocks.pop();
  259. if(!thread->activeBlocks.empty())
  260. thread->activeBlock = thread->activeBlocks.top();
  261. else
  262. thread->activeBlock = ActiveBlock();
  263. }
  264. void CPUProfiler::beginSamplePrecise(const String& name)
  265. {
  266. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  267. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  268. ThreadInfo* thread = ThreadInfo::activeThread;
  269. if(thread == nullptr || !thread->isActive)
  270. beginThread("Unknown");
  271. ProfiledBlock* parent = thread->activeBlock.block;
  272. ProfiledBlock* block = nullptr;
  273. if(parent != nullptr)
  274. block = parent->findChild(name);
  275. if(block == nullptr)
  276. {
  277. block = thread->getBlock();
  278. block->name = name;
  279. if(parent != nullptr)
  280. parent->children.push_back(block);
  281. else
  282. thread->rootBlock->children.push_back(block);
  283. }
  284. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  285. thread->activeBlocks.push(thread->activeBlock);
  286. block->precise.beginSample();
  287. }
  288. void CPUProfiler::endSamplePrecise(const String& name)
  289. {
  290. ThreadInfo* thread = ThreadInfo::activeThread;
  291. ProfiledBlock* block = thread->activeBlock.block;
  292. #if CM_DEBUG_MODE
  293. if(block == nullptr)
  294. {
  295. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  296. return;
  297. }
  298. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  299. {
  300. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  301. return;
  302. }
  303. if(block->name != name)
  304. {
  305. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + block->name + "\" but got \"" + name + "\". Sampling data will not be valid.");
  306. return;
  307. }
  308. #endif
  309. block->precise.endSample();
  310. thread->activeBlocks.pop();
  311. if(!thread->activeBlocks.empty())
  312. thread->activeBlock = thread->activeBlocks.top();
  313. else
  314. thread->activeBlock = ActiveBlock();
  315. }
  316. void CPUProfiler::reset()
  317. {
  318. ThreadInfo* thread = ThreadInfo::activeThread;
  319. if(thread != nullptr)
  320. thread->reset();
  321. }
  322. CPUProfilerReport CPUProfiler::generateReport()
  323. {
  324. CPUProfilerReport report;
  325. ThreadInfo* thread = ThreadInfo::activeThread;
  326. if(thread == nullptr)
  327. return report;
  328. if(thread->isActive)
  329. thread->end();
  330. // We need to separate out basic and precise data and form two separate hierarchies
  331. if(thread->rootBlock == nullptr)
  332. return report;
  333. struct TempEntry
  334. {
  335. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  336. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  337. { }
  338. ProfiledBlock* parentBlock;
  339. UINT32 entryIdx;
  340. Vector<UINT32>::type childIndexes;
  341. };
  342. Vector<CPUProfilerBasicSamplingEntry>::type basicEntries;
  343. Vector<CPUProfilerPreciseSamplingEntry>::type preciseEntries;
  344. // Fill up flatHierarchy array in a way so we always process children before parents
  345. Stack<UINT32>::type todo;
  346. Vector<TempEntry>::type flatHierarchy;
  347. UINT32 entryIdx = 0;
  348. todo.push(entryIdx);
  349. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  350. entryIdx++;
  351. while(!todo.empty())
  352. {
  353. UINT32 curDataIdx = todo.top();
  354. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  355. todo.pop();
  356. for(auto& child : curBlock->children)
  357. {
  358. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  359. todo.push(entryIdx);
  360. flatHierarchy.push_back(TempEntry(child, entryIdx));
  361. entryIdx++;
  362. }
  363. }
  364. // Calculate sampling data for all entries
  365. basicEntries.resize(flatHierarchy.size());
  366. preciseEntries.resize(flatHierarchy.size());
  367. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  368. {
  369. TempEntry& curData = *iter;
  370. ProfiledBlock* curBlock = curData.parentBlock;
  371. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  372. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  373. // Calculate basic data
  374. entryBasic->data.name = curBlock->name;
  375. entryBasic->data.totalTimeMs = 0.0;
  376. entryBasic->data.maxTimeMs = 0.0;
  377. for(auto& sample : curBlock->basic.samples)
  378. {
  379. entryBasic->data.totalTimeMs += sample.time;
  380. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  381. }
  382. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  383. if(entryBasic->data.numCalls > 0)
  384. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  385. double totalChildTime = 0.0;
  386. for(auto& childIdx : curData.childIndexes)
  387. {
  388. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  389. totalChildTime += childEntry->data.totalTimeMs;
  390. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  391. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  392. }
  393. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  394. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  395. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  396. if(entryBasic->data.numCalls > 0)
  397. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  398. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  399. // Calculate precise data
  400. entryPrecise->data.name = curBlock->name;
  401. entryPrecise->data.totalCycles = 0;
  402. entryPrecise->data.maxCycles = 0;
  403. for(auto& sample : curBlock->precise.samples)
  404. {
  405. entryPrecise->data.totalCycles += sample.cycles;
  406. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  407. }
  408. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  409. if(entryPrecise->data.numCalls > 0)
  410. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  411. UINT64 totalChildCycles = 0;
  412. for(auto& childIdx : curData.childIndexes)
  413. {
  414. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  415. totalChildCycles += childEntry->data.totalCycles;
  416. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  417. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  418. }
  419. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  420. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  421. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  422. if(entryPrecise->data.numCalls > 0)
  423. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  424. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  425. }
  426. // Prune empty basic entries
  427. Stack<UINT32>::type finalBasicHierarchyTodo;
  428. Stack<UINT32>::type parentBasicEntryIndexes;
  429. Vector<TempEntry>::type newBasicEntries;
  430. finalBasicHierarchyTodo.push(0);
  431. entryIdx = 0;
  432. parentBasicEntryIndexes.push(entryIdx);
  433. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  434. entryIdx++;
  435. while(!finalBasicHierarchyTodo.empty())
  436. {
  437. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  438. parentBasicEntryIndexes.pop();
  439. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  440. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  441. finalBasicHierarchyTodo.pop();
  442. for(auto& childIdx : curEntry.childIndexes)
  443. {
  444. finalBasicHierarchyTodo.push(childIdx);
  445. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  446. if(basicEntry.data.numCalls > 0)
  447. {
  448. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  449. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  450. parentBasicEntryIndexes.push(entryIdx);
  451. entryIdx++;
  452. }
  453. else
  454. parentBasicEntryIndexes.push(parentEntryIdx);
  455. }
  456. }
  457. Vector<CPUProfilerBasicSamplingEntry*>::type finalBasicEntries;
  458. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  459. UINT32 curEntryIdx = 0;
  460. for(auto& curEntry : newBasicEntries)
  461. {
  462. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  463. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  464. UINT32 idx = 0;
  465. for(auto& childIdx : curEntry.childIndexes)
  466. {
  467. TempEntry& childEntry = newBasicEntries[childIdx];
  468. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  469. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  470. idx++;
  471. }
  472. curEntryIdx++;
  473. }
  474. // Prune empty precise entries
  475. Stack<UINT32>::type finalPreciseHierarchyTodo;
  476. Stack<UINT32>::type parentPreciseEntryIndexes;
  477. Vector<TempEntry>::type newPreciseEntries;
  478. finalPreciseHierarchyTodo.push(0);
  479. entryIdx = 0;
  480. parentPreciseEntryIndexes.push(entryIdx);
  481. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  482. entryIdx++;
  483. while(!finalPreciseHierarchyTodo.empty())
  484. {
  485. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  486. parentPreciseEntryIndexes.pop();
  487. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  488. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  489. finalPreciseHierarchyTodo.pop();
  490. for(auto& childIdx : curEntry.childIndexes)
  491. {
  492. finalPreciseHierarchyTodo.push(childIdx);
  493. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  494. if(preciseEntry.data.numCalls > 0)
  495. {
  496. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  497. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  498. parentPreciseEntryIndexes.push(entryIdx);
  499. entryIdx++;
  500. }
  501. else
  502. parentPreciseEntryIndexes.push(parentEntryIdx);
  503. }
  504. }
  505. Vector<CPUProfilerPreciseSamplingEntry*>::type finalPreciseEntries;
  506. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  507. curEntryIdx = 0;
  508. for(auto& curEntry : newPreciseEntries)
  509. {
  510. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  511. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  512. UINT32 idx = 0;
  513. for(auto& childIdx : curEntry.childIndexes)
  514. {
  515. TempEntry& childEntry = newPreciseEntries[childIdx];
  516. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  517. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  518. idx++;
  519. }
  520. curEntryIdx++;
  521. }
  522. return report;
  523. }
  524. void CPUProfiler::estimateTimerOverhead()
  525. {
  526. // Get an idea of how long timer calls and RDTSC takes
  527. const UINT32 reps = 1000, sampleReps = 100;
  528. mBasicTimerOverhead = 1000000.0;
  529. mPreciseTimerOverhead = 1000000;
  530. for (UINT32 tries = 0; tries < 20; tries++)
  531. {
  532. Timer timer;
  533. for (UINT32 i = 0; i < reps; i++)
  534. {
  535. timer.start();
  536. timer.stop();
  537. }
  538. double avgTime = double(timer.time)/double(reps);
  539. if (avgTime < mBasicTimerOverhead)
  540. mBasicTimerOverhead = avgTime;
  541. TimerPrecise timerPrecise;
  542. for (UINT32 i = 0; i < reps; i++)
  543. {
  544. timerPrecise.start();
  545. timerPrecise.stop();
  546. }
  547. UINT64 avgCycles = timerPrecise.cycles/reps;
  548. if (avgCycles < mPreciseTimerOverhead)
  549. mPreciseTimerOverhead = avgCycles;
  550. }
  551. mBasicSamplingOverheadMs = 1000000.0;
  552. mPreciseSamplingOverheadMs = 1000000.0;
  553. mBasicSamplingOverheadCycles = 1000000;
  554. mPreciseSamplingOverheadCycles = 1000000;
  555. for (UINT32 tries = 0; tries < 20; tries++)
  556. {
  557. /************************************************************************/
  558. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  559. /************************************************************************/
  560. Timer timerA;
  561. timerA.start();
  562. beginThread("Main");
  563. // Two different cases that can effect performance, one where
  564. // sample already exists and other where new one needs to be created
  565. for (UINT32 i = 0; i < sampleReps; i++)
  566. {
  567. beginSample("TestAvg1");
  568. endSample("TestAvg1");
  569. beginSample("TestAvg2");
  570. endSample("TestAvg2");
  571. beginSample("TestAvg3");
  572. endSample("TestAvg3");
  573. beginSample("TestAvg4");
  574. endSample("TestAvg4");
  575. beginSample("TestAvg5");
  576. endSample("TestAvg5");
  577. beginSample("TestAvg6");
  578. endSample("TestAvg6");
  579. beginSample("TestAvg7");
  580. endSample("TestAvg7");
  581. beginSample("TestAvg8");
  582. endSample("TestAvg8");
  583. beginSample("TestAvg9");
  584. endSample("TestAvg9");
  585. beginSample("TestAvg10");
  586. endSample("TestAvg10");
  587. }
  588. for (UINT32 i = 0; i < sampleReps * 5; i++)
  589. {
  590. beginSample("TestAvg#" + toString(i));
  591. endSample("TestAvg#" + toString(i));
  592. }
  593. endThread();
  594. timerA.stop();
  595. reset();
  596. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  597. if (avgTimeBasic < mBasicSamplingOverheadMs)
  598. mBasicSamplingOverheadMs = avgTimeBasic;
  599. /************************************************************************/
  600. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  601. /************************************************************************/
  602. TimerPrecise timerPreciseA;
  603. timerPreciseA.start();
  604. beginThread("Main");
  605. // Two different cases that can effect performance, one where
  606. // sample already exists and other where new one needs to be created
  607. for (UINT32 i = 0; i < sampleReps; i++)
  608. {
  609. beginSample("TestAvg1");
  610. endSample("TestAvg1");
  611. beginSample("TestAvg2");
  612. endSample("TestAvg2");
  613. beginSample("TestAvg3");
  614. endSample("TestAvg3");
  615. beginSample("TestAvg4");
  616. endSample("TestAvg4");
  617. beginSample("TestAvg5");
  618. endSample("TestAvg5");
  619. beginSample("TestAvg6");
  620. endSample("TestAvg6");
  621. beginSample("TestAvg7");
  622. endSample("TestAvg7");
  623. beginSample("TestAvg8");
  624. endSample("TestAvg8");
  625. beginSample("TestAvg9");
  626. endSample("TestAvg9");
  627. beginSample("TestAvg10");
  628. endSample("TestAvg10");
  629. }
  630. for (UINT32 i = 0; i < sampleReps * 5; i++)
  631. {
  632. beginSample("TestAvg#" + toString(i));
  633. endSample("TestAvg#" + toString(i));
  634. }
  635. endThread();
  636. timerPreciseA.stop();
  637. reset();
  638. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  639. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  640. mBasicSamplingOverheadCycles = avgCyclesBasic;
  641. /************************************************************************/
  642. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  643. /************************************************************************/
  644. Timer timerB;
  645. timerB.start();
  646. beginThread("Main");
  647. // Two different cases that can effect performance, one where
  648. // sample already exists and other where new one needs to be created
  649. for (UINT32 i = 0; i < sampleReps; i++)
  650. {
  651. beginSamplePrecise("TestAvg1");
  652. endSamplePrecise("TestAvg1");
  653. beginSamplePrecise("TestAvg2");
  654. endSamplePrecise("TestAvg2");
  655. beginSamplePrecise("TestAvg3");
  656. endSamplePrecise("TestAvg3");
  657. beginSamplePrecise("TestAvg4");
  658. endSamplePrecise("TestAvg4");
  659. beginSamplePrecise("TestAvg5");
  660. endSamplePrecise("TestAvg5");
  661. beginSamplePrecise("TestAvg6");
  662. endSamplePrecise("TestAvg6");
  663. beginSamplePrecise("TestAvg7");
  664. endSamplePrecise("TestAvg7");
  665. beginSamplePrecise("TestAvg8");
  666. endSamplePrecise("TestAvg8");
  667. beginSamplePrecise("TestAvg9");
  668. endSamplePrecise("TestAvg9");
  669. beginSamplePrecise("TestAvg10");
  670. endSamplePrecise("TestAvg10");
  671. }
  672. for (UINT32 i = 0; i < sampleReps * 5; i++)
  673. {
  674. beginSamplePrecise("TestAvg#" + toString(i));
  675. endSamplePrecise("TestAvg#" + toString(i));
  676. }
  677. endThread();
  678. timerB.stop();
  679. reset();
  680. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  681. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  682. mPreciseSamplingOverheadMs = avgTimesPrecise;
  683. /************************************************************************/
  684. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  685. /************************************************************************/
  686. TimerPrecise timerPreciseB;
  687. timerPreciseB.start();
  688. beginThread("Main");
  689. // Two different cases that can effect performance, one where
  690. // sample already exists and other where new one needs to be created
  691. for (UINT32 i = 0; i < sampleReps; i++)
  692. {
  693. beginSamplePrecise("TestAvg1");
  694. endSamplePrecise("TestAvg1");
  695. beginSamplePrecise("TestAvg2");
  696. endSamplePrecise("TestAvg2");
  697. beginSamplePrecise("TestAvg3");
  698. endSamplePrecise("TestAvg3");
  699. beginSamplePrecise("TestAvg4");
  700. endSamplePrecise("TestAvg4");
  701. beginSamplePrecise("TestAvg5");
  702. endSamplePrecise("TestAvg5");
  703. beginSamplePrecise("TestAvg6");
  704. endSamplePrecise("TestAvg6");
  705. beginSamplePrecise("TestAvg7");
  706. endSamplePrecise("TestAvg7");
  707. beginSamplePrecise("TestAvg8");
  708. endSamplePrecise("TestAvg8");
  709. beginSamplePrecise("TestAvg9");
  710. endSamplePrecise("TestAvg9");
  711. beginSamplePrecise("TestAvg10");
  712. endSamplePrecise("TestAvg10");
  713. }
  714. for (UINT32 i = 0; i < sampleReps * 5; i++)
  715. {
  716. beginSamplePrecise("TestAvg#" + toString(i));
  717. endSamplePrecise("TestAvg#" + toString(i));
  718. }
  719. endThread();
  720. timerPreciseB.stop();
  721. reset();
  722. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  723. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  724. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  725. }
  726. }
  727. CPUProfilerBasicSamplingEntry::Data::Data()
  728. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  729. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  730. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  731. { }
  732. CPUProfilerPreciseSamplingEntry::Data::Data()
  733. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  734. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  735. estimatedOverhead(0), pctOfParent(1.0f)
  736. { }
  737. CPUProfilerReport::CPUProfilerReport()
  738. {
  739. }
  740. }