2
0

BsProfilerCPU.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971
  1. #include "BsProfilerCPU.h"
  2. #include "BsDebug.h"
  3. #include "BsPlatform.h"
  4. namespace BansheeEngine
  5. {
  6. ProfilerCPU::Timer::Timer()
  7. {
  8. time = 0.0f;
  9. }
  10. void ProfilerCPU::Timer::start()
  11. {
  12. startTime = getCurrentTime();
  13. }
  14. void ProfilerCPU::Timer::stop()
  15. {
  16. time += getCurrentTime() - startTime;
  17. }
  18. void ProfilerCPU::Timer::reset()
  19. {
  20. time = 0.0f;
  21. }
  22. inline double ProfilerCPU::Timer::getCurrentTime()
  23. {
  24. return Platform::queryPerformanceTimerMs();
  25. }
  26. ProfilerCPU::TimerPrecise::TimerPrecise()
  27. {
  28. cycles = 0;
  29. }
  30. void ProfilerCPU::TimerPrecise::start()
  31. {
  32. startCycles = getNumCycles();
  33. }
  34. void ProfilerCPU::TimerPrecise::stop()
  35. {
  36. cycles += getNumCycles() - startCycles;
  37. }
  38. void ProfilerCPU::TimerPrecise::reset()
  39. {
  40. cycles = 0;
  41. }
  42. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  43. {
  44. #if BS_COMPILER == BS_COMPILER_GNUC
  45. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  46. UINT32 __a,__d;
  47. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  48. return ( UINT64(__a) | UINT64(__d) << 32 );
  49. #else
  50. int a[4];
  51. int b = 0;
  52. __cpuid(a, b);
  53. return __rdtsc();
  54. #endif
  55. }
  56. void ProfilerCPU::ProfileData::beginSample()
  57. {
  58. memAllocs = MemoryCounter::getNumAllocs();
  59. memFrees = MemoryCounter::getNumFrees();
  60. timer.reset();
  61. timer.start();
  62. }
  63. void ProfilerCPU::ProfileData::endSample()
  64. {
  65. timer.stop();
  66. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  67. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  68. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  69. }
  70. void ProfilerCPU::ProfileData::resumeLastSample()
  71. {
  72. timer.start();
  73. samples.erase(samples.end() - 1);
  74. }
  75. void ProfilerCPU::PreciseProfileData::beginSample()
  76. {
  77. memAllocs = MemoryCounter::getNumAllocs();
  78. memFrees = MemoryCounter::getNumFrees();
  79. timer.reset();
  80. timer.start();
  81. }
  82. void ProfilerCPU::PreciseProfileData::endSample()
  83. {
  84. timer.stop();
  85. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  86. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  87. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  88. }
  89. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  90. {
  91. timer.start();
  92. samples.erase(samples.end() - 1);
  93. }
  94. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  95. ProfilerCPU::ThreadInfo::ThreadInfo()
  96. :isActive(false), rootBlock(nullptr)
  97. {
  98. }
  99. void ProfilerCPU::ThreadInfo::begin(const ProfilerString& _name)
  100. {
  101. if(isActive)
  102. {
  103. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  104. return;
  105. }
  106. if(rootBlock == nullptr)
  107. rootBlock = getBlock();
  108. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  109. activeBlocks.push(activeBlock);
  110. rootBlock->name = _name;
  111. rootBlock->basic.beginSample();
  112. isActive = true;
  113. }
  114. void ProfilerCPU::ThreadInfo::end()
  115. {
  116. if(activeBlock.type == ActiveSamplingType::Basic)
  117. activeBlock.block->basic.endSample();
  118. else
  119. activeBlock.block->precise.endSample();
  120. activeBlocks.pop();
  121. if(!isActive)
  122. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  123. if(activeBlocks.size() > 0)
  124. {
  125. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  126. while(activeBlocks.size() > 0)
  127. {
  128. ActiveBlock& curBlock = activeBlocks.top();
  129. if(curBlock.type == ActiveSamplingType::Basic)
  130. curBlock.block->basic.endSample();
  131. else
  132. curBlock.block->precise.endSample();
  133. activeBlocks.pop();
  134. }
  135. }
  136. isActive = false;
  137. activeBlocks = ProfilerStack<ActiveBlock>();
  138. activeBlock = ActiveBlock();
  139. }
  140. void ProfilerCPU::ThreadInfo::reset()
  141. {
  142. if(isActive)
  143. end();
  144. if(rootBlock != nullptr)
  145. releaseBlock(rootBlock);
  146. rootBlock = nullptr;
  147. }
  148. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock()
  149. {
  150. // TODO - Pool this, if possible using the memory allocator stuff
  151. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  152. return bs_new<ProfiledBlock, ProfilerAlloc>();
  153. }
  154. void ProfilerCPU::ThreadInfo::releaseBlock(ProfilerCPU::ProfiledBlock* block)
  155. {
  156. bs_delete<ProfilerAlloc>(block);
  157. }
  158. ProfilerCPU::ProfiledBlock::ProfiledBlock()
  159. { }
  160. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  161. {
  162. ThreadInfo* thread = ThreadInfo::activeThread;
  163. for(auto& child : children)
  164. thread->releaseBlock(child);
  165. children.clear();
  166. }
  167. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const ProfilerString& name) const
  168. {
  169. for(auto& child : children)
  170. {
  171. if(child->name == name)
  172. return child;
  173. }
  174. return nullptr;
  175. }
  176. ProfilerCPU::ProfilerCPU()
  177. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  178. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  179. {
  180. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  181. // and keep separate values per thread.
  182. estimateTimerOverhead();
  183. }
  184. ProfilerCPU::~ProfilerCPU()
  185. {
  186. reset();
  187. BS_LOCK_MUTEX(mThreadSync);
  188. for(auto& threadInfo : mActiveThreads)
  189. bs_delete<ProfilerAlloc>(threadInfo);
  190. }
  191. void ProfilerCPU::beginThread(const ProfilerString& name)
  192. {
  193. ThreadInfo* thread = ThreadInfo::activeThread;
  194. if(thread == nullptr)
  195. {
  196. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  197. thread = ThreadInfo::activeThread;
  198. {
  199. BS_LOCK_MUTEX(mThreadSync);
  200. mActiveThreads.push_back(thread);
  201. }
  202. }
  203. thread->begin(name);
  204. }
  205. void ProfilerCPU::endThread()
  206. {
  207. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  208. ThreadInfo::activeThread->end();
  209. }
  210. void ProfilerCPU::beginSample(const ProfilerString& name)
  211. {
  212. ThreadInfo* thread = ThreadInfo::activeThread;
  213. if(thread == nullptr || !thread->isActive)
  214. {
  215. beginThread("Unknown");
  216. thread = ThreadInfo::activeThread;
  217. }
  218. ProfiledBlock* parent = thread->activeBlock.block;
  219. ProfiledBlock* block = nullptr;
  220. if(parent != nullptr)
  221. block = parent->findChild(name);
  222. if(block == nullptr)
  223. {
  224. block = thread->getBlock();
  225. block->name = name;
  226. if(parent != nullptr)
  227. parent->children.push_back(block);
  228. else
  229. thread->rootBlock->children.push_back(block);
  230. }
  231. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  232. thread->activeBlocks.push(thread->activeBlock);
  233. block->basic.beginSample();
  234. }
  235. void ProfilerCPU::endSample(const ProfilerString& name)
  236. {
  237. ThreadInfo* thread = ThreadInfo::activeThread;
  238. ProfiledBlock* block = thread->activeBlock.block;
  239. #if BS_DEBUG_MODE
  240. if(block == nullptr)
  241. {
  242. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  243. return;
  244. }
  245. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  246. {
  247. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  248. return;
  249. }
  250. if(block->name != name)
  251. {
  252. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name.c_str()) +
  253. "\" but got \"" + String(name.c_str()) + "\". Sampling data will not be valid.");
  254. return;
  255. }
  256. #endif
  257. block->basic.endSample();
  258. thread->activeBlocks.pop();
  259. if(!thread->activeBlocks.empty())
  260. thread->activeBlock = thread->activeBlocks.top();
  261. else
  262. thread->activeBlock = ActiveBlock();
  263. }
  264. void ProfilerCPU::beginSamplePrecise(const ProfilerString& name)
  265. {
  266. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  267. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  268. ThreadInfo* thread = ThreadInfo::activeThread;
  269. if(thread == nullptr || !thread->isActive)
  270. beginThread("Unknown");
  271. ProfiledBlock* parent = thread->activeBlock.block;
  272. ProfiledBlock* block = nullptr;
  273. if(parent != nullptr)
  274. block = parent->findChild(name);
  275. if(block == nullptr)
  276. {
  277. block = thread->getBlock();
  278. block->name = name;
  279. if(parent != nullptr)
  280. parent->children.push_back(block);
  281. else
  282. thread->rootBlock->children.push_back(block);
  283. }
  284. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  285. thread->activeBlocks.push(thread->activeBlock);
  286. block->precise.beginSample();
  287. }
  288. void ProfilerCPU::endSamplePrecise(const ProfilerString& name)
  289. {
  290. ThreadInfo* thread = ThreadInfo::activeThread;
  291. ProfiledBlock* block = thread->activeBlock.block;
  292. #if BS_DEBUG_MODE
  293. if(block == nullptr)
  294. {
  295. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  296. return;
  297. }
  298. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  299. {
  300. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  301. return;
  302. }
  303. if(block->name != name)
  304. {
  305. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name.c_str()) +
  306. "\" but got \"" + String(name.c_str()) + "\". Sampling data will not be valid.");
  307. return;
  308. }
  309. #endif
  310. block->precise.endSample();
  311. thread->activeBlocks.pop();
  312. if(!thread->activeBlocks.empty())
  313. thread->activeBlock = thread->activeBlocks.top();
  314. else
  315. thread->activeBlock = ActiveBlock();
  316. }
  317. void ProfilerCPU::reset()
  318. {
  319. ThreadInfo* thread = ThreadInfo::activeThread;
  320. if(thread != nullptr)
  321. thread->reset();
  322. }
  323. CPUProfilerReport ProfilerCPU::generateReport()
  324. {
  325. CPUProfilerReport report;
  326. ThreadInfo* thread = ThreadInfo::activeThread;
  327. if(thread == nullptr)
  328. return report;
  329. if(thread->isActive)
  330. thread->end();
  331. // We need to separate out basic and precise data and form two separate hierarchies
  332. if(thread->rootBlock == nullptr)
  333. return report;
  334. struct TempEntry
  335. {
  336. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  337. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  338. { }
  339. ProfiledBlock* parentBlock;
  340. UINT32 entryIdx;
  341. ProfilerVector<UINT32> childIndexes;
  342. };
  343. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  344. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  345. // Fill up flatHierarchy array in a way so we always process children before parents
  346. ProfilerStack<UINT32> todo;
  347. ProfilerVector<TempEntry> flatHierarchy;
  348. UINT32 entryIdx = 0;
  349. todo.push(entryIdx);
  350. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  351. entryIdx++;
  352. while(!todo.empty())
  353. {
  354. UINT32 curDataIdx = todo.top();
  355. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  356. todo.pop();
  357. for(auto& child : curBlock->children)
  358. {
  359. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  360. todo.push(entryIdx);
  361. flatHierarchy.push_back(TempEntry(child, entryIdx));
  362. entryIdx++;
  363. }
  364. }
  365. // Calculate sampling data for all entries
  366. basicEntries.resize(flatHierarchy.size());
  367. preciseEntries.resize(flatHierarchy.size());
  368. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  369. {
  370. TempEntry& curData = *iter;
  371. ProfiledBlock* curBlock = curData.parentBlock;
  372. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  373. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  374. // Calculate basic data
  375. entryBasic->data.name = String(curBlock->name.c_str());
  376. entryBasic->data.memAllocs = 0;
  377. entryBasic->data.memFrees = 0;
  378. entryBasic->data.totalTimeMs = 0.0;
  379. entryBasic->data.maxTimeMs = 0.0;
  380. for(auto& sample : curBlock->basic.samples)
  381. {
  382. entryBasic->data.totalTimeMs += sample.time;
  383. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  384. entryBasic->data.memAllocs += sample.numAllocs;
  385. entryBasic->data.memFrees += sample.numFrees;
  386. }
  387. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  388. if(entryBasic->data.numCalls > 0)
  389. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  390. double totalChildTime = 0.0;
  391. for(auto& childIdx : curData.childIndexes)
  392. {
  393. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  394. totalChildTime += childEntry->data.totalTimeMs;
  395. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  396. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  397. }
  398. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  399. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  400. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  401. if(entryBasic->data.numCalls > 0)
  402. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  403. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  404. // Calculate precise data
  405. entryPrecise->data.name = String(curBlock->name.c_str());
  406. entryPrecise->data.memAllocs = 0;
  407. entryPrecise->data.memFrees = 0;
  408. entryPrecise->data.totalCycles = 0;
  409. entryPrecise->data.maxCycles = 0;
  410. for(auto& sample : curBlock->precise.samples)
  411. {
  412. entryPrecise->data.totalCycles += sample.cycles;
  413. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  414. entryPrecise->data.memAllocs += sample.numAllocs;
  415. entryPrecise->data.memFrees += sample.numFrees;
  416. }
  417. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  418. if(entryPrecise->data.numCalls > 0)
  419. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  420. UINT64 totalChildCycles = 0;
  421. for(auto& childIdx : curData.childIndexes)
  422. {
  423. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  424. totalChildCycles += childEntry->data.totalCycles;
  425. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  426. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  427. }
  428. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  429. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  430. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  431. if(entryPrecise->data.numCalls > 0)
  432. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  433. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  434. }
  435. // Prune empty basic entries
  436. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  437. ProfilerStack<UINT32> parentBasicEntryIndexes;
  438. ProfilerVector<TempEntry> newBasicEntries;
  439. finalBasicHierarchyTodo.push(0);
  440. entryIdx = 0;
  441. parentBasicEntryIndexes.push(entryIdx);
  442. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  443. entryIdx++;
  444. while(!finalBasicHierarchyTodo.empty())
  445. {
  446. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  447. parentBasicEntryIndexes.pop();
  448. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  449. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  450. finalBasicHierarchyTodo.pop();
  451. for(auto& childIdx : curEntry.childIndexes)
  452. {
  453. finalBasicHierarchyTodo.push(childIdx);
  454. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  455. if(basicEntry.data.numCalls > 0)
  456. {
  457. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  458. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  459. parentBasicEntryIndexes.push(entryIdx);
  460. entryIdx++;
  461. }
  462. else
  463. parentBasicEntryIndexes.push(parentEntryIdx);
  464. }
  465. }
  466. if(newBasicEntries.size() > 0)
  467. {
  468. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  469. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  470. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  471. finalBasicHierarchyTodo.push(0);
  472. while(!finalBasicHierarchyTodo.empty())
  473. {
  474. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  475. finalBasicHierarchyTodo.pop();
  476. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  477. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  478. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  479. UINT32 idx = 0;
  480. for(auto& childIdx : curEntry.childIndexes)
  481. {
  482. TempEntry& childEntry = newBasicEntries[childIdx];
  483. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  484. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  485. finalBasicHierarchyTodo.push(childIdx);
  486. idx++;
  487. }
  488. }
  489. }
  490. // Prune empty precise entries
  491. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  492. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  493. ProfilerVector<TempEntry> newPreciseEntries;
  494. finalPreciseHierarchyTodo.push(0);
  495. entryIdx = 0;
  496. parentPreciseEntryIndexes.push(entryIdx);
  497. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  498. entryIdx++;
  499. while(!finalPreciseHierarchyTodo.empty())
  500. {
  501. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  502. parentPreciseEntryIndexes.pop();
  503. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  504. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  505. finalPreciseHierarchyTodo.pop();
  506. for(auto& childIdx : curEntry.childIndexes)
  507. {
  508. finalPreciseHierarchyTodo.push(childIdx);
  509. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  510. if(preciseEntry.data.numCalls > 0)
  511. {
  512. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  513. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  514. parentPreciseEntryIndexes.push(entryIdx);
  515. entryIdx++;
  516. }
  517. else
  518. parentPreciseEntryIndexes.push(parentEntryIdx);
  519. }
  520. }
  521. if(newPreciseEntries.size() > 0)
  522. {
  523. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  524. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  525. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  526. finalPreciseHierarchyTodo.push(0);
  527. while(!finalPreciseHierarchyTodo.empty())
  528. {
  529. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  530. finalPreciseHierarchyTodo.pop();
  531. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  532. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  533. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  534. UINT32 idx = 0;
  535. for(auto& childIdx : curEntry.childIndexes)
  536. {
  537. TempEntry& childEntry = newPreciseEntries[childIdx];
  538. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  539. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  540. finalPreciseHierarchyTodo.push(childIdx);
  541. idx++;
  542. }
  543. }
  544. }
  545. return report;
  546. }
  547. void ProfilerCPU::estimateTimerOverhead()
  548. {
  549. // Get an idea of how long timer calls and RDTSC takes
  550. const UINT32 reps = 1000, sampleReps = 100;
  551. mBasicTimerOverhead = 1000000.0;
  552. mPreciseTimerOverhead = 1000000;
  553. for (UINT32 tries = 0; tries < 20; tries++)
  554. {
  555. Timer timer;
  556. for (UINT32 i = 0; i < reps; i++)
  557. {
  558. timer.start();
  559. timer.stop();
  560. }
  561. double avgTime = double(timer.time)/double(reps);
  562. if (avgTime < mBasicTimerOverhead)
  563. mBasicTimerOverhead = avgTime;
  564. TimerPrecise timerPrecise;
  565. for (UINT32 i = 0; i < reps; i++)
  566. {
  567. timerPrecise.start();
  568. timerPrecise.stop();
  569. }
  570. UINT64 avgCycles = timerPrecise.cycles/reps;
  571. if (avgCycles < mPreciseTimerOverhead)
  572. mPreciseTimerOverhead = avgCycles;
  573. }
  574. mBasicSamplingOverheadMs = 1000000.0;
  575. mPreciseSamplingOverheadMs = 1000000.0;
  576. mBasicSamplingOverheadCycles = 1000000;
  577. mPreciseSamplingOverheadCycles = 1000000;
  578. for (UINT32 tries = 0; tries < 20; tries++)
  579. {
  580. /************************************************************************/
  581. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  582. /************************************************************************/
  583. Timer timerA;
  584. timerA.start();
  585. beginThread("Main");
  586. // Two different cases that can effect performance, one where
  587. // sample already exists and other where new one needs to be created
  588. for (UINT32 i = 0; i < sampleReps; i++)
  589. {
  590. beginSample("TestAvg1");
  591. endSample("TestAvg1");
  592. beginSample("TestAvg2");
  593. endSample("TestAvg2");
  594. beginSample("TestAvg3");
  595. endSample("TestAvg3");
  596. beginSample("TestAvg4");
  597. endSample("TestAvg4");
  598. beginSample("TestAvg5");
  599. endSample("TestAvg5");
  600. beginSample("TestAvg6");
  601. endSample("TestAvg6");
  602. beginSample("TestAvg7");
  603. endSample("TestAvg7");
  604. beginSample("TestAvg8");
  605. endSample("TestAvg8");
  606. beginSample("TestAvg9");
  607. endSample("TestAvg9");
  608. beginSample("TestAvg10");
  609. endSample("TestAvg10");
  610. }
  611. for (UINT32 i = 0; i < sampleReps * 5; i++)
  612. {
  613. beginSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  614. endSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  615. }
  616. endThread();
  617. timerA.stop();
  618. reset();
  619. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  620. if (avgTimeBasic < mBasicSamplingOverheadMs)
  621. mBasicSamplingOverheadMs = avgTimeBasic;
  622. /************************************************************************/
  623. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  624. /************************************************************************/
  625. TimerPrecise timerPreciseA;
  626. timerPreciseA.start();
  627. beginThread("Main");
  628. // Two different cases that can effect performance, one where
  629. // sample already exists and other where new one needs to be created
  630. for (UINT32 i = 0; i < sampleReps; i++)
  631. {
  632. beginSample("TestAvg1");
  633. endSample("TestAvg1");
  634. beginSample("TestAvg2");
  635. endSample("TestAvg2");
  636. beginSample("TestAvg3");
  637. endSample("TestAvg3");
  638. beginSample("TestAvg4");
  639. endSample("TestAvg4");
  640. beginSample("TestAvg5");
  641. endSample("TestAvg5");
  642. beginSample("TestAvg6");
  643. endSample("TestAvg6");
  644. beginSample("TestAvg7");
  645. endSample("TestAvg7");
  646. beginSample("TestAvg8");
  647. endSample("TestAvg8");
  648. beginSample("TestAvg9");
  649. endSample("TestAvg9");
  650. beginSample("TestAvg10");
  651. endSample("TestAvg10");
  652. }
  653. for (UINT32 i = 0; i < sampleReps * 5; i++)
  654. {
  655. beginSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  656. endSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  657. }
  658. endThread();
  659. timerPreciseA.stop();
  660. reset();
  661. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  662. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  663. mBasicSamplingOverheadCycles = avgCyclesBasic;
  664. /************************************************************************/
  665. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  666. /************************************************************************/
  667. Timer timerB;
  668. timerB.start();
  669. beginThread("Main");
  670. // Two different cases that can effect performance, one where
  671. // sample already exists and other where new one needs to be created
  672. for (UINT32 i = 0; i < sampleReps; i++)
  673. {
  674. beginSamplePrecise("TestAvg1");
  675. endSamplePrecise("TestAvg1");
  676. beginSamplePrecise("TestAvg2");
  677. endSamplePrecise("TestAvg2");
  678. beginSamplePrecise("TestAvg3");
  679. endSamplePrecise("TestAvg3");
  680. beginSamplePrecise("TestAvg4");
  681. endSamplePrecise("TestAvg4");
  682. beginSamplePrecise("TestAvg5");
  683. endSamplePrecise("TestAvg5");
  684. beginSamplePrecise("TestAvg6");
  685. endSamplePrecise("TestAvg6");
  686. beginSamplePrecise("TestAvg7");
  687. endSamplePrecise("TestAvg7");
  688. beginSamplePrecise("TestAvg8");
  689. endSamplePrecise("TestAvg8");
  690. beginSamplePrecise("TestAvg9");
  691. endSamplePrecise("TestAvg9");
  692. beginSamplePrecise("TestAvg10");
  693. endSamplePrecise("TestAvg10");
  694. }
  695. for (UINT32 i = 0; i < sampleReps * 5; i++)
  696. {
  697. beginSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  698. endSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  699. }
  700. endThread();
  701. timerB.stop();
  702. reset();
  703. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  704. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  705. mPreciseSamplingOverheadMs = avgTimesPrecise;
  706. /************************************************************************/
  707. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  708. /************************************************************************/
  709. TimerPrecise timerPreciseB;
  710. timerPreciseB.start();
  711. beginThread("Main");
  712. // Two different cases that can effect performance, one where
  713. // sample already exists and other where new one needs to be created
  714. for (UINT32 i = 0; i < sampleReps; i++)
  715. {
  716. beginSamplePrecise("TestAvg1");
  717. endSamplePrecise("TestAvg1");
  718. beginSamplePrecise("TestAvg2");
  719. endSamplePrecise("TestAvg2");
  720. beginSamplePrecise("TestAvg3");
  721. endSamplePrecise("TestAvg3");
  722. beginSamplePrecise("TestAvg4");
  723. endSamplePrecise("TestAvg4");
  724. beginSamplePrecise("TestAvg5");
  725. endSamplePrecise("TestAvg5");
  726. beginSamplePrecise("TestAvg6");
  727. endSamplePrecise("TestAvg6");
  728. beginSamplePrecise("TestAvg7");
  729. endSamplePrecise("TestAvg7");
  730. beginSamplePrecise("TestAvg8");
  731. endSamplePrecise("TestAvg8");
  732. beginSamplePrecise("TestAvg9");
  733. endSamplePrecise("TestAvg9");
  734. beginSamplePrecise("TestAvg10");
  735. endSamplePrecise("TestAvg10");
  736. }
  737. for (UINT32 i = 0; i < sampleReps * 5; i++)
  738. {
  739. beginSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  740. endSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  741. }
  742. endThread();
  743. timerPreciseB.stop();
  744. reset();
  745. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  746. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  747. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  748. }
  749. }
  750. CPUProfilerBasicSamplingEntry::Data::Data()
  751. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  752. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  753. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  754. { }
  755. CPUProfilerPreciseSamplingEntry::Data::Data()
  756. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  757. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  758. estimatedOverhead(0), pctOfParent(1.0f)
  759. { }
  760. CPUProfilerReport::CPUProfilerReport()
  761. {
  762. }
  763. ProfilerCPU& gProfilerCPU()
  764. {
  765. return ProfilerCPU::instance();
  766. }
  767. }