BsProfilerCPU.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #include "Profiling/BsProfilerCPU.h"
  4. #include "Debug/BsDebug.h"
  5. #include "Platform/BsPlatform.h"
  6. #include <chrono>
  7. #if BS_COMPILER == BS_COMPILER_MSVC
  8. #include <intrin.h>
  9. #endif
  10. #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
  11. #include "cpuid.h"
  12. #endif
  13. #if BS_COMPILER == BS_COMPILER_CLANG
  14. #if BS_PLATFORM == BS_PLATFORM_WIN32
  15. #include "intrin.h"
  16. #else
  17. #include <x86intrin.h>
  18. #endif
  19. #endif
  20. using namespace std::chrono;
  21. namespace bs
  22. {
  23. ProfilerCPU::Timer::Timer()
  24. :startTime(0.0f)
  25. {
  26. time = 0.0f;
  27. }
  28. void ProfilerCPU::Timer::start()
  29. {
  30. startTime = getCurrentTime();
  31. }
  32. void ProfilerCPU::Timer::stop()
  33. {
  34. time += getCurrentTime() - startTime;
  35. }
  36. void ProfilerCPU::Timer::reset()
  37. {
  38. time = 0.0f;
  39. }
  40. inline double ProfilerCPU::Timer::getCurrentTime() const
  41. {
  42. high_resolution_clock::time_point timeNow = mHRClock.now();
  43. nanoseconds timeNowNs = timeNow.time_since_epoch();
  44. return timeNowNs.count() * 0.000001;
  45. }
  46. ProfilerCPU::TimerPrecise::TimerPrecise()
  47. {
  48. cycles = 0;
  49. }
  50. void ProfilerCPU::TimerPrecise::start()
  51. {
  52. startCycles = getNumCycles();
  53. }
  54. void ProfilerCPU::TimerPrecise::stop()
  55. {
  56. cycles += getNumCycles() - startCycles;
  57. }
  58. void ProfilerCPU::TimerPrecise::reset()
  59. {
  60. cycles = 0;
  61. }
  62. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  63. {
  64. #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
  65. unsigned int a = 0;
  66. unsigned int b[4];
  67. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  68. #if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64
  69. UINT32 __a, __d;
  70. __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d));
  71. return (UINT64(__a) | UINT64(__d) << 32);
  72. #else
  73. UINT64 x;
  74. __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  75. return x;
  76. #endif
  77. #elif BS_COMPILER == BS_COMPILER_MSVC
  78. int a[4];
  79. int b = 0;
  80. __cpuid(a, b);
  81. return __rdtsc();
  82. #else
  83. static_assert(false, "Unsupported compiler");
  84. #endif
  85. }
  86. ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
  87. :samples(alloc)
  88. { }
  89. void ProfilerCPU::ProfileData::beginSample()
  90. {
  91. memAllocs = MemoryCounter::getNumAllocs();
  92. memFrees = MemoryCounter::getNumFrees();
  93. timer.reset();
  94. timer.start();
  95. }
  96. void ProfilerCPU::ProfileData::endSample()
  97. {
  98. timer.stop();
  99. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  100. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  101. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  102. }
  103. void ProfilerCPU::ProfileData::resumeLastSample()
  104. {
  105. timer.start();
  106. samples.erase(samples.end() - 1);
  107. }
  108. ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
  109. :samples(alloc)
  110. { }
  111. void ProfilerCPU::PreciseProfileData::beginSample()
  112. {
  113. memAllocs = MemoryCounter::getNumAllocs();
  114. memFrees = MemoryCounter::getNumFrees();
  115. timer.reset();
  116. timer.start();
  117. }
  118. void ProfilerCPU::PreciseProfileData::endSample()
  119. {
  120. timer.stop();
  121. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  122. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  123. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  124. }
  125. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  126. {
  127. timer.start();
  128. samples.erase(samples.end() - 1);
  129. }
  130. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  131. ProfilerCPU::ThreadInfo::ThreadInfo()
  132. :isActive(false), rootBlock(nullptr), frameAlloc(1024 * 512), activeBlocks(nullptr)
  133. {
  134. }
  135. void ProfilerCPU::ThreadInfo::begin(const char* _name)
  136. {
  137. if(isActive)
  138. {
  139. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  140. return;
  141. }
  142. if(rootBlock == nullptr)
  143. rootBlock = getBlock(_name);
  144. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  145. if (activeBlocks == nullptr)
  146. activeBlocks = frameAlloc.alloc<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>
  147. (StdFrameAlloc<ActiveBlock>(&frameAlloc));
  148. activeBlocks->push(activeBlock);
  149. rootBlock->basic.beginSample();
  150. isActive = true;
  151. }
  152. void ProfilerCPU::ThreadInfo::end()
  153. {
  154. if(activeBlock.type == ActiveSamplingType::Basic)
  155. activeBlock.block->basic.endSample();
  156. else
  157. activeBlock.block->precise.endSample();
  158. activeBlocks->pop();
  159. if(!isActive)
  160. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  161. if (activeBlocks->size() > 0)
  162. {
  163. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  164. while (activeBlocks->size() > 0)
  165. {
  166. ActiveBlock& curBlock = activeBlocks->top();
  167. if(curBlock.type == ActiveSamplingType::Basic)
  168. curBlock.block->basic.endSample();
  169. else
  170. curBlock.block->precise.endSample();
  171. activeBlocks->pop();
  172. }
  173. }
  174. isActive = false;
  175. activeBlock = ActiveBlock();
  176. frameAlloc.dealloc(activeBlocks);
  177. activeBlocks = nullptr;
  178. }
  179. void ProfilerCPU::ThreadInfo::reset()
  180. {
  181. if(isActive)
  182. end();
  183. if(rootBlock != nullptr)
  184. releaseBlock(rootBlock);
  185. rootBlock = nullptr;
  186. frameAlloc.clear(); // Note: This never actually frees memory
  187. }
  188. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
  189. {
  190. ProfiledBlock* block = frameAlloc.alloc<ProfiledBlock>(&frameAlloc);
  191. block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
  192. strcpy(block->name, name);
  193. return block;
  194. }
  195. void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
  196. {
  197. frameAlloc.dealloc((UINT8*)block->name);
  198. frameAlloc.dealloc(block);
  199. }
  200. ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
  201. :basic(alloc), precise(alloc), children(alloc)
  202. { }
  203. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  204. {
  205. ThreadInfo* thread = ThreadInfo::activeThread;
  206. for(auto& child : children)
  207. thread->releaseBlock(child);
  208. children.clear();
  209. }
  210. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
  211. {
  212. for(auto& child : children)
  213. {
  214. if(strcmp(child->name, name) == 0)
  215. return child;
  216. }
  217. return nullptr;
  218. }
  219. ProfilerCPU::ProfilerCPU()
  220. : mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadMs(0.0)
  221. , mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadCycles(0)
  222. {
  223. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  224. // and keep separate values per thread.
  225. estimateTimerOverhead();
  226. }
  227. ProfilerCPU::~ProfilerCPU()
  228. {
  229. reset();
  230. Lock lock(mThreadSync);
  231. for(auto& threadInfo : mActiveThreads)
  232. bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
  233. }
  234. void ProfilerCPU::beginThread(const char* name)
  235. {
  236. ThreadInfo* thread = ThreadInfo::activeThread;
  237. if(thread == nullptr)
  238. {
  239. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  240. thread = ThreadInfo::activeThread;
  241. {
  242. Lock lock(mThreadSync);
  243. mActiveThreads.push_back(thread);
  244. }
  245. }
  246. thread->begin(name);
  247. }
  248. void ProfilerCPU::endThread()
  249. {
  250. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  251. ThreadInfo::activeThread->end();
  252. }
  253. void ProfilerCPU::beginSample(const char* name)
  254. {
  255. ThreadInfo* thread = ThreadInfo::activeThread;
  256. if(thread == nullptr || !thread->isActive)
  257. {
  258. beginThread("Unknown");
  259. thread = ThreadInfo::activeThread;
  260. }
  261. ProfiledBlock* parent = thread->activeBlock.block;
  262. ProfiledBlock* block = nullptr;
  263. if(parent != nullptr)
  264. block = parent->findChild(name);
  265. if(block == nullptr)
  266. {
  267. block = thread->getBlock(name);
  268. if(parent != nullptr)
  269. parent->children.push_back(block);
  270. else
  271. thread->rootBlock->children.push_back(block);
  272. }
  273. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  274. thread->activeBlocks->push(thread->activeBlock);
  275. block->basic.beginSample();
  276. }
  277. void ProfilerCPU::endSample(const char* name)
  278. {
  279. ThreadInfo* thread = ThreadInfo::activeThread;
  280. ProfiledBlock* block = thread->activeBlock.block;
  281. #if BS_DEBUG_MODE
  282. if(block == nullptr)
  283. {
  284. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  285. return;
  286. }
  287. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  288. {
  289. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  290. return;
  291. }
  292. if(strcmp(block->name, name) != 0)
  293. {
  294. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
  295. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  296. return;
  297. }
  298. #endif
  299. block->basic.endSample();
  300. thread->activeBlocks->pop();
  301. if (!thread->activeBlocks->empty())
  302. thread->activeBlock = thread->activeBlocks->top();
  303. else
  304. thread->activeBlock = ActiveBlock();
  305. }
  306. void ProfilerCPU::beginSamplePrecise(const char* name)
  307. {
  308. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  309. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  310. ThreadInfo* thread = ThreadInfo::activeThread;
  311. if(thread == nullptr || !thread->isActive)
  312. beginThread("Unknown");
  313. ProfiledBlock* parent = thread->activeBlock.block;
  314. ProfiledBlock* block = nullptr;
  315. if(parent != nullptr)
  316. block = parent->findChild(name);
  317. if(block == nullptr)
  318. {
  319. block = thread->getBlock(name);
  320. if(parent != nullptr)
  321. parent->children.push_back(block);
  322. else
  323. thread->rootBlock->children.push_back(block);
  324. }
  325. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  326. thread->activeBlocks->push(thread->activeBlock);
  327. block->precise.beginSample();
  328. }
  329. void ProfilerCPU::endSamplePrecise(const char* name)
  330. {
  331. ThreadInfo* thread = ThreadInfo::activeThread;
  332. ProfiledBlock* block = thread->activeBlock.block;
  333. #if BS_DEBUG_MODE
  334. if(block == nullptr)
  335. {
  336. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  337. return;
  338. }
  339. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  340. {
  341. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  342. return;
  343. }
  344. if (strcmp(block->name, name) != 0)
  345. {
  346. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
  347. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  348. return;
  349. }
  350. #endif
  351. block->precise.endSample();
  352. thread->activeBlocks->pop();
  353. if (!thread->activeBlocks->empty())
  354. thread->activeBlock = thread->activeBlocks->top();
  355. else
  356. thread->activeBlock = ActiveBlock();
  357. }
  358. void ProfilerCPU::reset()
  359. {
  360. ThreadInfo* thread = ThreadInfo::activeThread;
  361. if(thread != nullptr)
  362. thread->reset();
  363. }
  364. CPUProfilerReport ProfilerCPU::generateReport()
  365. {
  366. CPUProfilerReport report;
  367. ThreadInfo* thread = ThreadInfo::activeThread;
  368. if(thread == nullptr)
  369. return report;
  370. if(thread->isActive)
  371. thread->end();
  372. // We need to separate out basic and precise data and form two separate hierarchies
  373. if(thread->rootBlock == nullptr)
  374. return report;
  375. struct TempEntry
  376. {
  377. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  378. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  379. { }
  380. ProfiledBlock* parentBlock;
  381. UINT32 entryIdx;
  382. ProfilerVector<UINT32> childIndexes;
  383. };
  384. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  385. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  386. // Fill up flatHierarchy array in a way so we always process children before parents
  387. ProfilerStack<UINT32> todo;
  388. ProfilerVector<TempEntry> flatHierarchy;
  389. UINT32 entryIdx = 0;
  390. todo.push(entryIdx);
  391. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  392. entryIdx++;
  393. while(!todo.empty())
  394. {
  395. UINT32 curDataIdx = todo.top();
  396. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  397. todo.pop();
  398. for(auto& child : curBlock->children)
  399. {
  400. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  401. todo.push(entryIdx);
  402. flatHierarchy.push_back(TempEntry(child, entryIdx));
  403. entryIdx++;
  404. }
  405. }
  406. // Calculate sampling data for all entries
  407. basicEntries.resize(flatHierarchy.size());
  408. preciseEntries.resize(flatHierarchy.size());
  409. for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  410. {
  411. TempEntry& curData = *iter;
  412. ProfiledBlock* curBlock = curData.parentBlock;
  413. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  414. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  415. // Calculate basic data
  416. entryBasic->data.name = String(curBlock->name);
  417. entryBasic->data.memAllocs = 0;
  418. entryBasic->data.memFrees = 0;
  419. entryBasic->data.totalTimeMs = 0.0;
  420. entryBasic->data.maxTimeMs = 0.0;
  421. for(auto& sample : curBlock->basic.samples)
  422. {
  423. entryBasic->data.totalTimeMs += sample.time;
  424. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  425. entryBasic->data.memAllocs += sample.numAllocs;
  426. entryBasic->data.memFrees += sample.numFrees;
  427. }
  428. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  429. if(entryBasic->data.numCalls > 0)
  430. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  431. double totalChildTime = 0.0;
  432. for(auto& childIdx : curData.childIndexes)
  433. {
  434. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  435. totalChildTime += childEntry->data.totalTimeMs;
  436. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  437. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  438. }
  439. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  440. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  441. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  442. if(entryBasic->data.numCalls > 0)
  443. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  444. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  445. // Calculate precise data
  446. entryPrecise->data.name = String(curBlock->name);
  447. entryPrecise->data.memAllocs = 0;
  448. entryPrecise->data.memFrees = 0;
  449. entryPrecise->data.totalCycles = 0;
  450. entryPrecise->data.maxCycles = 0;
  451. for(auto& sample : curBlock->precise.samples)
  452. {
  453. entryPrecise->data.totalCycles += sample.cycles;
  454. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  455. entryPrecise->data.memAllocs += sample.numAllocs;
  456. entryPrecise->data.memFrees += sample.numFrees;
  457. }
  458. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  459. if(entryPrecise->data.numCalls > 0)
  460. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  461. UINT64 totalChildCycles = 0;
  462. for(auto& childIdx : curData.childIndexes)
  463. {
  464. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  465. totalChildCycles += childEntry->data.totalCycles;
  466. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  467. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  468. }
  469. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  470. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  471. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  472. if(entryPrecise->data.numCalls > 0)
  473. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  474. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  475. }
  476. // Prune empty basic entries
  477. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  478. ProfilerStack<UINT32> parentBasicEntryIndexes;
  479. ProfilerVector<TempEntry> newBasicEntries;
  480. finalBasicHierarchyTodo.push(0);
  481. entryIdx = 0;
  482. parentBasicEntryIndexes.push(entryIdx);
  483. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  484. entryIdx++;
  485. while(!finalBasicHierarchyTodo.empty())
  486. {
  487. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  488. parentBasicEntryIndexes.pop();
  489. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  490. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  491. finalBasicHierarchyTodo.pop();
  492. for(auto& childIdx : curEntry.childIndexes)
  493. {
  494. finalBasicHierarchyTodo.push(childIdx);
  495. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  496. if(basicEntry.data.numCalls > 0)
  497. {
  498. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  499. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  500. parentBasicEntryIndexes.push(entryIdx);
  501. entryIdx++;
  502. }
  503. else
  504. parentBasicEntryIndexes.push(parentEntryIdx);
  505. }
  506. }
  507. if(newBasicEntries.size() > 0)
  508. {
  509. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  510. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  511. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  512. finalBasicHierarchyTodo.push(0);
  513. while(!finalBasicHierarchyTodo.empty())
  514. {
  515. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  516. finalBasicHierarchyTodo.pop();
  517. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  518. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  519. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  520. UINT32 idx = 0;
  521. for(auto& childIdx : curEntry.childIndexes)
  522. {
  523. TempEntry& childEntry = newBasicEntries[childIdx];
  524. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  525. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  526. finalBasicHierarchyTodo.push(childIdx);
  527. idx++;
  528. }
  529. }
  530. }
  531. // Prune empty precise entries
  532. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  533. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  534. ProfilerVector<TempEntry> newPreciseEntries;
  535. finalPreciseHierarchyTodo.push(0);
  536. entryIdx = 0;
  537. parentPreciseEntryIndexes.push(entryIdx);
  538. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  539. entryIdx++;
  540. while(!finalPreciseHierarchyTodo.empty())
  541. {
  542. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  543. parentPreciseEntryIndexes.pop();
  544. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  545. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  546. finalPreciseHierarchyTodo.pop();
  547. for(auto& childIdx : curEntry.childIndexes)
  548. {
  549. finalPreciseHierarchyTodo.push(childIdx);
  550. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  551. if(preciseEntry.data.numCalls > 0)
  552. {
  553. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  554. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  555. parentPreciseEntryIndexes.push(entryIdx);
  556. entryIdx++;
  557. }
  558. else
  559. parentPreciseEntryIndexes.push(parentEntryIdx);
  560. }
  561. }
  562. if(newPreciseEntries.size() > 0)
  563. {
  564. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  565. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  566. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  567. finalPreciseHierarchyTodo.push(0);
  568. while(!finalPreciseHierarchyTodo.empty())
  569. {
  570. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  571. finalPreciseHierarchyTodo.pop();
  572. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  573. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  574. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  575. UINT32 idx = 0;
  576. for(auto& childIdx : curEntry.childIndexes)
  577. {
  578. TempEntry& childEntry = newPreciseEntries[childIdx];
  579. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  580. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  581. finalPreciseHierarchyTodo.push(childIdx);
  582. idx++;
  583. }
  584. }
  585. }
  586. return report;
  587. }
  588. void ProfilerCPU::estimateTimerOverhead()
  589. {
  590. // Get an idea of how long timer calls and RDTSC takes
  591. const UINT32 reps = 1000, sampleReps = 20;
  592. mBasicTimerOverhead = 1000000.0;
  593. mPreciseTimerOverhead = 1000000;
  594. for (UINT32 tries = 0; tries < 20; tries++)
  595. {
  596. Timer timer;
  597. for (UINT32 i = 0; i < reps; i++)
  598. {
  599. timer.start();
  600. timer.stop();
  601. }
  602. double avgTime = double(timer.time)/double(reps);
  603. if (avgTime < mBasicTimerOverhead)
  604. mBasicTimerOverhead = avgTime;
  605. TimerPrecise timerPrecise;
  606. for (UINT32 i = 0; i < reps; i++)
  607. {
  608. timerPrecise.start();
  609. timerPrecise.stop();
  610. }
  611. UINT64 avgCycles = timerPrecise.cycles/reps;
  612. if (avgCycles < mPreciseTimerOverhead)
  613. mPreciseTimerOverhead = avgCycles;
  614. }
  615. mBasicSamplingOverheadMs = 1000000.0;
  616. mPreciseSamplingOverheadMs = 1000000.0;
  617. mBasicSamplingOverheadCycles = 1000000;
  618. mPreciseSamplingOverheadCycles = 1000000;
  619. for (UINT32 tries = 0; tries < 3; tries++)
  620. {
  621. /************************************************************************/
  622. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  623. /************************************************************************/
  624. Timer timerA;
  625. timerA.start();
  626. beginThread("Main");
  627. // Two different cases that can effect performance, one where
  628. // sample already exists and other where new one needs to be created
  629. for (UINT32 i = 0; i < sampleReps; i++)
  630. {
  631. beginSample("TestAvg1");
  632. endSample("TestAvg1");
  633. beginSample("TestAvg2");
  634. endSample("TestAvg2");
  635. beginSample("TestAvg3");
  636. endSample("TestAvg3");
  637. beginSample("TestAvg4");
  638. endSample("TestAvg4");
  639. beginSample("TestAvg5");
  640. endSample("TestAvg5");
  641. beginSample("TestAvg6");
  642. endSample("TestAvg6");
  643. beginSample("TestAvg7");
  644. endSample("TestAvg7");
  645. beginSample("TestAvg8");
  646. endSample("TestAvg8");
  647. beginSample("TestAvg9");
  648. endSample("TestAvg9");
  649. beginSample("TestAvg10");
  650. endSample("TestAvg10");
  651. }
  652. for (UINT32 i = 0; i < sampleReps * 5; i++)
  653. {
  654. beginSample(("TestAvg#" + toString(i)).c_str());
  655. endSample(("TestAvg#" + toString(i)).c_str());
  656. }
  657. endThread();
  658. timerA.stop();
  659. reset();
  660. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  661. if (avgTimeBasic < mBasicSamplingOverheadMs)
  662. mBasicSamplingOverheadMs = avgTimeBasic;
  663. /************************************************************************/
  664. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  665. /************************************************************************/
  666. TimerPrecise timerPreciseA;
  667. timerPreciseA.start();
  668. beginThread("Main");
  669. // Two different cases that can effect performance, one where
  670. // sample already exists and other where new one needs to be created
  671. for (UINT32 i = 0; i < sampleReps; i++)
  672. {
  673. beginSample("TestAvg1");
  674. endSample("TestAvg1");
  675. beginSample("TestAvg2");
  676. endSample("TestAvg2");
  677. beginSample("TestAvg3");
  678. endSample("TestAvg3");
  679. beginSample("TestAvg4");
  680. endSample("TestAvg4");
  681. beginSample("TestAvg5");
  682. endSample("TestAvg5");
  683. beginSample("TestAvg6");
  684. endSample("TestAvg6");
  685. beginSample("TestAvg7");
  686. endSample("TestAvg7");
  687. beginSample("TestAvg8");
  688. endSample("TestAvg8");
  689. beginSample("TestAvg9");
  690. endSample("TestAvg9");
  691. beginSample("TestAvg10");
  692. endSample("TestAvg10");
  693. }
  694. for (UINT32 i = 0; i < sampleReps * 5; i++)
  695. {
  696. beginSample(("TestAvg#" + toString(i)).c_str());
  697. endSample(("TestAvg#" + toString(i)).c_str());
  698. }
  699. endThread();
  700. timerPreciseA.stop();
  701. reset();
  702. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  703. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  704. mBasicSamplingOverheadCycles = avgCyclesBasic;
  705. /************************************************************************/
  706. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  707. /************************************************************************/
  708. Timer timerB;
  709. timerB.start();
  710. beginThread("Main");
  711. // Two different cases that can effect performance, one where
  712. // sample already exists and other where new one needs to be created
  713. for (UINT32 i = 0; i < sampleReps; i++)
  714. {
  715. beginSamplePrecise("TestAvg1");
  716. endSamplePrecise("TestAvg1");
  717. beginSamplePrecise("TestAvg2");
  718. endSamplePrecise("TestAvg2");
  719. beginSamplePrecise("TestAvg3");
  720. endSamplePrecise("TestAvg3");
  721. beginSamplePrecise("TestAvg4");
  722. endSamplePrecise("TestAvg4");
  723. beginSamplePrecise("TestAvg5");
  724. endSamplePrecise("TestAvg5");
  725. beginSamplePrecise("TestAvg6");
  726. endSamplePrecise("TestAvg6");
  727. beginSamplePrecise("TestAvg7");
  728. endSamplePrecise("TestAvg7");
  729. beginSamplePrecise("TestAvg8");
  730. endSamplePrecise("TestAvg8");
  731. beginSamplePrecise("TestAvg9");
  732. endSamplePrecise("TestAvg9");
  733. beginSamplePrecise("TestAvg10");
  734. endSamplePrecise("TestAvg10");
  735. }
  736. for (UINT32 i = 0; i < sampleReps * 5; i++)
  737. {
  738. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  739. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  740. }
  741. endThread();
  742. timerB.stop();
  743. reset();
  744. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  745. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  746. mPreciseSamplingOverheadMs = avgTimesPrecise;
  747. /************************************************************************/
  748. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  749. /************************************************************************/
  750. TimerPrecise timerPreciseB;
  751. timerPreciseB.start();
  752. beginThread("Main");
  753. // Two different cases that can effect performance, one where
  754. // sample already exists and other where new one needs to be created
  755. for (UINT32 i = 0; i < sampleReps; i++)
  756. {
  757. beginSamplePrecise("TestAvg1");
  758. endSamplePrecise("TestAvg1");
  759. beginSamplePrecise("TestAvg2");
  760. endSamplePrecise("TestAvg2");
  761. beginSamplePrecise("TestAvg3");
  762. endSamplePrecise("TestAvg3");
  763. beginSamplePrecise("TestAvg4");
  764. endSamplePrecise("TestAvg4");
  765. beginSamplePrecise("TestAvg5");
  766. endSamplePrecise("TestAvg5");
  767. beginSamplePrecise("TestAvg6");
  768. endSamplePrecise("TestAvg6");
  769. beginSamplePrecise("TestAvg7");
  770. endSamplePrecise("TestAvg7");
  771. beginSamplePrecise("TestAvg8");
  772. endSamplePrecise("TestAvg8");
  773. beginSamplePrecise("TestAvg9");
  774. endSamplePrecise("TestAvg9");
  775. beginSamplePrecise("TestAvg10");
  776. endSamplePrecise("TestAvg10");
  777. }
  778. for (UINT32 i = 0; i < sampleReps * 5; i++)
  779. {
  780. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  781. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  782. }
  783. endThread();
  784. timerPreciseB.stop();
  785. reset();
  786. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  787. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  788. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  789. }
  790. }
  791. CPUProfilerBasicSamplingEntry::Data::Data()
  792. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  793. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  794. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  795. { }
  796. CPUProfilerPreciseSamplingEntry::Data::Data()
  797. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  798. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  799. estimatedOverhead(0), pctOfParent(1.0f)
  800. { }
  801. CPUProfilerReport::CPUProfilerReport()
  802. {
  803. }
  804. ProfilerCPU& gProfilerCPU()
  805. {
  806. return ProfilerCPU::instance();
  807. }
  808. }