BlockFrequencyInfoImpl.h 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207
  1. //==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // Shared implementation of BlockFrequency for IR and Machine Instructions.
  11. // See the documentation below for BlockFrequencyInfoImpl for details.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  15. #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  16. #include "llvm/ADT/DenseMap.h"
  17. #include "llvm/ADT/PostOrderIterator.h"
  18. #include "llvm/ADT/iterator_range.h"
  19. #include "llvm/IR/BasicBlock.h"
  20. #include "llvm/Support/BlockFrequency.h"
  21. #include "llvm/Support/BranchProbability.h"
  22. #include "llvm/Support/Debug.h"
  23. #include "llvm/Support/ScaledNumber.h"
  24. #include "llvm/Support/raw_ostream.h"
  25. #include <deque>
  26. #include <list>
  27. #include <string>
  28. #include <vector>
  29. #define DEBUG_TYPE "block-freq"
  30. namespace llvm {
  31. class BasicBlock;
  32. class BranchProbabilityInfo;
  33. class Function;
  34. class Loop;
  35. class LoopInfo;
  36. class MachineBasicBlock;
  37. class MachineBranchProbabilityInfo;
  38. class MachineFunction;
  39. class MachineLoop;
  40. class MachineLoopInfo;
  41. namespace bfi_detail {
  42. struct IrreducibleGraph;
  43. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  44. template <class BT> struct BlockEdgesAdder;
  45. /// \brief Mass of a block.
  46. ///
  47. /// This class implements a sort of fixed-point fraction always between 0.0 and
  48. /// 1.0. getMass() == UINT64_MAX indicates a value of 1.0.
  49. ///
  50. /// Masses can be added and subtracted. Simple saturation arithmetic is used,
  51. /// so arithmetic operations never overflow or underflow.
  52. ///
  53. /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
  54. /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
  55. /// quite, maximum precision).
  56. ///
  57. /// Masses can be scaled by \a BranchProbability at maximum precision.
  58. class BlockMass {
  59. uint64_t Mass;
  60. public:
  61. BlockMass() : Mass(0) {}
  62. explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
  63. static BlockMass getEmpty() { return BlockMass(); }
  64. static BlockMass getFull() { return BlockMass(UINT64_MAX); }
  65. uint64_t getMass() const { return Mass; }
  66. bool isFull() const { return Mass == UINT64_MAX; }
  67. bool isEmpty() const { return !Mass; }
  68. bool operator!() const { return isEmpty(); }
  69. /// \brief Add another mass.
  70. ///
  71. /// Adds another mass, saturating at \a isFull() rather than overflowing.
  72. BlockMass &operator+=(const BlockMass &X) {
  73. uint64_t Sum = Mass + X.Mass;
  74. Mass = Sum < Mass ? UINT64_MAX : Sum;
  75. return *this;
  76. }
  77. /// \brief Subtract another mass.
  78. ///
  79. /// Subtracts another mass, saturating at \a isEmpty() rather than
  80. /// undeflowing.
  81. BlockMass &operator-=(const BlockMass &X) {
  82. uint64_t Diff = Mass - X.Mass;
  83. Mass = Diff > Mass ? 0 : Diff;
  84. return *this;
  85. }
  86. BlockMass &operator*=(const BranchProbability &P) {
  87. Mass = P.scale(Mass);
  88. return *this;
  89. }
  90. bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
  91. bool operator!=(const BlockMass &X) const { return Mass != X.Mass; }
  92. bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; }
  93. bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; }
  94. bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
  95. bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
  96. /// \brief Convert to scaled number.
  97. ///
  98. /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty()
  99. /// gives slightly above 0.0.
  100. ScaledNumber<uint64_t> toScaled() const;
  101. void dump() const;
  102. raw_ostream &print(raw_ostream &OS) const;
  103. };
  104. inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
  105. return BlockMass(L) += R;
  106. }
  107. inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
  108. return BlockMass(L) -= R;
  109. }
  110. inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
  111. return BlockMass(L) *= R;
  112. }
  113. inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
  114. return BlockMass(R) *= L;
  115. }
  116. inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
  117. return X.print(OS);
  118. }
  119. } // end namespace bfi_detail
  120. template <> struct isPodLike<bfi_detail::BlockMass> {
  121. static const bool value = true;
  122. };
  123. /// \brief Base class for BlockFrequencyInfoImpl
  124. ///
  125. /// BlockFrequencyInfoImplBase has supporting data structures and some
  126. /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
  127. /// the block type (or that call such algorithms) are skipped here.
  128. ///
  129. /// Nevertheless, the majority of the overall algorithm documention lives with
  130. /// BlockFrequencyInfoImpl. See there for details.
  131. class BlockFrequencyInfoImplBase {
  132. public:
  133. typedef ScaledNumber<uint64_t> Scaled64;
  134. typedef bfi_detail::BlockMass BlockMass;
  135. /// \brief Representative of a block.
  136. ///
  137. /// This is a simple wrapper around an index into the reverse-post-order
  138. /// traversal of the blocks.
  139. ///
  140. /// Unlike a block pointer, its order has meaning (location in the
  141. /// topological sort) and it's class is the same regardless of block type.
  142. struct BlockNode {
  143. typedef uint32_t IndexType;
  144. IndexType Index;
  145. bool operator==(const BlockNode &X) const { return Index == X.Index; }
  146. bool operator!=(const BlockNode &X) const { return Index != X.Index; }
  147. bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
  148. bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
  149. bool operator<(const BlockNode &X) const { return Index < X.Index; }
  150. bool operator>(const BlockNode &X) const { return Index > X.Index; }
  151. BlockNode() : Index(UINT32_MAX) {}
  152. BlockNode(IndexType Index) : Index(Index) {}
  153. bool isValid() const { return Index <= getMaxIndex(); }
  154. static size_t getMaxIndex() { return UINT32_MAX - 1; }
  155. };
  156. /// \brief Stats about a block itself.
  157. struct FrequencyData {
  158. Scaled64 Scaled;
  159. uint64_t Integer;
  160. };
  161. /// \brief Data about a loop.
  162. ///
  163. /// Contains the data necessary to represent a loop as a pseudo-node once it's
  164. /// packaged.
  165. struct LoopData {
  166. typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
  167. typedef SmallVector<BlockNode, 4> NodeList;
  168. typedef SmallVector<BlockMass, 1> HeaderMassList;
  169. LoopData *Parent; ///< The parent loop.
  170. bool IsPackaged; ///< Whether this has been packaged.
  171. uint32_t NumHeaders; ///< Number of headers.
  172. ExitMap Exits; ///< Successor edges (and weights).
  173. NodeList Nodes; ///< Header and the members of the loop.
  174. HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
  175. BlockMass Mass;
  176. Scaled64 Scale;
  177. LoopData(LoopData *Parent, const BlockNode &Header)
  178. : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header),
  179. BackedgeMass(1) {}
  180. template <class It1, class It2>
  181. LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
  182. It2 LastOther)
  183. : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
  184. NumHeaders = Nodes.size();
  185. Nodes.insert(Nodes.end(), FirstOther, LastOther);
  186. BackedgeMass.resize(NumHeaders);
  187. }
  188. bool isHeader(const BlockNode &Node) const {
  189. if (isIrreducible())
  190. return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
  191. Node);
  192. return Node == Nodes[0];
  193. }
  194. BlockNode getHeader() const { return Nodes[0]; }
  195. bool isIrreducible() const { return NumHeaders > 1; }
  196. HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
  197. assert(isHeader(B) && "this is only valid on loop header blocks");
  198. if (isIrreducible())
  199. return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
  200. Nodes.begin();
  201. return 0;
  202. }
  203. NodeList::const_iterator members_begin() const {
  204. return Nodes.begin() + NumHeaders;
  205. }
  206. NodeList::const_iterator members_end() const { return Nodes.end(); }
  207. iterator_range<NodeList::const_iterator> members() const {
  208. return make_range(members_begin(), members_end());
  209. }
  210. };
  211. /// \brief Index of loop information.
  212. struct WorkingData {
  213. BlockNode Node; ///< This node.
  214. LoopData *Loop; ///< The loop this block is inside.
  215. BlockMass Mass; ///< Mass distribution from the entry block.
  216. WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {}
  217. bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
  218. bool isDoubleLoopHeader() const {
  219. return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
  220. Loop->Parent->isHeader(Node);
  221. }
  222. LoopData *getContainingLoop() const {
  223. if (!isLoopHeader())
  224. return Loop;
  225. if (!isDoubleLoopHeader())
  226. return Loop->Parent;
  227. return Loop->Parent->Parent;
  228. }
  229. /// \brief Resolve a node to its representative.
  230. ///
  231. /// Get the node currently representing Node, which could be a containing
  232. /// loop.
  233. ///
  234. /// This function should only be called when distributing mass. As long as
  235. /// there are no irreducible edges to Node, then it will have complexity
  236. /// O(1) in this context.
  237. ///
  238. /// In general, the complexity is O(L), where L is the number of loop
  239. /// headers Node has been packaged into. Since this method is called in
  240. /// the context of distributing mass, L will be the number of loop headers
  241. /// an early exit edge jumps out of.
  242. BlockNode getResolvedNode() const {
  243. auto L = getPackagedLoop();
  244. return L ? L->getHeader() : Node;
  245. }
  246. LoopData *getPackagedLoop() const {
  247. if (!Loop || !Loop->IsPackaged)
  248. return nullptr;
  249. auto L = Loop;
  250. while (L->Parent && L->Parent->IsPackaged)
  251. L = L->Parent;
  252. return L;
  253. }
  254. /// \brief Get the appropriate mass for a node.
  255. ///
  256. /// Get appropriate mass for Node. If Node is a loop-header (whose loop
  257. /// has been packaged), returns the mass of its pseudo-node. If it's a
  258. /// node inside a packaged loop, it returns the loop's mass.
  259. BlockMass &getMass() {
  260. if (!isAPackage())
  261. return Mass;
  262. if (!isADoublePackage())
  263. return Loop->Mass;
  264. return Loop->Parent->Mass;
  265. }
  266. /// \brief Has ContainingLoop been packaged up?
  267. bool isPackaged() const { return getResolvedNode() != Node; }
  268. /// \brief Has Loop been packaged up?
  269. bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
  270. /// \brief Has Loop been packaged up twice?
  271. bool isADoublePackage() const {
  272. return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
  273. }
  274. };
  275. /// \brief Unscaled probability weight.
  276. ///
  277. /// Probability weight for an edge in the graph (including the
  278. /// successor/target node).
  279. ///
  280. /// All edges in the original function are 32-bit. However, exit edges from
  281. /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
  282. /// space in general.
  283. ///
  284. /// In addition to the raw weight amount, Weight stores the type of the edge
  285. /// in the current context (i.e., the context of the loop being processed).
  286. /// Is this a local edge within the loop, an exit from the loop, or a
  287. /// backedge to the loop header?
  288. struct Weight {
  289. enum DistType { Local, Exit, Backedge };
  290. DistType Type;
  291. BlockNode TargetNode;
  292. uint64_t Amount;
  293. Weight() : Type(Local), Amount(0) {}
  294. Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
  295. : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
  296. };
  297. /// \brief Distribution of unscaled probability weight.
  298. ///
  299. /// Distribution of unscaled probability weight to a set of successors.
  300. ///
  301. /// This class collates the successor edge weights for later processing.
  302. ///
  303. /// \a DidOverflow indicates whether \a Total did overflow while adding to
  304. /// the distribution. It should never overflow twice.
  305. struct Distribution {
  306. typedef SmallVector<Weight, 4> WeightList;
  307. WeightList Weights; ///< Individual successor weights.
  308. uint64_t Total; ///< Sum of all weights.
  309. bool DidOverflow; ///< Whether \a Total did overflow.
  310. Distribution() : Total(0), DidOverflow(false) {}
  311. void addLocal(const BlockNode &Node, uint64_t Amount) {
  312. add(Node, Amount, Weight::Local);
  313. }
  314. void addExit(const BlockNode &Node, uint64_t Amount) {
  315. add(Node, Amount, Weight::Exit);
  316. }
  317. void addBackedge(const BlockNode &Node, uint64_t Amount) {
  318. add(Node, Amount, Weight::Backedge);
  319. }
  320. /// \brief Normalize the distribution.
  321. ///
  322. /// Combines multiple edges to the same \a Weight::TargetNode and scales
  323. /// down so that \a Total fits into 32-bits.
  324. ///
  325. /// This is linear in the size of \a Weights. For the vast majority of
  326. /// cases, adjacent edge weights are combined by sorting WeightList and
  327. /// combining adjacent weights. However, for very large edge lists an
  328. /// auxiliary hash table is used.
  329. void normalize();
  330. private:
  331. void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
  332. };
  333. /// \brief Data about each block. This is used downstream.
  334. std::vector<FrequencyData> Freqs;
  335. /// \brief Loop data: see initializeLoops().
  336. std::vector<WorkingData> Working;
  337. /// \brief Indexed information about loops.
  338. std::list<LoopData> Loops;
  339. /// \brief Add all edges out of a packaged loop to the distribution.
  340. ///
  341. /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
  342. /// successor edge.
  343. ///
  344. /// \return \c true unless there's an irreducible backedge.
  345. bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
  346. Distribution &Dist);
  347. /// \brief Add an edge to the distribution.
  348. ///
  349. /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
  350. /// edge is local/exit/backedge is in the context of LoopHead. Otherwise,
  351. /// every edge should be a local edge (since all the loops are packaged up).
  352. ///
  353. /// \return \c true unless aborted due to an irreducible backedge.
  354. bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
  355. const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
  356. LoopData &getLoopPackage(const BlockNode &Head) {
  357. assert(Head.Index < Working.size());
  358. assert(Working[Head.Index].isLoopHeader());
  359. return *Working[Head.Index].Loop;
  360. }
  361. /// \brief Analyze irreducible SCCs.
  362. ///
  363. /// Separate irreducible SCCs from \c G, which is an explict graph of \c
  364. /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
  365. /// Insert them into \a Loops before \c Insert.
  366. ///
  367. /// \return the \c LoopData nodes representing the irreducible SCCs.
  368. iterator_range<std::list<LoopData>::iterator>
  369. analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
  370. std::list<LoopData>::iterator Insert);
  371. /// \brief Update a loop after packaging irreducible SCCs inside of it.
  372. ///
  373. /// Update \c OuterLoop. Before finding irreducible control flow, it was
  374. /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
  375. /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged
  376. /// up need to be removed from \a OuterLoop::Nodes.
  377. void updateLoopWithIrreducible(LoopData &OuterLoop);
  378. /// \brief Distribute mass according to a distribution.
  379. ///
  380. /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
  381. /// backedges and exits are stored in its entry in Loops.
  382. ///
  383. /// Mass is distributed in parallel from two copies of the source mass.
  384. void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
  385. Distribution &Dist);
  386. /// \brief Compute the loop scale for a loop.
  387. void computeLoopScale(LoopData &Loop);
  388. /// Adjust the mass of all headers in an irreducible loop.
  389. ///
  390. /// Initially, irreducible loops are assumed to distribute their mass
  391. /// equally among its headers. This can lead to wrong frequency estimates
  392. /// since some headers may be executed more frequently than others.
  393. ///
  394. /// This adjusts header mass distribution so it matches the weights of
  395. /// the backedges going into each of the loop headers.
  396. void adjustLoopHeaderMass(LoopData &Loop);
  397. /// \brief Package up a loop.
  398. void packageLoop(LoopData &Loop);
  399. /// \brief Unwrap loops.
  400. void unwrapLoops();
  401. /// \brief Finalize frequency metrics.
  402. ///
  403. /// Calculates final frequencies and cleans up no-longer-needed data
  404. /// structures.
  405. void finalizeMetrics();
  406. /// \brief Clear all memory.
  407. void clear();
  408. virtual std::string getBlockName(const BlockNode &Node) const;
  409. std::string getLoopName(const LoopData &Loop) const;
  410. virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
  411. void dump() const { print(dbgs()); }
  412. Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
  413. BlockFrequency getBlockFreq(const BlockNode &Node) const;
  414. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
  415. raw_ostream &printBlockFreq(raw_ostream &OS,
  416. const BlockFrequency &Freq) const;
  417. uint64_t getEntryFreq() const {
  418. assert(!Freqs.empty());
  419. return Freqs[0].Integer;
  420. }
  421. /// \brief Virtual destructor.
  422. ///
  423. /// Need a virtual destructor to mask the compiler warning about
  424. /// getBlockName().
  425. virtual ~BlockFrequencyInfoImplBase() {}
  426. };
  427. namespace bfi_detail {
  428. template <class BlockT> struct TypeMap {};
  429. template <> struct TypeMap<BasicBlock> {
  430. typedef BasicBlock BlockT;
  431. typedef Function FunctionT;
  432. typedef BranchProbabilityInfo BranchProbabilityInfoT;
  433. typedef Loop LoopT;
  434. typedef LoopInfo LoopInfoT;
  435. };
  436. template <> struct TypeMap<MachineBasicBlock> {
  437. typedef MachineBasicBlock BlockT;
  438. typedef MachineFunction FunctionT;
  439. typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
  440. typedef MachineLoop LoopT;
  441. typedef MachineLoopInfo LoopInfoT;
  442. };
  443. /// \brief Get the name of a MachineBasicBlock.
  444. ///
  445. /// Get the name of a MachineBasicBlock. It's templated so that including from
  446. /// CodeGen is unnecessary (that would be a layering issue).
  447. ///
  448. /// This is used mainly for debug output. The name is similar to
  449. /// MachineBasicBlock::getFullName(), but skips the name of the function.
  450. template <class BlockT> std::string getBlockName(const BlockT *BB) {
  451. assert(BB && "Unexpected nullptr");
  452. auto MachineName = "BB" + Twine(BB->getNumber());
  453. if (BB->getBasicBlock())
  454. return (MachineName + "[" + BB->getName() + "]").str();
  455. return MachineName.str();
  456. }
  457. /// \brief Get the name of a BasicBlock.
  458. template <> inline std::string getBlockName(const BasicBlock *BB) {
  459. assert(BB && "Unexpected nullptr");
  460. return BB->getName().str();
  461. }
  462. /// \brief Graph of irreducible control flow.
  463. ///
  464. /// This graph is used for determining the SCCs in a loop (or top-level
  465. /// function) that has irreducible control flow.
  466. ///
  467. /// During the block frequency algorithm, the local graphs are defined in a
  468. /// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
  469. /// graphs for most edges, but getting others from \a LoopData::ExitMap. The
  470. /// latter only has successor information.
  471. ///
  472. /// \a IrreducibleGraph makes this graph explicit. It's in a form that can use
  473. /// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
  474. /// and it explicitly lists predecessors and successors. The initialization
  475. /// that relies on \c MachineBasicBlock is defined in the header.
  476. struct IrreducibleGraph {
  477. typedef BlockFrequencyInfoImplBase BFIBase;
  478. BFIBase &BFI;
  479. typedef BFIBase::BlockNode BlockNode;
  480. struct IrrNode {
  481. BlockNode Node;
  482. unsigned NumIn;
  483. std::deque<const IrrNode *> Edges;
  484. IrrNode(const BlockNode &Node) : Node(Node), NumIn(0) {}
  485. typedef std::deque<const IrrNode *>::const_iterator iterator;
  486. iterator pred_begin() const { return Edges.begin(); }
  487. iterator succ_begin() const { return Edges.begin() + NumIn; }
  488. iterator pred_end() const { return succ_begin(); }
  489. iterator succ_end() const { return Edges.end(); }
  490. };
  491. BlockNode Start;
  492. const IrrNode *StartIrr;
  493. std::vector<IrrNode> Nodes;
  494. SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
  495. /// \brief Construct an explicit graph containing irreducible control flow.
  496. ///
  497. /// Construct an explicit graph of the control flow in \c OuterLoop (or the
  498. /// top-level function, if \c OuterLoop is \c nullptr). Uses \c
  499. /// addBlockEdges to add block successors that have not been packaged into
  500. /// loops.
  501. ///
  502. /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
  503. /// user of this.
  504. template <class BlockEdgesAdder>
  505. IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
  506. BlockEdgesAdder addBlockEdges)
  507. : BFI(BFI), StartIrr(nullptr) {
  508. initialize(OuterLoop, addBlockEdges);
  509. }
  510. template <class BlockEdgesAdder>
  511. void initialize(const BFIBase::LoopData *OuterLoop,
  512. BlockEdgesAdder addBlockEdges);
  513. void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
  514. void addNodesInFunction();
  515. void addNode(const BlockNode &Node) {
  516. Nodes.emplace_back(Node);
  517. BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
  518. }
  519. void indexNodes();
  520. template <class BlockEdgesAdder>
  521. void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
  522. BlockEdgesAdder addBlockEdges);
  523. void addEdge(IrrNode &Irr, const BlockNode &Succ,
  524. const BFIBase::LoopData *OuterLoop);
  525. };
  526. template <class BlockEdgesAdder>
  527. void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
  528. BlockEdgesAdder addBlockEdges) {
  529. if (OuterLoop) {
  530. addNodesInLoop(*OuterLoop);
  531. for (auto N : OuterLoop->Nodes)
  532. addEdges(N, OuterLoop, addBlockEdges);
  533. } else {
  534. addNodesInFunction();
  535. for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
  536. addEdges(Index, OuterLoop, addBlockEdges);
  537. }
  538. StartIrr = Lookup[Start.Index];
  539. }
  540. template <class BlockEdgesAdder>
  541. void IrreducibleGraph::addEdges(const BlockNode &Node,
  542. const BFIBase::LoopData *OuterLoop,
  543. BlockEdgesAdder addBlockEdges) {
  544. auto L = Lookup.find(Node.Index);
  545. if (L == Lookup.end())
  546. return;
  547. IrrNode &Irr = *L->second;
  548. const auto &Working = BFI.Working[Node.Index];
  549. if (Working.isAPackage())
  550. for (const auto &I : Working.Loop->Exits)
  551. addEdge(Irr, I.first, OuterLoop);
  552. else
  553. addBlockEdges(*this, Irr, OuterLoop);
  554. }
  555. }
  556. /// \brief Shared implementation for block frequency analysis.
  557. ///
  558. /// This is a shared implementation of BlockFrequencyInfo and
  559. /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
  560. /// blocks.
  561. ///
  562. /// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
  563. /// which is called the header. A given loop, L, can have sub-loops, which are
  564. /// loops within the subgraph of L that exclude its header. (A "trivial" SCC
  565. /// consists of a single block that does not have a self-edge.)
  566. ///
  567. /// In addition to loops, this algorithm has limited support for irreducible
  568. /// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are
  569. /// discovered on they fly, and modelled as loops with multiple headers.
  570. ///
  571. /// The headers of irreducible sub-SCCs consist of its entry blocks and all
  572. /// nodes that are targets of a backedge within it (excluding backedges within
  573. /// true sub-loops). Block frequency calculations act as if a block is
  574. /// inserted that intercepts all the edges to the headers. All backedges and
  575. /// entries point to this block. Its successors are the headers, which split
  576. /// the frequency evenly.
  577. ///
  578. /// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
  579. /// separates mass distribution from loop scaling, and dithers to eliminate
  580. /// probability mass loss.
  581. ///
  582. /// The implementation is split between BlockFrequencyInfoImpl, which knows the
  583. /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
  584. /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
  585. /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
  586. /// reverse-post order. This gives two advantages: it's easy to compare the
  587. /// relative ordering of two nodes, and maps keyed on BlockT can be represented
  588. /// by vectors.
  589. ///
  590. /// This algorithm is O(V+E), unless there is irreducible control flow, in
  591. /// which case it's O(V*E) in the worst case.
  592. ///
  593. /// These are the main stages:
  594. ///
  595. /// 0. Reverse post-order traversal (\a initializeRPOT()).
  596. ///
  597. /// Run a single post-order traversal and save it (in reverse) in RPOT.
  598. /// All other stages make use of this ordering. Save a lookup from BlockT
  599. /// to BlockNode (the index into RPOT) in Nodes.
  600. ///
  601. /// 1. Loop initialization (\a initializeLoops()).
  602. ///
  603. /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
  604. /// the algorithm. In particular, store the immediate members of each loop
  605. /// in reverse post-order.
  606. ///
  607. /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
  608. ///
  609. /// For each loop (bottom-up), distribute mass through the DAG resulting
  610. /// from ignoring backedges and treating sub-loops as a single pseudo-node.
  611. /// Track the backedge mass distributed to the loop header, and use it to
  612. /// calculate the loop scale (number of loop iterations). Immediate
  613. /// members that represent sub-loops will already have been visited and
  614. /// packaged into a pseudo-node.
  615. ///
  616. /// Distributing mass in a loop is a reverse-post-order traversal through
  617. /// the loop. Start by assigning full mass to the Loop header. For each
  618. /// node in the loop:
  619. ///
  620. /// - Fetch and categorize the weight distribution for its successors.
  621. /// If this is a packaged-subloop, the weight distribution is stored
  622. /// in \a LoopData::Exits. Otherwise, fetch it from
  623. /// BranchProbabilityInfo.
  624. ///
  625. /// - Each successor is categorized as \a Weight::Local, a local edge
  626. /// within the current loop, \a Weight::Backedge, a backedge to the
  627. /// loop header, or \a Weight::Exit, any successor outside the loop.
  628. /// The weight, the successor, and its category are stored in \a
  629. /// Distribution. There can be multiple edges to each successor.
  630. ///
  631. /// - If there's a backedge to a non-header, there's an irreducible SCC.
  632. /// The usual flow is temporarily aborted. \a
  633. /// computeIrreducibleMass() finds the irreducible SCCs within the
  634. /// loop, packages them up, and restarts the flow.
  635. ///
  636. /// - Normalize the distribution: scale weights down so that their sum
  637. /// is 32-bits, and coalesce multiple edges to the same node.
  638. ///
  639. /// - Distribute the mass accordingly, dithering to minimize mass loss,
  640. /// as described in \a distributeMass().
  641. ///
  642. /// In the case of irreducible loops, instead of a single loop header,
  643. /// there will be several. The computation of backedge masses is similar
  644. /// but instead of having a single backedge mass, there will be one
  645. /// backedge per loop header. In these cases, each backedge will carry
  646. /// a mass proportional to the edge weights along the corresponding
  647. /// path.
  648. ///
  649. /// At the end of propagation, the full mass assigned to the loop will be
  650. /// distributed among the loop headers proportionally according to the
  651. /// mass flowing through their backedges.
  652. ///
  653. /// Finally, calculate the loop scale from the accumulated backedge mass.
  654. ///
  655. /// 3. Distribute mass in the function (\a computeMassInFunction()).
  656. ///
  657. /// Finally, distribute mass through the DAG resulting from packaging all
  658. /// loops in the function. This uses the same algorithm as distributing
  659. /// mass in a loop, except that there are no exit or backedge edges.
  660. ///
  661. /// 4. Unpackage loops (\a unwrapLoops()).
  662. ///
  663. /// Initialize each block's frequency to a floating point representation of
  664. /// its mass.
  665. ///
  666. /// Visit loops top-down, scaling the frequencies of its immediate members
  667. /// by the loop's pseudo-node's frequency.
  668. ///
  669. /// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
  670. ///
  671. /// Using the min and max frequencies as a guide, translate floating point
  672. /// frequencies to an appropriate range in uint64_t.
  673. ///
  674. /// It has some known flaws.
  675. ///
  676. /// - The model of irreducible control flow is a rough approximation.
  677. ///
  678. /// Modelling irreducible control flow exactly involves setting up and
  679. /// solving a group of infinite geometric series. Such precision is
  680. /// unlikely to be worthwhile, since most of our algorithms give up on
  681. /// irreducible control flow anyway.
  682. ///
  683. /// Nevertheless, we might find that we need to get closer. Here's a sort
  684. /// of TODO list for the model with diminishing returns, to be completed as
  685. /// necessary.
  686. ///
  687. /// - The headers for the \a LoopData representing an irreducible SCC
  688. /// include non-entry blocks. When these extra blocks exist, they
  689. /// indicate a self-contained irreducible sub-SCC. We could treat them
  690. /// as sub-loops, rather than arbitrarily shoving the problematic
  691. /// blocks into the headers of the main irreducible SCC.
  692. ///
  693. /// - Entry frequencies are assumed to be evenly split between the
  694. /// headers of a given irreducible SCC, which is the only option if we
  695. /// need to compute mass in the SCC before its parent loop. Instead,
  696. /// we could partially compute mass in the parent loop, and stop when
  697. /// we get to the SCC. Here, we have the correct ratio of entry
  698. /// masses, which we can use to adjust their relative frequencies.
  699. /// Compute mass in the SCC, and then continue propagation in the
  700. /// parent.
  701. ///
  702. /// - We can propagate mass iteratively through the SCC, for some fixed
  703. /// number of iterations. Each iteration starts by assigning the entry
  704. /// blocks their backedge mass from the prior iteration. The final
  705. /// mass for each block (and each exit, and the total backedge mass
  706. /// used for computing loop scale) is the sum of all iterations.
  707. /// (Running this until fixed point would "solve" the geometric
  708. /// series by simulation.)
  709. template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
  710. typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
  711. typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
  712. typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT
  713. BranchProbabilityInfoT;
  714. typedef typename bfi_detail::TypeMap<BT>::LoopT LoopT;
  715. typedef typename bfi_detail::TypeMap<BT>::LoopInfoT LoopInfoT;
  716. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  717. friend struct bfi_detail::BlockEdgesAdder<BT>;
  718. typedef GraphTraits<const BlockT *> Successor;
  719. typedef GraphTraits<Inverse<const BlockT *>> Predecessor;
  720. const BranchProbabilityInfoT *BPI;
  721. const LoopInfoT *LI;
  722. const FunctionT *F;
  723. // All blocks in reverse postorder.
  724. std::vector<const BlockT *> RPOT;
  725. DenseMap<const BlockT *, BlockNode> Nodes;
  726. typedef typename std::vector<const BlockT *>::const_iterator rpot_iterator;
  727. rpot_iterator rpot_begin() const { return RPOT.begin(); }
  728. rpot_iterator rpot_end() const { return RPOT.end(); }
  729. size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
  730. BlockNode getNode(const rpot_iterator &I) const {
  731. return BlockNode(getIndex(I));
  732. }
  733. BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
  734. const BlockT *getBlock(const BlockNode &Node) const {
  735. assert(Node.Index < RPOT.size());
  736. return RPOT[Node.Index];
  737. }
  738. /// \brief Run (and save) a post-order traversal.
  739. ///
  740. /// Saves a reverse post-order traversal of all the nodes in \a F.
  741. void initializeRPOT();
  742. /// \brief Initialize loop data.
  743. ///
  744. /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from
  745. /// each block to the deepest loop it's in, but we need the inverse. For each
  746. /// loop, we store in reverse post-order its "immediate" members, defined as
  747. /// the header, the headers of immediate sub-loops, and all other blocks in
  748. /// the loop that are not in sub-loops.
  749. void initializeLoops();
  750. /// \brief Propagate to a block's successors.
  751. ///
  752. /// In the context of distributing mass through \c OuterLoop, divide the mass
  753. /// currently assigned to \c Node between its successors.
  754. ///
  755. /// \return \c true unless there's an irreducible backedge.
  756. bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
  757. /// \brief Compute mass in a particular loop.
  758. ///
  759. /// Assign mass to \c Loop's header, and then for each block in \c Loop in
  760. /// reverse post-order, distribute mass to its successors. Only visits nodes
  761. /// that have not been packaged into sub-loops.
  762. ///
  763. /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
  764. /// \return \c true unless there's an irreducible backedge.
  765. bool computeMassInLoop(LoopData &Loop);
  766. /// \brief Try to compute mass in the top-level function.
  767. ///
  768. /// Assign mass to the entry block, and then for each block in reverse
  769. /// post-order, distribute mass to its successors. Skips nodes that have
  770. /// been packaged into loops.
  771. ///
  772. /// \pre \a computeMassInLoops() has been called.
  773. /// \return \c true unless there's an irreducible backedge.
  774. bool tryToComputeMassInFunction();
  775. /// \brief Compute mass in (and package up) irreducible SCCs.
  776. ///
  777. /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
  778. /// of \c Insert), and call \a computeMassInLoop() on each of them.
  779. ///
  780. /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
  781. ///
  782. /// \pre \a computeMassInLoop() has been called for each subloop of \c
  783. /// OuterLoop.
  784. /// \pre \c Insert points at the last loop successfully processed by \a
  785. /// computeMassInLoop().
  786. /// \pre \c OuterLoop has irreducible SCCs.
  787. void computeIrreducibleMass(LoopData *OuterLoop,
  788. std::list<LoopData>::iterator Insert);
  789. /// \brief Compute mass in all loops.
  790. ///
  791. /// For each loop bottom-up, call \a computeMassInLoop().
  792. ///
  793. /// \a computeMassInLoop() aborts (and returns \c false) on loops that
  794. /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then
  795. /// re-enter \a computeMassInLoop().
  796. ///
  797. /// \post \a computeMassInLoop() has returned \c true for every loop.
  798. void computeMassInLoops();
  799. /// \brief Compute mass in the top-level function.
  800. ///
  801. /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
  802. /// compute mass in the top-level function.
  803. ///
  804. /// \post \a tryToComputeMassInFunction() has returned \c true.
  805. void computeMassInFunction();
  806. std::string getBlockName(const BlockNode &Node) const override {
  807. return bfi_detail::getBlockName(getBlock(Node));
  808. }
  809. public:
  810. const FunctionT *getFunction() const { return F; }
  811. void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
  812. const LoopInfoT *LI);
  813. BlockFrequencyInfoImpl() : BPI(nullptr), LI(nullptr), F(nullptr) {}
  814. using BlockFrequencyInfoImplBase::getEntryFreq;
  815. BlockFrequency getBlockFreq(const BlockT *BB) const {
  816. return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
  817. }
  818. Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
  819. return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
  820. }
  821. /// \brief Print the frequencies for the current function.
  822. ///
  823. /// Prints the frequencies for the blocks in the current function.
  824. ///
  825. /// Blocks are printed in the natural iteration order of the function, rather
  826. /// than reverse post-order. This provides two advantages: writing -analyze
  827. /// tests is easier (since blocks come out in source order), and even
  828. /// unreachable blocks are printed.
  829. ///
  830. /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
  831. /// we need to override it here.
  832. raw_ostream &print(raw_ostream &OS) const override;
  833. using BlockFrequencyInfoImplBase::dump;
  834. using BlockFrequencyInfoImplBase::printBlockFreq;
  835. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
  836. return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
  837. }
  838. };
  839. template <class BT>
  840. void BlockFrequencyInfoImpl<BT>::doFunction(const FunctionT *F,
  841. const BranchProbabilityInfoT *BPI,
  842. const LoopInfoT *LI) {
  843. // Save the parameters.
  844. this->BPI = BPI;
  845. this->LI = LI;
  846. this->F = F;
  847. // Clean up left-over data structures.
  848. BlockFrequencyInfoImplBase::clear();
  849. RPOT.clear();
  850. Nodes.clear();
  851. // Initialize.
  852. DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
  853. << std::string(F->getName().size(), '=') << "\n");
  854. initializeRPOT();
  855. initializeLoops();
  856. // Visit loops in post-order to find the local mass distribution, and then do
  857. // the full function.
  858. computeMassInLoops();
  859. computeMassInFunction();
  860. unwrapLoops();
  861. finalizeMetrics();
  862. }
  863. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
  864. const BlockT *Entry = F->begin();
  865. RPOT.reserve(F->size());
  866. std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
  867. std::reverse(RPOT.begin(), RPOT.end());
  868. assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
  869. "More nodes in function than Block Frequency Info supports");
  870. DEBUG(dbgs() << "reverse-post-order-traversal\n");
  871. for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
  872. BlockNode Node = getNode(I);
  873. DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
  874. Nodes[*I] = Node;
  875. }
  876. Working.reserve(RPOT.size());
  877. for (size_t Index = 0; Index < RPOT.size(); ++Index)
  878. Working.emplace_back(Index);
  879. Freqs.resize(RPOT.size());
  880. }
  881. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
  882. DEBUG(dbgs() << "loop-detection\n");
  883. if (LI->empty())
  884. return;
  885. // Visit loops top down and assign them an index.
  886. std::deque<std::pair<const LoopT *, LoopData *>> Q;
  887. for (const LoopT *L : *LI)
  888. Q.emplace_back(L, nullptr);
  889. while (!Q.empty()) {
  890. const LoopT *Loop = Q.front().first;
  891. LoopData *Parent = Q.front().second;
  892. Q.pop_front();
  893. BlockNode Header = getNode(Loop->getHeader());
  894. assert(Header.isValid());
  895. Loops.emplace_back(Parent, Header);
  896. Working[Header.Index].Loop = &Loops.back();
  897. DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
  898. for (const LoopT *L : *Loop)
  899. Q.emplace_back(L, &Loops.back());
  900. }
  901. // Visit nodes in reverse post-order and add them to their deepest containing
  902. // loop.
  903. for (size_t Index = 0; Index < RPOT.size(); ++Index) {
  904. // Loop headers have already been mostly mapped.
  905. if (Working[Index].isLoopHeader()) {
  906. LoopData *ContainingLoop = Working[Index].getContainingLoop();
  907. if (ContainingLoop)
  908. ContainingLoop->Nodes.push_back(Index);
  909. continue;
  910. }
  911. const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
  912. if (!Loop)
  913. continue;
  914. // Add this node to its containing loop's member list.
  915. BlockNode Header = getNode(Loop->getHeader());
  916. assert(Header.isValid());
  917. const auto &HeaderData = Working[Header.Index];
  918. assert(HeaderData.isLoopHeader());
  919. Working[Index].Loop = HeaderData.Loop;
  920. HeaderData.Loop->Nodes.push_back(Index);
  921. DEBUG(dbgs() << " - loop = " << getBlockName(Header)
  922. << ": member = " << getBlockName(Index) << "\n");
  923. }
  924. }
  925. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
  926. // Visit loops with the deepest first, and the top-level loops last.
  927. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  928. if (computeMassInLoop(*L))
  929. continue;
  930. auto Next = std::next(L);
  931. computeIrreducibleMass(&*L, L.base());
  932. L = std::prev(Next);
  933. if (computeMassInLoop(*L))
  934. continue;
  935. llvm_unreachable("unhandled irreducible control flow");
  936. }
  937. }
  938. template <class BT>
  939. bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
  940. // Compute mass in loop.
  941. DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
  942. if (Loop.isIrreducible()) {
  943. BlockMass Remaining = BlockMass::getFull();
  944. for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
  945. auto &Mass = Working[Loop.Nodes[H].Index].getMass();
  946. Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
  947. Remaining -= Mass;
  948. }
  949. for (const BlockNode &M : Loop.Nodes)
  950. if (!propagateMassToSuccessors(&Loop, M))
  951. llvm_unreachable("unhandled irreducible control flow");
  952. adjustLoopHeaderMass(Loop);
  953. } else {
  954. Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
  955. if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
  956. llvm_unreachable("irreducible control flow to loop header!?");
  957. for (const BlockNode &M : Loop.members())
  958. if (!propagateMassToSuccessors(&Loop, M))
  959. // Irreducible backedge.
  960. return false;
  961. }
  962. computeLoopScale(Loop);
  963. packageLoop(Loop);
  964. return true;
  965. }
  966. template <class BT>
  967. bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
  968. // Compute mass in function.
  969. DEBUG(dbgs() << "compute-mass-in-function\n");
  970. assert(!Working.empty() && "no blocks in function");
  971. assert(!Working[0].isLoopHeader() && "entry block is a loop header");
  972. Working[0].getMass() = BlockMass::getFull();
  973. for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
  974. // Check for nodes that have been packaged.
  975. BlockNode Node = getNode(I);
  976. if (Working[Node.Index].isPackaged())
  977. continue;
  978. if (!propagateMassToSuccessors(nullptr, Node))
  979. return false;
  980. }
  981. return true;
  982. }
  983. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
  984. if (tryToComputeMassInFunction())
  985. return;
  986. computeIrreducibleMass(nullptr, Loops.begin());
  987. if (tryToComputeMassInFunction())
  988. return;
  989. llvm_unreachable("unhandled irreducible control flow");
  990. }
  991. /// \note This should be a lambda, but that crashes GCC 4.7.
  992. namespace bfi_detail {
  993. template <class BT> struct BlockEdgesAdder {
  994. typedef BT BlockT;
  995. typedef BlockFrequencyInfoImplBase::LoopData LoopData;
  996. typedef GraphTraits<const BlockT *> Successor;
  997. const BlockFrequencyInfoImpl<BT> &BFI;
  998. explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
  999. : BFI(BFI) {}
  1000. void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
  1001. const LoopData *OuterLoop) {
  1002. const BlockT *BB = BFI.RPOT[Irr.Node.Index];
  1003. for (auto I = Successor::child_begin(BB), E = Successor::child_end(BB);
  1004. I != E; ++I)
  1005. G.addEdge(Irr, BFI.getNode(*I), OuterLoop);
  1006. }
  1007. };
  1008. }
  1009. template <class BT>
  1010. void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
  1011. LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
  1012. DEBUG(dbgs() << "analyze-irreducible-in-";
  1013. if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
  1014. else dbgs() << "function\n");
  1015. using namespace bfi_detail;
  1016. // Ideally, addBlockEdges() would be declared here as a lambda, but that
  1017. // crashes GCC 4.7.
  1018. BlockEdgesAdder<BT> addBlockEdges(*this);
  1019. IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
  1020. for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
  1021. computeMassInLoop(L);
  1022. if (!OuterLoop)
  1023. return;
  1024. updateLoopWithIrreducible(*OuterLoop);
  1025. }
  1026. template <class BT>
  1027. bool
  1028. BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
  1029. const BlockNode &Node) {
  1030. DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
  1031. // Calculate probability for successors.
  1032. Distribution Dist;
  1033. if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
  1034. assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
  1035. if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
  1036. // Irreducible backedge.
  1037. return false;
  1038. } else {
  1039. const BlockT *BB = getBlock(Node);
  1040. for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
  1041. SI != SE; ++SI)
  1042. // Do not dereference SI, or getEdgeWeight() is linear in the number of
  1043. // successors.
  1044. if (!addToDist(Dist, OuterLoop, Node, getNode(*SI),
  1045. BPI->getEdgeWeight(BB, SI)))
  1046. // Irreducible backedge.
  1047. return false;
  1048. }
  1049. // Distribute mass to successors, saving exit and backedge data in the
  1050. // loop header.
  1051. distributeMass(Node, OuterLoop, Dist);
  1052. return true;
  1053. }
  1054. template <class BT>
  1055. raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
  1056. if (!F)
  1057. return OS;
  1058. OS << "block-frequency-info: " << F->getName() << "\n";
  1059. for (const BlockT &BB : *F)
  1060. OS << " - " << bfi_detail::getBlockName(&BB)
  1061. << ": float = " << getFloatingBlockFreq(&BB)
  1062. << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
  1063. // Add an extra newline for readability.
  1064. OS << "\n";
  1065. return OS;
  1066. }
  1067. } // end namespace llvm
  1068. #undef DEBUG_TYPE
  1069. #endif