LoopUnrollRuntime.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities for loops with run-time
  11. // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
  12. // trip counts.
  13. //
  14. // The functions in this file are used to generate extra code when the
  15. // run-time trip count modulo the unroll factor is not 0. When this is the
  16. // case, we need to generate code to execute these 'left over' iterations.
  17. //
  18. // The current strategy generates an if-then-else sequence prior to the
  19. // unrolled loop to execute the 'left over' iterations. Other strategies
  20. // include generate a loop before or after the unrolled loop.
  21. //
  22. //===----------------------------------------------------------------------===//
  23. #include "llvm/Transforms/Utils/UnrollLoop.h"
  24. #include "llvm/ADT/Statistic.h"
  25. #include "llvm/Analysis/AliasAnalysis.h"
  26. #include "llvm/Analysis/LoopIterator.h"
  27. #include "llvm/Analysis/LoopPass.h"
  28. #include "llvm/Analysis/ScalarEvolution.h"
  29. #include "llvm/Analysis/ScalarEvolutionExpander.h"
  30. #include "llvm/IR/BasicBlock.h"
  31. #include "llvm/IR/Dominators.h"
  32. #include "llvm/IR/Metadata.h"
  33. #include "llvm/IR/Module.h"
  34. #include "llvm/Support/Debug.h"
  35. #include "llvm/Support/raw_ostream.h"
  36. #include "llvm/Transforms/Scalar.h"
  37. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  38. #include "llvm/Transforms/Utils/Cloning.h"
  39. #include <algorithm>
  40. using namespace llvm;
  41. #define DEBUG_TYPE "loop-unroll"
  42. STATISTIC(NumRuntimeUnrolled,
  43. "Number of loops unrolled with run-time trip counts");
  44. /// Connect the unrolling prolog code to the original loop.
  45. /// The unrolling prolog code contains code to execute the
  46. /// 'extra' iterations if the run-time trip count modulo the
  47. /// unroll count is non-zero.
  48. ///
  49. /// This function performs the following:
  50. /// - Create PHI nodes at prolog end block to combine values
  51. /// that exit the prolog code and jump around the prolog.
  52. /// - Add a PHI operand to a PHI node at the loop exit block
  53. /// for values that exit the prolog and go around the loop.
  54. /// - Branch around the original loop if the trip count is less
  55. /// than the unroll factor.
  56. ///
  57. static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  58. BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
  59. BasicBlock *OrigPH, BasicBlock *NewPH,
  60. ValueToValueMapTy &VMap, AliasAnalysis *AA,
  61. DominatorTree *DT, LoopInfo *LI, Pass *P) {
  62. BasicBlock *Latch = L->getLoopLatch();
  63. assert(Latch && "Loop must have a latch");
  64. // Create a PHI node for each outgoing value from the original loop
  65. // (which means it is an outgoing value from the prolog code too).
  66. // The new PHI node is inserted in the prolog end basic block.
  67. // The new PHI name is added as an operand of a PHI node in either
  68. // the loop header or the loop exit block.
  69. for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
  70. SBI != SBE; ++SBI) {
  71. for (BasicBlock::iterator BBI = (*SBI)->begin();
  72. PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
  73. // Add a new PHI node to the prolog end block and add the
  74. // appropriate incoming values.
  75. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
  76. PrologEnd->getTerminator());
  77. // Adding a value to the new PHI node from the original loop preheader.
  78. // This is the value that skips all the prolog code.
  79. if (L->contains(PN)) {
  80. NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
  81. } else {
  82. NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
  83. }
  84. Value *V = PN->getIncomingValueForBlock(Latch);
  85. if (Instruction *I = dyn_cast<Instruction>(V)) {
  86. if (L->contains(I)) {
  87. V = VMap[I];
  88. }
  89. }
  90. // Adding a value to the new PHI node from the last prolog block
  91. // that was created.
  92. NewPN->addIncoming(V, LastPrologBB);
  93. // Update the existing PHI node operand with the value from the
  94. // new PHI node. How this is done depends on if the existing
  95. // PHI node is in the original loop block, or the exit block.
  96. if (L->contains(PN)) {
  97. PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
  98. } else {
  99. PN->addIncoming(NewPN, PrologEnd);
  100. }
  101. }
  102. }
  103. // Create a branch around the orignal loop, which is taken if there are no
  104. // iterations remaining to be executed after running the prologue.
  105. Instruction *InsertPt = PrologEnd->getTerminator();
  106. IRBuilder<> B(InsertPt);
  107. assert(Count != 0 && "nonsensical Count!");
  108. // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
  109. // (since Count is a power of 2). This means %xtraiter is (BECount + 1) and
  110. // and all of the iterations of this loop were executed by the prologue. Note
  111. // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
  112. Value *BrLoopExit =
  113. B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
  114. BasicBlock *Exit = L->getUniqueExitBlock();
  115. assert(Exit && "Loop must have a single exit block only");
  116. // Split the exit to maintain loop canonicalization guarantees
  117. SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
  118. SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI,
  119. P->mustPreserveAnalysisID(LCSSAID));
  120. // Add the branch to the exit block (around the unrolled loop)
  121. B.CreateCondBr(BrLoopExit, Exit, NewPH);
  122. InsertPt->eraseFromParent();
  123. }
  124. /// Create a clone of the blocks in a loop and connect them together.
  125. /// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
  126. /// loop will be created including all cloned blocks, and the iterator of it
  127. /// switches to count NewIter down to 0.
  128. ///
  129. static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
  130. BasicBlock *InsertTop, BasicBlock *InsertBot,
  131. std::vector<BasicBlock *> &NewBlocks,
  132. LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
  133. LoopInfo *LI) {
  134. BasicBlock *Preheader = L->getLoopPreheader();
  135. BasicBlock *Header = L->getHeader();
  136. BasicBlock *Latch = L->getLoopLatch();
  137. Function *F = Header->getParent();
  138. LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  139. LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  140. Loop *NewLoop = 0;
  141. Loop *ParentLoop = L->getParentLoop();
  142. if (!UnrollProlog) {
  143. NewLoop = new Loop();
  144. if (ParentLoop)
  145. ParentLoop->addChildLoop(NewLoop);
  146. else
  147. LI->addTopLevelLoop(NewLoop);
  148. }
  149. // For each block in the original loop, create a new copy,
  150. // and update the value map with the newly created values.
  151. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  152. BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
  153. NewBlocks.push_back(NewBB);
  154. if (NewLoop)
  155. NewLoop->addBasicBlockToLoop(NewBB, *LI);
  156. else if (ParentLoop)
  157. ParentLoop->addBasicBlockToLoop(NewBB, *LI);
  158. VMap[*BB] = NewBB;
  159. if (Header == *BB) {
  160. // For the first block, add a CFG connection to this newly
  161. // created block.
  162. InsertTop->getTerminator()->setSuccessor(0, NewBB);
  163. }
  164. if (Latch == *BB) {
  165. // For the last block, if UnrollProlog is true, create a direct jump to
  166. // InsertBot. If not, create a loop back to cloned head.
  167. VMap.erase((*BB)->getTerminator());
  168. BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
  169. BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
  170. IRBuilder<> Builder(LatchBR);
  171. if (UnrollProlog) {
  172. Builder.CreateBr(InsertBot);
  173. } else {
  174. PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
  175. FirstLoopBB->getFirstNonPHI());
  176. Value *IdxSub =
  177. Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  178. NewIdx->getName() + ".sub");
  179. Value *IdxCmp =
  180. Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
  181. Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
  182. NewIdx->addIncoming(NewIter, InsertTop);
  183. NewIdx->addIncoming(IdxSub, NewBB);
  184. }
  185. LatchBR->eraseFromParent();
  186. }
  187. }
  188. // Change the incoming values to the ones defined in the preheader or
  189. // cloned loop.
  190. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  191. PHINode *NewPHI = cast<PHINode>(VMap[I]);
  192. if (UnrollProlog) {
  193. VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
  194. cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
  195. } else {
  196. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  197. NewPHI->setIncomingBlock(idx, InsertTop);
  198. BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
  199. idx = NewPHI->getBasicBlockIndex(Latch);
  200. Value *InVal = NewPHI->getIncomingValue(idx);
  201. NewPHI->setIncomingBlock(idx, NewLatch);
  202. if (VMap[InVal])
  203. NewPHI->setIncomingValue(idx, VMap[InVal]);
  204. }
  205. }
  206. if (NewLoop) {
  207. // Add unroll disable metadata to disable future unrolling for this loop.
  208. SmallVector<Metadata *, 4> MDs;
  209. // Reserve first location for self reference to the LoopID metadata node.
  210. MDs.push_back(nullptr);
  211. MDNode *LoopID = NewLoop->getLoopID();
  212. if (LoopID) {
  213. // First remove any existing loop unrolling metadata.
  214. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
  215. bool IsUnrollMetadata = false;
  216. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  217. if (MD) {
  218. const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  219. IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
  220. }
  221. if (!IsUnrollMetadata)
  222. MDs.push_back(LoopID->getOperand(i));
  223. }
  224. }
  225. LLVMContext &Context = NewLoop->getHeader()->getContext();
  226. SmallVector<Metadata *, 1> DisableOperands;
  227. DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
  228. MDNode *DisableNode = MDNode::get(Context, DisableOperands);
  229. MDs.push_back(DisableNode);
  230. MDNode *NewLoopID = MDNode::get(Context, MDs);
  231. // Set operand 0 to refer to the loop id itself.
  232. NewLoopID->replaceOperandWith(0, NewLoopID);
  233. NewLoop->setLoopID(NewLoopID);
  234. }
  235. }
  236. /// Insert code in the prolog code when unrolling a loop with a
  237. /// run-time trip-count.
  238. ///
  239. /// This method assumes that the loop unroll factor is total number
  240. /// of loop bodes in the loop after unrolling. (Some folks refer
  241. /// to the unroll factor as the number of *extra* copies added).
  242. /// We assume also that the loop unroll factor is a power-of-two. So, after
  243. /// unrolling the loop, the number of loop bodies executed is 2,
  244. /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
  245. /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
  246. /// the switch instruction is generated.
  247. ///
  248. /// extraiters = tripcount % loopfactor
  249. /// if (extraiters == 0) jump Loop:
  250. /// else jump Prol
  251. /// Prol: LoopBody;
  252. /// extraiters -= 1 // Omitted if unroll factor is 2.
  253. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
  254. /// if (tripcount < loopfactor) jump End
  255. /// Loop:
  256. /// ...
  257. /// End:
  258. ///
  259. bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
  260. bool AllowExpensiveTripCount, LoopInfo *LI,
  261. LPPassManager *LPM) {
  262. // for now, only unroll loops that contain a single exit
  263. if (!L->getExitingBlock())
  264. return false;
  265. // Make sure the loop is in canonical form, and there is a single
  266. // exit block only.
  267. if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
  268. return false;
  269. // Use Scalar Evolution to compute the trip count. This allows more
  270. // loops to be unrolled than relying on induction var simplification
  271. if (!LPM)
  272. return false;
  273. ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
  274. if (!SE)
  275. return false;
  276. // Only unroll loops with a computable trip count and the trip count needs
  277. // to be an int value (allowing a pointer type is a TODO item)
  278. const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
  279. if (isa<SCEVCouldNotCompute>(BECountSC) ||
  280. !BECountSC->getType()->isIntegerTy())
  281. return false;
  282. unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  283. // Add 1 since the backedge count doesn't include the first loop iteration
  284. const SCEV *TripCountSC =
  285. SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  286. if (isa<SCEVCouldNotCompute>(TripCountSC))
  287. return false;
  288. BasicBlock *Header = L->getHeader();
  289. const DataLayout &DL = Header->getModule()->getDataLayout();
  290. SCEVExpander Expander(*SE, DL, "loop-unroll");
  291. if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L))
  292. return false;
  293. // We only handle cases when the unroll factor is a power of 2.
  294. // Count is the loop unroll factor, the number of extra copies added + 1.
  295. if (!isPowerOf2_32(Count))
  296. return false;
  297. // This constraint lets us deal with an overflowing trip count easily; see the
  298. // comment on ModVal below.
  299. if (Log2_32(Count) > BEWidth)
  300. return false;
  301. // If this loop is nested, then the loop unroller changes the code in
  302. // parent loop, so the Scalar Evolution pass needs to be run again
  303. if (Loop *ParentLoop = L->getParentLoop())
  304. SE->forgetLoop(ParentLoop);
  305. // Grab analyses that we preserve.
  306. auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
  307. auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
  308. BasicBlock *PH = L->getLoopPreheader();
  309. BasicBlock *Latch = L->getLoopLatch();
  310. // It helps to splits the original preheader twice, one for the end of the
  311. // prolog code and one for a new loop preheader
  312. BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
  313. BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
  314. BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
  315. // Compute the number of extra iterations required, which is:
  316. // extra iterations = run-time trip count % (loop unroll factor + 1)
  317. Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
  318. PreHeaderBR);
  319. Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
  320. PreHeaderBR);
  321. IRBuilder<> B(PreHeaderBR);
  322. Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
  323. // If ModVal is zero, we know that either
  324. // 1. there are no iteration to be run in the prologue loop
  325. // OR
  326. // 2. the addition computing TripCount overflowed
  327. //
  328. // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
  329. // number of iterations that remain to be run in the original loop is a
  330. // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
  331. // explicitly check this above).
  332. Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
  333. // Branch to either the extra iterations or the cloned/unrolled loop
  334. // We will fix up the true branch label when adding loop body copies
  335. B.CreateCondBr(BranchVal, PEnd, PEnd);
  336. assert(PreHeaderBR->isUnconditional() &&
  337. PreHeaderBR->getSuccessor(0) == PEnd &&
  338. "CFG edges in Preheader are not correct");
  339. PreHeaderBR->eraseFromParent();
  340. Function *F = Header->getParent();
  341. // Get an ordered list of blocks in the loop to help with the ordering of the
  342. // cloned blocks in the prolog code
  343. LoopBlocksDFS LoopBlocks(L);
  344. LoopBlocks.perform(LI);
  345. //
  346. // For each extra loop iteration, create a copy of the loop's basic blocks
  347. // and generate a condition that branches to the copy depending on the
  348. // number of 'left over' iterations.
  349. //
  350. std::vector<BasicBlock *> NewBlocks;
  351. ValueToValueMapTy VMap;
  352. bool UnrollPrologue = Count == 2;
  353. // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  354. // the loop, otherwise we create a cloned loop to execute the extra
  355. // iterations. This function adds the appropriate CFG connections.
  356. CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
  357. VMap, LI);
  358. // Insert the cloned blocks into function just before the original loop
  359. F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0],
  360. F->end());
  361. // Rewrite the cloned instruction operands to use the values
  362. // created when the clone is created.
  363. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
  364. for (BasicBlock::iterator I = NewBlocks[i]->begin(),
  365. E = NewBlocks[i]->end();
  366. I != E; ++I) {
  367. RemapInstruction(I, VMap,
  368. RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
  369. }
  370. }
  371. // Connect the prolog code to the original loop and update the
  372. // PHI functions.
  373. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
  374. ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
  375. /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass());
  376. NumRuntimeUnrolled++;
  377. return true;
  378. }