DxilLoopUnroll.cpp 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064
  1. //===- DxilLoopUnroll.cpp - Special Unroll for Constant Values ------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //===----------------------------------------------------------------------===//
  10. //
  11. // Special loop unroll routine for creating mandatory constant values and
  12. // loops that have exits.
  13. //
  14. // Overview of algorithm:
  15. //
  16. // 1. Identify a set of blocks to unroll.
  17. //
  18. // LLVM's concept of loop excludes exit blocks, which are blocks that no
  19. // longer have a path to the loop latch. However, some exit blocks in HLSL
  20. // also need to be unrolled. For example:
  21. //
  22. // [unroll]
  23. // for (uint i = 0; i < 4; i++)
  24. // {
  25. // if (...)
  26. // {
  27. // // This block here is an exit block, since it's.
  28. // // guaranteed to exit the loop.
  29. // ...
  30. // a[i] = ...; // Indexing requires unroll.
  31. // return;
  32. // }
  33. // }
  34. //
  35. //
  36. // 2. Create LCSSA based on the new loop boundary.
  37. //
  38. // See LCSSA.cpp for more details. It creates trivial PHI nodes for any
  39. // outgoing values of the loop at the exit blocks, so when the loop body
  40. // gets cloned, the outgoing values can be added to those PHI nodes easily.
  41. //
  42. // We are using a modified LCSSA routine here because we are including some
  43. // of the original exit blocks in the unroll.
  44. //
  45. //
  46. // 3. Unroll the loop until we succeed.
  47. //
  48. // Unlike LLVM, we do not try to find a loop count before unrolling.
  49. // Instead, we unroll to find a constant terminal condition. Give up when we
  50. // fail to do so.
  51. //
  52. //
  53. //===----------------------------------------------------------------------===//
  54. #include "llvm/Pass.h"
  55. #include "llvm/Analysis/LoopPass.h"
  56. #include "llvm/Analysis/InstructionSimplify.h"
  57. #include "llvm/Analysis/AssumptionCache.h"
  58. #include "llvm/Analysis/LoopPass.h"
  59. #include "llvm/Analysis/InstructionSimplify.h"
  60. #include "llvm/Analysis/AssumptionCache.h"
  61. #include "llvm/Analysis/ScalarEvolution.h"
  62. #include "llvm/Transforms/Scalar.h"
  63. #include "llvm/Transforms/Utils/Cloning.h"
  64. #include "llvm/Transforms/Utils/Local.h"
  65. #include "llvm/Transforms/Utils/UnrollLoop.h"
  66. #include "llvm/Transforms/Utils/SSAUpdater.h"
  67. #include "llvm/Transforms/Utils/LoopUtils.h"
  68. #include "llvm/Transforms/Utils/PromoteMemToReg.h"
  69. #include "llvm/IR/DiagnosticInfo.h"
  70. #include "llvm/IR/Instructions.h"
  71. #include "llvm/IR/Module.h"
  72. #include "llvm/IR/Verifier.h"
  73. #include "llvm/IR/PredIteratorCache.h"
  74. #include "llvm/IR/IntrinsicInst.h"
  75. #include "llvm/Support/raw_ostream.h"
  76. #include "llvm/Support/Debug.h"
  77. #include "llvm/ADT/SetVector.h"
  78. #include "llvm/IR/LegacyPassManager.h"
  79. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  80. #include "dxc/DXIL/DxilUtil.h"
  81. #include "dxc/HLSL/HLModule.h"
  82. #include "llvm/Analysis/DxilValueCache.h"
  83. using namespace llvm;
  84. using namespace hlsl;
  85. // Copied over from LoopUnroll.cpp - RemapInstruction()
  86. static inline void RemapInstruction(Instruction *I,
  87. ValueToValueMapTy &VMap) {
  88. for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
  89. Value *Op = I->getOperand(op);
  90. ValueToValueMapTy::iterator It = VMap.find(Op);
  91. if (It != VMap.end())
  92. I->setOperand(op, It->second);
  93. }
  94. if (PHINode *PN = dyn_cast<PHINode>(I)) {
  95. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
  96. ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
  97. if (It != VMap.end())
  98. PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
  99. }
  100. }
  101. }
  102. namespace {
  103. class DxilLoopUnroll : public LoopPass {
  104. public:
  105. static char ID;
  106. std::unordered_set<Function *> CleanedUpAlloca;
  107. const unsigned MaxIterationAttempt;
  108. DxilLoopUnroll(unsigned MaxIterationAttempt = 1024) :
  109. LoopPass(ID),
  110. MaxIterationAttempt(MaxIterationAttempt)
  111. {
  112. initializeDxilLoopUnrollPass(*PassRegistry::getPassRegistry());
  113. }
  114. const char *getPassName() const override { return "Dxil Loop Unroll"; }
  115. bool runOnLoop(Loop *L, LPPassManager &LPM) override;
  116. void getAnalysisUsage(AnalysisUsage &AU) const override {
  117. AU.addRequired<LoopInfoWrapperPass>();
  118. AU.addRequired<AssumptionCacheTracker>();
  119. AU.addRequired<DominatorTreeWrapperPass>();
  120. AU.addPreserved<DominatorTreeWrapperPass>();
  121. AU.addRequired<ScalarEvolution>();
  122. AU.addRequired<DxilValueCache>();
  123. AU.addRequiredID(LoopSimplifyID);
  124. }
  125. };
  126. char DxilLoopUnroll::ID;
  127. static void FailLoopUnroll(bool WarnOnly, Function *F, DebugLoc DL, const Twine &Message) {
  128. LLVMContext &Ctx = F->getContext();
  129. DiagnosticSeverity severity = DiagnosticSeverity::DS_Error;
  130. if (WarnOnly)
  131. severity = DiagnosticSeverity::DS_Warning;
  132. Ctx.diagnose(DiagnosticInfoDxil(F, DL.get(), Message, severity));
  133. }
  134. struct LoopIteration {
  135. SmallVector<BasicBlock *, 16> Body;
  136. BasicBlock *Latch = nullptr;
  137. BasicBlock *Header = nullptr;
  138. ValueToValueMapTy VarMap;
  139. SetVector<BasicBlock *> Extended; // Blocks that are included in the clone that are not in the core loop body.
  140. LoopIteration() {}
  141. };
  142. static bool GetConstantI1(Value *V, bool *Val=nullptr) {
  143. if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
  144. if (V->getType()->isIntegerTy(1)) {
  145. if (Val)
  146. *Val = (bool)C->getLimitedValue();
  147. return true;
  148. }
  149. }
  150. return false;
  151. }
  152. static bool IsMarkedFullUnroll(Loop *L) {
  153. if (MDNode *LoopID = L->getLoopID())
  154. return GetUnrollMetadata(LoopID, "llvm.loop.unroll.full");
  155. return false;
  156. }
  157. static bool IsMarkedUnrollCount(Loop *L, int *OutCount) {
  158. if (MDNode *LoopID = L->getLoopID()) {
  159. if (MDNode *MD = GetUnrollMetadata(LoopID, "llvm.loop.unroll.count")) {
  160. assert(MD->getNumOperands() == 2 &&
  161. "Unroll count hint metadata should have two operands.");
  162. ConstantInt *Val = mdconst::extract<ConstantInt>(MD->getOperand(1));
  163. int Count = Val->getZExtValue();
  164. *OutCount = Count;
  165. return true;
  166. }
  167. }
  168. return false;
  169. }
  170. static bool HasSuccessorsInLoop(BasicBlock *BB, Loop *L) {
  171. for (BasicBlock *Succ : successors(BB)) {
  172. if (L->contains(Succ)) {
  173. return true;
  174. }
  175. }
  176. return false;
  177. }
  178. static void DetachFromSuccessors(BasicBlock *BB) {
  179. SmallVector<BasicBlock *, 16> Successors(succ_begin(BB), succ_end(BB));
  180. for (BasicBlock *Succ : Successors) {
  181. Succ->removePredecessor(BB);
  182. }
  183. }
  184. /// Return true if the specified block is in the list.
  185. static bool isExitBlock(BasicBlock *BB,
  186. const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
  187. for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
  188. if (ExitBlocks[i] == BB)
  189. return true;
  190. return false;
  191. }
  192. // Copied and modified from LCSSA.cpp
  193. static bool processInstruction(SetVector<BasicBlock *> &Body, Loop &L, Instruction &Inst, DominatorTree &DT, // HLSL Change
  194. const SmallVectorImpl<BasicBlock *> &ExitBlocks,
  195. PredIteratorCache &PredCache, LoopInfo *LI) {
  196. SmallVector<Use *, 16> UsesToRewrite;
  197. BasicBlock *InstBB = Inst.getParent();
  198. for (Use &U : Inst.uses()) {
  199. Instruction *User = cast<Instruction>(U.getUser());
  200. BasicBlock *UserBB = User->getParent();
  201. if (PHINode *PN = dyn_cast<PHINode>(User))
  202. UserBB = PN->getIncomingBlock(U);
  203. if (InstBB != UserBB && /*!L.contains(UserBB)*/!Body.count(UserBB)) // HLSL Change
  204. UsesToRewrite.push_back(&U);
  205. }
  206. // If there are no uses outside the loop, exit with no change.
  207. if (UsesToRewrite.empty())
  208. return false;
  209. #if 0 // HLSL Change
  210. ++NumLCSSA; // We are applying the transformation
  211. #endif // HLSL Change
  212. // Invoke instructions are special in that their result value is not available
  213. // along their unwind edge. The code below tests to see whether DomBB
  214. // dominates
  215. // the value, so adjust DomBB to the normal destination block, which is
  216. // effectively where the value is first usable.
  217. BasicBlock *DomBB = Inst.getParent();
  218. if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
  219. DomBB = Inv->getNormalDest();
  220. DomTreeNode *DomNode = DT.getNode(DomBB);
  221. SmallVector<PHINode *, 16> AddedPHIs;
  222. SmallVector<PHINode *, 8> PostProcessPHIs;
  223. SSAUpdater SSAUpdate;
  224. SSAUpdate.Initialize(Inst.getType(), Inst.getName());
  225. // Insert the LCSSA phi's into all of the exit blocks dominated by the
  226. // value, and add them to the Phi's map.
  227. for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(),
  228. BBE = ExitBlocks.end();
  229. BBI != BBE; ++BBI) {
  230. BasicBlock *ExitBB = *BBI;
  231. if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
  232. continue;
  233. // If we already inserted something for this BB, don't reprocess it.
  234. if (SSAUpdate.HasValueForBlock(ExitBB))
  235. continue;
  236. PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
  237. Inst.getName() + ".lcssa", ExitBB->begin());
  238. // Add inputs from inside the loop for this PHI.
  239. for (BasicBlock *Pred : PredCache.get(ExitBB)) {
  240. PN->addIncoming(&Inst, Pred);
  241. // If the exit block has a predecessor not within the loop, arrange for
  242. // the incoming value use corresponding to that predecessor to be
  243. // rewritten in terms of a different LCSSA PHI.
  244. if (/*!L.contains(Pred)*/ !Body.count(Pred)) // HLSL Change
  245. UsesToRewrite.push_back(
  246. &PN->getOperandUse(PN->getOperandNumForIncomingValue(
  247. PN->getNumIncomingValues() - 1)));
  248. }
  249. AddedPHIs.push_back(PN);
  250. // Remember that this phi makes the value alive in this block.
  251. SSAUpdate.AddAvailableValue(ExitBB, PN);
  252. // LoopSimplify might fail to simplify some loops (e.g. when indirect
  253. // branches are involved). In such situations, it might happen that an exit
  254. // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
  255. // PHIs in such an exit block, we are also inserting PHIs into L2's header.
  256. // This could break LCSSA form for L2 because these inserted PHIs can also
  257. // have uses outside of L2. Remember all PHIs in such situation as to
  258. // revisit than later on. FIXME: Remove this if indirectbr support into
  259. // LoopSimplify gets improved.
  260. if (auto *OtherLoop = LI->getLoopFor(ExitBB))
  261. if (!L.contains(OtherLoop))
  262. PostProcessPHIs.push_back(PN);
  263. }
  264. // Rewrite all uses outside the loop in terms of the new PHIs we just
  265. // inserted.
  266. for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
  267. // If this use is in an exit block, rewrite to use the newly inserted PHI.
  268. // This is required for correctness because SSAUpdate doesn't handle uses in
  269. // the same block. It assumes the PHI we inserted is at the end of the
  270. // block.
  271. Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
  272. BasicBlock *UserBB = User->getParent();
  273. if (PHINode *PN = dyn_cast<PHINode>(User))
  274. UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
  275. if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
  276. // Tell the VHs that the uses changed. This updates SCEV's caches.
  277. if (UsesToRewrite[i]->get()->hasValueHandle())
  278. ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
  279. UsesToRewrite[i]->set(UserBB->begin());
  280. continue;
  281. }
  282. // Otherwise, do full PHI insertion.
  283. SSAUpdate.RewriteUse(*UsesToRewrite[i]);
  284. }
  285. // Post process PHI instructions that were inserted into another disjoint loop
  286. // and update their exits properly.
  287. for (auto *I : PostProcessPHIs) {
  288. if (I->use_empty())
  289. continue;
  290. BasicBlock *PHIBB = I->getParent();
  291. Loop *OtherLoop = LI->getLoopFor(PHIBB);
  292. SmallVector<BasicBlock *, 8> EBs;
  293. OtherLoop->getExitBlocks(EBs);
  294. if (EBs.empty())
  295. continue;
  296. // Recurse and re-process each PHI instruction. FIXME: we should really
  297. // convert this entire thing to a worklist approach where we process a
  298. // vector of instructions...
  299. SetVector<BasicBlock *> OtherLoopBody(OtherLoop->block_begin(), OtherLoop->block_end()); // HLSL Change
  300. processInstruction(OtherLoopBody, *OtherLoop, *I, DT, EBs, PredCache, LI);
  301. }
  302. // Remove PHI nodes that did not have any uses rewritten.
  303. for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
  304. if (AddedPHIs[i]->use_empty())
  305. AddedPHIs[i]->eraseFromParent();
  306. }
  307. return true;
  308. }
  309. // Copied from LCSSA.cpp
  310. static bool blockDominatesAnExit(BasicBlock *BB,
  311. DominatorTree &DT,
  312. const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
  313. DomTreeNode *DomNode = DT.getNode(BB);
  314. for (BasicBlock *Exit : ExitBlocks)
  315. if (DT.dominates(DomNode, DT.getNode(Exit)))
  316. return true;
  317. return false;
  318. }
  319. // Copied from LCSSA.cpp
  320. //
  321. // We need to recreate the LCSSA form since our loop boundary is potentially different from
  322. // the canonical one.
  323. static bool CreateLCSSA(SetVector<BasicBlock *> &Body, const SmallVectorImpl<BasicBlock *> &ExitBlocks, Loop *L, DominatorTree &DT, LoopInfo *LI) {
  324. PredIteratorCache PredCache;
  325. bool Changed = false;
  326. // Look at all the instructions in the loop, checking to see if they have uses
  327. // outside the loop. If so, rewrite those uses.
  328. for (SetVector<BasicBlock *>::iterator BBI = Body.begin(), BBE = Body.end();
  329. BBI != BBE; ++BBI) {
  330. BasicBlock *BB = *BBI;
  331. // For large loops, avoid use-scanning by using dominance information: In
  332. // particular, if a block does not dominate any of the loop exits, then none
  333. // of the values defined in the block could be used outside the loop.
  334. if (!blockDominatesAnExit(BB, DT, ExitBlocks))
  335. continue;
  336. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
  337. // Reject two common cases fast: instructions with no uses (like stores)
  338. // and instructions with one use that is in the same block as this.
  339. if (I->use_empty() ||
  340. (I->hasOneUse() && I->user_back()->getParent() == BB &&
  341. !isa<PHINode>(I->user_back())))
  342. continue;
  343. Changed |= processInstruction(Body, *L, *I, DT, ExitBlocks, PredCache, LI);
  344. }
  345. }
  346. return Changed;
  347. }
  348. static Value *GetGEPPtrOrigin(GEPOperator *GEP) {
  349. Value *Ptr = GEP->getPointerOperand();
  350. while (Ptr) {
  351. if (AllocaInst *AI = dyn_cast<AllocaInst>(Ptr)) {
  352. return AI;
  353. }
  354. else if (GEPOperator *NewGEP = dyn_cast<GEPOperator>(Ptr)) {
  355. Ptr = NewGEP->getPointerOperand();
  356. }
  357. else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
  358. return GV;
  359. }
  360. else {
  361. break;
  362. }
  363. }
  364. return nullptr;
  365. }
  366. // Find all blocks in the loop with instructions that
  367. // would require an unroll to be correct.
  368. //
  369. // For example:
  370. // for (int i = 0; i < 10; i++) {
  371. // gep i
  372. // }
  373. //
  374. static void FindProblemBlocks(BasicBlock *Header, const SmallVectorImpl<BasicBlock *> &BlocksInLoop, std::unordered_set<BasicBlock *> &ProblemBlocks, SetVector<AllocaInst *> &ProblemAllocas) {
  375. SmallVector<Instruction *, 16> WorkList;
  376. std::unordered_set<BasicBlock *> BlocksInLoopSet(BlocksInLoop.begin(), BlocksInLoop.end());
  377. std::unordered_set<Instruction *> InstructionsSeen;
  378. for (Instruction &I : *Header) {
  379. PHINode *PN = dyn_cast<PHINode>(&I);
  380. if (!PN)
  381. break;
  382. WorkList.push_back(PN);
  383. InstructionsSeen.insert(PN);
  384. }
  385. while (WorkList.size()) {
  386. Instruction *I = WorkList.pop_back_val();
  387. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
  388. Type *EltType = GEP->getType()->getPointerElementType();
  389. // NOTE: This is a very convservative in the following conditions:
  390. // - constant global resource arrays with external linkage (these can be
  391. // dynamically accessed)
  392. // - global resource arrays or alloca resource arrays, as long as all
  393. // writes come from the same original resource definition (which can
  394. // also be an array).
  395. //
  396. // We may want to make this more precise in the future if it becomes a
  397. // problem.
  398. //
  399. if (hlsl::dxilutil::IsHLSLObjectType(EltType)) {
  400. if (Value *Ptr = GetGEPPtrOrigin(cast<GEPOperator>(GEP))) {
  401. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
  402. if (!GV->isExternalLinkage(llvm::GlobalValue::ExternalLinkage))
  403. ProblemBlocks.insert(GEP->getParent());
  404. }
  405. else if (AllocaInst *AI = dyn_cast<AllocaInst>(Ptr)) {
  406. ProblemAllocas.insert(AI);
  407. ProblemBlocks.insert(GEP->getParent());
  408. }
  409. }
  410. continue; // Stop Propagating
  411. }
  412. }
  413. for (User *U : I->users()) {
  414. if (Instruction *UserI = dyn_cast<Instruction>(U)) {
  415. if (!InstructionsSeen.count(UserI) &&
  416. BlocksInLoopSet.count(UserI->getParent()))
  417. {
  418. InstructionsSeen.insert(UserI);
  419. WorkList.push_back(UserI);
  420. }
  421. }
  422. }
  423. }
  424. }
  425. // Helper function for getting GEP's const index value
  426. inline static int64_t GetGEPIndex(GEPOperator *GEP, unsigned idx) {
  427. return cast<ConstantInt>(GEP->getOperand(idx + 1))->getSExtValue();
  428. }
  429. // Replace allocas with all constant indices with scalar allocas, then promote
  430. // them to values where possible (mem2reg).
  431. //
  432. // Before loop unroll, we did not have constant indices for arrays and SROA was
  433. // unable to break them into scalars. Now that unroll has potentially given
  434. // them constant values, we need to turn them into scalars.
  435. //
  436. // if "AllowOOBIndex" is true, it turns any out of bound index into 0.
  437. // Otherwise it emits an error and fails compilation.
  438. //
  439. template<typename IteratorT>
  440. static bool BreakUpArrayAllocas(bool AllowOOBIndex, IteratorT ItBegin, IteratorT ItEnd, DominatorTree *DT, AssumptionCache *AC, DxilValueCache *DVC) {
  441. bool Success = true;
  442. SmallVector<AllocaInst *, 8> WorkList(ItBegin, ItEnd);
  443. SmallVector<GEPOperator *, 16> GEPs;
  444. while (WorkList.size()) {
  445. AllocaInst *AI = WorkList.pop_back_val();
  446. Type *AllocaType = AI->getAllocatedType();
  447. // Only deal with array allocas.
  448. if (!AllocaType->isArrayTy())
  449. continue;
  450. unsigned ArraySize = AI->getAllocatedType()->getArrayNumElements();
  451. Type *ElementType = AllocaType->getArrayElementType();
  452. if (!ArraySize)
  453. continue;
  454. GEPs.clear(); // Re-use array
  455. for (User *U : AI->users()) {
  456. // Try to set all GEP operands to constant
  457. if (GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
  458. if (!GEP->hasAllConstantIndices() && isa<GetElementPtrInst>(GEP)) {
  459. for (unsigned i = 0; i < GEP->getNumIndices(); i++) {
  460. Value *IndexOp = GEP->getOperand(i + 1);
  461. if (isa<Constant>(IndexOp))
  462. continue;
  463. if (Constant *C = DVC->GetConstValue(IndexOp))
  464. GEP->setOperand(i + 1, C);
  465. }
  466. }
  467. if (!GEP->hasAllConstantIndices() || GEP->getNumIndices() < 2 ||
  468. GetGEPIndex(GEP, 0) != 0)
  469. {
  470. GEPs.clear();
  471. break;
  472. }
  473. else {
  474. GEPs.push_back(GEP);
  475. }
  476. }
  477. else {
  478. GEPs.clear();
  479. break;
  480. }
  481. }
  482. if (!GEPs.size())
  483. continue;
  484. SmallVector<AllocaInst *, 8> ScalarAllocas;
  485. ScalarAllocas.resize(ArraySize);
  486. IRBuilder<> B(AI);
  487. for (GEPOperator *GEP : GEPs) {
  488. int64_t idx = GetGEPIndex(GEP, 1);
  489. GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP);
  490. if (idx < 0 || idx >= ArraySize) {
  491. if (AllowOOBIndex)
  492. idx = 0;
  493. else {
  494. Success = false;
  495. if (GEPInst)
  496. hlsl::dxilutil::EmitErrorOnInstruction(GEPInst, "Array access out of bound.");
  497. continue;
  498. }
  499. }
  500. AllocaInst *ScalarAlloca = ScalarAllocas[idx];
  501. if (!ScalarAlloca) {
  502. ScalarAlloca = B.CreateAlloca(ElementType);
  503. ScalarAllocas[idx] = ScalarAlloca;
  504. if (ElementType->isArrayTy()) {
  505. WorkList.push_back(ScalarAlloca);
  506. }
  507. }
  508. Value *NewPointer = nullptr;
  509. if (ElementType->isArrayTy()) {
  510. SmallVector<Value *, 2> Indices;
  511. Indices.push_back(B.getInt32(0));
  512. for (unsigned i = 2; i < GEP->getNumIndices(); i++) {
  513. Indices.push_back(GEP->getOperand(i + 1));
  514. }
  515. NewPointer = B.CreateGEP(ScalarAlloca, Indices);
  516. } else {
  517. NewPointer = ScalarAlloca;
  518. }
  519. GEP->replaceAllUsesWith(NewPointer);
  520. }
  521. if (!ElementType->isArrayTy()) {
  522. ScalarAllocas.erase(std::remove(ScalarAllocas.begin(), ScalarAllocas.end(), nullptr), ScalarAllocas.end());
  523. PromoteMemToReg(ScalarAllocas, *DT, nullptr, AC);
  524. }
  525. }
  526. return Success;
  527. }
  528. static void RecursivelyRemoveLoopFromQueue(LPPassManager &LPM, Loop *L) {
  529. // Copy the sub loops into a separate list because
  530. // the original list may change.
  531. SmallVector<Loop *, 4> SubLoops(L->getSubLoops().begin(), L->getSubLoops().end());
  532. // Must remove all child loops first.
  533. for (Loop *SubL : SubLoops) {
  534. RecursivelyRemoveLoopFromQueue(LPM, SubL);
  535. }
  536. LPM.deleteLoopFromQueue(L);
  537. }
  538. bool DxilLoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
  539. DebugLoc LoopLoc = L->getStartLoc(); // Debug location for the start of the loop.
  540. Function *F = L->getHeader()->getParent();
  541. ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
  542. DxilValueCache *DVC = &getAnalysis<DxilValueCache>();
  543. bool HasExplicitLoopCount = false;
  544. int ExplicitUnrollCountSigned = 0;
  545. // If the loop is not marked as [unroll], don't do anything.
  546. if (IsMarkedUnrollCount(L, &ExplicitUnrollCountSigned)) {
  547. HasExplicitLoopCount = true;
  548. }
  549. else if (!IsMarkedFullUnroll(L)) {
  550. return false;
  551. }
  552. unsigned ExplicitUnrollCount = 0;
  553. if (HasExplicitLoopCount) {
  554. if (ExplicitUnrollCountSigned < 1) {
  555. FailLoopUnroll(false, F, LoopLoc, "Could not unroll loop. Invalid unroll count.");
  556. return false;
  557. }
  558. ExplicitUnrollCount = (unsigned)ExplicitUnrollCountSigned;
  559. }
  560. if (!L->isSafeToClone())
  561. return false;
  562. bool FxcCompatMode = false;
  563. if (F->getParent()->HasHLModule()) {
  564. HLModule &HM = F->getParent()->GetHLModule();
  565. FxcCompatMode = HM.GetHLOptions().bFXCCompatMode;
  566. }
  567. unsigned TripCount = 0;
  568. BasicBlock *ExitingBlock = L->getLoopLatch();
  569. if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
  570. ExitingBlock = L->getExitingBlock();
  571. if (ExitingBlock) {
  572. TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
  573. }
  574. // Analysis passes
  575. DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  576. AssumptionCache *AC =
  577. &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
  578. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  579. Loop *OuterL = L->getParentLoop();
  580. BasicBlock *Latch = L->getLoopLatch();
  581. BasicBlock *Header = L->getHeader();
  582. BasicBlock *Predecessor = L->getLoopPredecessor();
  583. const DataLayout &DL = F->getParent()->getDataLayout();
  584. // Quit if we don't have a single latch block or predecessor
  585. if (!Latch || !Predecessor) {
  586. return false;
  587. }
  588. // If the loop exit condition is not in the latch, then the loop is not rotated. Give up.
  589. if (!cast<BranchInst>(Latch->getTerminator())->isConditional()) {
  590. return false;
  591. }
  592. SmallVector<BasicBlock *, 16> ExitBlocks;
  593. L->getExitBlocks(ExitBlocks);
  594. std::unordered_set<BasicBlock *> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
  595. SmallVector<BasicBlock *, 16> BlocksInLoop; // Set of blocks including both body and exits
  596. BlocksInLoop.append(L->getBlocks().begin(), L->getBlocks().end());
  597. BlocksInLoop.append(ExitBlocks.begin(), ExitBlocks.end());
  598. // Heuristically find blocks that likely need to be unrolled
  599. SetVector<AllocaInst *> ProblemAllocas;
  600. std::unordered_set<BasicBlock *> ProblemBlocks;
  601. FindProblemBlocks(L->getHeader(), BlocksInLoop, ProblemBlocks, ProblemAllocas);
  602. // Keep track of the PHI nodes at the header.
  603. SmallVector<PHINode *, 16> PHIs;
  604. for (auto it = Header->begin(); it != Header->end(); it++) {
  605. if (PHINode *PN = dyn_cast<PHINode>(it)) {
  606. PHIs.push_back(PN);
  607. }
  608. else {
  609. break;
  610. }
  611. }
  612. // Quick simplification of PHINode incoming values
  613. for (PHINode *PN : PHIs) {
  614. for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
  615. Value *OldIncomingV = PN->getIncomingValue(i);
  616. if (Instruction *IncomingI = dyn_cast<Instruction>(OldIncomingV)) {
  617. if (Value *NewIncomingV = llvm::SimplifyInstruction(IncomingI, DL)) {
  618. PN->setIncomingValue(i, NewIncomingV);
  619. }
  620. }
  621. }
  622. }
  623. SetVector<BasicBlock *> ToBeCloned; // List of blocks that will be cloned.
  624. for (BasicBlock *BB : L->getBlocks()) // Include the body right away
  625. ToBeCloned.insert(BB);
  626. // Find the exit blocks that also need to be included
  627. // in the unroll.
  628. SmallVector<BasicBlock *, 8> NewExits; // New set of exit blocks as boundaries for LCSSA
  629. SmallVector<BasicBlock *, 8> FakeExits; // Set of blocks created to allow cloning original exit blocks.
  630. for (BasicBlock *BB : ExitBlocks) {
  631. bool CloneThisExitBlock = ProblemBlocks.count(BB);
  632. if (CloneThisExitBlock) {
  633. ToBeCloned.insert(BB);
  634. // If we are cloning this basic block, we must create a new exit
  635. // block for inserting LCSSA PHI nodes.
  636. BasicBlock *FakeExit = BasicBlock::Create(BB->getContext(), "loop.exit.new");
  637. F->getBasicBlockList().insert(BB, FakeExit);
  638. TerminatorInst *OldTerm = BB->getTerminator();
  639. OldTerm->removeFromParent();
  640. FakeExit->getInstList().push_back(OldTerm);
  641. BranchInst::Create(FakeExit, BB);
  642. for (BasicBlock *Succ : successors(FakeExit)) {
  643. for (Instruction &I : *Succ) {
  644. if (PHINode *PN = dyn_cast<PHINode>(&I)) {
  645. for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
  646. if (PN->getIncomingBlock(i) == BB)
  647. PN->setIncomingBlock(i, FakeExit);
  648. }
  649. }
  650. }
  651. }
  652. NewExits.push_back(FakeExit);
  653. FakeExits.push_back(FakeExit);
  654. // Update Dom tree with new exit
  655. if (!DT->getNode(FakeExit))
  656. DT->addNewBlock(FakeExit, BB);
  657. }
  658. else {
  659. // If we are not including this exit block in the unroll,
  660. // use it for LCSSA as normal.
  661. NewExits.push_back(BB);
  662. }
  663. }
  664. // Simplify the PHI nodes that have single incoming value. The original LCSSA form
  665. // (if exists) does not necessarily work for our unroll because we may be unrolling
  666. // from a different boundary.
  667. for (BasicBlock *BB : BlocksInLoop)
  668. hlsl::dxilutil::SimplifyTrivialPHIs(BB);
  669. // Re-establish LCSSA form to get ready for unrolling.
  670. CreateLCSSA(ToBeCloned, NewExits, L, *DT, LI);
  671. SmallVector<std::unique_ptr<LoopIteration>, 16> Iterations; // List of cloned iterations
  672. bool Succeeded = false;
  673. unsigned MaxAttempt = this->MaxIterationAttempt;
  674. // If we were able to figure out the definitive trip count,
  675. // just unroll that many times.
  676. if (TripCount != 0) {
  677. MaxAttempt = TripCount;
  678. }
  679. else if (HasExplicitLoopCount) {
  680. MaxAttempt = ExplicitUnrollCount;
  681. }
  682. for (unsigned IterationI = 0; IterationI < MaxAttempt; IterationI++) {
  683. LoopIteration *PrevIteration = nullptr;
  684. if (Iterations.size())
  685. PrevIteration = Iterations.back().get();
  686. Iterations.push_back(llvm::make_unique<LoopIteration>());
  687. LoopIteration &CurIteration = *Iterations.back().get();
  688. // Clone the blocks.
  689. for (BasicBlock *BB : ToBeCloned) {
  690. BasicBlock *ClonedBB = CloneBasicBlock(BB, CurIteration.VarMap);
  691. CurIteration.VarMap[BB] = ClonedBB;
  692. ClonedBB->insertInto(F, Header);
  693. if (ExitBlockSet.count(BB))
  694. CurIteration.Extended.insert(ClonedBB);
  695. CurIteration.Body.push_back(ClonedBB);
  696. // Identify the special blocks.
  697. if (BB == Latch) {
  698. CurIteration.Latch = ClonedBB;
  699. }
  700. if (BB == Header) {
  701. CurIteration.Header = ClonedBB;
  702. }
  703. }
  704. for (BasicBlock *BB : ToBeCloned) {
  705. BasicBlock *ClonedBB = cast<BasicBlock>(CurIteration.VarMap[BB]);
  706. // If branching to outside of the loop, need to update the
  707. // phi nodes there to include new values.
  708. for (BasicBlock *Succ : successors(ClonedBB)) {
  709. if (ToBeCloned.count(Succ))
  710. continue;
  711. for (Instruction &I : *Succ) {
  712. PHINode *PN = dyn_cast<PHINode>(&I);
  713. if (!PN)
  714. break;
  715. // Find the incoming value for this new block. If there is an entry
  716. // for this block in the map, then it was defined in the loop, use it.
  717. // Otherwise it came from outside the loop.
  718. Value *OldIncoming = PN->getIncomingValueForBlock(BB);
  719. Value *NewIncoming = OldIncoming;
  720. ValueToValueMapTy::iterator Itor = CurIteration.VarMap.find(OldIncoming);
  721. if (Itor != CurIteration.VarMap.end())
  722. NewIncoming = Itor->second;
  723. PN->addIncoming(NewIncoming, ClonedBB);
  724. }
  725. }
  726. }
  727. // Remap the instructions inside of cloned blocks.
  728. for (BasicBlock *BB : CurIteration.Body) {
  729. for (Instruction &I : *BB) {
  730. ::RemapInstruction(&I, CurIteration.VarMap);
  731. }
  732. }
  733. // If this is the first block
  734. if (!PrevIteration) {
  735. // Replace the phi nodes in the clone block with the values coming
  736. // from outside of the loop
  737. for (PHINode *PN : PHIs) {
  738. PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
  739. Value *ReplacementVal = ClonedPN->getIncomingValueForBlock(Predecessor);
  740. ClonedPN->replaceAllUsesWith(ReplacementVal);
  741. ClonedPN->eraseFromParent();
  742. CurIteration.VarMap[PN] = ReplacementVal;
  743. }
  744. }
  745. else {
  746. // Replace the phi nodes with the value defined INSIDE the previous iteration.
  747. for (PHINode *PN : PHIs) {
  748. PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
  749. Value *ReplacementVal = PN->getIncomingValueForBlock(Latch);
  750. auto itRep = PrevIteration->VarMap.find(ReplacementVal);
  751. if (itRep != PrevIteration->VarMap.end())
  752. ReplacementVal = itRep->second;
  753. ClonedPN->replaceAllUsesWith(ReplacementVal);
  754. ClonedPN->eraseFromParent();
  755. CurIteration.VarMap[PN] = ReplacementVal;
  756. }
  757. // Make the latch of the previous iteration branch to the header
  758. // of this new iteration.
  759. if (BranchInst *BI = dyn_cast<BranchInst>(PrevIteration->Latch->getTerminator())) {
  760. for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
  761. if (BI->getSuccessor(i) == PrevIteration->Header) {
  762. BI->setSuccessor(i, CurIteration.Header);
  763. break;
  764. }
  765. }
  766. }
  767. }
  768. // Check exit condition to see if we fully unrolled the loop
  769. if (BranchInst *BI = dyn_cast<BranchInst>(CurIteration.Latch->getTerminator())) {
  770. bool Cond = false;
  771. Value *ConstantCond = BI->getCondition();
  772. if (Value *C = DVC->GetValue(ConstantCond))
  773. ConstantCond = C;
  774. if (GetConstantI1(ConstantCond, &Cond)) {
  775. if (BI->getSuccessor(Cond ? 1 : 0) == CurIteration.Header) {
  776. Succeeded = true;
  777. break;
  778. }
  779. }
  780. }
  781. // We've reached the N defined in [unroll(N)]
  782. if ((HasExplicitLoopCount && IterationI+1 >= ExplicitUnrollCount) ||
  783. (TripCount != 0 && IterationI+1 >= TripCount))
  784. {
  785. Succeeded = true;
  786. BranchInst *BI = cast<BranchInst>(CurIteration.Latch->getTerminator());
  787. BasicBlock *ExitBlock = nullptr;
  788. for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
  789. BasicBlock *Succ = BI->getSuccessor(i);
  790. if (Succ != CurIteration.Header) {
  791. ExitBlock = Succ;
  792. break;
  793. }
  794. }
  795. BranchInst *NewBI = BranchInst::Create(ExitBlock, BI);
  796. BI->replaceAllUsesWith(NewBI);
  797. BI->eraseFromParent();
  798. break;
  799. }
  800. }
  801. if (Succeeded) {
  802. // We are going to be cleaning them up later. Maker sure
  803. // they're in entry block so deleting loop blocks don't
  804. // kill them too.
  805. for (AllocaInst *AI : ProblemAllocas)
  806. DXASSERT_LOCALVAR(AI, AI->getParent() == &F->getEntryBlock(), "Alloca is not in entry block.");
  807. LoopIteration &FirstIteration = *Iterations.front().get();
  808. // Make the predecessor branch to the first new header.
  809. {
  810. BranchInst *BI = cast<BranchInst>(Predecessor->getTerminator());
  811. for (unsigned i = 0, NumSucc = BI->getNumSuccessors(); i < NumSucc; i++) {
  812. if (BI->getSuccessor(i) == Header) {
  813. BI->setSuccessor(i, FirstIteration.Header);
  814. }
  815. }
  816. }
  817. if (OuterL) {
  818. // Core body blocks need to be added to outer loop
  819. for (size_t i = 0; i < Iterations.size(); i++) {
  820. LoopIteration &Iteration = *Iterations[i].get();
  821. for (BasicBlock *BB : Iteration.Body) {
  822. if (!Iteration.Extended.count(BB)) {
  823. OuterL->addBasicBlockToLoop(BB, *LI);
  824. }
  825. }
  826. }
  827. // Our newly created exit blocks may need to be added to outer loop
  828. for (BasicBlock *BB : FakeExits) {
  829. if (HasSuccessorsInLoop(BB, OuterL))
  830. OuterL->addBasicBlockToLoop(BB, *LI);
  831. }
  832. // Cloned exit blocks may need to be added to outer loop
  833. for (size_t i = 0; i < Iterations.size(); i++) {
  834. LoopIteration &Iteration = *Iterations[i].get();
  835. for (BasicBlock *BB : Iteration.Extended) {
  836. if (HasSuccessorsInLoop(BB, OuterL))
  837. OuterL->addBasicBlockToLoop(BB, *LI);
  838. }
  839. }
  840. }
  841. SE->forgetLoop(L);
  842. // Remove the original blocks that we've cloned from all loops.
  843. for (BasicBlock *BB : ToBeCloned)
  844. LI->removeBlock(BB);
  845. // Remove loop and all child loops from queue.
  846. RecursivelyRemoveLoopFromQueue(LPM, L);
  847. // Remove dead blocks.
  848. for (BasicBlock *BB : ToBeCloned)
  849. DetachFromSuccessors(BB);
  850. for (BasicBlock *BB : ToBeCloned)
  851. BB->dropAllReferences();
  852. for (BasicBlock *BB : ToBeCloned)
  853. BB->eraseFromParent();
  854. // Blocks need to be removed from DomTree. There's no easy way
  855. // to remove them in the right order, so just make DomTree
  856. // recalculate.
  857. DT->recalculate(*F);
  858. if (OuterL) {
  859. // This process may have created multiple back edges for the
  860. // parent loop. Simplify to keep it well-formed.
  861. simplifyLoop(OuterL, DT, LI, this, nullptr, nullptr, AC);
  862. }
  863. // Now that we potentially turned some GEP indices into constants,
  864. // try to clean up their allocas.
  865. if (!BreakUpArrayAllocas(FxcCompatMode /* allow oob index */, ProblemAllocas.begin(), ProblemAllocas.end(), DT, AC, DVC)) {
  866. FailLoopUnroll(false, F, LoopLoc, "Could not unroll loop due to out of bound array access.");
  867. }
  868. return true;
  869. }
  870. // If we were unsuccessful in unrolling the loop
  871. else {
  872. const char *Msg =
  873. "Could not unroll loop. Loop bound could not be deduced at compile time. "
  874. "Use [unroll(n)] to give an explicit count.";
  875. if (FxcCompatMode) {
  876. FailLoopUnroll(true /*warn only*/, F, LoopLoc, Msg);
  877. }
  878. else {
  879. FailLoopUnroll(false /*warn only*/, F, LoopLoc,
  880. Twine(Msg) + Twine(" Use '-HV 2016' to treat this as warning."));
  881. }
  882. // Remove all the cloned blocks
  883. for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
  884. LoopIteration &Iteration = *Ptr.get();
  885. for (BasicBlock *BB : Iteration.Body)
  886. DetachFromSuccessors(BB);
  887. }
  888. for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
  889. LoopIteration &Iteration = *Ptr.get();
  890. for (BasicBlock *BB : Iteration.Body)
  891. BB->dropAllReferences();
  892. }
  893. for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
  894. LoopIteration &Iteration = *Ptr.get();
  895. for (BasicBlock *BB : Iteration.Body)
  896. BB->eraseFromParent();
  897. }
  898. return false;
  899. }
  900. }
  901. }
  902. Pass *llvm::createDxilLoopUnrollPass(unsigned MaxIterationAttempt) {
  903. return new DxilLoopUnroll(MaxIterationAttempt);
  904. }
  905. INITIALIZE_PASS_BEGIN(DxilLoopUnroll, "dxil-loop-unroll", "Dxil Unroll loops", false, false)
  906. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  907. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  908. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  909. INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  910. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  911. INITIALIZE_PASS_DEPENDENCY(DxilValueCache)
  912. INITIALIZE_PASS_END(DxilLoopUnroll, "dxil-loop-unroll", "Dxil Unroll loops", false, false)