6 years ago · 51c6cfb01c
--- a/include/dxc/DXIL/DxilUtil.h
+++ b/include/dxc/DXIL/DxilUtil.h
@@ -14,6 +14,7 @@
 
				 #include <string>
			
 
				 #include <memory>
			
 
				 #include "llvm/ADT/StringRef.h"
			
 
				+#include "llvm/ADT/Twine.h"
			
 
				 #include "llvm/IR/Constants.h"
			
 
				 
			
 
				 namespace llvm {
			
@@ -30,6 +31,7 @@ class BasicBlock;
 
				 class raw_ostream;
			
 
				 class ModulePass;
			
 
				 class PassRegistry;
			
 
				+class DebugLoc;
			
 
				 
			
 
				 ModulePass *createDxilLoadMetadataPass();
			
 
				 void initializeDxilLoadMetadataPass(llvm::PassRegistry&);
			
@@ -67,6 +69,8 @@ namespace dxilutil {
 
				                              llvm::Function *PatchConstantFunc, bool IsLib);
			
 
				   void EmitErrorOnInstruction(llvm::Instruction *I, llvm::StringRef Msg);
			
 
				   void EmitResMappingError(llvm::Instruction *Res);
			
 
				+  std::string FormatMessageAtLocation(const llvm::DebugLoc &DL, llvm::Twine Msg);
			
 
				+  llvm::Twine FormatMessageWithoutLocation(llvm::Twine Msg);
			
 
				   // Simple demangle just support case "\01?name@" pattern.
			
 
				   llvm::StringRef DemangleFunctionName(llvm::StringRef name);
			
 
				   // ReplaceFunctionName replaces the undecorated portion of originalName with undecorated newName
			
@@ -92,6 +96,7 @@ namespace dxilutil {
 
				   llvm::Value *MergeSelectOnSameValue(llvm::Instruction *SelInst,
			
 
				                                       unsigned startOpIdx,
			
 
				                                       unsigned numOperands);
			
 
				+  bool SimplifyTrivialPHIs(llvm::BasicBlock *BB);
			
 
				   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::StringRef BC,
			
 
				     llvm::LLVMContext &Ctx, std::string &DiagStr);
			
 
				   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,
			
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -259,6 +259,7 @@ void initializeMultiDimArrayToOneDimArrayPass(PassRegistry&);
 
				 void initializeResourceToHandlePass(PassRegistry&);
			
 
				 void initializeSROA_SSAUp_HLSLPass(PassRegistry&);
			
 
				 void initializeHoistConstantArrayPass(PassRegistry&);
			
 
				+void initializeDxilLoopUnrollPass(PassRegistry&);
			
 
				 // HLSL Change Ends
			
 
				 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
			
 
				 void initializeScalarEvolutionPass(PassRegistry&);
			
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -122,6 +122,9 @@ void initializeSROA_DT_HLSLPass(PassRegistry&);
 
				 //
			
 
				 ModulePass *createSROA_Parameter_HLSL();
			
 
				 void initializeSROA_Parameter_HLSLPass(PassRegistry&);
			
 
				+
			
 
				+Pass *createDxilLoopUnrollPass(unsigned MaxIterationAttempt);
			
 
				+void initializeDxilLoopUnrollPass(PassRegistry&);
			
 
				 //===----------------------------------------------------------------------===//
			
 
				 //
			
 
				 // LowerStaticGlobalIntoAlloca. Replace static globals with alloca if only used
			
--- a/lib/DXIL/DxilUtil.cpp
+++ b/lib/DXIL/DxilUtil.cpp
@@ -229,20 +229,29 @@ static bool EmitErrorOnInstructionFollowPhiSelect(
 
				   return false;
			
 
				 }
			
 
				 
			
 
				+std::string FormatMessageAtLocation(const DebugLoc &DL, Twine Msg) {
			
 
				+  std::string locString;
			
 
				+  raw_string_ostream os(locString);
			
 
				+  DL.print(os);
			
 
				+  os << ": " << Msg;
			
 
				+  return os.str();
			
 
				+}
			
 
				+
			
 
				+Twine FormatMessageWithoutLocation(Twine Msg) {
			
 
				+  return Twine(Msg) + " Use /Zi for source location.";
			
 
				+}
			
 
				+
			
 
				 void EmitErrorOnInstruction(Instruction *I, StringRef Msg) {
			
 
				   const DebugLoc &DL = I->getDebugLoc();
			
 
				   if (DL.get()) {
			
 
				-    std::string locString;
			
 
				-    raw_string_ostream os(locString);
			
 
				-    DL.print(os);
			
 
				-    I->getContext().emitError(os.str() + ": " + Twine(Msg));
			
 
				+    I->getContext().emitError(FormatMessageAtLocation(DL, Msg));
			
 
				     return;
			
 
				   } else if (isa<PHINode>(I) || isa<SelectInst>(I)) {
			
 
				     if (EmitErrorOnInstructionFollowPhiSelect(I, Msg))
			
 
				       return;
			
 
				   }
			
 
				 
			
 
				-  I->getContext().emitError(Twine(Msg) + " Use /Zi for source location.");
			
 
				+  I->getContext().emitError(FormatMessageWithoutLocation(Msg));
			
 
				 }
			
 
				 
			
 
				 const StringRef kResourceMapErrorMsg =
			
@@ -296,6 +305,28 @@ Value *MergeSelectOnSameValue(Instruction *SelInst, unsigned startOpIdx,
 
				   return op0;
			
 
				 }
			
 
				 
			
 
				+bool SimplifyTrivialPHIs(BasicBlock *BB) {
			
 
				+  bool Changed = false;
			
 
				+  SmallVector<Instruction *, 16> Removed;
			
 
				+  for (Instruction &I : *BB) {
			
 
				+    PHINode *PN = dyn_cast<PHINode>(&I);
			
 
				+    if (!PN)
			
 
				+      continue;
			
 
				+
			
 
				+    if (PN->getNumIncomingValues() == 1) {
			
 
				+      Value *V = PN->getIncomingValue(0);
			
 
				+      PN->replaceAllUsesWith(V);
			
 
				+      Removed.push_back(PN);
			
 
				+      Changed = true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  for (Instruction *I : Removed)
			
 
				+    I->eraseFromParent();
			
 
				+
			
 
				+  return Changed;
			
 
				+}
			
 
				+
			
 
				 Value *SelectOnOperation(llvm::Instruction *Inst, unsigned operandIdx) {
			
 
				   Instruction *prototype = Inst;
			
 
				   for (unsigned i = 0; i < prototype->getNumOperands(); i++) {
			
--- a/lib/HLSL/DxcOptimizer.cpp
+++ b/lib/HLSL/DxcOptimizer.cpp
@@ -157,6 +157,7 @@ HRESULT SetupRegistryPassForHLSL() {
 
				     initializeSROA_Parameter_HLSLPass(Registry);
			
 
				     initializeSROA_SSAUpPass(Registry);
			
 
				     initializeSROA_SSAUp_HLSLPass(Registry);
			
 
				+    initializeDxilLoopUnrollPass(Registry);
			
 
				     initializeSampleProfileLoaderPass(Registry);
			
 
				     initializeScalarizerPass(Registry);
			
 
				     initializeScopedNoAliasAAPass(Registry);
			
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -219,6 +219,10 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
 
				     MPM.add(createHLDeadFunctionEliminationPass());
			
 
				   }
			
 
				 
			
 
				+  // Passes to handle [unroll]
			
 
				+  MPM.add(createLoopRotatePass());
			
 
				+  MPM.add(createDxilLoopUnrollPass(/*MaxIterationAttempt*/ 128));
			
 
				+
			
 
				   // Split struct and array of parameter.
			
 
				   MPM.add(createSROA_Parameter_HLSL());
			
 
				 
			
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_library(LLVMScalarOpts
 
				   Scalar.cpp
			
 
				   ScalarReplAggregates.cpp
			
 
				   ScalarReplAggregatesHLSL.cpp  # HLSL Change
			
 
				+  DxilLoopUnroll.cpp # HLSL Change
			
 
				   Scalarizer.cpp
			
 
				   SeparateConstOffsetFromGEP.cpp
			
 
				   SimplifyCFGPass.cpp
			
--- a/lib/Transforms/Scalar/DxilLoopUnroll.cpp
+++ b/lib/Transforms/Scalar/DxilLoopUnroll.cpp
@@ -0,0 +1,848 @@
 
				+//===- DxilLoopUnroll.cpp - Special Unroll for Constant Values ------------===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// Special loop unroll routine for creating mandatory constant values and
			
 
				+// loops that have exits.
			
 
				+//
			
 
				+// Overview of algorithm:
			
 
				+// 
			
 
				+// 1. Identify a set of blocks to unroll.
			
 
				+//
			
 
				+//    LLVM's concept of loop excludes exit blocks, which are blocks that no
			
 
				+//    longer have a path to the loop latch. However, some exit blocks in HLSL
			
 
				+//    also need to be unrolled. For example:
			
 
				+//
			
 
				+//        [unroll]
			
 
				+//        for (uint i = 0; i < 4; i++)
			
 
				+//        {
			
 
				+//          if (...)
			
 
				+//          {
			
 
				+//            // This block here is an exit block, since it's.
			
 
				+//            // guaranteed to exit the loop.
			
 
				+//            ...
			
 
				+//            a[i] = ...; // Indexing requires unroll.
			
 
				+//            return;
			
 
				+//          }
			
 
				+//        }
			
 
				+//
			
 
				+//
			
 
				+// 2. Create LCSSA based on the new loop boundary.
			
 
				+//
			
 
				+//    See LCSSA.cpp for more details. It creates trivial PHI nodes for any
			
 
				+//    outgoing values of the loop at the exit blocks, so when the loop body
			
 
				+//    gets cloned, the outgoing values can be added to those PHI nodes easily.
			
 
				+//
			
 
				+//    We are using a modified LCSSA routine here because we are including some
			
 
				+//    of the original exit blocks in the unroll.
			
 
				+//
			
 
				+//
			
 
				+// 3. Unroll the loop until we succeed.
			
 
				+//
			
 
				+//    Unlike LLVM, we do not try to find a loop count before unrolling.
			
 
				+//    Instead, we unroll to find a constant terminal condition. Give up when we
			
 
				+//    fail to do so.
			
 
				+//
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+
			
 
				+#include "llvm/Pass.h"
			
 
				+#include "llvm/Analysis/LoopPass.h"
			
 
				+#include "llvm/Analysis/InstructionSimplify.h"
			
 
				+#include "llvm/Analysis/AssumptionCache.h"
			
 
				+#include "llvm/Analysis/LoopPass.h"
			
 
				+#include "llvm/Analysis/InstructionSimplify.h"
			
 
				+#include "llvm/Analysis/AssumptionCache.h"
			
 
				+#include "llvm/Transforms/Scalar.h"
			
 
				+#include "llvm/Transforms/Utils/Cloning.h"
			
 
				+#include "llvm/Transforms/Utils/Local.h"
			
 
				+#include "llvm/Transforms/Utils/UnrollLoop.h"
			
 
				+#include "llvm/Transforms/Utils/SSAUpdater.h"
			
 
				+#include "llvm/Transforms/Utils/LoopUtils.h"
			
 
				+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
			
 
				+#include "llvm/IR/Instructions.h"
			
 
				+#include "llvm/IR/Module.h"
			
 
				+#include "llvm/IR/Verifier.h"
			
 
				+#include "llvm/IR/PredIteratorCache.h"
			
 
				+#include "llvm/Support/raw_ostream.h"
			
 
				+#include "llvm/Support/Debug.h"
			
 
				+#include "llvm/ADT/SetVector.h"
			
 
				+
			
 
				+#include "dxc/DXIL/DxilUtil.h"
			
 
				+#include "dxc/HLSL/HLModule.h"
			
 
				+
			
 
				+using namespace llvm;
			
 
				+using namespace hlsl;
			
 
				+
			
 
				+// Copied over from LoopUnroll.cpp - RemapInstruction()
			
 
				+static inline void RemapInstruction(Instruction *I,
			
 
				+                                    ValueToValueMapTy &VMap) {
			
 
				+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
			
 
				+    Value *Op = I->getOperand(op);
			
 
				+    ValueToValueMapTy::iterator It = VMap.find(Op);
			
 
				+    if (It != VMap.end())
			
 
				+      I->setOperand(op, It->second);
			
 
				+  }
			
 
				+
			
 
				+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
			
 
				+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
			
 
				+      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
			
 
				+      if (It != VMap.end())
			
 
				+        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+namespace {
			
 
				+
			
 
				+class DxilLoopUnroll : public LoopPass {
			
 
				+public:
			
 
				+  static char ID;
			
 
				+
			
 
				+  std::unordered_set<Function *> CleanedUpAlloca;
			
 
				+  unsigned MaxIterationAttempt = 0;
			
 
				+
			
 
				+  DxilLoopUnroll(unsigned MaxIterationAttempt = 128) :
			
 
				+    LoopPass(ID),
			
 
				+    MaxIterationAttempt(MaxIterationAttempt)
			
 
				+  {
			
 
				+    initializeDxilLoopUnrollPass(*PassRegistry::getPassRegistry());
			
 
				+  }
			
 
				+  const char *getPassName() const override { return "Dxil Loop Unroll"; }
			
 
				+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
			
 
				+  void getAnalysisUsage(AnalysisUsage &AU) const override {
			
 
				+    AU.addRequired<LoopInfoWrapperPass>();
			
 
				+    AU.addPreserved<LoopInfoWrapperPass>();
			
 
				+    AU.addRequiredID(LoopSimplifyID);
			
 
				+    AU.addRequired<AssumptionCacheTracker>();
			
 
				+    AU.addRequired<DominatorTreeWrapperPass>();
			
 
				+    AU.addPreserved<DominatorTreeWrapperPass>();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+char DxilLoopUnroll::ID;
			
 
				+
			
 
				+static void FailLoopUnroll(bool WarnOnly, Loop *L, const char *Message) {
			
 
				+  DebugLoc DL = L->getStartLoc();
			
 
				+  LLVMContext &Ctx = L->getHeader()->getContext();
			
 
				+
			
 
				+  if (WarnOnly) {
			
 
				+    if (DL.get())
			
 
				+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
			
 
				+    else
			
 
				+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
			
 
				+  }
			
 
				+  else {
			
 
				+    if (DL.get())
			
 
				+      Ctx.emitError(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
			
 
				+    else
			
 
				+      Ctx.emitError(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+struct LoopIteration {
			
 
				+  SmallVector<BasicBlock *, 16> Body;
			
 
				+  BasicBlock *Latch = nullptr;
			
 
				+  BasicBlock *Header = nullptr;
			
 
				+  ValueToValueMapTy VarMap;
			
 
				+  SetVector<BasicBlock *> Extended; // Blocks that are included in the clone that are not in the core loop body.
			
 
				+  LoopIteration() {}
			
 
				+};
			
 
				+
			
 
				+static bool GetConstantI1(Value *V, bool *Val=nullptr) {
			
 
				+  if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
			
 
				+    if (V->getType()->isIntegerTy(1)) {
			
 
				+      if (Val)
			
 
				+        *Val = (bool)C->getLimitedValue();
			
 
				+      return true;
			
 
				+    }
			
 
				+  }
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+// Copied from llvm::SimplifyInstructionsInBlock
			
 
				+static bool SimplifyInstructionsInBlock_NoDelete(BasicBlock *BB,
			
 
				+                                       const TargetLibraryInfo *TLI) {
			
 
				+  bool MadeChange = false;
			
 
				+
			
 
				+#ifndef NDEBUG
			
 
				+  // In debug builds, ensure that the terminator of the block is never replaced
			
 
				+  // or deleted by these simplifications. The idea of simplification is that it
			
 
				+  // cannot introduce new instructions, and there is no way to replace the
			
 
				+  // terminator of a block without introducing a new instruction.
			
 
				+  AssertingVH<Instruction> TerminatorVH(--BB->end());
			
 
				+#endif
			
 
				+
			
 
				+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
			
 
				+    assert(!BI->isTerminator());
			
 
				+    Instruction *Inst = BI++;
			
 
				+
			
 
				+    WeakVH BIHandle(BI);
			
 
				+    if (recursivelySimplifyInstruction(Inst, TLI)) {
			
 
				+      MadeChange = true;
			
 
				+      if (BIHandle != BI)
			
 
				+        BI = BB->begin();
			
 
				+      continue;
			
 
				+    }
			
 
				+#if 0 // HLSL Change
			
 
				+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
			
 
				+#endif // HLSL Change
			
 
				+    if (BIHandle != BI)
			
 
				+      BI = BB->begin();
			
 
				+  }
			
 
				+  return MadeChange;
			
 
				+}
			
 
				+
			
 
				+static bool IsMarkedFullUnroll(Loop *L) {
			
 
				+  if (MDNode *LoopID = L->getLoopID())
			
 
				+    return GetUnrollMetadata(LoopID, "llvm.loop.unroll.full");
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+static bool HasSuccessorsInLoop(BasicBlock *BB, Loop *L) {
			
 
				+  for (BasicBlock *Succ : successors(BB)) {
			
 
				+    if (L->contains(Succ)) {
			
 
				+      return true;
			
 
				+    }
			
 
				+  }
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+static void DetachFromSuccessors(BasicBlock *BB) {
			
 
				+  SmallVector<BasicBlock *, 16> Successors(succ_begin(BB), succ_end(BB));
			
 
				+  for (BasicBlock *Succ : Successors) {
			
 
				+    Succ->removePredecessor(BB);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/// Return true if the specified block is in the list.
			
 
				+static bool isExitBlock(BasicBlock *BB,
			
 
				+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
			
 
				+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
			
 
				+    if (ExitBlocks[i] == BB)
			
 
				+      return true;
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+// Copied and modified from LCSSA.cpp
			
 
				+static bool processInstruction(SetVector<BasicBlock *> &Body, Loop &L, Instruction &Inst, DominatorTree &DT, // HLSL Change
			
 
				+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks,
			
 
				+                               PredIteratorCache &PredCache, LoopInfo *LI) {
			
 
				+
			
 
				+  SmallVector<Use *, 16> UsesToRewrite;
			
 
				+
			
 
				+  BasicBlock *InstBB = Inst.getParent();
			
 
				+
			
 
				+  for (Use &U : Inst.uses()) {
			
 
				+    Instruction *User = cast<Instruction>(U.getUser());
			
 
				+    BasicBlock *UserBB = User->getParent();
			
 
				+    if (PHINode *PN = dyn_cast<PHINode>(User))
			
 
				+      UserBB = PN->getIncomingBlock(U);
			
 
				+
			
 
				+    if (InstBB != UserBB && /*!L.contains(UserBB)*/!Body.count(UserBB)) // HLSL Change
			
 
				+      UsesToRewrite.push_back(&U);
			
 
				+  }
			
 
				+
			
 
				+  // If there are no uses outside the loop, exit with no change.
			
 
				+  if (UsesToRewrite.empty())
			
 
				+    return false;
			
 
				+#if 0 // HLSL Change
			
 
				+  ++NumLCSSA; // We are applying the transformation
			
 
				+#endif // HLSL Change
			
 
				+  // Invoke instructions are special in that their result value is not available
			
 
				+  // along their unwind edge. The code below tests to see whether DomBB
			
 
				+  // dominates
			
 
				+  // the value, so adjust DomBB to the normal destination block, which is
			
 
				+  // effectively where the value is first usable.
			
 
				+  BasicBlock *DomBB = Inst.getParent();
			
 
				+  if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
			
 
				+    DomBB = Inv->getNormalDest();
			
 
				+
			
 
				+  DomTreeNode *DomNode = DT.getNode(DomBB);
			
 
				+
			
 
				+  SmallVector<PHINode *, 16> AddedPHIs;
			
 
				+  SmallVector<PHINode *, 8> PostProcessPHIs;
			
 
				+
			
 
				+  SSAUpdater SSAUpdate;
			
 
				+  SSAUpdate.Initialize(Inst.getType(), Inst.getName());
			
 
				+
			
 
				+  // Insert the LCSSA phi's into all of the exit blocks dominated by the
			
 
				+  // value, and add them to the Phi's map.
			
 
				+  for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(),
			
 
				+                                                     BBE = ExitBlocks.end();
			
 
				+       BBI != BBE; ++BBI) {
			
 
				+    BasicBlock *ExitBB = *BBI;
			
 
				+    if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
			
 
				+      continue;
			
 
				+
			
 
				+    // If we already inserted something for this BB, don't reprocess it.
			
 
				+    if (SSAUpdate.HasValueForBlock(ExitBB))
			
 
				+      continue;
			
 
				+
			
 
				+    PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
			
 
				+                                  Inst.getName() + ".lcssa", ExitBB->begin());
			
 
				+
			
 
				+    // Add inputs from inside the loop for this PHI.
			
 
				+    for (BasicBlock *Pred : PredCache.get(ExitBB)) {
			
 
				+      PN->addIncoming(&Inst, Pred);
			
 
				+
			
 
				+      // If the exit block has a predecessor not within the loop, arrange for
			
 
				+      // the incoming value use corresponding to that predecessor to be
			
 
				+      // rewritten in terms of a different LCSSA PHI.
			
 
				+      if (/*!L.contains(Pred)*/ !Body.count(Pred)) // HLSL Change
			
 
				+        UsesToRewrite.push_back(
			
 
				+            &PN->getOperandUse(PN->getOperandNumForIncomingValue(
			
 
				+                 PN->getNumIncomingValues() - 1)));
			
 
				+    }
			
 
				+
			
 
				+    AddedPHIs.push_back(PN);
			
 
				+
			
 
				+    // Remember that this phi makes the value alive in this block.
			
 
				+    SSAUpdate.AddAvailableValue(ExitBB, PN);
			
 
				+
			
 
				+    // LoopSimplify might fail to simplify some loops (e.g. when indirect
			
 
				+    // branches are involved). In such situations, it might happen that an exit
			
 
				+    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
			
 
				+    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
			
 
				+    // This could break LCSSA form for L2 because these inserted PHIs can also
			
 
				+    // have uses outside of L2. Remember all PHIs in such situation as to
			
 
				+    // revisit than later on. FIXME: Remove this if indirectbr support into
			
 
				+    // LoopSimplify gets improved.
			
 
				+    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
			
 
				+      if (!L.contains(OtherLoop))
			
 
				+        PostProcessPHIs.push_back(PN);
			
 
				+  }
			
 
				+
			
 
				+  // Rewrite all uses outside the loop in terms of the new PHIs we just
			
 
				+  // inserted.
			
 
				+  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
			
 
				+    // If this use is in an exit block, rewrite to use the newly inserted PHI.
			
 
				+    // This is required for correctness because SSAUpdate doesn't handle uses in
			
 
				+    // the same block.  It assumes the PHI we inserted is at the end of the
			
 
				+    // block.
			
 
				+    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
			
 
				+    BasicBlock *UserBB = User->getParent();
			
 
				+    if (PHINode *PN = dyn_cast<PHINode>(User))
			
 
				+      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
			
 
				+
			
 
				+    if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
			
 
				+      // Tell the VHs that the uses changed. This updates SCEV's caches.
			
 
				+      if (UsesToRewrite[i]->get()->hasValueHandle())
			
 
				+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
			
 
				+      UsesToRewrite[i]->set(UserBB->begin());
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    // Otherwise, do full PHI insertion.
			
 
				+    SSAUpdate.RewriteUse(*UsesToRewrite[i]);
			
 
				+  }
			
 
				+
			
 
				+  // Post process PHI instructions that were inserted into another disjoint loop
			
 
				+  // and update their exits properly.
			
 
				+  for (auto *I : PostProcessPHIs) {
			
 
				+    if (I->use_empty())
			
 
				+      continue;
			
 
				+
			
 
				+    BasicBlock *PHIBB = I->getParent();
			
 
				+    Loop *OtherLoop = LI->getLoopFor(PHIBB);
			
 
				+    SmallVector<BasicBlock *, 8> EBs;
			
 
				+    OtherLoop->getExitBlocks(EBs);
			
 
				+    if (EBs.empty())
			
 
				+      continue;
			
 
				+
			
 
				+    // Recurse and re-process each PHI instruction. FIXME: we should really
			
 
				+    // convert this entire thing to a worklist approach where we process a
			
 
				+    // vector of instructions...
			
 
				+    processInstruction(Body, *OtherLoop, *I, DT, EBs, PredCache, LI);
			
 
				+  }
			
 
				+
			
 
				+  // Remove PHI nodes that did not have any uses rewritten.
			
 
				+  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
			
 
				+    if (AddedPHIs[i]->use_empty())
			
 
				+      AddedPHIs[i]->eraseFromParent();
			
 
				+  }
			
 
				+
			
 
				+  return true;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+// Copied from LCSSA.cpp
			
 
				+static bool blockDominatesAnExit(BasicBlock *BB,
			
 
				+                     DominatorTree &DT,
			
 
				+                     const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
			
 
				+  DomTreeNode *DomNode = DT.getNode(BB);
			
 
				+  for (BasicBlock *Exit : ExitBlocks)
			
 
				+    if (DT.dominates(DomNode, DT.getNode(Exit)))
			
 
				+      return true;
			
 
				+  return false;
			
 
				+};
			
 
				+
			
 
				+// Copied from LCSSA.cpp
			
 
				+//
			
 
				+// We need to recreate the LCSSA form since our loop boundary is potentially different from
			
 
				+// the canonical one.
			
 
				+static bool CreateLCSSA(SetVector<BasicBlock *> &Body, const SmallVectorImpl<BasicBlock *> &ExitBlocks, Loop *L, DominatorTree &DT, LoopInfo *LI) {
			
 
				+
			
 
				+  PredIteratorCache PredCache;
			
 
				+  bool Changed = false;
			
 
				+  // Look at all the instructions in the loop, checking to see if they have uses
			
 
				+  // outside the loop.  If so, rewrite those uses.
			
 
				+  for (SetVector<BasicBlock *>::iterator BBI = Body.begin(), BBE = Body.end();
			
 
				+       BBI != BBE; ++BBI) {
			
 
				+    BasicBlock *BB = *BBI;
			
 
				+
			
 
				+    // For large loops, avoid use-scanning by using dominance information:  In
			
 
				+    // particular, if a block does not dominate any of the loop exits, then none
			
 
				+    // of the values defined in the block could be used outside the loop.
			
 
				+    if (!blockDominatesAnExit(BB, DT, ExitBlocks))
			
 
				+      continue;
			
 
				+
			
 
				+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
			
 
				+      // Reject two common cases fast: instructions with no uses (like stores)
			
 
				+      // and instructions with one use that is in the same block as this.
			
 
				+      if (I->use_empty() ||
			
 
				+          (I->hasOneUse() && I->user_back()->getParent() == BB &&
			
 
				+           !isa<PHINode>(I->user_back())))
			
 
				+        continue;
			
 
				+
			
 
				+      Changed |= processInstruction(Body, *L, *I, DT, ExitBlocks, PredCache, LI);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return Changed;
			
 
				+}
			
 
				+
			
 
				+static void FindProblemBlocks(BasicBlock *Header, const SmallVectorImpl<BasicBlock *> &BlocksInLoop, std::unordered_set<BasicBlock *> &ProblemBlocks) {
			
 
				+  SmallVector<Instruction *, 16> WorkList;
			
 
				+
			
 
				+  std::unordered_set<BasicBlock *> BlocksInLoopSet(BlocksInLoop.begin(), BlocksInLoop.end());
			
 
				+  std::unordered_set<Instruction *> InstructionsSeen;
			
 
				+
			
 
				+  for (Instruction &I : *Header) {
			
 
				+    PHINode *PN = dyn_cast<PHINode>(&I);
			
 
				+    if (!PN)
			
 
				+      break;
			
 
				+    WorkList.push_back(PN);
			
 
				+    InstructionsSeen.insert(PN);
			
 
				+  }
			
 
				+
			
 
				+  while (WorkList.size()) {
			
 
				+    Instruction *I = WorkList.pop_back_val();
			
 
				+
			
 
				+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
			
 
				+      Type *EltType = GEP->getType()->getPointerElementType();
			
 
				+
			
 
				+      // NOTE: This is a very convservative in the following conditions:
			
 
				+      // - constant global resource arrays with external linkage (these can be
			
 
				+      //   dynamically accessed)
			
 
				+      // - global resource arrays or alloca resource arrays, as long as all
			
 
				+      //   writes come from the same original resource definition (which can
			
 
				+      //   also be an array).
			
 
				+      //
			
 
				+      // We may want to make this more precise in the future if it becomes a
			
 
				+      // problem.
			
 
				+      //
			
 
				+      if (hlsl::dxilutil::IsHLSLObjectType(EltType)) {
			
 
				+        ProblemBlocks.insert(GEP->getParent());
			
 
				+        continue; // Stop Propagating
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    for (User *U : I->users()) {
			
 
				+      if (Instruction *UserI = dyn_cast<Instruction>(U)) {
			
 
				+        if (!InstructionsSeen.count(UserI) &&
			
 
				+          BlocksInLoopSet.count(UserI->getParent()))
			
 
				+        {
			
 
				+          InstructionsSeen.insert(UserI);
			
 
				+          WorkList.push_back(UserI);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static bool ContainsFloatingPointType(Type *Ty) {
			
 
				+  if (Ty->isFloatingPointTy()) {
			
 
				+    return true;
			
 
				+  }
			
 
				+  else if (Ty->isArrayTy()) {
			
 
				+    return ContainsFloatingPointType(Ty->getArrayElementType());
			
 
				+  }
			
 
				+  else if (Ty->isVectorTy()) {
			
 
				+    return ContainsFloatingPointType(Ty->getVectorElementType());
			
 
				+  }
			
 
				+  else if (Ty->isStructTy()) {
			
 
				+    for (unsigned i = 0, NumStructElms = Ty->getStructNumElements(); i < NumStructElms; i++) {
			
 
				+      if (ContainsFloatingPointType(Ty->getStructElementType(i)))
			
 
				+        return true;
			
 
				+    }
			
 
				+  }
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+static bool Mem2Reg(Function &F, DominatorTree &DT, AssumptionCache &AC) {
			
 
				+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
			
 
				+  bool Changed  = false;
			
 
				+  std::vector<AllocaInst*> Allocas;
			
 
				+  while (1) {
			
 
				+    Allocas.clear();
			
 
				+
			
 
				+    // Find allocas that are safe to promote, by looking at all instructions in
			
 
				+    // the entry node
			
 
				+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
			
 
				+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
			
 
				+        if (isAllocaPromotable(AI) &&
			
 
				+          (!HLModule::HasPreciseAttributeWithMetadata(AI) || !ContainsFloatingPointType(AI->getAllocatedType())))
			
 
				+          Allocas.push_back(AI);
			
 
				+
			
 
				+    if (Allocas.empty()) break;
			
 
				+
			
 
				+    PromoteMemToReg(Allocas, DT, nullptr, &AC);
			
 
				+    Changed = true;
			
 
				+  }
			
 
				+
			
 
				+  return Changed;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+bool DxilLoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
			
 
				+
			
 
				+  // If the loop is not marked as [unroll], don't do anything.
			
 
				+  if (!IsMarkedFullUnroll(L))
			
 
				+    return false;
			
 
				+
			
 
				+  if (!L->isSafeToClone())
			
 
				+    return false;
			
 
				+
			
 
				+  Function *F = L->getHeader()->getParent();
			
 
				+  bool OnlyWarnOnFail = false;
			
 
				+  if (F->getParent()->HasHLModule()) {
			
 
				+    HLModule &HM = F->getParent()->GetHLModule();
			
 
				+    OnlyWarnOnFail = HM.GetHLOptions().bFXCCompatMode;
			
 
				+  }
			
 
				+
			
 
				+  // Analysis passes
			
 
				+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
			
 
				+  AssumptionCache *AC =
			
 
				+    &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
			
 
				+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
			
 
				+
			
 
				+  Loop *OuterL = L->getParentLoop();
			
 
				+  BasicBlock *Latch = L->getLoopLatch();
			
 
				+  BasicBlock *Header = L->getHeader();
			
 
				+  BasicBlock *Predecessor = L->getLoopPredecessor();
			
 
				+
			
 
				+  // Quit if we don't have a single latch block or predecessor
			
 
				+  if (!Latch || !Predecessor) {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // If the loop exit condition is not in the latch, then the loop is not rotated. Give up.
			
 
				+  if (!cast<BranchInst>(Latch->getTerminator())->isConditional()) {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // Promote alloca's
			
 
				+  if (!CleanedUpAlloca.count(F)) {
			
 
				+    CleanedUpAlloca.insert(F);
			
 
				+    Mem2Reg(*F, *DT, *AC);
			
 
				+  }
			
 
				+
			
 
				+  SmallVector<BasicBlock *, 16> ExitBlocks;
			
 
				+  L->getExitBlocks(ExitBlocks);
			
 
				+  std::unordered_set<BasicBlock *> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
			
 
				+
			
 
				+  SmallVector<BasicBlock *, 16> BlocksInLoop; // Set of blocks including both body and exits
			
 
				+  BlocksInLoop.append(L->getBlocks().begin(), L->getBlocks().end());
			
 
				+  BlocksInLoop.append(ExitBlocks.begin(), ExitBlocks.end());
			
 
				+
			
 
				+  // Heuristically find blocks that likely need to be unrolled
			
 
				+  std::unordered_set<BasicBlock *> ProblemBlocks;
			
 
				+  FindProblemBlocks(L->getHeader(), BlocksInLoop, ProblemBlocks);
			
 
				+
			
 
				+  // Keep track of the PHI nodes at the header.
			
 
				+  SmallVector<PHINode *, 16> PHIs;
			
 
				+  for (auto it = Header->begin(); it != Header->end(); it++) {
			
 
				+    if (PHINode *PN = dyn_cast<PHINode>(it)) {
			
 
				+      PHIs.push_back(PN);
			
 
				+    }
			
 
				+    else {
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  SetVector<BasicBlock *> ToBeCloned; // List of blocks that will be cloned.
			
 
				+  for (BasicBlock *BB : L->getBlocks()) // Include the body right away
			
 
				+    ToBeCloned.insert(BB);
			
 
				+
			
 
				+  // Find the exit blocks that also need to be included
			
 
				+  // in the unroll.
			
 
				+  SmallVector<BasicBlock *, 8> NewExits; // New set of exit blocks as boundaries for LCSSA
			
 
				+  SmallVector<BasicBlock *, 8> FakeExits; // Set of blocks created to allow cloning original exit blocks.
			
 
				+  for (BasicBlock *BB : ExitBlocks) {
			
 
				+    bool CloneThisExitBlock = ProblemBlocks.count(BB);
			
 
				+
			
 
				+    if (CloneThisExitBlock) {
			
 
				+      ToBeCloned.insert(BB);
			
 
				+
			
 
				+      // If we are cloning this basic block, we must create a new exit
			
 
				+      // block for inserting LCSSA PHI nodes.
			
 
				+      BasicBlock *FakeExit = BasicBlock::Create(BB->getContext(), "loop.exit.new");
			
 
				+      F->getBasicBlockList().insert(BB, FakeExit);
			
 
				+
			
 
				+      TerminatorInst *OldTerm = BB->getTerminator();
			
 
				+      OldTerm->removeFromParent();
			
 
				+      FakeExit->getInstList().push_back(OldTerm);
			
 
				+
			
 
				+      BranchInst::Create(FakeExit, BB);
			
 
				+      for (BasicBlock *Succ : successors(FakeExit)) {
			
 
				+        for (Instruction &I : *Succ) {
			
 
				+          if (PHINode *PN = dyn_cast<PHINode>(&I)) {
			
 
				+            for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
			
 
				+              if (PN->getIncomingBlock(i) == BB)
			
 
				+                PN->setIncomingBlock(i, FakeExit);
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      NewExits.push_back(FakeExit);
			
 
				+      FakeExits.push_back(FakeExit);
			
 
				+
			
 
				+      // Update Dom tree with new exit
			
 
				+      if (!DT->getNode(FakeExit))
			
 
				+        DT->addNewBlock(FakeExit, BB);
			
 
				+    }
			
 
				+    else {
			
 
				+      // If we are not including this exit block in the unroll,
			
 
				+      // use it for LCSSA as normal.
			
 
				+      NewExits.push_back(BB);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Simplify the PHI nodes that have single incoming value. The original LCSSA form
			
 
				+  // (if exists) does not necessarily work for our unroll because we may be unrolling
			
 
				+  // from a different boundary.
			
 
				+  for (BasicBlock *BB : BlocksInLoop)
			
 
				+    hlsl::dxilutil::SimplifyTrivialPHIs(BB);
			
 
				+
			
 
				+  // Re-establish LCSSA form to get ready for unrolling.
			
 
				+  CreateLCSSA(ToBeCloned, NewExits, L, *DT, LI);
			
 
				+
			
 
				+  SmallVector<std::unique_ptr<LoopIteration>, 16> Iterations; // List of cloned iterations
			
 
				+  bool Succeeded = false;
			
 
				+
			
 
				+  for (unsigned IterationI = 0; IterationI < this->MaxIterationAttempt; IterationI++) {
			
 
				+
			
 
				+    LoopIteration *PrevIteration = nullptr;
			
 
				+    if (Iterations.size())
			
 
				+      PrevIteration = Iterations.back().get();
			
 
				+    Iterations.push_back(std::make_unique<LoopIteration>());
			
 
				+    LoopIteration &CurIteration = *Iterations.back().get();
			
 
				+
			
 
				+    // Clone the blocks.
			
 
				+    for (BasicBlock *BB : ToBeCloned) {
			
 
				+
			
 
				+      BasicBlock *ClonedBB = CloneBasicBlock(BB, CurIteration.VarMap);
			
 
				+      CurIteration.VarMap[BB] = ClonedBB;
			
 
				+      ClonedBB->insertInto(F, Header);
			
 
				+
			
 
				+      if (ExitBlockSet.count(BB))
			
 
				+        CurIteration.Extended.insert(ClonedBB);
			
 
				+
			
 
				+      CurIteration.Body.push_back(ClonedBB);
			
 
				+
			
 
				+      // Identify the special blocks.
			
 
				+      if (BB == Latch) {
			
 
				+        CurIteration.Latch = ClonedBB;
			
 
				+      }
			
 
				+      if (BB == Header) {
			
 
				+        CurIteration.Header = ClonedBB;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    for (BasicBlock *BB : ToBeCloned) {
			
 
				+      BasicBlock *ClonedBB = cast<BasicBlock>(CurIteration.VarMap[BB]);
			
 
				+      // If branching to outside of the loop, need to update the
			
 
				+      // phi nodes there to include new values.
			
 
				+      for (BasicBlock *Succ : successors(ClonedBB)) {
			
 
				+        if (ToBeCloned.count(Succ))
			
 
				+          continue;
			
 
				+        for (Instruction &I : *Succ) {
			
 
				+          PHINode *PN = dyn_cast<PHINode>(&I);
			
 
				+          if (!PN)
			
 
				+            break;
			
 
				+
			
 
				+          // Find the incoming value for this new block. If there is an entry
			
 
				+          // for this block in the map, then it was defined in the loop, use it.
			
 
				+          // Otherwise it came from outside the loop.
			
 
				+          Value *OldIncoming = PN->getIncomingValueForBlock(BB);
			
 
				+          Value *NewIncoming = OldIncoming;
			
 
				+          ValueToValueMapTy::iterator Itor = CurIteration.VarMap.find(OldIncoming);
			
 
				+          if (Itor != CurIteration.VarMap.end())
			
 
				+            NewIncoming = Itor->second;
			
 
				+          PN->addIncoming(NewIncoming, ClonedBB);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Remap the instructions inside of cloned blocks.
			
 
				+    for (BasicBlock *BB : CurIteration.Body) {
			
 
				+      for (Instruction &I : *BB) {
			
 
				+        ::RemapInstruction(&I, CurIteration.VarMap);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // If this is the first block
			
 
				+    if (!PrevIteration) {
			
 
				+      // Replace the phi nodes in the clone block with the values coming
			
 
				+      // from outside of the loop
			
 
				+      for (PHINode *PN : PHIs) {
			
 
				+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
			
 
				+        Value *ReplacementVal = ClonedPN->getIncomingValueForBlock(Predecessor);
			
 
				+        ClonedPN->replaceAllUsesWith(ReplacementVal);
			
 
				+        ClonedPN->eraseFromParent();
			
 
				+        CurIteration.VarMap[PN] = ReplacementVal;
			
 
				+      }
			
 
				+    }
			
 
				+    else {
			
 
				+      // Replace the phi nodes with the value defined INSIDE the previous iteration.
			
 
				+      for (PHINode *PN : PHIs) {
			
 
				+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
			
 
				+        Value *ReplacementVal = PrevIteration->VarMap[PN->getIncomingValueForBlock(Latch)];
			
 
				+        ClonedPN->replaceAllUsesWith(ReplacementVal);
			
 
				+        ClonedPN->eraseFromParent();
			
 
				+        CurIteration.VarMap[PN] = ReplacementVal;
			
 
				+      }
			
 
				+
			
 
				+      // Make the latch of the previous iteration branch to the header
			
 
				+      // of this new iteration.
			
 
				+      if (BranchInst *BI = dyn_cast<BranchInst>(PrevIteration->Latch->getTerminator())) {
			
 
				+        for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
			
 
				+          if (BI->getSuccessor(i) == PrevIteration->Header) {
			
 
				+            BI->setSuccessor(i, CurIteration.Header);
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Simplify instructions in the cloned blocks to create
			
 
				+    // constant exit conditions.
			
 
				+    for (BasicBlock *ClonedBB : CurIteration.Body)
			
 
				+      SimplifyInstructionsInBlock_NoDelete(ClonedBB, NULL);
			
 
				+
			
 
				+    // Check exit condition to see if we fully unrolled the loop
			
 
				+    if (BranchInst *BI = dyn_cast<BranchInst>(CurIteration.Latch->getTerminator())) {
			
 
				+      bool Cond = false;
			
 
				+      if (GetConstantI1(BI->getCondition(), &Cond)) {
			
 
				+        if (BI->getSuccessor(Cond ? 1 : 0) == CurIteration.Header) {
			
 
				+          Succeeded = true;
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if (Succeeded) {
			
 
				+    LoopIteration &FirstIteration = *Iterations.front().get();
			
 
				+    // Make the predecessor branch to the first new header.
			
 
				+    {
			
 
				+      BranchInst *BI = cast<BranchInst>(Predecessor->getTerminator());
			
 
				+      for (unsigned i = 0, NumSucc = BI->getNumSuccessors(); i < NumSucc; i++) {
			
 
				+        if (BI->getSuccessor(i) == Header) {
			
 
				+          BI->setSuccessor(i, FirstIteration.Header);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    if (OuterL) {
			
 
				+      // Core body blocks need to be added to outer loop
			
 
				+      for (size_t i = 0; i < Iterations.size(); i++) {
			
 
				+        LoopIteration &Iteration = *Iterations[i].get();
			
 
				+        for (BasicBlock *BB : Iteration.Body) {
			
 
				+          if (!Iteration.Extended.count(BB)) {
			
 
				+            OuterL->addBasicBlockToLoop(BB, *LI);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      // Our newly created exit blocks may need to be added to outer loop
			
 
				+      for (BasicBlock *BB : FakeExits) {
			
 
				+        if (HasSuccessorsInLoop(BB, OuterL))
			
 
				+          OuterL->addBasicBlockToLoop(BB, *LI);
			
 
				+      }
			
 
				+
			
 
				+      // Cloned exit blocks may need to be added to outer loop
			
 
				+      for (size_t i = 0; i < Iterations.size(); i++) {
			
 
				+        LoopIteration &Iteration = *Iterations[i].get();
			
 
				+        for (BasicBlock *BB : Iteration.Extended) {
			
 
				+          if (HasSuccessorsInLoop(BB, OuterL))
			
 
				+            OuterL->addBasicBlockToLoop(BB, *LI);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Remove the original blocks that we've cloned from all loops.
			
 
				+    for (BasicBlock *BB : ToBeCloned)
			
 
				+      LI->removeBlock(BB);
			
 
				+
			
 
				+    LPM.deleteLoopFromQueue(L);
			
 
				+
			
 
				+    // Remove dead blocks.
			
 
				+    for (BasicBlock *BB : ToBeCloned)
			
 
				+      DetachFromSuccessors(BB);
			
 
				+    for (BasicBlock *BB : ToBeCloned)
			
 
				+      BB->dropAllReferences();
			
 
				+    for (BasicBlock *BB : ToBeCloned)
			
 
				+      BB->eraseFromParent();
			
 
				+
			
 
				+    if (OuterL) {
			
 
				+      // This process may have created multiple back edges for the
			
 
				+      // parent loop. Simplify to keep it well-formed.
			
 
				+      simplifyLoop(OuterL, DT, LI, this, nullptr, nullptr, AC);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  // If we were unsuccessful in unrolling the loop
			
 
				+  else {
			
 
				+    FailLoopUnroll(OnlyWarnOnFail, L, "Could not unroll loop.");
			
 
				+
			
 
				+    // Remove all the cloned blocks
			
 
				+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
			
 
				+      LoopIteration &Iteration = *Ptr.get();
			
 
				+      for (BasicBlock *BB : Iteration.Body)
			
 
				+        DetachFromSuccessors(BB);
			
 
				+    }
			
 
				+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
			
 
				+      LoopIteration &Iteration = *Ptr.get();
			
 
				+      for (BasicBlock *BB : Iteration.Body)
			
 
				+        BB->dropAllReferences();
			
 
				+    }
			
 
				+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
			
 
				+      LoopIteration &Iteration = *Ptr.get();
			
 
				+      for (BasicBlock *BB : Iteration.Body)
			
 
				+        BB->eraseFromParent();
			
 
				+    }
			
 
				+
			
 
				+    return false;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+Pass *llvm::createDxilLoopUnrollPass(unsigned MaxIterationAttempt) {
			
 
				+  return new DxilLoopUnroll(MaxIterationAttempt);
			
 
				+}
			
 
				+
			
 
				+INITIALIZE_PASS(DxilLoopUnroll, "dxil-loop-unroll", "Dxil Unroll loops", false, false)
			
--- a/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl
+++ b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl
@@ -146,7 +146,7 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 
				 	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
			
 
				 	{
			
 
				 		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
			
 
				-		[unroll]
			
 
				+    // [unroll] // Change to allow new unroll behavior.
			
 
				 		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
			
 
				 			gs_IntersectionMasks[C] = 0;
			
 
				 
			
@@ -239,4 +239,4 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 
				 			g_FastDrawPackets[NewPacketIndex] = Packet;
			
 
				 		}
			
 
				 	}
			
 
				-}
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl
+++ b/tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl
@@ -0,0 +1,233 @@
 
				+// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
			
 
				+
			
 
				+// CHECK: Could not unroll loop.
			
 
				+
			
 
				+// Copied from the original ParticleBinCullingCS.hlsl
			
 
				+// The loop on line 141 cannot be unrolled because
			
 
				+// the starting index is not known at compile time.
			
 
				+
			
 
				+//
			
 
				+// Copyright (c) Microsoft. All rights reserved.
			
 
				+// This code is licensed under the MIT License (MIT).
			
 
				+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
			
 
				+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
			
 
				+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
			
 
				+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
			
 
				+//
			
 
				+// Developed by Minigraph
			
 
				+//
			
 
				+// Author(s):   James Stanard 
			
 
				+//              Julia Careaga
			
 
				+//
			
 
				+
			
 
				+#include "ParticleUtility.hlsli"
			
 
				+
			
 
				+StructuredBuffer<uint> g_BinParticles : register(t0);
			
 
				+StructuredBuffer<uint> g_BinCounters : register(t1);
			
 
				+Texture2D<uint> g_DepthBounds : register(t2);
			
 
				+StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
			
 
				+
			
 
				+RWStructuredBuffer<uint> g_SortedParticles : register(u0);
			
 
				+RWByteAddressBuffer g_TileHitMasks : register(u1);
			
 
				+RWStructuredBuffer<uint> g_DrawPackets : register(u2);
			
 
				+RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
			
 
				+RWByteAddressBuffer g_DrawPacketCount : register(u4);
			
 
				+
			
 
				+#if TILES_PER_BIN < 64
			
 
				+#define GROUP_THREAD_COUNT 64
			
 
				+#else
			
 
				+#define GROUP_THREAD_COUNT TILES_PER_BIN
			
 
				+#endif
			
 
				+#define GROUP_SIZE_X TILES_PER_BIN_X
			
 
				+#define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
			
 
				+#define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
			
 
				+
			
 
				+groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
			
 
				+groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
			
 
				+groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
			
 
				+groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
			
 
				+groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
			
 
				+
			
 
				+void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
			
 
				+{
			
 
				+	for (uint k = 2; k <= NextPow2; k *= 2)
			
 
				+	{
			
 
				+		// Align NumElements to the next multiple of k
			
 
				+		NumElements = (NumElements + k - 1) & ~(k - 1);
			
 
				+
			
 
				+		for (uint j = k / 2; j > 0; j /= 2)
			
 
				+		{
			
 
				+			// Loop over all N/2 unique element pairs
			
 
				+			for (uint i = GI; i < NumElements / 2; i += NumThreads)
			
 
				+			{
			
 
				+				uint Index1 = InsertZeroBit(i, j);
			
 
				+				uint Index2 = Index1 | j;
			
 
				+
			
 
				+				uint A = gs_SortKeys[Index1];
			
 
				+				uint B = gs_SortKeys[Index2];
			
 
				+
			
 
				+				if ((A < B) != ((Index1 & k) == 0))
			
 
				+				{
			
 
				+					gs_SortKeys[Index1] = B;
			
 
				+					gs_SortKeys[Index2] = A;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			GroupMemoryBarrierWithGroupSync();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
			
 
				+{
			
 
				+	// Sometimes we have more threads than tiles per bin.
			
 
				+	uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
			
 
				+	uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
			
 
				+	return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
			
 
				+}
			
 
				+
			
 
				+[RootSignature(Particle_RootSig)]
			
 
				+[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
			
 
				+void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
			
 
				+{
			
 
				+	// Each group is assigned a bin
			
 
				+	uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
			
 
				+
			
 
				+	uint ParticleCountInBin = g_BinCounters[BinIndex];
			
 
				+	if (ParticleCountInBin == 0)	
			
 
				+		return;
			
 
				+
			
 
				+	// Get the start location for particles in this bin
			
 
				+	uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
			
 
				+
			
 
				+	// Each thread is assigned a tile
			
 
				+	uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
			
 
				+
			
 
				+	if (GI < TILES_PER_BIN)
			
 
				+	{
			
 
				+		gs_TileParticleCounts[GI] = 0;
			
 
				+		gs_SlowTileParticleCounts[GI] = 0;
			
 
				+		gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
			
 
				+	}
			
 
				+
			
 
				+	// Sometimes the counter value exceeds the actual storage size
			
 
				+	ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
			
 
				+
			
 
				+	// Compute the next power of two for the bitonic sort
			
 
				+	uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
			
 
				+
			
 
				+	// Fill in the sort key array.  Each sort key has passenger data (in the least signficant
			
 
				+	// bits, so that as the sort keys are moved around, they retain a pointer to the particle
			
 
				+	// they refer to.
			
 
				+	for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
			
 
				+		gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
			
 
				+
			
 
				+	GroupMemoryBarrierWithGroupSync();
			
 
				+
			
 
				+	// Sort the particles from front to back.
			
 
				+	BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
			
 
				+
			
 
				+	// Upper-left tile coord and lower-right coord, clamped to the screen
			
 
				+	const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
			
 
				+
			
 
				+	// Each thread writes the hit mask for one tile
			
 
				+	uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
			
 
				+
			
 
				+	// Loop over all sorted particles, group-size count at a time
			
 
				+	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
			
 
				+	{
			
 
				+		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
			
 
				+		[unroll]
			
 
				+		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
			
 
				+			gs_IntersectionMasks[C] = 0;
			
 
				+
			
 
				+		GroupMemoryBarrierWithGroupSync();
			
 
				+
			
 
				+		// The array index of the particle this thread will test
			
 
				+		uint SortIdx = Iter + GI;
			
 
				+
			
 
				+		// Compute word and bit to set (from thread index)
			
 
				+		uint WordOffset = GI >> 5;
			
 
				+		uint BitOffset = GI & 31;
			
 
				+
			
 
				+		// Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
			
 
				+		if (SortIdx < ParticleCountInBin)
			
 
				+		{
			
 
				+			uint SortKey = gs_SortKeys[SortIdx];
			
 
				+			uint GlobalIdx = SortKey & 0x3FFFF;
			
 
				+
			
 
				+			// After this phase, all we care about is its global index
			
 
				+			g_SortedParticles[BinStart + SortIdx] = SortKey;
			
 
				+
			
 
				+			uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
			
 
				+			int2 MinTile = uint2(Bounds >>  0, Bounds >>  8) & 0xFF;
			
 
				+			int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
			
 
				+			MinTile = max(MinTile - StartTile, 0);
			
 
				+			MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
			
 
				+
			
 
				+			for (int y = MinTile.y; y <= MaxTile.y; y++)
			
 
				+			{
			
 
				+				for (int x = MinTile.x; x <= MaxTile.x; x++)
			
 
				+				{
			
 
				+					uint TileIndex = y * TILES_PER_BIN_X + x;
			
 
				+					uint TileMaxZ = gs_MinMaxDepth[TileIndex];
			
 
				+					uint Inside = SortKey < TileMaxZ ? 1 : 0;
			
 
				+					uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
			
 
				+					InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
			
 
				+					InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		GroupMemoryBarrierWithGroupSync();
			
 
				+
			
 
				+#if TILES_PER_BIN < GROUP_THREAD_COUNT
			
 
				+		// Copy the hit masks from LDS to the output buffer.  Here, each thread copies a single word
			
 
				+		if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
			
 
				+		{
			
 
				+			uint TileIndex = GI % TILES_PER_BIN;
			
 
				+			uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
			
 
				+			uint Mask = gs_IntersectionMasks[Offset];
			
 
				+			InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
			
 
				+			g_TileHitMasks.Store(OutOffsetInBytes, Mask);
			
 
				+			OutOffsetInBytes += 8;
			
 
				+		}
			
 
				+#else
			
 
				+		// Copy the hit masks from LDS to the output buffer.  Here, each thread is assigned a tile.
			
 
				+		uint Offset = GI * MASK_WORDS_PER_ITER;
			
 
				+		[unroll]
			
 
				+		for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
			
 
				+		{
			
 
				+			uint Mask0 = gs_IntersectionMasks[Offset+O];
			
 
				+			uint Mask1 = gs_IntersectionMasks[Offset+O+1];
			
 
				+			InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
			
 
				+			g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
			
 
				+			OutOffsetInBytes += 8;
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+		GroupMemoryBarrierWithGroupSync();
			
 
				+	}
			
 
				+
			
 
				+	if (GI >= TILES_PER_BIN)
			
 
				+		return;
			
 
				+
			
 
				+	uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
			
 
				+	if (ParticleCountInThisThreadsTile > 0)
			
 
				+	{
			
 
				+		uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
			
 
				+		uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
			
 
				+
			
 
				+		uint NewPacketIndex;
			
 
				+		if (SlowParticlesInThisThreadsTile > 0)
			
 
				+		{
			
 
				+			g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
			
 
				+			g_DrawPackets[NewPacketIndex] = Packet;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
			
 
				+			g_FastDrawPackets[NewPacketIndex] = Packet;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/loop3.hlsl
+++ b/tools/clang/test/CodeGenHLSL/loop3.hlsl
@@ -1,6 +1,5 @@
 
				 // RUN: %dxc -E main -O2 -T ps_6_0 %s | FileCheck %s
			
 
				 
			
 
				-// CHECK: !"llvm.loop.unroll.full"
			
 
				 // CHECK: !"llvm.loop.unroll.disable"
			
 
				 
			
 
				 float main(float2 a : A, int3 b : B) : SV_Target
			
@@ -12,8 +11,7 @@ float main(float2 a : A, int3 b : B) : SV_Target
 
				     if (b.z == 9)
			
 
				       break;
			
 
				     [allow_uav_condition]
			
 
				-    [unroll]
			
 
				-    for(int j = 0; j < b.y; j++)
			
 
				+    for(int j = 0; j <= 16; j++)
			
 
				     {
			
 
				       [branch]
			
 
				       if (b.z == 16)
			
--- a/tools/clang/test/CodeGenHLSL/unroll/complex.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/complex.hlsl
@@ -0,0 +1,33 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: @main
			
 
				+
			
 
				+uint g_cond[3];
			
 
				+uint g_bound;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  float foo = 10;
			
 
				+
			
 
				+  [unroll]
			
 
				+  for (uint i = 0; i < 4; i++) {
			
 
				+    
			
 
				+    if (i == g_cond[0]) {
			
 
				+      foo += 100;
			
 
				+      break;
			
 
				+    }
			
 
				+    else if (i == g_cond[1]) {
			
 
				+      foo += 200;
			
 
				+      break;
			
 
				+    }
			
 
				+    else if (i == g_cond[2]) { 
			
 
				+      return 10;
			
 
				+    }
			
 
				+    foo++;
			
 
				+  }
			
 
				+
			
 
				+  if (foo > 300) {
			
 
				+    foo /= 2;
			
 
				+  }
			
 
				+
			
 
				+  return foo;
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl
@@ -0,0 +1,51 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+
			
 
				+// CHECK-NOT: call float @dx.op.dot3
			
 
				+
			
 
				+uint gc[4];
			
 
				+uint g_bound;
			
 
				+
			
 
				+float main(float3 a : A, float3 b : B) : SV_Target {
			
 
				+
			
 
				+  float foo = 10;
			
 
				+
			
 
				+  [unroll]
			
 
				+  for (uint i = 1; i < 3; i++) {
			
 
				+    
			
 
				+    if (i == gc[0]) {
			
 
				+      foo += dot(a*gc[0], b/gc[0]);
			
 
				+      continue;
			
 
				+    }
			
 
				+    else if (i == gc[1]) {
			
 
				+      foo += dot(a*gc[1], b/gc[1]);
			
 
				+      continue;
			
 
				+    }
			
 
				+    else if (i == gc[2]) { 
			
 
				+      foo += dot(a*gc[2], b/gc[2]);
			
 
				+      if (foo > g_bound)
			
 
				+        return foo;
			
 
				+      continue;
			
 
				+    }
			
 
				+    else if (i == gc[3]) { 
			
 
				+      foo += dot(a*gc[3], b/gc[3]);
			
 
				+      continue;
			
 
				+    }
			
 
				+    foo++;
			
 
				+  }
			
 
				+
			
 
				+  if (foo > 300) {
			
 
				+    foo /= 2;
			
 
				+  }
			
 
				+
			
 
				+  return foo;
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/unroll/fail.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/fail.hlsl
@@ -0,0 +1,18 @@
 
				+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK-DAG: Could not unroll loop.
			
 
				+// CHECK-NOT: @main
			
 
				+
			
 
				+// Check that the compilation fails due to unable to
			
 
				+// find the loop bound.
			
 
				+
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+  float result = 0;
			
 
				+  [unroll]
			
 
				+  for (uint j = 0; j < g_cond; j++) {
			
 
				+    result += 1;
			
 
				+  }
			
 
				+  return result;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/gis.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/gis.hlsl
@@ -0,0 +1,17 @@
 
				+// RUN: %dxc -Gis -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK-NOT: call float @dx.op.dot3
			
 
				+
			
 
				+float4 main(float3 a : A, float3 b : B) : SV_Target {
			
 
				+  uint result = 1;
			
 
				+  [unroll]
			
 
				+  for (uint i = 0; i < 4; i++) {
			
 
				+    result += dot(a*i, b);
			
 
				+  }
			
 
				+  return float4(result, 0,0, 1);
			
 
				+}
			
 
				+
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/nested.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested.hlsl
@@ -0,0 +1,30 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: @main
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  float ret = 0;
			
 
				+  [unroll]
			
 
				+  for (uint i = 0; i < 4; i++) {
			
 
				+    [unroll]
			
 
				+    for (uint j = 0; j < 4; j++) {
			
 
				+      ret++;
			
 
				+      if (g_cond == j) {
			
 
				+        buffers[j].Append(i);
			
 
				+        return ret;
			
 
				+      }
			
 
				+    }
			
 
				+    ret--;
			
 
				+  }
			
 
				+
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl
@@ -0,0 +1,34 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: @main
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  float ret = 0;
			
 
				+  [unroll]
			
 
				+  for (uint i = 0; i < 4; i++) {
			
 
				+    [unroll]
			
 
				+    for (uint j = 0; j < 4; j++) {
			
 
				+      ret++;
			
 
				+      [unroll]
			
 
				+      for (uint k = 0; k < 4; k++) {
			
 
				+        ret++;
			
 
				+        if (g_cond == j) {
			
 
				+          buffers[k].Append(i+j);
			
 
				+          return ret;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    ret--;
			
 
				+  }
			
 
				+
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl
@@ -0,0 +1,62 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+// CHECK: call i32 @dx.op.bufferUpdateCounter
			
 
				+
			
 
				+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+uint g_cond2;
			
 
				+
			
 
				+float routine(float value) {
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+  float ret = 0;
			
 
				+  [unroll]
			
 
				+  for (uint k = 0; k < 4; k++) {
			
 
				+    ret += 15;
			
 
				+    if (g_cond == k) {
			
 
				+      buffers[k].Append(value);
			
 
				+      return ret;
			
 
				+    }
			
 
				+  }
			
 
				+  return ret+1;
			
 
				+}
			
 
				+
			
 
				+float main(float3 a : A, float3 b : B) : SV_Target {
			
 
				+
			
 
				+  float ret = 0;
			
 
				+  [unroll]
			
 
				+  for (uint i = 0; i < 4; i++) {
			
 
				+
			
 
				+    [loop]
			
 
				+    for (uint j = 0; j < 4; j++) {
			
 
				+      ret += routine(j);
			
 
				+      ret++;
			
 
				+    }
			
 
				+
			
 
				+    ret--;
			
 
				+  }
			
 
				+
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl
@@ -0,0 +1,27 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK-NOT: @main
			
 
				+
			
 
				+// Without [unroll] attribute, the special unroll
			
 
				+// routine is not done of the loop, and the resource
			
 
				+// fail to get mapped.
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  for (uint j = 0; j < 4; j++) {
			
 
				+    if (g_cond == j) {
			
 
				+      buffers[j].Append(1);
			
 
				+      return 10;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl
@@ -0,0 +1,24 @@
 
				+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: @main
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  [unroll]
			
 
				+  for (uint j = 0; j < 4; j++) {
			
 
				+    if (g_cond == j) {
			
 
				+      buffers[j].Append(1);
			
 
				+      return 10;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl
@@ -0,0 +1,20 @@
 
				+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
			
 
				+
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+
			
 
				+// CHECK-NOT: call float @dx.op.dot3
			
 
				+
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main(float3 a : A, float3 b : B) : SV_Target {
			
 
				+  float result = 0;
			
 
				+  [unroll]
			
 
				+  for (uint j = 0; j < g_cond && j < 4; j++) {
			
 
				+    result += dot(a*j, b);
			
 
				+  }
			
 
				+  return result;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl
@@ -0,0 +1,19 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK: call float @dx.op.dot3
			
 
				+// CHECK-NOT: call float @dx.op.dot3
			
 
				+
			
 
				+
			
 
				+float4 main(float3 a : A, float3 b : B) : SV_Target {
			
 
				+  precise uint result = 1;
			
 
				+  [unroll]
			
 
				+  for (precise uint i = 0; i < 4; i++) {
			
 
				+    result += dot(a*i, b);
			
 
				+  }
			
 
				+  return float4(result, 0,0, 1);
			
 
				+}
			
 
				+
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/simple.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/simple.hlsl
@@ -0,0 +1,24 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK: @main
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+uint g_cond;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  [unroll]
			
 
				+  for (uint j = 0; j < 4; j++) {
			
 
				+    if (g_cond == j) {
			
 
				+      buffers[j].Append(1);
			
 
				+      return 10;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/tools/clang/test/CodeGenHLSL/unroll/warning.hlsl
+++ b/tools/clang/test/CodeGenHLSL/unroll/warning.hlsl
@@ -0,0 +1,27 @@
 
				+// RUN: %dxc -HV 2016 -Od -E main -T ps_6_0 %s | FileCheck %s
			
 
				+// CHECK-DAG: warning: Could not unroll loop.
			
 
				+// CHECK-NOT: @main
			
 
				+
			
 
				+// Check that the compilation fails due to unable to
			
 
				+// find the loop bound.
			
 
				+
			
 
				+uint g_cond;
			
 
				+
			
 
				+AppendStructuredBuffer<float4> buf0;
			
 
				+AppendStructuredBuffer<float4> buf1;
			
 
				+AppendStructuredBuffer<float4> buf2;
			
 
				+AppendStructuredBuffer<float4> buf3;
			
 
				+
			
 
				+float main() : SV_Target {
			
 
				+
			
 
				+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
			
 
				+
			
 
				+  float result = 0;
			
 
				+  [unroll]
			
 
				+  for (uint j = 0; j < g_cond; j++) {
			
 
				+    buffers[j].Append(result);
			
 
				+    result += 1;
			
 
				+  }
			
 
				+  return result;
			
 
				+}
			
 
				+
			
--- a/tools/clang/unittests/HLSL/CompilerTest.cpp
+++ b/tools/clang/unittests/HLSL/CompilerTest.cpp
@@ -923,6 +923,7 @@ public:
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticlesortindirectargscs)
			
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticlespawncs)
			
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs)
			
 
				+  TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll)
			
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticletilerendercs)
			
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastcs)
			
 
				   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastdynamiccs)
			
@@ -951,6 +952,7 @@ public:
 
				   TEST_METHOD(ViewID)
			
 
				   TEST_METHOD(SubobjectCodeGenErrors)
			
 
				   TEST_METHOD(ShaderCompatSuite)
			
 
				+  TEST_METHOD(Unroll)
			
 
				   TEST_METHOD(QuickTest)
			
 
				   TEST_METHOD(QuickLlTest)
			
 
				   BEGIN_TEST_METHOD(SingleFileCheckTest)
			
@@ -5652,6 +5654,10 @@ TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs){
 
				   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS.hlsl");
			
 
				 }
			
 
				 
			
 
				+TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll){
			
 
				+  CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS_fail_unroll.hlsl");
			
 
				+}
			
 
				+
			
 
				 TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilerendercs){
			
 
				   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileRenderCS.hlsl");
			
 
				 }
			
@@ -5999,6 +6005,19 @@ TEST_F(CompilerTest, SubobjectCodeGenErrors) {
 
				   }
			
 
				 }
			
 
				 
			
 
				+TEST_F(CompilerTest, Unroll) {
			
 
				+  using namespace WEX::TestExecution;
			
 
				+  std::wstring suitePath = L"..\\CodeGenHLSL\\unroll";
			
 
				+
			
 
				+  WEX::Common::String value;
			
 
				+  if (!DXC_FAILED(RuntimeParameters::TryGetValue(L"SuitePath", value)))
			
 
				+  {
			
 
				+    suitePath = value;
			
 
				+  }
			
 
				+
			
 
				+  CodeGenTestCheckBatchDir(suitePath);
			
 
				+}
			
 
				+
			
 
				 TEST_F(CompilerTest, ShaderCompatSuite) {
			
 
				   using namespace WEX::TestExecution;
			
 
				   std::wstring suitePath = L"..\\CodeGenHLSL\\shader-compat-suite";