Browse Source

Created custom unroll pass to optionally include certain exit blocks. (#1752)

Adam Yang 6 years ago
parent
commit
51c6cfb01c

+ 5 - 0
include/dxc/DXIL/DxilUtil.h

@@ -14,6 +14,7 @@
 #include <string>
 #include <memory>
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 
 namespace llvm {
@@ -30,6 +31,7 @@ class BasicBlock;
 class raw_ostream;
 class ModulePass;
 class PassRegistry;
+class DebugLoc;
 
 ModulePass *createDxilLoadMetadataPass();
 void initializeDxilLoadMetadataPass(llvm::PassRegistry&);
@@ -67,6 +69,8 @@ namespace dxilutil {
                              llvm::Function *PatchConstantFunc, bool IsLib);
   void EmitErrorOnInstruction(llvm::Instruction *I, llvm::StringRef Msg);
   void EmitResMappingError(llvm::Instruction *Res);
+  std::string FormatMessageAtLocation(const llvm::DebugLoc &DL, llvm::Twine Msg);
+  llvm::Twine FormatMessageWithoutLocation(llvm::Twine Msg);
   // Simple demangle just support case "\01?name@" pattern.
   llvm::StringRef DemangleFunctionName(llvm::StringRef name);
   // ReplaceFunctionName replaces the undecorated portion of originalName with undecorated newName
@@ -92,6 +96,7 @@ namespace dxilutil {
   llvm::Value *MergeSelectOnSameValue(llvm::Instruction *SelInst,
                                       unsigned startOpIdx,
                                       unsigned numOperands);
+  bool SimplifyTrivialPHIs(llvm::BasicBlock *BB);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::StringRef BC,
     llvm::LLVMContext &Ctx, std::string &DiagStr);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,

+ 1 - 0
include/llvm/InitializePasses.h

@@ -259,6 +259,7 @@ void initializeMultiDimArrayToOneDimArrayPass(PassRegistry&);
 void initializeResourceToHandlePass(PassRegistry&);
 void initializeSROA_SSAUp_HLSLPass(PassRegistry&);
 void initializeHoistConstantArrayPass(PassRegistry&);
+void initializeDxilLoopUnrollPass(PassRegistry&);
 // HLSL Change Ends
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
 void initializeScalarEvolutionPass(PassRegistry&);

+ 3 - 0
include/llvm/Transforms/Scalar.h

@@ -122,6 +122,9 @@ void initializeSROA_DT_HLSLPass(PassRegistry&);
 //
 ModulePass *createSROA_Parameter_HLSL();
 void initializeSROA_Parameter_HLSLPass(PassRegistry&);
+
+Pass *createDxilLoopUnrollPass(unsigned MaxIterationAttempt);
+void initializeDxilLoopUnrollPass(PassRegistry&);
 //===----------------------------------------------------------------------===//
 //
 // LowerStaticGlobalIntoAlloca. Replace static globals with alloca if only used

+ 36 - 5
lib/DXIL/DxilUtil.cpp

@@ -229,20 +229,29 @@ static bool EmitErrorOnInstructionFollowPhiSelect(
   return false;
 }
 
+std::string FormatMessageAtLocation(const DebugLoc &DL, Twine Msg) {
+  std::string locString;
+  raw_string_ostream os(locString);
+  DL.print(os);
+  os << ": " << Msg;
+  return os.str();
+}
+
+Twine FormatMessageWithoutLocation(Twine Msg) {
+  return Twine(Msg) + " Use /Zi for source location.";
+}
+
 void EmitErrorOnInstruction(Instruction *I, StringRef Msg) {
   const DebugLoc &DL = I->getDebugLoc();
   if (DL.get()) {
-    std::string locString;
-    raw_string_ostream os(locString);
-    DL.print(os);
-    I->getContext().emitError(os.str() + ": " + Twine(Msg));
+    I->getContext().emitError(FormatMessageAtLocation(DL, Msg));
     return;
   } else if (isa<PHINode>(I) || isa<SelectInst>(I)) {
     if (EmitErrorOnInstructionFollowPhiSelect(I, Msg))
       return;
   }
 
-  I->getContext().emitError(Twine(Msg) + " Use /Zi for source location.");
+  I->getContext().emitError(FormatMessageWithoutLocation(Msg));
 }
 
 const StringRef kResourceMapErrorMsg =
@@ -296,6 +305,28 @@ Value *MergeSelectOnSameValue(Instruction *SelInst, unsigned startOpIdx,
   return op0;
 }
 
+bool SimplifyTrivialPHIs(BasicBlock *BB) {
+  bool Changed = false;
+  SmallVector<Instruction *, 16> Removed;
+  for (Instruction &I : *BB) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      continue;
+
+    if (PN->getNumIncomingValues() == 1) {
+      Value *V = PN->getIncomingValue(0);
+      PN->replaceAllUsesWith(V);
+      Removed.push_back(PN);
+      Changed = true;
+    }
+  }
+
+  for (Instruction *I : Removed)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
 Value *SelectOnOperation(llvm::Instruction *Inst, unsigned operandIdx) {
   Instruction *prototype = Inst;
   for (unsigned i = 0; i < prototype->getNumOperands(); i++) {

+ 1 - 0
lib/HLSL/DxcOptimizer.cpp

@@ -157,6 +157,7 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeSROA_Parameter_HLSLPass(Registry);
     initializeSROA_SSAUpPass(Registry);
     initializeSROA_SSAUp_HLSLPass(Registry);
+    initializeDxilLoopUnrollPass(Registry);
     initializeSampleProfileLoaderPass(Registry);
     initializeScalarizerPass(Registry);
     initializeScopedNoAliasAAPass(Registry);

+ 4 - 0
lib/Transforms/IPO/PassManagerBuilder.cpp

@@ -219,6 +219,10 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
     MPM.add(createHLDeadFunctionEliminationPass());
   }
 
+  // Passes to handle [unroll]
+  MPM.add(createLoopRotatePass());
+  MPM.add(createDxilLoopUnrollPass(/*MaxIterationAttempt*/ 128));
+
   // Split struct and array of parameter.
   MPM.add(createSROA_Parameter_HLSL());
 

+ 1 - 0
lib/Transforms/Scalar/CMakeLists.txt

@@ -44,6 +44,7 @@ add_llvm_library(LLVMScalarOpts
   Scalar.cpp
   ScalarReplAggregates.cpp
   ScalarReplAggregatesHLSL.cpp  # HLSL Change
+  DxilLoopUnroll.cpp # HLSL Change
   Scalarizer.cpp
   SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp

+ 848 - 0
lib/Transforms/Scalar/DxilLoopUnroll.cpp

@@ -0,0 +1,848 @@
+//===- DxilLoopUnroll.cpp - Special Unroll for Constant Values ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Special loop unroll routine for creating mandatory constant values and
+// loops that have exits.
+//
+// Overview of algorithm:
+// 
+// 1. Identify a set of blocks to unroll.
+//
+//    LLVM's concept of loop excludes exit blocks, which are blocks that no
+//    longer have a path to the loop latch. However, some exit blocks in HLSL
+//    also need to be unrolled. For example:
+//
+//        [unroll]
+//        for (uint i = 0; i < 4; i++)
+//        {
+//          if (...)
+//          {
+//            // This block here is an exit block, since it's.
+//            // guaranteed to exit the loop.
+//            ...
+//            a[i] = ...; // Indexing requires unroll.
+//            return;
+//          }
+//        }
+//
+//
+// 2. Create LCSSA based on the new loop boundary.
+//
+//    See LCSSA.cpp for more details. It creates trivial PHI nodes for any
+//    outgoing values of the loop at the exit blocks, so when the loop body
+//    gets cloned, the outgoing values can be added to those PHI nodes easily.
+//
+//    We are using a modified LCSSA routine here because we are including some
+//    of the original exit blocks in the unroll.
+//
+//
+// 3. Unroll the loop until we succeed.
+//
+//    Unlike LLVM, we do not try to find a loop count before unrolling.
+//    Instead, we unroll to find a constant terminal condition. Give up when we
+//    fail to do so.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SetVector.h"
+
+#include "dxc/DXIL/DxilUtil.h"
+#include "dxc/HLSL/HLModule.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+// Copied over from LoopUnroll.cpp - RemapInstruction()
+static inline void RemapInstruction(Instruction *I,
+                                    ValueToValueMapTy &VMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    ValueToValueMapTy::iterator It = VMap.find(Op);
+    if (It != VMap.end())
+      I->setOperand(op, It->second);
+  }
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
+      if (It != VMap.end())
+        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
+    }
+  }
+}
+
+
+namespace {
+
+class DxilLoopUnroll : public LoopPass {
+public:
+  static char ID;
+
+  std::unordered_set<Function *> CleanedUpAlloca;
+  unsigned MaxIterationAttempt = 0;
+
+  DxilLoopUnroll(unsigned MaxIterationAttempt = 128) :
+    LoopPass(ID),
+    MaxIterationAttempt(MaxIterationAttempt)
+  {
+    initializeDxilLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+  const char *getPassName() const override { return "Dxil Loop Unroll"; }
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+
+char DxilLoopUnroll::ID;
+
+static void FailLoopUnroll(bool WarnOnly, Loop *L, const char *Message) {
+  DebugLoc DL = L->getStartLoc();
+  LLVMContext &Ctx = L->getHeader()->getContext();
+
+  if (WarnOnly) {
+    if (DL.get())
+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
+    else
+      Ctx.emitWarning(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
+  }
+  else {
+    if (DL.get())
+      Ctx.emitError(hlsl::dxilutil::FormatMessageAtLocation(DL, Message));
+    else
+      Ctx.emitError(hlsl::dxilutil::FormatMessageWithoutLocation(Message));
+  }
+}
+
+struct LoopIteration {
+  SmallVector<BasicBlock *, 16> Body;
+  BasicBlock *Latch = nullptr;
+  BasicBlock *Header = nullptr;
+  ValueToValueMapTy VarMap;
+  SetVector<BasicBlock *> Extended; // Blocks that are included in the clone that are not in the core loop body.
+  LoopIteration() {}
+};
+
+static bool GetConstantI1(Value *V, bool *Val=nullptr) {
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
+    if (V->getType()->isIntegerTy(1)) {
+      if (Val)
+        *Val = (bool)C->getLimitedValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+// Copied from llvm::SimplifyInstructionsInBlock
+static bool SimplifyInstructionsInBlock_NoDelete(BasicBlock *BB,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(--BB->end());
+#endif
+
+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+    assert(!BI->isTerminator());
+    Instruction *Inst = BI++;
+
+    WeakVH BIHandle(BI);
+    if (recursivelySimplifyInstruction(Inst, TLI)) {
+      MadeChange = true;
+      if (BIHandle != BI)
+        BI = BB->begin();
+      continue;
+    }
+#if 0 // HLSL Change
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+#endif // HLSL Change
+    if (BIHandle != BI)
+      BI = BB->begin();
+  }
+  return MadeChange;
+}
+
+static bool IsMarkedFullUnroll(Loop *L) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, "llvm.loop.unroll.full");
+  return false;
+}
+
+static bool HasSuccessorsInLoop(BasicBlock *BB, Loop *L) {
+  for (BasicBlock *Succ : successors(BB)) {
+    if (L->contains(Succ)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void DetachFromSuccessors(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 16> Successors(succ_begin(BB), succ_end(BB));
+  for (BasicBlock *Succ : Successors) {
+    Succ->removePredecessor(BB);
+  }
+}
+
+/// Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (ExitBlocks[i] == BB)
+      return true;
+  return false;
+}
+
+// Copied and modified from LCSSA.cpp
+static bool processInstruction(SetVector<BasicBlock *> &Body, Loop &L, Instruction &Inst, DominatorTree &DT, // HLSL Change
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                               PredIteratorCache &PredCache, LoopInfo *LI) {
+
+  SmallVector<Use *, 16> UsesToRewrite;
+
+  BasicBlock *InstBB = Inst.getParent();
+
+  for (Use &U : Inst.uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User))
+      UserBB = PN->getIncomingBlock(U);
+
+    if (InstBB != UserBB && /*!L.contains(UserBB)*/!Body.count(UserBB)) // HLSL Change
+      UsesToRewrite.push_back(&U);
+  }
+
+  // If there are no uses outside the loop, exit with no change.
+  if (UsesToRewrite.empty())
+    return false;
+#if 0 // HLSL Change
+  ++NumLCSSA; // We are applying the transformation
+#endif // HLSL Change
+  // Invoke instructions are special in that their result value is not available
+  // along their unwind edge. The code below tests to see whether DomBB
+  // dominates
+  // the value, so adjust DomBB to the normal destination block, which is
+  // effectively where the value is first usable.
+  BasicBlock *DomBB = Inst.getParent();
+  if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
+    DomBB = Inv->getNormalDest();
+
+  DomTreeNode *DomNode = DT.getNode(DomBB);
+
+  SmallVector<PHINode *, 16> AddedPHIs;
+  SmallVector<PHINode *, 8> PostProcessPHIs;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(Inst.getType(), Inst.getName());
+
+  // Insert the LCSSA phi's into all of the exit blocks dominated by the
+  // value, and add them to the Phi's map.
+  for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(),
+                                                     BBE = ExitBlocks.end();
+       BBI != BBE; ++BBI) {
+    BasicBlock *ExitBB = *BBI;
+    if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+      continue;
+
+    // If we already inserted something for this BB, don't reprocess it.
+    if (SSAUpdate.HasValueForBlock(ExitBB))
+      continue;
+
+    PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
+                                  Inst.getName() + ".lcssa", ExitBB->begin());
+
+    // Add inputs from inside the loop for this PHI.
+    for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+      PN->addIncoming(&Inst, Pred);
+
+      // If the exit block has a predecessor not within the loop, arrange for
+      // the incoming value use corresponding to that predecessor to be
+      // rewritten in terms of a different LCSSA PHI.
+      if (/*!L.contains(Pred)*/ !Body.count(Pred)) // HLSL Change
+        UsesToRewrite.push_back(
+            &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                 PN->getNumIncomingValues() - 1)));
+    }
+
+    AddedPHIs.push_back(PN);
+
+    // Remember that this phi makes the value alive in this block.
+    SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+    // LoopSimplify might fail to simplify some loops (e.g. when indirect
+    // branches are involved). In such situations, it might happen that an exit
+    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
+    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
+    // This could break LCSSA form for L2 because these inserted PHIs can also
+    // have uses outside of L2. Remember all PHIs in such situation as to
+    // revisit than later on. FIXME: Remove this if indirectbr support into
+    // LoopSimplify gets improved.
+    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
+      if (!L.contains(OtherLoop))
+        PostProcessPHIs.push_back(PN);
+  }
+
+  // Rewrite all uses outside the loop in terms of the new PHIs we just
+  // inserted.
+  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
+    // If this use is in an exit block, rewrite to use the newly inserted PHI.
+    // This is required for correctness because SSAUpdate doesn't handle uses in
+    // the same block.  It assumes the PHI we inserted is at the end of the
+    // block.
+    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User))
+      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
+
+    if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+      // Tell the VHs that the uses changed. This updates SCEV's caches.
+      if (UsesToRewrite[i]->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
+      UsesToRewrite[i]->set(UserBB->begin());
+      continue;
+    }
+
+    // Otherwise, do full PHI insertion.
+    SSAUpdate.RewriteUse(*UsesToRewrite[i]);
+  }
+
+  // Post process PHI instructions that were inserted into another disjoint loop
+  // and update their exits properly.
+  for (auto *I : PostProcessPHIs) {
+    if (I->use_empty())
+      continue;
+
+    BasicBlock *PHIBB = I->getParent();
+    Loop *OtherLoop = LI->getLoopFor(PHIBB);
+    SmallVector<BasicBlock *, 8> EBs;
+    OtherLoop->getExitBlocks(EBs);
+    if (EBs.empty())
+      continue;
+
+    // Recurse and re-process each PHI instruction. FIXME: we should really
+    // convert this entire thing to a worklist approach where we process a
+    // vector of instructions...
+    processInstruction(Body, *OtherLoop, *I, DT, EBs, PredCache, LI);
+  }
+
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
+    if (AddedPHIs[i]->use_empty())
+      AddedPHIs[i]->eraseFromParent();
+  }
+
+  return true;
+
+}
+
+// Copied from LCSSA.cpp
+static bool blockDominatesAnExit(BasicBlock *BB,
+                     DominatorTree &DT,
+                     const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  DomTreeNode *DomNode = DT.getNode(BB);
+  for (BasicBlock *Exit : ExitBlocks)
+    if (DT.dominates(DomNode, DT.getNode(Exit)))
+      return true;
+  return false;
+};
+
+// Copied from LCSSA.cpp
+//
+// We need to recreate the LCSSA form since our loop boundary is potentially different from
+// the canonical one.
+static bool CreateLCSSA(SetVector<BasicBlock *> &Body, const SmallVectorImpl<BasicBlock *> &ExitBlocks, Loop *L, DominatorTree &DT, LoopInfo *LI) {
+
+  PredIteratorCache PredCache;
+  bool Changed = false;
+  // Look at all the instructions in the loop, checking to see if they have uses
+  // outside the loop.  If so, rewrite those uses.
+  for (SetVector<BasicBlock *>::iterator BBI = Body.begin(), BBE = Body.end();
+       BBI != BBE; ++BBI) {
+    BasicBlock *BB = *BBI;
+
+    // For large loops, avoid use-scanning by using dominance information:  In
+    // particular, if a block does not dominate any of the loop exits, then none
+    // of the values defined in the block could be used outside the loop.
+    if (!blockDominatesAnExit(BB, DT, ExitBlocks))
+      continue;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Reject two common cases fast: instructions with no uses (like stores)
+      // and instructions with one use that is in the same block as this.
+      if (I->use_empty() ||
+          (I->hasOneUse() && I->user_back()->getParent() == BB &&
+           !isa<PHINode>(I->user_back())))
+        continue;
+
+      Changed |= processInstruction(Body, *L, *I, DT, ExitBlocks, PredCache, LI);
+    }
+  }
+
+  return Changed;
+}
+
+static void FindProblemBlocks(BasicBlock *Header, const SmallVectorImpl<BasicBlock *> &BlocksInLoop, std::unordered_set<BasicBlock *> &ProblemBlocks) {
+  SmallVector<Instruction *, 16> WorkList;
+
+  std::unordered_set<BasicBlock *> BlocksInLoopSet(BlocksInLoop.begin(), BlocksInLoop.end());
+  std::unordered_set<Instruction *> InstructionsSeen;
+
+  for (Instruction &I : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    WorkList.push_back(PN);
+    InstructionsSeen.insert(PN);
+  }
+
+  while (WorkList.size()) {
+    Instruction *I = WorkList.pop_back_val();
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Type *EltType = GEP->getType()->getPointerElementType();
+
+      // NOTE: This is a very convservative in the following conditions:
+      // - constant global resource arrays with external linkage (these can be
+      //   dynamically accessed)
+      // - global resource arrays or alloca resource arrays, as long as all
+      //   writes come from the same original resource definition (which can
+      //   also be an array).
+      //
+      // We may want to make this more precise in the future if it becomes a
+      // problem.
+      //
+      if (hlsl::dxilutil::IsHLSLObjectType(EltType)) {
+        ProblemBlocks.insert(GEP->getParent());
+        continue; // Stop Propagating
+      }
+    }
+
+    for (User *U : I->users()) {
+      if (Instruction *UserI = dyn_cast<Instruction>(U)) {
+        if (!InstructionsSeen.count(UserI) &&
+          BlocksInLoopSet.count(UserI->getParent()))
+        {
+          InstructionsSeen.insert(UserI);
+          WorkList.push_back(UserI);
+        }
+      }
+    }
+  }
+}
+
+static bool ContainsFloatingPointType(Type *Ty) {
+  if (Ty->isFloatingPointTy()) {
+    return true;
+  }
+  else if (Ty->isArrayTy()) {
+    return ContainsFloatingPointType(Ty->getArrayElementType());
+  }
+  else if (Ty->isVectorTy()) {
+    return ContainsFloatingPointType(Ty->getVectorElementType());
+  }
+  else if (Ty->isStructTy()) {
+    for (unsigned i = 0, NumStructElms = Ty->getStructNumElements(); i < NumStructElms; i++) {
+      if (ContainsFloatingPointType(Ty->getStructElementType(i)))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool Mem2Reg(Function &F, DominatorTree &DT, AssumptionCache &AC) {
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+  bool Changed  = false;
+  std::vector<AllocaInst*> Allocas;
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI) &&
+          (!HLModule::HasPreciseAttributeWithMetadata(AI) || !ContainsFloatingPointType(AI->getAllocatedType())))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, nullptr, &AC);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+bool DxilLoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+  // If the loop is not marked as [unroll], don't do anything.
+  if (!IsMarkedFullUnroll(L))
+    return false;
+
+  if (!L->isSafeToClone())
+    return false;
+
+  Function *F = L->getHeader()->getParent();
+  bool OnlyWarnOnFail = false;
+  if (F->getParent()->HasHLModule()) {
+    HLModule &HM = F->getParent()->GetHLModule();
+    OnlyWarnOnFail = HM.GetHLOptions().bFXCCompatMode;
+  }
+
+  // Analysis passes
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionCache *AC =
+    &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  Loop *OuterL = L->getParentLoop();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Predecessor = L->getLoopPredecessor();
+
+  // Quit if we don't have a single latch block or predecessor
+  if (!Latch || !Predecessor) {
+    return false;
+  }
+
+  // If the loop exit condition is not in the latch, then the loop is not rotated. Give up.
+  if (!cast<BranchInst>(Latch->getTerminator())->isConditional()) {
+    return false;
+  }
+
+  // Promote alloca's
+  if (!CleanedUpAlloca.count(F)) {
+    CleanedUpAlloca.insert(F);
+    Mem2Reg(*F, *DT, *AC);
+  }
+
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  std::unordered_set<BasicBlock *> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+
+  SmallVector<BasicBlock *, 16> BlocksInLoop; // Set of blocks including both body and exits
+  BlocksInLoop.append(L->getBlocks().begin(), L->getBlocks().end());
+  BlocksInLoop.append(ExitBlocks.begin(), ExitBlocks.end());
+
+  // Heuristically find blocks that likely need to be unrolled
+  std::unordered_set<BasicBlock *> ProblemBlocks;
+  FindProblemBlocks(L->getHeader(), BlocksInLoop, ProblemBlocks);
+
+  // Keep track of the PHI nodes at the header.
+  SmallVector<PHINode *, 16> PHIs;
+  for (auto it = Header->begin(); it != Header->end(); it++) {
+    if (PHINode *PN = dyn_cast<PHINode>(it)) {
+      PHIs.push_back(PN);
+    }
+    else {
+      break;
+    }
+  }
+
+  SetVector<BasicBlock *> ToBeCloned; // List of blocks that will be cloned.
+  for (BasicBlock *BB : L->getBlocks()) // Include the body right away
+    ToBeCloned.insert(BB);
+
+  // Find the exit blocks that also need to be included
+  // in the unroll.
+  SmallVector<BasicBlock *, 8> NewExits; // New set of exit blocks as boundaries for LCSSA
+  SmallVector<BasicBlock *, 8> FakeExits; // Set of blocks created to allow cloning original exit blocks.
+  for (BasicBlock *BB : ExitBlocks) {
+    bool CloneThisExitBlock = ProblemBlocks.count(BB);
+
+    if (CloneThisExitBlock) {
+      ToBeCloned.insert(BB);
+
+      // If we are cloning this basic block, we must create a new exit
+      // block for inserting LCSSA PHI nodes.
+      BasicBlock *FakeExit = BasicBlock::Create(BB->getContext(), "loop.exit.new");
+      F->getBasicBlockList().insert(BB, FakeExit);
+
+      TerminatorInst *OldTerm = BB->getTerminator();
+      OldTerm->removeFromParent();
+      FakeExit->getInstList().push_back(OldTerm);
+
+      BranchInst::Create(FakeExit, BB);
+      for (BasicBlock *Succ : successors(FakeExit)) {
+        for (Instruction &I : *Succ) {
+          if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+            for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+              if (PN->getIncomingBlock(i) == BB)
+                PN->setIncomingBlock(i, FakeExit);
+            }
+          }
+        }
+      }
+
+      NewExits.push_back(FakeExit);
+      FakeExits.push_back(FakeExit);
+
+      // Update Dom tree with new exit
+      if (!DT->getNode(FakeExit))
+        DT->addNewBlock(FakeExit, BB);
+    }
+    else {
+      // If we are not including this exit block in the unroll,
+      // use it for LCSSA as normal.
+      NewExits.push_back(BB);
+    }
+  }
+
+  // Simplify the PHI nodes that have single incoming value. The original LCSSA form
+  // (if exists) does not necessarily work for our unroll because we may be unrolling
+  // from a different boundary.
+  for (BasicBlock *BB : BlocksInLoop)
+    hlsl::dxilutil::SimplifyTrivialPHIs(BB);
+
+  // Re-establish LCSSA form to get ready for unrolling.
+  CreateLCSSA(ToBeCloned, NewExits, L, *DT, LI);
+
+  SmallVector<std::unique_ptr<LoopIteration>, 16> Iterations; // List of cloned iterations
+  bool Succeeded = false;
+
+  for (unsigned IterationI = 0; IterationI < this->MaxIterationAttempt; IterationI++) {
+
+    LoopIteration *PrevIteration = nullptr;
+    if (Iterations.size())
+      PrevIteration = Iterations.back().get();
+    Iterations.push_back(std::make_unique<LoopIteration>());
+    LoopIteration &CurIteration = *Iterations.back().get();
+
+    // Clone the blocks.
+    for (BasicBlock *BB : ToBeCloned) {
+
+      BasicBlock *ClonedBB = CloneBasicBlock(BB, CurIteration.VarMap);
+      CurIteration.VarMap[BB] = ClonedBB;
+      ClonedBB->insertInto(F, Header);
+
+      if (ExitBlockSet.count(BB))
+        CurIteration.Extended.insert(ClonedBB);
+
+      CurIteration.Body.push_back(ClonedBB);
+
+      // Identify the special blocks.
+      if (BB == Latch) {
+        CurIteration.Latch = ClonedBB;
+      }
+      if (BB == Header) {
+        CurIteration.Header = ClonedBB;
+      }
+    }
+
+    for (BasicBlock *BB : ToBeCloned) {
+      BasicBlock *ClonedBB = cast<BasicBlock>(CurIteration.VarMap[BB]);
+      // If branching to outside of the loop, need to update the
+      // phi nodes there to include new values.
+      for (BasicBlock *Succ : successors(ClonedBB)) {
+        if (ToBeCloned.count(Succ))
+          continue;
+        for (Instruction &I : *Succ) {
+          PHINode *PN = dyn_cast<PHINode>(&I);
+          if (!PN)
+            break;
+
+          // Find the incoming value for this new block. If there is an entry
+          // for this block in the map, then it was defined in the loop, use it.
+          // Otherwise it came from outside the loop.
+          Value *OldIncoming = PN->getIncomingValueForBlock(BB);
+          Value *NewIncoming = OldIncoming;
+          ValueToValueMapTy::iterator Itor = CurIteration.VarMap.find(OldIncoming);
+          if (Itor != CurIteration.VarMap.end())
+            NewIncoming = Itor->second;
+          PN->addIncoming(NewIncoming, ClonedBB);
+        }
+      }
+    }
+
+    // Remap the instructions inside of cloned blocks.
+    for (BasicBlock *BB : CurIteration.Body) {
+      for (Instruction &I : *BB) {
+        ::RemapInstruction(&I, CurIteration.VarMap);
+      }
+    }
+
+    // If this is the first block
+    if (!PrevIteration) {
+      // Replace the phi nodes in the clone block with the values coming
+      // from outside of the loop
+      for (PHINode *PN : PHIs) {
+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
+        Value *ReplacementVal = ClonedPN->getIncomingValueForBlock(Predecessor);
+        ClonedPN->replaceAllUsesWith(ReplacementVal);
+        ClonedPN->eraseFromParent();
+        CurIteration.VarMap[PN] = ReplacementVal;
+      }
+    }
+    else {
+      // Replace the phi nodes with the value defined INSIDE the previous iteration.
+      for (PHINode *PN : PHIs) {
+        PHINode *ClonedPN = cast<PHINode>(CurIteration.VarMap[PN]);
+        Value *ReplacementVal = PrevIteration->VarMap[PN->getIncomingValueForBlock(Latch)];
+        ClonedPN->replaceAllUsesWith(ReplacementVal);
+        ClonedPN->eraseFromParent();
+        CurIteration.VarMap[PN] = ReplacementVal;
+      }
+
+      // Make the latch of the previous iteration branch to the header
+      // of this new iteration.
+      if (BranchInst *BI = dyn_cast<BranchInst>(PrevIteration->Latch->getTerminator())) {
+        for (unsigned i = 0; i < BI->getNumSuccessors(); i++) {
+          if (BI->getSuccessor(i) == PrevIteration->Header) {
+            BI->setSuccessor(i, CurIteration.Header);
+            break;
+          }
+        }
+      }
+    }
+
+    // Simplify instructions in the cloned blocks to create
+    // constant exit conditions.
+    for (BasicBlock *ClonedBB : CurIteration.Body)
+      SimplifyInstructionsInBlock_NoDelete(ClonedBB, NULL);
+
+    // Check exit condition to see if we fully unrolled the loop
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurIteration.Latch->getTerminator())) {
+      bool Cond = false;
+      if (GetConstantI1(BI->getCondition(), &Cond)) {
+        if (BI->getSuccessor(Cond ? 1 : 0) == CurIteration.Header) {
+          Succeeded = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (Succeeded) {
+    LoopIteration &FirstIteration = *Iterations.front().get();
+    // Make the predecessor branch to the first new header.
+    {
+      BranchInst *BI = cast<BranchInst>(Predecessor->getTerminator());
+      for (unsigned i = 0, NumSucc = BI->getNumSuccessors(); i < NumSucc; i++) {
+        if (BI->getSuccessor(i) == Header) {
+          BI->setSuccessor(i, FirstIteration.Header);
+        }
+      }
+    }
+
+    if (OuterL) {
+      // Core body blocks need to be added to outer loop
+      for (size_t i = 0; i < Iterations.size(); i++) {
+        LoopIteration &Iteration = *Iterations[i].get();
+        for (BasicBlock *BB : Iteration.Body) {
+          if (!Iteration.Extended.count(BB)) {
+            OuterL->addBasicBlockToLoop(BB, *LI);
+          }
+        }
+      }
+
+      // Our newly created exit blocks may need to be added to outer loop
+      for (BasicBlock *BB : FakeExits) {
+        if (HasSuccessorsInLoop(BB, OuterL))
+          OuterL->addBasicBlockToLoop(BB, *LI);
+      }
+
+      // Cloned exit blocks may need to be added to outer loop
+      for (size_t i = 0; i < Iterations.size(); i++) {
+        LoopIteration &Iteration = *Iterations[i].get();
+        for (BasicBlock *BB : Iteration.Extended) {
+          if (HasSuccessorsInLoop(BB, OuterL))
+            OuterL->addBasicBlockToLoop(BB, *LI);
+        }
+      }
+    }
+
+    // Remove the original blocks that we've cloned from all loops.
+    for (BasicBlock *BB : ToBeCloned)
+      LI->removeBlock(BB);
+
+    LPM.deleteLoopFromQueue(L);
+
+    // Remove dead blocks.
+    for (BasicBlock *BB : ToBeCloned)
+      DetachFromSuccessors(BB);
+    for (BasicBlock *BB : ToBeCloned)
+      BB->dropAllReferences();
+    for (BasicBlock *BB : ToBeCloned)
+      BB->eraseFromParent();
+
+    if (OuterL) {
+      // This process may have created multiple back edges for the
+      // parent loop. Simplify to keep it well-formed.
+      simplifyLoop(OuterL, DT, LI, this, nullptr, nullptr, AC);
+    }
+
+    return true;
+  }
+
+  // If we were unsuccessful in unrolling the loop
+  else {
+    FailLoopUnroll(OnlyWarnOnFail, L, "Could not unroll loop.");
+
+    // Remove all the cloned blocks
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        DetachFromSuccessors(BB);
+    }
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        BB->dropAllReferences();
+    }
+    for (std::unique_ptr<LoopIteration> &Ptr : Iterations) {
+      LoopIteration &Iteration = *Ptr.get();
+      for (BasicBlock *BB : Iteration.Body)
+        BB->eraseFromParent();
+    }
+
+    return false;
+  }
+}
+
+}
+
+Pass *llvm::createDxilLoopUnrollPass(unsigned MaxIterationAttempt) {
+  return new DxilLoopUnroll(MaxIterationAttempt);
+}
+
+INITIALIZE_PASS(DxilLoopUnroll, "dxil-loop-unroll", "Dxil Unroll loops", false, false)

+ 2 - 2
tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS.hlsl

@@ -146,7 +146,7 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
 	{
 		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
-		[unroll]
+    // [unroll] // Change to allow new unroll behavior.
 		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
 			gs_IntersectionMasks[C] = 0;
 
@@ -239,4 +239,4 @@ void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_Grou
 			g_FastDrawPackets[NewPacketIndex] = Packet;
 		}
 	}
-}
+}

+ 233 - 0
tools/clang/test/CodeGenHLSL/Samples/MiniEngine/ParticleTileCullingCS_fail_unroll.hlsl

@@ -0,0 +1,233 @@
+// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
+
+// CHECK: Could not unroll loop.
+
+// Copied from the original ParticleBinCullingCS.hlsl
+// The loop on line 141 cannot be unrolled because
+// the starting index is not known at compile time.
+
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author(s):   James Stanard 
+//              Julia Careaga
+//
+
+#include "ParticleUtility.hlsli"
+
+StructuredBuffer<uint> g_BinParticles : register(t0);
+StructuredBuffer<uint> g_BinCounters : register(t1);
+Texture2D<uint> g_DepthBounds : register(t2);
+StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
+
+RWStructuredBuffer<uint> g_SortedParticles : register(u0);
+RWByteAddressBuffer g_TileHitMasks : register(u1);
+RWStructuredBuffer<uint> g_DrawPackets : register(u2);
+RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
+RWByteAddressBuffer g_DrawPacketCount : register(u4);
+
+#if TILES_PER_BIN < 64
+#define GROUP_THREAD_COUNT 64
+#else
+#define GROUP_THREAD_COUNT TILES_PER_BIN
+#endif
+#define GROUP_SIZE_X TILES_PER_BIN_X
+#define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
+#define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
+
+groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
+groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
+groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
+groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
+groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
+
+void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
+{
+	for (uint k = 2; k <= NextPow2; k *= 2)
+	{
+		// Align NumElements to the next multiple of k
+		NumElements = (NumElements + k - 1) & ~(k - 1);
+
+		for (uint j = k / 2; j > 0; j /= 2)
+		{
+			// Loop over all N/2 unique element pairs
+			for (uint i = GI; i < NumElements / 2; i += NumThreads)
+			{
+				uint Index1 = InsertZeroBit(i, j);
+				uint Index2 = Index1 | j;
+
+				uint A = gs_SortKeys[Index1];
+				uint B = gs_SortKeys[Index2];
+
+				if ((A < B) != ((Index1 & k) == 0))
+				{
+					gs_SortKeys[Index1] = B;
+					gs_SortKeys[Index2] = A;
+				}
+			}
+
+			GroupMemoryBarrierWithGroupSync();
+		}
+	}
+}
+
+uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
+{
+	// Sometimes we have more threads than tiles per bin.
+	uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
+	uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
+	return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
+}
+
+[RootSignature(Particle_RootSig)]
+[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
+void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
+{
+	// Each group is assigned a bin
+	uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
+
+	uint ParticleCountInBin = g_BinCounters[BinIndex];
+	if (ParticleCountInBin == 0)	
+		return;
+
+	// Get the start location for particles in this bin
+	uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
+
+	// Each thread is assigned a tile
+	uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
+
+	if (GI < TILES_PER_BIN)
+	{
+		gs_TileParticleCounts[GI] = 0;
+		gs_SlowTileParticleCounts[GI] = 0;
+		gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
+	}
+
+	// Sometimes the counter value exceeds the actual storage size
+	ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
+
+	// Compute the next power of two for the bitonic sort
+	uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
+
+	// Fill in the sort key array.  Each sort key has passenger data (in the least signficant
+	// bits, so that as the sort keys are moved around, they retain a pointer to the particle
+	// they refer to.
+	for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
+		gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
+
+	GroupMemoryBarrierWithGroupSync();
+
+	// Sort the particles from front to back.
+	BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
+
+	// Upper-left tile coord and lower-right coord, clamped to the screen
+	const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
+
+	// Each thread writes the hit mask for one tile
+	uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
+
+	// Loop over all sorted particles, group-size count at a time
+	for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
+	{
+		// Reset temporary particle intersection masks.  There are two words (64-bits) per thread.
+		[unroll]
+		for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
+			gs_IntersectionMasks[C] = 0;
+
+		GroupMemoryBarrierWithGroupSync();
+
+		// The array index of the particle this thread will test
+		uint SortIdx = Iter + GI;
+
+		// Compute word and bit to set (from thread index)
+		uint WordOffset = GI >> 5;
+		uint BitOffset = GI & 31;
+
+		// Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
+		if (SortIdx < ParticleCountInBin)
+		{
+			uint SortKey = gs_SortKeys[SortIdx];
+			uint GlobalIdx = SortKey & 0x3FFFF;
+
+			// After this phase, all we care about is its global index
+			g_SortedParticles[BinStart + SortIdx] = SortKey;
+
+			uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
+			int2 MinTile = uint2(Bounds >>  0, Bounds >>  8) & 0xFF;
+			int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
+			MinTile = max(MinTile - StartTile, 0);
+			MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
+
+			for (int y = MinTile.y; y <= MaxTile.y; y++)
+			{
+				for (int x = MinTile.x; x <= MaxTile.x; x++)
+				{
+					uint TileIndex = y * TILES_PER_BIN_X + x;
+					uint TileMaxZ = gs_MinMaxDepth[TileIndex];
+					uint Inside = SortKey < TileMaxZ ? 1 : 0;
+					uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
+					InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
+					InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
+				}
+			}
+		}
+
+		GroupMemoryBarrierWithGroupSync();
+
+#if TILES_PER_BIN < GROUP_THREAD_COUNT
+		// Copy the hit masks from LDS to the output buffer.  Here, each thread copies a single word
+		if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
+		{
+			uint TileIndex = GI % TILES_PER_BIN;
+			uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
+			uint Mask = gs_IntersectionMasks[Offset];
+			InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
+			g_TileHitMasks.Store(OutOffsetInBytes, Mask);
+			OutOffsetInBytes += 8;
+		}
+#else
+		// Copy the hit masks from LDS to the output buffer.  Here, each thread is assigned a tile.
+		uint Offset = GI * MASK_WORDS_PER_ITER;
+		[unroll]
+		for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
+		{
+			uint Mask0 = gs_IntersectionMasks[Offset+O];
+			uint Mask1 = gs_IntersectionMasks[Offset+O+1];
+			InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
+			g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
+			OutOffsetInBytes += 8;
+		}
+#endif
+
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	if (GI >= TILES_PER_BIN)
+		return;
+
+	uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
+	if (ParticleCountInThisThreadsTile > 0)
+	{
+		uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
+		uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
+
+		uint NewPacketIndex;
+		if (SlowParticlesInThisThreadsTile > 0)
+		{
+			g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
+			g_DrawPackets[NewPacketIndex] = Packet;
+		}
+		else
+		{
+			g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
+			g_FastDrawPackets[NewPacketIndex] = Packet;
+		}
+	}
+}

+ 1 - 3
tools/clang/test/CodeGenHLSL/loop3.hlsl

@@ -1,6 +1,5 @@
 // RUN: %dxc -E main -O2 -T ps_6_0 %s | FileCheck %s
 
-// CHECK: !"llvm.loop.unroll.full"
 // CHECK: !"llvm.loop.unroll.disable"
 
 float main(float2 a : A, int3 b : B) : SV_Target
@@ -12,8 +11,7 @@ float main(float2 a : A, int3 b : B) : SV_Target
     if (b.z == 9)
       break;
     [allow_uav_condition]
-    [unroll]
-    for(int j = 0; j < b.y; j++)
+    for(int j = 0; j <= 16; j++)
     {
       [branch]
       if (b.z == 16)

+ 33 - 0
tools/clang/test/CodeGenHLSL/unroll/complex.hlsl

@@ -0,0 +1,33 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+uint g_cond[3];
+uint g_bound;
+
+float main() : SV_Target {
+
+  float foo = 10;
+
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    
+    if (i == g_cond[0]) {
+      foo += 100;
+      break;
+    }
+    else if (i == g_cond[1]) {
+      foo += 200;
+      break;
+    }
+    else if (i == g_cond[2]) { 
+      return 10;
+    }
+    foo++;
+  }
+
+  if (foo > 300) {
+    foo /= 2;
+  }
+
+  return foo;
+}

+ 51 - 0
tools/clang/test/CodeGenHLSL/unroll/complex2.hlsl

@@ -0,0 +1,51 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK-NOT: call float @dx.op.dot3
+
+uint gc[4];
+uint g_bound;
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float foo = 10;
+
+  [unroll]
+  for (uint i = 1; i < 3; i++) {
+    
+    if (i == gc[0]) {
+      foo += dot(a*gc[0], b/gc[0]);
+      continue;
+    }
+    else if (i == gc[1]) {
+      foo += dot(a*gc[1], b/gc[1]);
+      continue;
+    }
+    else if (i == gc[2]) { 
+      foo += dot(a*gc[2], b/gc[2]);
+      if (foo > g_bound)
+        return foo;
+      continue;
+    }
+    else if (i == gc[3]) { 
+      foo += dot(a*gc[3], b/gc[3]);
+      continue;
+    }
+    foo++;
+  }
+
+  if (foo > 300) {
+    foo /= 2;
+  }
+
+  return foo;
+}

+ 18 - 0
tools/clang/test/CodeGenHLSL/unroll/fail.hlsl

@@ -0,0 +1,18 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-DAG: Could not unroll loop.
+// CHECK-NOT: @main
+
+// Check that the compilation fails due to unable to
+// find the loop bound.
+
+uint g_cond;
+
+float main() : SV_Target {
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond; j++) {
+    result += 1;
+  }
+  return result;
+}
+

+ 17 - 0
tools/clang/test/CodeGenHLSL/unroll/gis.hlsl

@@ -0,0 +1,17 @@
+// RUN: %dxc -Gis -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+float4 main(float3 a : A, float3 b : B) : SV_Target {
+  uint result = 1;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    result += dot(a*i, b);
+  }
+  return float4(result, 0,0, 1);
+}
+
+

+ 30 - 0
tools/clang/test/CodeGenHLSL/unroll/nested.hlsl

@@ -0,0 +1,30 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    [unroll]
+    for (uint j = 0; j < 4; j++) {
+      ret++;
+      if (g_cond == j) {
+        buffers[j].Append(i);
+        return ret;
+      }
+    }
+    ret--;
+  }
+
+  return ret;
+}
+

+ 34 - 0
tools/clang/test/CodeGenHLSL/unroll/nested2.hlsl

@@ -0,0 +1,34 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+    [unroll]
+    for (uint j = 0; j < 4; j++) {
+      ret++;
+      [unroll]
+      for (uint k = 0; k < 4; k++) {
+        ret++;
+        if (g_cond == j) {
+          buffers[k].Append(i+j);
+          return ret;
+        }
+      }
+    }
+    ret--;
+  }
+
+  return ret;
+}
+

+ 62 - 0
tools/clang/test/CodeGenHLSL/unroll/nested3.hlsl

@@ -0,0 +1,62 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+// CHECK: call i32 @dx.op.bufferUpdateCounter
+
+// CHECK-NOT: call i32 @dx.op.bufferUpdateCounter
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+uint g_cond2;
+
+float routine(float value) {
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+  float ret = 0;
+  [unroll]
+  for (uint k = 0; k < 4; k++) {
+    ret += 15;
+    if (g_cond == k) {
+      buffers[k].Append(value);
+      return ret;
+    }
+  }
+  return ret+1;
+}
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+
+  float ret = 0;
+  [unroll]
+  for (uint i = 0; i < 4; i++) {
+
+    [loop]
+    for (uint j = 0; j < 4; j++) {
+      ret += routine(j);
+      ret++;
+    }
+
+    ret--;
+  }
+
+  return ret;
+}
+

+ 27 - 0
tools/clang/test/CodeGenHLSL/unroll/no_attribute.hlsl

@@ -0,0 +1,27 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-NOT: @main
+
+// Without [unroll] attribute, the special unroll
+// routine is not done of the loop, and the resource
+// fail to get mapped.
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+

+ 24 - 0
tools/clang/test/CodeGenHLSL/unroll/no_opt.hlsl

@@ -0,0 +1,24 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+

+ 20 - 0
tools/clang/test/CodeGenHLSL/unroll/partial_cond.hlsl

@@ -0,0 +1,20 @@
+// RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+
+// CHECK-NOT: call float @dx.op.dot3
+
+uint g_cond;
+
+float main(float3 a : A, float3 b : B) : SV_Target {
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond && j < 4; j++) {
+    result += dot(a*j, b);
+  }
+  return result;
+}
+

+ 19 - 0
tools/clang/test/CodeGenHLSL/unroll/precise_int.hlsl

@@ -0,0 +1,19 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK: call float @dx.op.dot3
+// CHECK-NOT: call float @dx.op.dot3
+
+
+float4 main(float3 a : A, float3 b : B) : SV_Target {
+  precise uint result = 1;
+  [unroll]
+  for (precise uint i = 0; i < 4; i++) {
+    result += dot(a*i, b);
+  }
+  return float4(result, 0,0, 1);
+}
+
+

+ 24 - 0
tools/clang/test/CodeGenHLSL/unroll/simple.hlsl

@@ -0,0 +1,24 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// CHECK: @main
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+uint g_cond;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  [unroll]
+  for (uint j = 0; j < 4; j++) {
+    if (g_cond == j) {
+      buffers[j].Append(1);
+      return 10;
+    }
+  }
+
+  return 0;
+}
+

+ 27 - 0
tools/clang/test/CodeGenHLSL/unroll/warning.hlsl

@@ -0,0 +1,27 @@
+// RUN: %dxc -HV 2016 -Od -E main -T ps_6_0 %s | FileCheck %s
+// CHECK-DAG: warning: Could not unroll loop.
+// CHECK-NOT: @main
+
+// Check that the compilation fails due to unable to
+// find the loop bound.
+
+uint g_cond;
+
+AppendStructuredBuffer<float4> buf0;
+AppendStructuredBuffer<float4> buf1;
+AppendStructuredBuffer<float4> buf2;
+AppendStructuredBuffer<float4> buf3;
+
+float main() : SV_Target {
+
+  AppendStructuredBuffer<float4> buffers[] = { buf0, buf1, buf2, buf3, };
+
+  float result = 0;
+  [unroll]
+  for (uint j = 0; j < g_cond; j++) {
+    buffers[j].Append(result);
+    result += 1;
+  }
+  return result;
+}
+

+ 19 - 0
tools/clang/unittests/HLSL/CompilerTest.cpp

@@ -923,6 +923,7 @@ public:
   TEST_METHOD(CodeGenDx12MiniEngineParticlesortindirectargscs)
   TEST_METHOD(CodeGenDx12MiniEngineParticlespawncs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs)
+  TEST_METHOD(CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerendercs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastcs)
   TEST_METHOD(CodeGenDx12MiniEngineParticletilerenderfastdynamiccs)
@@ -951,6 +952,7 @@ public:
   TEST_METHOD(ViewID)
   TEST_METHOD(SubobjectCodeGenErrors)
   TEST_METHOD(ShaderCompatSuite)
+  TEST_METHOD(Unroll)
   TEST_METHOD(QuickTest)
   TEST_METHOD(QuickLlTest)
   BEGIN_TEST_METHOD(SingleFileCheckTest)
@@ -5652,6 +5654,10 @@ TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs){
   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS.hlsl");
 }
 
+TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilecullingcs_fail_unroll){
+  CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileCullingCS_fail_unroll.hlsl");
+}
+
 TEST_F(CompilerTest, CodeGenDx12MiniEngineParticletilerendercs){
   CodeGenTestCheck(L"..\\CodeGenHLSL\\Samples\\MiniEngine\\ParticleTileRenderCS.hlsl");
 }
@@ -5999,6 +6005,19 @@ TEST_F(CompilerTest, SubobjectCodeGenErrors) {
   }
 }
 
+TEST_F(CompilerTest, Unroll) {
+  using namespace WEX::TestExecution;
+  std::wstring suitePath = L"..\\CodeGenHLSL\\unroll";
+
+  WEX::Common::String value;
+  if (!DXC_FAILED(RuntimeParameters::TryGetValue(L"SuitePath", value)))
+  {
+    suitePath = value;
+  }
+
+  CodeGenTestCheckBatchDir(suitePath);
+}
+
 TEST_F(CompilerTest, ShaderCompatSuite) {
   using namespace WEX::TestExecution;
   std::wstring suitePath = L"..\\CodeGenHLSL\\shader-compat-suite";