瀏覽代碼

Increase scan limit for DSE, add option (#2725)

* Increase scan limit for DSE, add option

Due to the large number of fields in a struct passed to an exported
function, the number of useless loads and stores exceeded the builtin
limit, which left a lot of them in. This increases the default limit
from 100 to 500 and adds a hidden parameter -memdep-block-scan-limit to
set it to whatever is needed for future workarounds.

The Dead Store Elimination pass examines stores to see if they are
unneeded. If it finds no uses between the store and the original load,
it eliminates both, but if it has to exceed the instruction limit to get
there, it gives up and leaves it in just in case.
Greg Roth 5 年之前
父節點
當前提交
08f3100f25

+ 1 - 0
include/dxc/Support/HLSLOptions.h

@@ -172,6 +172,7 @@ public:
   bool ExportShadersOnly = false; // OPT_export_shaders_only
   bool ResMayAlias = false; // OPT_res_may_alias
   unsigned long ValVerMajor = UINT_MAX, ValVerMinor = UINT_MAX; // OPT_validator_version
+  unsigned ScanLimit = 0; // OPT_memdep_block_scan_limit
 
   std::vector<std::string> Warnings;
 

+ 2 - 1
include/dxc/Support/HLSLOptions.td

@@ -150,7 +150,8 @@ def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<hlsloptz_G
 //  Flags<[CoreOption]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
 //  " | on (according to FP_CONTRACT pragma, default) | off (never fuse)">;
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<hlsloptz_Group>;
-
+def memdep_block_scan_limit : Separate<["-", "/"], "memdep-block-scan-limit">, Group<hlsloptz_Group>, Flags<[CoreOption, DriverOption, HelpHidden]>,
+  HelpText<"The number of instructions to scan in a block in memory dependency analysis.">;
 
 /*
 def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<hlslcomp_Group>,

+ 3 - 2
include/llvm/Analysis/MemoryDependenceAnalysis.h

@@ -347,7 +347,7 @@ namespace llvm {
     /// getDependency - Return the instruction on which a memory operation
     /// depends.  See the class comment for more details.  It is illegal to call
     /// this on non-memory instructions.
-    MemDepResult getDependency(Instruction *QueryInst);
+    MemDepResult getDependency(Instruction *QueryInst, unsigned ScanLimit = 0);
 
     /// getNonLocalCallDependency - Perform a full dependency query for the
     /// specified call, returning the set of blocks that the value is
@@ -407,7 +407,8 @@ namespace llvm {
                                           bool isLoad,
                                           BasicBlock::iterator ScanIt,
                                           BasicBlock *BB,
-                                          Instruction *QueryInst = nullptr);
+                                          Instruction *QueryInst = nullptr,
+                                          unsigned Limit = 0);
 
     /// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
     /// looks at a memory location for a load (specified by MemLocBase, Offs,

+ 1 - 0
include/llvm/Transforms/IPO/PassManagerBuilder.h

@@ -129,6 +129,7 @@ public:
   bool HLSLHighLevel = false; // HLSL Change
   hlsl::HLSLExtensionsCodegenHelper *HLSLExtensionsCodeGen = nullptr; // HLSL Change
   bool HLSLResMayAlias = false; // HLSL Change
+  unsigned ScanLimit = 0; // HLSL Change
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.

+ 1 - 1
include/llvm/Transforms/Scalar.h

@@ -73,7 +73,7 @@ FunctionPass *createDeadCodeEliminationPass();
 // DeadStoreElimination - This pass deletes stores that are post-dominated by
 // must-aliased stores and are not loaded used between the stores.
 //
-FunctionPass *createDeadStoreEliminationPass();
+FunctionPass *createDeadStoreEliminationPass(unsigned ScanLimit = 0); // HLSL Change - Add ScanLimit
 
 //===----------------------------------------------------------------------===//
 //

+ 8 - 6
lib/Analysis/MemoryDependenceAnalysis.cpp

@@ -49,7 +49,7 @@ STATISTIC(NumCacheCompleteNonLocalPtr,
           "Number of block queries that were completely cached");
 
 // Limit for the number of instructions to scan in a block.
-static const unsigned int BlockScanLimit = 100;
+static const unsigned int BlockScanLimit = 500;
 
 // Limit on the number of memdep results to process.
 static const unsigned int NumResultsLimit = 100;
@@ -376,13 +376,16 @@ static bool isVolatile(Instruction *Inst) {
 /// annotated to the query instruction to refine the result.
 MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
-    BasicBlock *BB, Instruction *QueryInst) {
+    BasicBlock *BB, Instruction *QueryInst, unsigned Limit) {
 
   const Value *MemLocBase = nullptr;
   int64_t MemLocOffset = 0;
-  unsigned Limit = BlockScanLimit;
   bool isInvariantLoad = false;
 
+  unsigned DefaultLimit = BlockScanLimit;
+  if (Limit == 0)
+    Limit = DefaultLimit;
+
   // We must be careful with atomic accesses, as they may allow another thread
   //   to touch this location, cloberring it. We are conservative: if the
   //   QueryInst is not a simple (non-atomic) memory access, we automatically
@@ -653,7 +656,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom(
 
 /// getDependency - Return the instruction on which a memory operation
 /// depends.
-MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
+MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst, unsigned ScanLimit) {
   Instruction *ScanPos = QueryInst;
 
   // Check for a cached result
@@ -690,9 +693,8 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       bool isLoad = !(MR & AliasAnalysis::Mod);
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
         isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start;
-
       LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos,
-                                            QueryParent, QueryInst);
+                                            QueryParent, QueryInst, ScanLimit);
     } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
       CallSite QueryCS(QueryInst);
       bool isReadOnly = AA->onlyReadsMemory(QueryCS);

+ 4 - 0
lib/DxcSupport/HLSLOptions.cpp

@@ -484,6 +484,10 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
   opts.RootSignatureSource = Args.getLastArgValue(OPT_setrootsignature);
   opts.VerifyRootSignatureSource = Args.getLastArgValue(OPT_verifyrootsignature);
   opts.RootSignatureDefine = Args.getLastArgValue(OPT_rootsig_define);
+  opts.ScanLimit = 0;
+  llvm::StringRef limit = Args.getLastArgValue(OPT_memdep_block_scan_limit);
+  if (!limit.empty())
+    opts.ScanLimit = std::stoul(std::string(limit));
 
   if (!opts.ForceRootSigVer.empty() && opts.ForceRootSigVer != "rootsig_1_0" &&
       opts.ForceRootSigVer != "rootsig_1_1") {

+ 2 - 2
lib/Transforms/IPO/PassManagerBuilder.cpp

@@ -473,7 +473,7 @@ void PassManagerBuilder::populateModulePassManager(
   addExtensionsToPM(EP_Peephole, MPM);
   // HLSL Change. MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
-  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+  MPM.add(createDeadStoreEliminationPass(ScanLimit));  // Delete dead stores
   // HLSL Change - disable LICM in frontend for not consider register pressure.
   // MPM.add(createLICMPass());
 
@@ -724,7 +724,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
 
   // Nuke dead stores.
-  PM.add(createDeadStoreEliminationPass());
+  PM.add(createDeadStoreEliminationPass(ScanLimit)); // HLSL Change - add ScanLimit
 
   // More loops are countable; try to optimize them.
   PM.add(createIndVarSimplifyPass());

+ 5 - 3
lib/Transforms/Scalar/DeadStoreElimination.cpp

@@ -49,9 +49,11 @@ namespace {
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
     const TargetLibraryInfo *TLI;
+    unsigned ScanLimit; // HLSL Change - Add ScanLimit
 
     static char ID; // Pass identification, replacement for typeid
-    DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) {
+    DSE(unsigned ScanLimit = 0) :
+      FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr), ScanLimit(ScanLimit) {// HLSL Change - Add ScanLimit
       initializeDSEPass(*PassRegistry::getPassRegistry());
     }
 
@@ -101,7 +103,7 @@ INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
 
-FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+FunctionPass *llvm::createDeadStoreEliminationPass(unsigned ScanLimit) { return new DSE(ScanLimit); } // HLSL Change - add ScanLimit
 
 //===----------------------------------------------------------------------===//
 // Helper functions
@@ -491,7 +493,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     if (!hasMemoryWrite(Inst, TLI))
       continue;
 
-    MemDepResult InstDep = MD->getDependency(Inst);
+    MemDepResult InstDep = MD->getDependency(Inst, ScanLimit);
 
     // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)

+ 2 - 0
tools/clang/include/clang/Frontend/CodeGenOptions.h

@@ -214,6 +214,8 @@ public:
   hlsl::DXIL::DefaultLinkage DefaultLinkage = hlsl::DXIL::DefaultLinkage::Default;
   /// Assume UAVs/SRVs may alias.
   bool HLSLResMayAlias = false;
+  /// Lookback scan limit for memory dependencies
+  unsigned ScanLimit = 0;
   // HLSL Change Ends
 
   // SPIRV Change Starts

+ 1 - 0
tools/clang/lib/CodeGen/BackendUtil.cpp

@@ -325,6 +325,7 @@ void EmitAssemblyHelper::CreatePasses() {
   PMBuilder.HLSLHighLevel = CodeGenOpts.HLSLHighLevel; // HLSL Change
   PMBuilder.HLSLExtensionsCodeGen = CodeGenOpts.HLSLExtensionsCodegen.get(); // HLSL Change
   PMBuilder.HLSLResMayAlias = CodeGenOpts.HLSLResMayAlias; // HLSL Change
+  PMBuilder.ScanLimit = CodeGenOpts.ScanLimit; // HLSL Change
 
   PMBuilder.DisableUnitAtATime = !CodeGenOpts.UnitAtATime;
   PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;

+ 36 - 0
tools/clang/test/HLSLFileCheck/hlsl/functions/arguments/inout_large.hlsl

@@ -0,0 +1,36 @@
+// RUN: %dxc -T lib_6_3 %s | FileCheck %s
+
+// Large struct with a lot of members to overwhelm the default memory data analysis lookback
+struct BigusStructus
+{
+    float  BigusArrayus[100];
+};
+
+// CHECK: getelementptr inbounds %struct.BigusStructus, %struct.BigusStructus* %bs, i32 0, i32 0, i32 1
+// CHECK: load float, float*
+// CHECK: fmul fast float
+// CHECK: insertelement <4 x float> undef, float
+// CHECK: insertelement <4 x float>
+// CHECK: insertelement <4 x float>
+// CHECK: insertelement <4 x float>
+// CHECK: ret <4 x float>
+
+export
+float4 AccessJustOneStruct(inout BigusStructus bs)
+{
+   return float4(bs.BigusArrayus[1]*255, 1, 0, 0);
+}
+
+// CHECK: getelementptr inbounds [100 x float], [100 x float]* %ba, i32 0, i32 1
+// CHECK: load float, float*
+// CHECK: fmul fast float
+// CHECK: insertelement <4 x float> undef, float
+// CHECK: insertelement <4 x float>
+// CHECK: insertelement <4 x float>
+// CHECK: insertelement <4 x float>
+// CHECK: ret <4 x float>
+export
+float4 AccessJustOneArray(inout float ba[100])
+{
+   return float4(ba[1]*255, 1, 0, 0);
+}

+ 1 - 0
tools/clang/tools/dxcompiler/dxcompilerobj.cpp

@@ -1100,6 +1100,7 @@ public:
 
     compiler.getCodeGenOpts().HLSLHighLevel = Opts.CodeGenHighLevel;
     compiler.getCodeGenOpts().HLSLResMayAlias = Opts.ResMayAlias;
+    compiler.getCodeGenOpts().ScanLimit = Opts.ScanLimit;
     compiler.getCodeGenOpts().HLSLAllResourcesBound = Opts.AllResourcesBound;
     compiler.getCodeGenOpts().HLSLDefaultRowMajor = Opts.DefaultRowMajor;
     compiler.getCodeGenOpts().HLSLPreferControlFlow = Opts.PreferFlowControl;