Browse Source

Fix memcpy replace with undominated users (#2766)

When the only use of an alloca is as the destination of a memcpy
operation, the compiler will try to replace all uses of the alloca with
the source of the memcpy. That's fine when all the uses are in locations
where the source of the memcpy is sure to be defined, otherwise, a lot
of complicated PHI insertion is needed to get things right. Lacking it
made LCSSA checks fail. When LCSSA did succeed in inserting PHIs, it did
it wrong which caused other problems.

Rather than do the complicated PHI insertion at memcpy replace, this
change borrows from the previous work for global variables in similar
circumstances and recharacterizes the use of the pointer to be stored
only, but not uniquely. The result of this is the memcpy is replaced
with a load/store pair. There is a good chance that pair will go on to
be replaced by the mem2reg pass, which eliminates the alloca in the end.
That pass has excellent PHI insertion logic that gets everything right.

The added test includes a number of different conditional locations for
memcpy that each produced unique issues in investigating this issue.
Greg Roth 5 years ago
parent
commit
606d13b606

+ 47 - 3
lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp

@@ -3850,6 +3850,43 @@ static bool ReplaceUseOfZeroInitBeforeDef(Instruction *I, GlobalVariable *GV) {
   }
 }
 
+
+static bool DominateAllUsersPostDom(Instruction *I, Value *V,
+                                    PostDominatorTree &PDT) {
+  BasicBlock *BB = I->getParent();
+  Function *F = I->getParent()->getParent();
+  for (auto U = V->user_begin(); U != V->user_end(); ) {
+    Instruction *UI = dyn_cast<Instruction>(*(U++));
+    if (!UI)
+      continue;
+    assert (UI->getParent()->getParent() == F);
+
+    if (!PDT.dominates(BB, UI->getParent()))
+      return false;
+
+    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+      if (!DominateAllUsersPostDom(I, UI, PDT))
+        return false;
+    }
+  }
+  return true;
+}
+
+// Determine if `I` dominates all the users of `V`
+static bool DominateAllUsers(Instruction *I, Value *V) {
+  Function *F = I->getParent()->getParent();
+
+  // The Entry Block dominates everything, trivially true
+  if (&F->getEntryBlock() == I->getParent())
+    return true;
+
+  // Post dominator tree.
+  PostDominatorTree PDT;
+  PDT.runOnFunction(*F);
+  return DominateAllUsersPostDom(I, V, PDT);
+}
+
+
 bool SROA_Helper::LowerMemcpy(Value *V, DxilFieldAnnotation *annotation,
                               DxilTypeSystem &typeSys, const DataLayout &DL,
                               bool bAllowReplace) {
@@ -3870,9 +3907,9 @@ bool SROA_Helper::LowerMemcpy(Value *V, DxilFieldAnnotation *annotation,
       if (PS.storedType == PointerStatus::StoredType::NotStored) {
         PS.storedType = PointerStatus::StoredType::InitializerStored;
       } else if (PS.storedType == PointerStatus::StoredType::MemcopyDestOnce) {
-        // For single mem store, if the store not dominator all users.
-        // Makr it as Stored.
-        // Case like:
+        // For single mem store, if the store does not dominate all users.
+        // Mark it as Stored.
+        // In cases like:
         // struct A { float4 x[25]; };
         // A a;
         // static A a2;
@@ -3888,6 +3925,13 @@ bool SROA_Helper::LowerMemcpy(Value *V, DxilFieldAnnotation *annotation,
         PS.storedType = PointerStatus::StoredType::Stored;
       }
     }
+  } else if (PS.storedType == PointerStatus::StoredType::MemcopyDestOnce) {
+    // As above, it the memcpy doesn't dominate all its users,
+    // full replacement isn't possible without complicated PHI insertion
+    // This will likely replace with ld/st which will be replaced in mem2reg
+    Instruction *Memcpy = PS.StoringMemcpy;
+    if (!DominateAllUsers(Memcpy, V))
+        PS.storedType = PointerStatus::StoredType::Stored;
   }
 
   if (bAllowReplace && !PS.HasMultipleAccessingFunctions) {

+ 191 - 0
tools/clang/test/HLSLFileCheck/hlsl/control_flow/basic_blocks/cbuf_memcpy_replace.hlsl

@@ -0,0 +1,191 @@
+// RUN: %dxc -T lib_6_3 %s  | FileCheck %s
+
+
+// Make sure we're still eliminating all the allocas one way or the other
+//CHECK-NOT: alloca
+
+struct Istruct {
+  uint ival;
+};
+
+cbuffer cbuf : register(b1)
+{
+  Istruct istructs[1];
+}
+//CHECK: define i32 @"\01?loop_with_break1@@YAHH@Z"
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: icmp eq i32
+//CHECK: phi i32
+//CHECK: ret i32
+
+export
+int loop_with_break1(int i)
+{
+  Istruct istruct;
+  uint cascadeIndex = 10;
+
+  for (; i>=0; --i) {
+    istruct = istructs[i];
+    // break conditional introduces additional complications
+    if (istruct.ival) {
+      cascadeIndex = i;
+      break;
+    }
+  }
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?loop_with_break2@@YAHH@Z"
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: icmp eq i32
+//CHECK: phi i32
+//CHECK: ret i32
+
+export
+int loop_with_break2(int i)
+{
+  Istruct istruct;
+
+  for (; i>=0; --i) {
+    istruct = istructs[i];
+    // break conditional introduces additional complications
+    if (istruct.ival) {
+      break;
+    }
+  }
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?uncond_loop_with_break@@YAHI@Z"
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: icmp eq i32
+//CHECK: ret i32
+export
+int uncond_loop_with_break(uint i)
+{
+  Istruct istruct;
+
+  for (; i>=0; --i) {
+    istruct = istructs[i];
+    // break conditional introduces additional complications
+    if (istruct.ival)
+      break;
+  }
+
+  return istruct.ival;
+}
+
+//CHECK define i32 @"\01?uncond_init_loop@@YAHH@Z"
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: phi i32
+//CHECK: ret i32
+export
+int init_loop(int ct)
+{
+  Istruct istruct;
+
+  for (int i = 0; i < ct; ++i)
+    istruct = istructs[i];
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?cond_if@@YAHH@Z"(i32 %i) #0 {
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: phi i32
+//CHECK: ret i32
+export
+int cond_if(int i)
+{
+  Istruct istruct;
+
+  if(i>=0)
+    istruct = istructs[i];
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?uncond_if@@YAHI@Z"(i32 %i)
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: ret i32
+
+export
+int uncond_if(uint i)
+{
+  Istruct istruct;
+
+  if(i>=0)
+    istruct = istructs[i];
+
+  return istruct.ival;
+}
+
+
+//CHECK: define i32 @"\01?cond_if_else@@YAHHH@Z"(i32 %i, i32 %j)
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: phi i32
+//CHECK: ret i32
+
+export
+int cond_if_else(int i, int j)
+{
+  Istruct istruct;
+
+  if(i>=0)
+    istruct = istructs[i];
+  else
+    istruct = istructs[j];
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?uncond_if_else@@YAHIH@Z"(i32 %i, i32 %j)
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: ret i32
+export
+int uncond_if_else(uint i, int j)
+{
+  Istruct istruct;
+
+  if(i>=0)
+    istruct = istructs[i];
+  else
+    istruct = istructs[j];
+
+  return istruct.ival;
+}
+
+//CHECK: define i32 @"\01?entry_memcpy@@YAHHH@Z"(i32 %i, i32 %ct)
+//CHECK: call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32
+//CHECK: extractvalue %dx.types.CBufRet.i32
+//CHECK: icmp eq i32
+//CHECK: phi i32
+//CHECK: ret i32
+// This should allow the complete RAUW replacement
+export
+int entry_memcpy(int i, int ct)
+{
+  Istruct istruct;
+
+  istruct = istructs[i];
+
+  int ival;
+
+  for (; i < ct; ++i)
+    ival += istruct.ival;
+
+  return ival;
+}
+