Browse Source

Shader debug instrumentation (#680)

See the comment in DxilDebugInstrumentation.cpp for a summary of how this-all works. Basically: instrument all instructions to output "trace" info to a UAV so that a debugger can reconstruct the execution of the shader and offer a debugger-like experience.
Jeff Noyle 8 years ago
parent
commit
b06a8153ff

+ 2 - 0
include/dxc/HLSL/DxilGenerationPass.h

@@ -60,6 +60,7 @@ ModulePass *createDxilOutputColorBecomesConstantPass();
 ModulePass *createDxilRemoveDiscardsPass();
 ModulePass *createDxilReduceMSAAToSingleSamplePass();
 ModulePass *createDxilForceEarlyZPass();
+ModulePass *createDxilDebugInstrumentationPass();
 
 void initializeDxilCondenseResourcesPass(llvm::PassRegistry&);
 void initializeDxilEliminateOutputDynamicIndexingPass(llvm::PassRegistry&);
@@ -84,6 +85,7 @@ void initializeDxilOutputColorBecomesConstantPass(llvm::PassRegistry&);
 void initializeDxilRemoveDiscardsPass(llvm::PassRegistry&);
 void initializeDxilReduceMSAAToSingleSamplePass(llvm::PassRegistry&);
 void initializeDxilForceEarlyZPass(llvm::PassRegistry&);
+void initializeDxilDebugInstrumentationPass(llvm::PassRegistry&);
 
 bool AreDxilResourcesDense(llvm::Module *M, hlsl::DxilResourceBase **ppNonDense);
 

+ 1 - 0
lib/HLSL/CMakeLists.txt

@@ -10,6 +10,7 @@ add_llvm_library(LLVMHLSL
   DxilContainer.cpp
   DxilContainerAssembler.cpp
   DxilContainerReflection.cpp
+  DxilDebugInstrumentation.cpp
   DxilEliminateOutputDynamicIndexing.cpp
   DxilExpandTrigIntrinsics.cpp
   DxilForceEarlyZ.cpp

+ 9 - 0
lib/HLSL/DxcOptimizer.cpp

@@ -85,6 +85,7 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeDxilAddPixelHitInstrumentationPass(Registry);
     initializeDxilCondenseResourcesPass(Registry);
     initializeDxilDeadFunctionEliminationPass(Registry);
+    initializeDxilDebugInstrumentationPass(Registry);
     initializeDxilEliminateOutputDynamicIndexingPass(Registry);
     initializeDxilEmitMetadataPass(Registry);
     initializeDxilExpandTrigIntrinsicsPass(Registry);
@@ -176,6 +177,7 @@ static ArrayRef<LPCSTR> GetPassArgNames(LPCSTR passName) {
   static const LPCSTR ArgPromotionArgs[] = { "maxElements" };
   static const LPCSTR CFGSimplifyPassArgs[] = { "Threshold", "Ftor", "bonus-inst-threshold" };
   static const LPCSTR DxilAddPixelHitInstrumentationArgs[] = { "force-early-z", "add-pixel-cost", "rt-width", "sv-position-index", "num-pixels" };
+  static const LPCSTR DxilDebugInstrumentationArgs[] = { "UAVSize", "parameter0", "parameter1", "parameter2" };
   static const LPCSTR DxilGenerationPassArgs[] = { "NotOptimized" };
   static const LPCSTR DxilOutputColorBecomesConstantArgs[] = { "mod-mode", "constant-red", "constant-green", "constant-blue", "constant-alpha" };
   static const LPCSTR DynamicIndexingVectorToArrayArgs[] = { "ReplaceAllVectors" };
@@ -207,6 +209,7 @@ static ArrayRef<LPCSTR> GetPassArgNames(LPCSTR passName) {
   if (strcmp(passName, "argpromotion") == 0) return ArrayRef<LPCSTR>(ArgPromotionArgs, _countof(ArgPromotionArgs));
   if (strcmp(passName, "simplifycfg") == 0) return ArrayRef<LPCSTR>(CFGSimplifyPassArgs, _countof(CFGSimplifyPassArgs));
   if (strcmp(passName, "hlsl-dxil-add-pixel-hit-instrmentation") == 0) return ArrayRef<LPCSTR>(DxilAddPixelHitInstrumentationArgs, _countof(DxilAddPixelHitInstrumentationArgs));
+  if (strcmp(passName, "hlsl-dxil-debug-instrumentation") == 0) return ArrayRef<LPCSTR>(DxilDebugInstrumentationArgs, _countof(DxilDebugInstrumentationArgs));
   if (strcmp(passName, "dxilgen") == 0) return ArrayRef<LPCSTR>(DxilGenerationPassArgs, _countof(DxilGenerationPassArgs));
   if (strcmp(passName, "hlsl-dxil-constantColor") == 0) return ArrayRef<LPCSTR>(DxilOutputColorBecomesConstantArgs, _countof(DxilOutputColorBecomesConstantArgs));
   if (strcmp(passName, "dynamic-vector-to-array") == 0) return ArrayRef<LPCSTR>(DynamicIndexingVectorToArrayArgs, _countof(DynamicIndexingVectorToArrayArgs));
@@ -245,6 +248,7 @@ static ArrayRef<LPCSTR> GetPassArgDescriptions(LPCSTR passName) {
   static const LPCSTR ArgPromotionArgs[] = { "None" };
   static const LPCSTR CFGSimplifyPassArgs[] = { "None", "None", "Control the number of bonus instructions (default = 1)" };
   static const LPCSTR DxilAddPixelHitInstrumentationArgs[] = { "None", "None", "None", "None", "None" };
+  static const LPCSTR DxilDebugInstrumentationArgs[] = { "None", "None", "None", "None" };
   static const LPCSTR DxilGenerationPassArgs[] = { "None" };
   static const LPCSTR DxilOutputColorBecomesConstantArgs[] = { "None", "None", "None", "None", "None" };
   static const LPCSTR DynamicIndexingVectorToArrayArgs[] = { "None" };
@@ -276,6 +280,7 @@ static ArrayRef<LPCSTR> GetPassArgDescriptions(LPCSTR passName) {
   if (strcmp(passName, "argpromotion") == 0) return ArrayRef<LPCSTR>(ArgPromotionArgs, _countof(ArgPromotionArgs));
   if (strcmp(passName, "simplifycfg") == 0) return ArrayRef<LPCSTR>(CFGSimplifyPassArgs, _countof(CFGSimplifyPassArgs));
   if (strcmp(passName, "hlsl-dxil-add-pixel-hit-instrmentation") == 0) return ArrayRef<LPCSTR>(DxilAddPixelHitInstrumentationArgs, _countof(DxilAddPixelHitInstrumentationArgs));
+  if (strcmp(passName, "hlsl-dxil-debug-instrumentation") == 0) return ArrayRef<LPCSTR>(DxilDebugInstrumentationArgs, _countof(DxilDebugInstrumentationArgs));
   if (strcmp(passName, "dxilgen") == 0) return ArrayRef<LPCSTR>(DxilGenerationPassArgs, _countof(DxilGenerationPassArgs));
   if (strcmp(passName, "hlsl-dxil-constantColor") == 0) return ArrayRef<LPCSTR>(DxilOutputColorBecomesConstantArgs, _countof(DxilOutputColorBecomesConstantArgs));
   if (strcmp(passName, "dynamic-vector-to-array") == 0) return ArrayRef<LPCSTR>(DynamicIndexingVectorToArrayArgs, _countof(DynamicIndexingVectorToArrayArgs));
@@ -328,6 +333,7 @@ static bool IsPassOptionName(StringRef S) {
     ||  S.equals("TIRA")
     ||  S.equals("TLIImpl")
     ||  S.equals("Threshold")
+    ||  S.equals("UAVSize")
     ||  S.equals("add-pixel-cost")
     ||  S.equals("bonus-inst-threshold")
     ||  S.equals("constant-alpha")
@@ -356,6 +362,9 @@ static bool IsPassOptionName(StringRef S) {
     ||  S.equals("no-discriminators")
     ||  S.equals("noloads")
     ||  S.equals("num-pixels")
+    ||  S.equals("parameter0")
+    ||  S.equals("parameter1")
+    ||  S.equals("parameter2")
     ||  S.equals("pragma-unroll-threshold")
     ||  S.equals("reroll-num-tolerated-failed-matches")
     ||  S.equals("rewrite-map-file")

+ 732 - 0
lib/HLSL/DxilDebugInstrumentation.cpp

@@ -0,0 +1,732 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilDebugInstrumentation.cpp                                              //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Adds instrumentation that enables shader debugging in PIX                 //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilModule.h"
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IRBuilder.h"
+
+
+using namespace llvm;
+using namespace hlsl;
+
+// Overview of instrumentation:
+// 
+// In summary, instructions are added that cause a "trace" of the execution of the shader to be written
+// out to a UAV. This trace is then used by a debugger application to provide a post-mortem debugging
+// experience that reconstructs the execution history of the shader.
+// 
+// The trace is only required for a particular shader instance of interest, and a branchless mechanism
+// is used to write the trace either to an incrementing location within the UAV, or to a "dumping ground"
+// area at the top of the UAV if the instance is not of interest.
+// 
+// The following modifications are made:
+// 
+// First, instructions are added to the top of the entry point function that implement the following:
+// -  Examine the input variables that define the instance of the shader that is running. This will
+//    be SV_Position for pixel shaders, SV_Vertex+SV_Instance for vertex shaders, thread id for compute
+//    shaders etc. If these system values need to be added to the shader, then they are also added to the
+//    input signature, if appropriate.
+// -  Compare the above variables with the instance of interest defined by the invoker of this pass.
+//    Deduce two values: a multiplicand and an addend that together allow a branchless calculation of
+//    the offset into the UAV at which to write via "offset = offset * multiplicand + addend."
+//    If the instance is NOT of interest, the multiplicand is zero and the addend is 
+//    sizeof(UAV)-(a little bit), causing writes for uninteresting invocations to end up at the top of 
+//    the UAV. Otherwise the multiplicand is 1 and the addend is 0.
+// -  Calculate an "instance identifier". Even with the above instance identification, several invocations may
+//    end up matching the selection criteria. Specifically, this happens during a draw call in which many
+//    triangles overlap the pixel of interest. More on this below.
+//    
+// During execution, the instrumentation for most instructions cause data to be emitted to the UAV. 
+// The index at which data is written is identified by treating the first uint32 of the UAV as an index 
+// which is atomically incremented by the instrumentation. The very first value of this counter that is
+// encountered by each invocation is used as the "instance identifier" mentioned above. That instance
+// identifier is written out with each packet, since many pixel shaders executing in parallel will emit
+// interleaved packets, and the debugger application uses the identifiers to group packets from each separate
+// invocation together.
+// 
+// If an instruction has a non-void and primitive return type, i.e. isn't a struct, then the instrumentation
+// will write that value out to the UAV as well as part of the "step" data packet.
+//    
+// The limiting size of the UAV is enforced in a branchless way by ANDing the offset with a precomputed
+// value that is sizeof(UAV)-64. The actual size of the UAV allocated by the caller is required to be
+// a power of two plus 64 for this reason. The caller detects UAV overrun by examining a canary value
+// close to the end of the power-of-two size of the UAV. If this value has been overwritten, the debug session
+// is deemed to have overflowed the UAV. The caller will than allocate a UAV that is twice the size and
+// try again, up to a predefined maximum.
+
+// Keep this in sync with the same-named value in the debugger application's WinPixShaderUtils.h
+constexpr uint64_t DebugBufferDumpingGroundSize = 64 * 1024;
+
+
+// These definitions echo those in the debugger application's debugshaderrecord.h file
+enum DebugShaderModifierRecordType {
+  DebugShaderModifierRecordTypeInvocationStartMarker,
+  DebugShaderModifierRecordTypeStep,
+  DebugShaderModifierRecordTypeEvent,
+  DebugShaderModifierRecordTypeInputRegister,
+  DebugShaderModifierRecordTypeReadRegister,
+  DebugShaderModifierRecordTypeWrittenRegister,
+  DebugShaderModifierRecordTypeRegisterRelativeIndex0,
+  DebugShaderModifierRecordTypeRegisterRelativeIndex1,
+  DebugShaderModifierRecordTypeRegisterRelativeIndex2,
+  DebugShaderModifierRecordTypeDXILStepVoid = 251,
+  DebugShaderModifierRecordTypeDXILStepFloat = 252,
+  DebugShaderModifierRecordTypeDXILStepUint32 = 253,
+  DebugShaderModifierRecordTypeDXILStepUint64 = 254,
+  DebugShaderModifierRecordTypeDXILStepDouble = 255,
+};
+
+// These structs echo those in the debugger application's debugshaderrecord.h file, but are recapitulated here
+// because the originals use unnamed unions which are disallowed by DXCompiler's build.
+// 
+#pragma pack(push,4)
+struct DebugShaderModifierRecordHeader {
+  union  {
+    struct {
+      uint32_t SizeDwords : 4;
+      uint32_t Flags : 4;
+      uint32_t Type : 8;
+      uint32_t HeaderPayload : 16;
+    } Details;
+    uint32_t u32Header;
+  } Header;
+  uint32_t UID;
+};
+
+struct DebugShaderModifierRecordDXILStepBase {
+  union {
+    struct {
+      uint32_t SizeDwords : 4;
+      uint32_t Flags : 4;
+      uint32_t Type : 8;
+      uint32_t Opcode : 16;
+    } Details;
+    uint32_t u32Header;
+  } Header;
+  uint32_t UID;
+  uint32_t InstructionOffset;
+};
+
+template< typename ReturnType >
+struct DebugShaderModifierRecordDXILStep : public DebugShaderModifierRecordDXILStepBase {
+  ReturnType ReturnValue;
+};
+
+template< >
+struct DebugShaderModifierRecordDXILStep<void> : public DebugShaderModifierRecordDXILStepBase {
+};
+#pragma pack(pop)
+
+
+uint32_t DebugShaderModifierRecordPayloadSizeDwords(size_t recordTotalSizeBytes) {
+  return ((recordTotalSizeBytes - sizeof(DebugShaderModifierRecordHeader)) / sizeof(uint32_t));
+}
+
+class DxilDebugInstrumentation : public ModulePass {
+
+private:
+  union ParametersAllTogether {
+    unsigned Parameters[3];
+    struct PixelShaderParameters {
+      unsigned X;
+      unsigned Y;
+    } PixelShader;
+    struct VertexShaderParameters {
+      unsigned VertexId;
+      unsigned InstanceId;
+    } VertexShader;
+    struct ComputeShaderParameters {
+      unsigned ThreadIdX;
+      unsigned ThreadIdY;
+      unsigned ThreadIdZ;
+    } ComputeShader;
+  } m_Parameters = { 0,0,0 };
+
+  union SystemValueIndices {
+    struct PixelShaderParameters {
+      unsigned Position;
+    } PixelShader;
+    struct VertexShaderParameters {
+      unsigned VertexId;
+      unsigned InstanceId;
+    } VertexShader;
+  };
+
+  uint64_t m_UAVSize = 1024*1024;
+  Value * m_SelectionCriterion = nullptr;
+  CallInst * m_HandleForUAV = nullptr;
+  Value * m_InvocationId = nullptr;
+
+  // Together these two values allow branchless writing to the UAV. An invocation of the shader
+  // is either of interest or not (e.g. it writes to the pixel the user selected for debugging
+  // or it doesn't). If not of interest, debugging output will still occur, but it will be
+  // relegated to the very top few bytes of the UAV. Invocations of interest, by contrast, will
+  // be written to the UAV at sequentially increasing offsets.
+
+  // This value will either be one or zero (one if the invocation is of interest, zero otherwise)
+  Value * m_OffsetMultiplicand = nullptr;
+  // This will either be zero (if the invocation is of interest) or (UAVSize)-(SmallValue) if not.
+  Value * m_OffsetAddend = nullptr;
+
+  Constant * m_OffsetMask = nullptr;
+
+  std::map<uint32_t, Value *> m_IncrementInstructionBySize;
+
+  unsigned int m_InstructionIndex = 0;
+
+  struct BuilderContext {
+    Module &M;
+    DxilModule &DM;
+    LLVMContext & Ctx;
+    OP * HlslOP;
+    IRBuilder<> & Builder;
+  };
+
+  uint32_t m_RemainingReservedSpaceInBytes = 0;
+  Value * m_CurrentIndex = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilDebugInstrumentation() : ModulePass(ID) {}
+  const char *getPassName() const override { return "Add PIX debug instrumentation"; }
+  void applyOptions(PassOptions O) override;
+  bool runOnModule(Module &M) override;
+
+private:
+  SystemValueIndices addRequiredSystemValues(BuilderContext &BC);
+  void addUAV(BuilderContext &BC);
+  void addInvocationSelectionProlog(BuilderContext &BC, SystemValueIndices SVIndices);
+  Value * addPixelShaderProlog(BuilderContext &BC, SystemValueIndices SVIndices);
+  Value * addComputeShaderProlog(BuilderContext &BC);
+  Value * addVertexShaderProlog(BuilderContext &BC, SystemValueIndices SVIndices);
+  void addDebugEntryValue(BuilderContext &BC, Value * TheValue);
+  void addInvocationStartMarker(BuilderContext &BC);
+  void reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInDwords);
+  void addStepDebugEntry(BuilderContext &BC, Instruction *Inst);
+  uint32_t UAVDumpingGroundOffset();
+  template<typename ReturnType>
+  void addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, Instruction *Inst);
+
+};
+
+void DxilDebugInstrumentation::applyOptions(PassOptions O) {
+  for (const auto & option : O) {
+    if (0 == option.first.compare("parameter0")) {
+      m_Parameters.Parameters[0] = atoi(option.second.data());
+    }
+    else if (0 == option.first.compare("parameter1")) {
+      m_Parameters.Parameters[1] = atoi(option.second.data()); 
+    }
+    else if (0 == option.first.compare("parameter2")) {
+      m_Parameters.Parameters[2] = atoi(option.second.data());
+    }
+    else if (0 == option.first.compare("UAVSize")) {
+      m_UAVSize = std::stoull(option.second.data());
+    }
+  }
+}
+
+uint32_t DxilDebugInstrumentation::UAVDumpingGroundOffset() {
+  return static_cast<uint32_t>(m_UAVSize - DebugBufferDumpingGroundSize);
+}
+
+
+DxilDebugInstrumentation::SystemValueIndices DxilDebugInstrumentation::addRequiredSystemValues(BuilderContext &BC) {
+  SystemValueIndices SVIndices{};
+
+  hlsl::DxilSignature & InputSignature = BC.DM.GetInputSignature();
+
+  auto & InputElements = InputSignature.GetElements();
+
+  auto ShaderModel = BC.DM.GetShaderModel();
+  switch (ShaderModel->GetKind()) {
+  case DXIL::ShaderKind::Pixel: {
+    auto Existing_SV_Position = std::find_if(
+      InputElements.begin(), InputElements.end(),
+      [](const std::unique_ptr<DxilSignatureElement> & Element) {
+      return Element->GetSemantic()->GetKind() == hlsl::DXIL::SemanticKind::Position; });
+
+    // SV_Position, if present, has to have full mask, so we needn't worry 
+    // about the shader having selected components that don't include x or y.
+    // If not present, we add it.
+    if (Existing_SV_Position == InputElements.end()) {
+      auto Added_SV_Position = std::make_unique<DxilSignatureElement>(DXIL::SigPointKind::PSIn);
+      Added_SV_Position->Initialize("Position", hlsl::CompType::getF32(), hlsl::DXIL::InterpolationMode::Linear, 1, 4);
+      Added_SV_Position->AppendSemanticIndex(0);
+      Added_SV_Position->SetSigPointKind(DXIL::SigPointKind::PSIn);
+      Added_SV_Position->SetKind(hlsl::DXIL::SemanticKind::Position);
+
+      auto index = InputSignature.AppendElement(std::move(Added_SV_Position));
+      SVIndices.PixelShader.Position = InputElements[index]->GetID();
+    }
+    else {
+      SVIndices.PixelShader.Position = Existing_SV_Position->get()->GetID();
+    }
+  }
+  break;
+  case DXIL::ShaderKind::Vertex: {
+    {
+      auto Existing_SV_VertexId = std::find_if(
+        InputElements.begin(), InputElements.end(),
+        [](const std::unique_ptr<DxilSignatureElement> & Element) {
+        return Element->GetSemantic()->GetKind() == hlsl::DXIL::SemanticKind::VertexID; });
+
+      if (Existing_SV_VertexId == InputElements.end()) {
+        auto Added_SV_VertexId = std::make_unique<DxilSignatureElement>(DXIL::SigPointKind::VSIn);
+        Added_SV_VertexId->Initialize("VertexId", hlsl::CompType::getF32(), hlsl::DXIL::InterpolationMode::Undefined, 1, 1);
+        Added_SV_VertexId->AppendSemanticIndex(0);
+        Added_SV_VertexId->SetSigPointKind(DXIL::SigPointKind::VSIn);
+        Added_SV_VertexId->SetKind(hlsl::DXIL::SemanticKind::VertexID);
+
+        auto index = InputSignature.AppendElement(std::move(Added_SV_VertexId));
+        SVIndices.VertexShader.VertexId = InputElements[index]->GetID();
+      }
+      else {
+        SVIndices.VertexShader.VertexId = Existing_SV_VertexId->get()->GetID();
+      }
+    }
+    {
+      auto Existing_SV_InstanceId = std::find_if(
+        InputElements.begin(), InputElements.end(),
+        [](const std::unique_ptr<DxilSignatureElement> & Element) {
+        return Element->GetSemantic()->GetKind() == hlsl::DXIL::SemanticKind::InstanceID; });
+
+      if (Existing_SV_InstanceId == InputElements.end()) {
+        auto Added_SV_InstanceId = std::make_unique<DxilSignatureElement>(DXIL::SigPointKind::VSIn);
+        Added_SV_InstanceId->Initialize("InstanceId", hlsl::CompType::getF32(), hlsl::DXIL::InterpolationMode::Undefined, 1, 1);
+        Added_SV_InstanceId->AppendSemanticIndex(0);
+        Added_SV_InstanceId->SetSigPointKind(DXIL::SigPointKind::VSIn);
+        Added_SV_InstanceId->SetKind(hlsl::DXIL::SemanticKind::InstanceID);
+
+        auto index = InputSignature.AppendElement(std::move(Added_SV_InstanceId));
+        SVIndices.VertexShader.InstanceId = InputElements[index]->GetID();
+      }
+      else {
+        SVIndices.VertexShader.InstanceId = Existing_SV_InstanceId->get()->GetID();
+      }
+    }
+  }
+  break;
+  case DXIL::ShaderKind::Compute:
+    // Compute thread Id is not in the input signature
+  break;
+  default:
+    assert(false); // guaranteed by runOnModule
+  }
+
+  return SVIndices;
+}
+
+Value * DxilDebugInstrumentation::addComputeShaderProlog(BuilderContext &BC) {
+  Constant* Zero32Arg = BC.HlslOP->GetU32Const(0);
+  Constant* One32Arg = BC.HlslOP->GetU32Const(1);
+  Constant* Two32Arg = BC.HlslOP->GetU32Const(2);
+
+  auto ThreadIdFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::ThreadId, Type::getInt32Ty(BC.Ctx));
+  Constant* Opcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::ThreadId);
+  auto ThreadIdX = BC.Builder.CreateCall(ThreadIdFunc, { Opcode, Zero32Arg }, "ThreadIdX");
+  auto ThreadIdY = BC.Builder.CreateCall(ThreadIdFunc, { Opcode, One32Arg  }, "ThreadIdY");
+  auto ThreadIdZ = BC.Builder.CreateCall(ThreadIdFunc, { Opcode, Two32Arg  }, "ThreadIdZ");
+
+  // Compare to expected thread ID
+  auto CompareToX = BC.Builder.CreateICmpEQ(ThreadIdX, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdX), "CompareToThreadIdX");
+  auto CompareToY = BC.Builder.CreateICmpEQ(ThreadIdY, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdY), "CompareToThreadIdY");
+  auto CompareToZ = BC.Builder.CreateICmpEQ(ThreadIdZ, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdZ), "CompareToThreadIdZ");
+
+  auto CompareXAndY = BC.Builder.CreateAnd(CompareToX, CompareToY, "CompareXAndY");
+
+  auto CompareAll = BC.Builder.CreateAnd(CompareXAndY, CompareToZ, "CompareAll");
+
+  return CompareAll;
+}
+
+Value * DxilDebugInstrumentation::addVertexShaderProlog(BuilderContext &BC, SystemValueIndices SVIndices) {
+  Constant* Zero32Arg = BC.HlslOP->GetU32Const(0);
+  Constant* Zero8Arg = BC.HlslOP->GetI8Const(0);
+  UndefValue* UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
+
+  auto LoadInputOpFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::LoadInput, Type::getInt32Ty(BC.Ctx));
+  Constant* LoadInputOpcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::LoadInput);
+  Constant*  SV_Vert_ID = BC.HlslOP->GetU32Const(SVIndices.VertexShader.VertexId);
+  auto VertId = BC.Builder.CreateCall(LoadInputOpFunc,
+  { LoadInputOpcode, SV_Vert_ID, Zero32Arg /*row*/, Zero8Arg /*column*/, UndefArg }, "VertId");
+
+  Constant*  SV_Instance_ID = BC.HlslOP->GetU32Const(SVIndices.VertexShader.InstanceId);
+  auto InstanceId = BC.Builder.CreateCall(LoadInputOpFunc,
+  { LoadInputOpcode, SV_Instance_ID, Zero32Arg /*row*/, Zero8Arg /*column*/, UndefArg }, "InstanceId");
+
+  // Compare to expected vertex ID and instance ID
+  auto CompareToVert = BC.Builder.CreateICmpEQ(VertId, BC.HlslOP->GetU32Const(m_Parameters.VertexShader.VertexId), "CompareToVertId");
+  auto CompareToInstance = BC.Builder.CreateICmpEQ(InstanceId, BC.HlslOP->GetU32Const(m_Parameters.VertexShader.InstanceId), "CompareToInstanceId");
+  auto CompareBoth = BC.Builder.CreateAnd(CompareToVert, CompareToInstance, "CompareBoth");
+
+  return CompareBoth;
+}
+
+Value * DxilDebugInstrumentation::addPixelShaderProlog(BuilderContext &BC, SystemValueIndices SVIndices) {
+  Constant* Zero32Arg = BC.HlslOP->GetU32Const(0);
+  Constant* Zero8Arg = BC.HlslOP->GetI8Const(0);
+  Constant* One8Arg = BC.HlslOP->GetI8Const(1);
+  UndefValue* UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
+
+  // Convert SV_POSITION to UINT    
+  Value * XAsInt;
+  Value * YAsInt;
+  {
+    auto LoadInputOpFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::LoadInput, Type::getFloatTy(BC.Ctx));
+    Constant* LoadInputOpcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::LoadInput);
+    Constant*  SV_Pos_ID = BC.HlslOP->GetU32Const(SVIndices.PixelShader.Position);
+    auto XPos = BC.Builder.CreateCall(LoadInputOpFunc,
+    { LoadInputOpcode, SV_Pos_ID, Zero32Arg /*row*/, Zero8Arg /*column*/, UndefArg }, "XPos");
+    auto YPos = BC.Builder.CreateCall(LoadInputOpFunc,
+    { LoadInputOpcode, SV_Pos_ID, Zero32Arg /*row*/, One8Arg /*column*/, UndefArg }, "YPos");
+
+    XAsInt = BC.Builder.CreateCast(Instruction::CastOps::FPToUI, XPos, Type::getInt32Ty(BC.Ctx), "XIndex");
+    YAsInt = BC.Builder.CreateCast(Instruction::CastOps::FPToUI, YPos, Type::getInt32Ty(BC.Ctx), "YIndex");
+  }
+
+  // Compare to expected pixel position and primitive ID
+  auto CompareToX = BC.Builder.CreateICmpEQ(XAsInt, BC.HlslOP->GetU32Const(m_Parameters.PixelShader.X), "CompareToX");
+  auto CompareToY = BC.Builder.CreateICmpEQ(YAsInt, BC.HlslOP->GetU32Const(m_Parameters.PixelShader.Y), "CompareToY");
+  auto ComparePos = BC.Builder.CreateAnd(CompareToX, CompareToY, "ComparePos");
+
+  return ComparePos;
+}
+
+void DxilDebugInstrumentation::addUAV(BuilderContext &BC)
+{
+  // Set up a UAV with structure of a single int
+  unsigned int UAVResourceHandle = static_cast<unsigned int>(BC.DM.GetUAVs().size());
+  SmallVector<llvm::Type*, 1> Elements{ Type::getInt32Ty(BC.Ctx) };
+  llvm::StructType *UAVStructTy = llvm::StructType::create(Elements, "PIX_DebugUAV_Type");
+  std::unique_ptr<DxilResource> pUAV = llvm::make_unique<DxilResource>();
+  pUAV->SetGlobalName("PIX_DebugUAVName");
+  pUAV->SetGlobalSymbol(UndefValue::get(UAVStructTy->getPointerTo()));
+  pUAV->SetID(UAVResourceHandle);
+  pUAV->SetSpaceID((unsigned int)-2); // This is the reserved-for-tools register space
+  pUAV->SetSampleCount(1);
+  pUAV->SetGloballyCoherent(false);
+  pUAV->SetHasCounter(false);
+  pUAV->SetCompType(CompType::getI32());
+  pUAV->SetLowerBound(0);
+  pUAV->SetRangeSize(1);
+  pUAV->SetKind(DXIL::ResourceKind::RawBuffer);
+  pUAV->SetRW(true);
+
+  auto ID = BC.DM.AddUAV(std::move(pUAV));
+  assert(ID == UAVResourceHandle);
+
+  BC.DM.m_ShaderFlags.SetEnableRawAndStructuredBuffers(true);
+
+  // Create handle for the newly-added UAV
+  Function* CreateHandleOpFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::CreateHandle, Type::getVoidTy(BC.Ctx));
+  Constant* CreateHandleOpcodeArg = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::CreateHandle);
+  Constant* UAVVArg = BC.HlslOP->GetI8Const(static_cast<std::underlying_type<DxilResourceBase::Class>::type>(DXIL::ResourceClass::UAV));
+  Constant* MetaDataArg = BC.HlslOP->GetU32Const(ID); // position of the metadata record in the corresponding metadata list
+  Constant* IndexArg = BC.HlslOP->GetU32Const(0); // 
+  Constant* FalseArg = BC.HlslOP->GetI1Const(0); // non-uniform resource index: false
+  m_HandleForUAV = BC.Builder.CreateCall(CreateHandleOpFunc,
+  { CreateHandleOpcodeArg, UAVVArg, MetaDataArg, IndexArg, FalseArg }, "PIX_DebugUAV_Handle");
+}
+
+void DxilDebugInstrumentation::addInvocationSelectionProlog(BuilderContext &BC, SystemValueIndices SVIndices) {
+  auto ShaderModel = BC.DM.GetShaderModel();
+
+  Value * ParameterTestResult;
+  switch (ShaderModel->GetKind()) {
+  case DXIL::ShaderKind::Pixel:
+    ParameterTestResult = addPixelShaderProlog(BC, SVIndices);
+    break;
+    case DXIL::ShaderKind::Vertex:
+      ParameterTestResult = addVertexShaderProlog(BC, SVIndices);
+    break;
+    case DXIL::ShaderKind::Compute:
+      ParameterTestResult = addComputeShaderProlog(BC);
+    break;
+  default:
+    assert(false); // guaranteed by runOnModule
+  }
+
+  // This is a convenient place to calculate the values that modify the UAV offset for invocations of interest and for
+  // UAV size.
+  m_OffsetMultiplicand = BC.Builder.CreateCast(Instruction::CastOps::ZExt, ParameterTestResult, Type::getInt32Ty(BC.Ctx), "OffsetMultiplicand");
+  auto InverseOffsetMultiplicand = BC.Builder.CreateSub(BC.HlslOP->GetU32Const(1), m_OffsetMultiplicand, "ComplementOfMultiplicand");
+  m_OffsetAddend = BC.Builder.CreateMul(BC.HlslOP->GetU32Const(UAVDumpingGroundOffset()), InverseOffsetMultiplicand, "OffsetAddend");
+  m_OffsetMask = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() - 1);
+
+  m_SelectionCriterion = ParameterTestResult;
+}
+
+void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInBytes) {
+  assert(m_CurrentIndex == nullptr);
+  assert(m_RemainingReservedSpaceInBytes == 0);
+
+  m_RemainingReservedSpaceInBytes = SpaceInBytes;
+
+  // Insert the UAV increment instruction:
+  Function* AtomicOpFunc = BC.HlslOP->GetOpFunc(OP::OpCode::AtomicBinOp, Type::getInt32Ty(BC.Ctx));
+  Constant* AtomicBinOpcode = BC.HlslOP->GetU32Const((unsigned)OP::OpCode::AtomicBinOp);
+  Constant* AtomicAdd = BC.HlslOP->GetU32Const((unsigned)DXIL::AtomicBinOpCode::Add);
+  Constant* Zero32Arg = BC.HlslOP->GetU32Const(0);
+  UndefValue* UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
+
+  // so inc will be zero for uninteresting invocations:
+  Value * IncrementForThisInvocation;
+  auto findIncrementInstruction = m_IncrementInstructionBySize.find(SpaceInBytes);
+  if (findIncrementInstruction == m_IncrementInstructionBySize.end()) {
+    Constant* Increment = BC.HlslOP->GetU32Const(SpaceInBytes);
+    auto it = m_IncrementInstructionBySize.emplace(
+      SpaceInBytes, BC.Builder.CreateMul(Increment, m_OffsetMultiplicand, "IncrementForThisInvocation"));
+    findIncrementInstruction = it.first;
+  }
+  IncrementForThisInvocation = findIncrementInstruction->second;
+
+  auto PreviousValue = BC.Builder.CreateCall(AtomicOpFunc, {
+    AtomicBinOpcode,// i32, ; opcode
+    m_HandleForUAV, // %dx.types.Handle, ; resource handle
+    AtomicAdd,      // i32, ; binary operation code : EXCHANGE, IADD, AND, OR, XOR, IMIN, IMAX, UMIN, UMAX
+    Zero32Arg,      // i32, ; coordinate c0: index in bytes
+    UndefArg,       // i32, ; coordinate c1 (unused)
+    UndefArg,       // i32, ; coordinate c2 (unused)
+    IncrementForThisInvocation,      // i32); increment value
+  }, "UAVIncResult");
+
+  if (m_InvocationId == nullptr)
+  {
+      m_InvocationId = PreviousValue;
+  }
+
+  auto MaskedForLimit = BC.Builder.CreateAnd(PreviousValue, m_OffsetMask, "MaskedForUAVLimit");
+  // The return value will either end up being itself (multiplied by one and added with zero)
+  // or the "dump uninteresting things here" value of (UAVSize - a bit).
+  auto MultipliedForInterest = BC.Builder.CreateMul(MaskedForLimit, m_OffsetMultiplicand, "MultipliedForInterest");
+  auto AddedForInterest = BC.Builder.CreateAdd(MultipliedForInterest, m_OffsetAddend, "AddedForInterest");
+  m_CurrentIndex = AddedForInterest;
+}
+
+void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC, Value * TheValue) {
+  assert(m_RemainingReservedSpaceInBytes > 0);
+
+  auto TheValueTypeID = TheValue->getType()->getTypeID();
+  if (TheValueTypeID == Type::TypeID::DoubleTyID) {
+    Function* SplitDouble = BC.HlslOP->GetOpFunc(OP::OpCode::SplitDouble, TheValue->getType());
+    Constant* SplitDoubleOpcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::SplitDouble);
+    auto SplitDoubleIntruction = BC.Builder.CreateCall(SplitDouble, { SplitDoubleOpcode, TheValue }, "SplitDouble");
+    auto LowBits = BC.Builder.CreateExtractValue(SplitDoubleIntruction, 0, "LowBits");
+    auto HighBits = BC.Builder.CreateExtractValue(SplitDoubleIntruction, 1, "HighBits");
+    //addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
+    addDebugEntryValue(BC, LowBits);
+    addDebugEntryValue(BC, HighBits);
+  }
+  else if (TheValueTypeID == Type::TypeID::IntegerTyID && TheValue->getType()->getIntegerBitWidth() == 64) {
+    auto LowBits = BC.Builder.CreateTrunc(TheValue, Type::getInt32Ty(BC.Ctx), "LowBits");
+    auto ShiftedBits = BC.Builder.CreateLShr(TheValue, 32, "ShiftedBits");
+    auto HighBits = BC.Builder.CreateTrunc(ShiftedBits, Type::getInt32Ty(BC.Ctx), "HighBits");
+    //addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
+    addDebugEntryValue(BC, LowBits);
+    addDebugEntryValue(BC, HighBits);
+  }
+  else if (TheValueTypeID == Type::TypeID::IntegerTyID &&
+    (TheValue->getType()->getIntegerBitWidth() == 16 || TheValue->getType()->getIntegerBitWidth() == 1)) {
+    auto As32 = BC.Builder.CreateZExt(TheValue, Type::getInt32Ty(BC.Ctx), "As32");
+    addDebugEntryValue(BC, As32);
+  }
+  else if (TheValueTypeID == Type::TypeID::HalfTyID) {
+    auto AsFloat = BC.Builder.CreateFPCast(TheValue, Type::getFloatTy(BC.Ctx), "AsFloat");
+    addDebugEntryValue(BC, AsFloat);
+  }
+  else {
+    Function* StoreValue = BC.HlslOP->GetOpFunc(OP::OpCode::BufferStore, TheValue->getType()); // Type::getInt32Ty(BC.Ctx));
+    Constant* StoreValueOpcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::BufferStore);
+    Constant* Zero32Arg = BC.HlslOP->GetU32Const(0);
+    Constant* ZeroArg;
+    if (TheValueTypeID == Type::TypeID::IntegerTyID) {
+      ZeroArg = BC.HlslOP->GetU32Const(0);
+    }
+    else if (TheValueTypeID == Type::TypeID::FloatTyID) {
+      ZeroArg = BC.HlslOP->GetFloatConst(0.f);
+    }
+    else {
+      // The above are the only two valid types for a UAV store
+      assert(false);
+    }
+    Constant* WriteMask_X = BC.HlslOP->GetI8Const(1);
+    (void)BC.Builder.CreateCall(StoreValue, {
+      StoreValueOpcode, // i32 opcode
+      m_HandleForUAV,     // %dx.types.Handle, ; resource handle
+      m_CurrentIndex,            // i32 c0: index in bytes into UAV
+      Zero32Arg,        // i32 c1: unused
+      TheValue,
+      ZeroArg,        // unused values
+      ZeroArg,        // unused values
+      ZeroArg,        // unused values
+      WriteMask_X
+    });
+
+    m_RemainingReservedSpaceInBytes -= 4;
+    assert(m_RemainingReservedSpaceInBytes < 1024);  // check for underflow
+
+    if (m_RemainingReservedSpaceInBytes != 0) {
+      m_CurrentIndex = BC.Builder.CreateAdd(m_CurrentIndex, BC.HlslOP->GetU32Const(4));
+    }
+    else {
+      m_CurrentIndex = nullptr;
+    }
+  }
+}
+
+void DxilDebugInstrumentation::addInvocationStartMarker(BuilderContext &BC) {
+  DebugShaderModifierRecordHeader marker{ 0 };
+  reserveDebugEntrySpace(BC, sizeof(marker));
+
+  marker.Header.Details.SizeDwords = DebugShaderModifierRecordPayloadSizeDwords(sizeof(marker));;
+  marker.Header.Details.Flags = 0;
+  marker.Header.Details.Type = DebugShaderModifierRecordTypeInvocationStartMarker;
+  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(marker.Header.u32Header));
+  addDebugEntryValue(BC, m_InvocationId);
+}
+
+template<typename ReturnType>
+void DxilDebugInstrumentation::addStepEntryForType(DebugShaderModifierRecordType RecordType, BuilderContext &BC, Instruction *Inst) {
+
+  DebugShaderModifierRecordDXILStep<ReturnType> step = {};
+  reserveDebugEntrySpace(BC, sizeof(step));
+
+  step.Header.Details.SizeDwords = DebugShaderModifierRecordPayloadSizeDwords(sizeof(step));
+  step.Header.Details.Type = static_cast<uint8_t>(RecordType);
+  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(step.Header.u32Header));
+  addDebugEntryValue(BC, m_InvocationId);
+  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(m_InstructionIndex++));
+
+  if (RecordType != DebugShaderModifierRecordTypeDXILStepVoid) {
+    addDebugEntryValue(BC, Inst);
+  }
+}
+
+void DxilDebugInstrumentation::addStepDebugEntry(BuilderContext &BC, Instruction *Inst) {
+  if (Inst->getOpcode() == Instruction::OtherOps::PHI) {
+    return;
+  }
+
+  Type::TypeID ID = Inst->getType()->getTypeID();
+
+  switch (ID) {
+  case Type::TypeID::StructTyID:
+  case Type::TypeID::VoidTyID:
+    addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, BC, Inst);
+    break;
+  case Type::TypeID::FloatTyID:
+    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, Inst);
+    break;
+  case Type::TypeID::IntegerTyID:
+    if (Inst->getType()->getIntegerBitWidth() == 64) {
+      addStepEntryForType<uint64_t>(DebugShaderModifierRecordTypeDXILStepUint64, BC, Inst);
+    }
+    else {
+      addStepEntryForType<uint32_t>(DebugShaderModifierRecordTypeDXILStepUint32, BC, Inst);
+    }
+    break;
+  case Type::TypeID::DoubleTyID:
+    addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble, BC, Inst);
+    break;
+  case Type::TypeID::HalfTyID:
+    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC, Inst);
+    break;
+  case Type::TypeID::FP128TyID:
+  case Type::TypeID::LabelTyID:
+  case Type::TypeID::MetadataTyID:
+  case Type::TypeID::FunctionTyID:
+  case Type::TypeID::ArrayTyID:
+  case Type::TypeID::PointerTyID:
+  case Type::TypeID::VectorTyID:
+    assert(false);
+  }
+
+}
+
+bool DxilDebugInstrumentation::runOnModule(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  LLVMContext & Ctx = M.getContext();
+  OP *HlslOP = DM.GetOP();
+
+  auto ShaderModel = DM.GetShaderModel();
+  switch (ShaderModel->GetKind()) {
+  case DXIL::ShaderKind::Pixel:
+  case DXIL::ShaderKind::Vertex:
+  case DXIL::ShaderKind::Compute:
+    break;
+  default:
+    return false;
+  }
+
+
+  // First record pointers to all instructions in the function:
+  std::vector<Instruction*> AllInstructions;
+  for (inst_iterator I = inst_begin(DM.GetEntryFunction()), E = inst_end(DM.GetEntryFunction()); I != E; ++I) {
+    AllInstructions.push_back(&*I);
+  }
+
+  // Branchless instrumentation requires taking care of a few things:
+  // -Each invocation of the shader will be either of interest or not of interest
+  //    -If of interest, the offset into the output UAV will be as expected
+  //    -If not, the offset is forced to (UAVsize) - (Small Amount), and that output is ignored by the CPU-side code.
+  // -The invocation of interest may overflow the UAV. This is handled by taking the modulus of the
+  //  output index. Overflow is then detected on the CPU side by checking for the presence of a canary
+  //  value at (UAVSize) - (Small Amount) * 2 (which is actually a conservative definition of overflow).
+  //
+
+  Instruction* firstInsertionPt = DM.GetEntryFunction()->getEntryBlock().getFirstInsertionPt();
+  IRBuilder<> Builder(firstInsertionPt);
+
+  BuilderContext BC{ M, DM, Ctx, HlslOP, Builder };
+
+  addUAV(BC);
+  auto SystemValues = addRequiredSystemValues(BC);
+  addInvocationSelectionProlog(BC, SystemValues);
+  addInvocationStartMarker(BC);
+
+  // Instrument original instructions:
+  {
+    for (auto & Inst : AllInstructions) {
+      // Instrumentation goes after the instruction if it has a return value.
+      // Otherwise, the instruction might be a terminator so we HAVE to put the instrumentation before
+      if (Inst->getType()->getTypeID() != Type::TypeID::VoidTyID) {
+        // Has a return type, so can't be a terminator, so start inserting before the next instruction
+        IRBuilder<> Builder(Inst->getNextNode());
+        BuilderContext BC2{ BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder };
+        addStepDebugEntry(BC2, Inst);
+      }
+      else {
+        // Insert before this instruction
+        IRBuilder<> Builder(Inst);
+        BuilderContext BC2{ BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder };
+        addStepDebugEntry(BC2, Inst);
+      }
+    }
+  }
+
+  DM.ReEmitDxilResources();
+
+  return true;
+}
+
+char DxilDebugInstrumentation::ID = 0;
+
+ModulePass *llvm::createDxilDebugInstrumentationPass() {
+  return new DxilDebugInstrumentation();
+}
+
+INITIALIZE_PASS(DxilDebugInstrumentation, "hlsl-dxil-debug-instrumentation", "HLSL DXIL debug instrumentation for PIX", false, false)

+ 29 - 0
tools/clang/test/HLSL/pix/DebugBasic.hlsl

@@ -0,0 +1,29 @@
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+
+// Check that the basic starting header is present:
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %XPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: %YPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)
+// CHECK: %XIndex = fptoui float %XPos to i32
+// CHECK: %YIndex = fptoui float %YPos to i32
+// CHECK: %CompareToX = icmp eq i32 %XIndex, 0
+// CHECK: %CompareToY = icmp eq i32 %YIndex, 0
+// CHECK: %ComparePos = and i1 %CompareToX, %CompareToY
+// CHECK: %OffsetMultiplicand = zext i1 %ComparePos to i32
+// CHECK: %ComplementOfMultiplicand = sub i32 1, %OffsetMultiplicand
+// CHECK: %OffsetAddend = mul i32 983040, %ComplementOfMultiplicand
+// CHECK: %IncrementForThisInvocation = mul i32 8, %OffsetMultiplicand
+
+// Check the first instruction was instrumented:
+// CHECK: %UAVIncResult = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0, i32 0, i32 undef, i32 undef, i32 %IncrementForThisInvocation)
+// CHECK: %MaskedForUAVLimit = and i32 %UAVIncResult, 983039
+// CHECK: %MultipliedForInterest = mul i32 %MaskedForUAVLimit, %OffsetMultiplicand
+// CHECK: %AddedForInterest = add i32 %MultipliedForInterest, %OffsetAddend
+// CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest, i32 0, i32 0, i32 0, i32 0, i32 0, i8 1)
+
+
+[RootSignature("")]
+float4 main() : SV_Target {
+    return float4(0,0,0,0);
+}

+ 18 - 0
tools/clang/test/HLSL/pix/DebugCSParameters.hlsl

@@ -0,0 +1,18 @@
+// RUN: %dxc -Emain -Tcs_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,parameter0=10,parameter1=20,parameter2=30 | %FileCheck %s
+
+// Check that the CS thread IDs are added properly
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %ThreadIdX = call i32 @dx.op.threadId.i32(i32 93, i32 0)
+// CHECK: %ThreadIdY = call i32 @dx.op.threadId.i32(i32 93, i32 1)
+// CHECK: %ThreadIdZ = call i32 @dx.op.threadId.i32(i32 93, i32 2)
+// CHECK: %CompareToThreadIdX = icmp eq i32 %ThreadIdX, 10
+// CHECK: %CompareToThreadIdY = icmp eq i32 %ThreadIdY, 20
+// CHECK: %CompareToThreadIdZ = icmp eq i32 %ThreadIdZ, 30
+// CHECK: %CompareXAndY = and i1 %CompareToThreadIdX, %CompareToThreadIdY
+// CHECK: %CompareAll = and i1 %CompareXAndY, %CompareToThreadIdZ
+
+[RootSignature("")]
+[numthreads(4, 4, 4)]
+void main() {
+}

+ 53 - 0
tools/clang/test/HLSL/pix/DebugFlowControl.hlsl

@@ -0,0 +1,53 @@
+// RUN: %dxc -EFlowControlPS -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+
+// Check that flow control constructs don't break the instrumentation.
+
+// check instrumentation for one branch. 
+
+// CHECK:  %UAVIncResult15 = call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle %PIX_DebugUAV_Handle, i32 0, i32 0, i32 undef, i32 undef, i32 %IncrementForThisInvocation1)
+// CHECK:  %MaskedForUAVLimit16 = and i32 %UAVIncResult15, 983039
+// CHECK:  %MultipliedForInterest17 = mul i32 %MaskedForUAVLimit16, %OffsetMultiplicand
+// CHECK:  %AddedForInterest18 = add i32 %MultipliedForInterest17, %OffsetAddend
+// CHECK:  call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %PIX_DebugUAV_Handle, i32 %AddedForInterest18, i32 0, i32 64257, i32 0, i32 0, i32 0, i8 1)
+// CHECK:  switch i32
+// CHECK:    i32 0, label 
+// CHECK:    i32 32, label
+// CHECK:  ]
+
+int i32;
+float f32;
+
+float4 Vectorize(float f)
+{
+  return float4((float)f / 128.f, (float)f / 128.f, (float)f / 128.f, 1.f);
+}
+
+float4 FlowControlPS() : SV_Target
+{
+  float4 ret = { 0,0,0,1 };
+  switch (i32)
+  {
+  case 0:
+    ret = float4(1, 0, 1, 1);
+    break;
+  case 32:
+    ret = Vectorize(f32);
+    break;
+  }
+
+  if (i32 > 10)
+  {
+    ret.r += 0.1f;
+  }
+  else
+  {
+    ret.g += 0.1f;
+  }
+
+  for (uint i = 0; i < 3; ++i)
+  {
+    ret.b += (float)i32 / 10.f;
+  }
+
+  return ret;
+}

+ 17 - 0
tools/clang/test/HLSL/pix/DebugPSParameters.hlsl

@@ -0,0 +1,17 @@
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,parameter0=1,parameter1=2 | %FileCheck %s
+
+// Check that the basic starting header is present:
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %XPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: %YPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)
+// CHECK: %XIndex = fptoui float %XPos to i32
+// CHECK: %YIndex = fptoui float %YPos to i32
+// CHECK: %CompareToX = icmp eq i32 %XIndex, 1
+// CHECK: %CompareToY = icmp eq i32 %YIndex, 2
+
+
+[RootSignature("")]
+float4 main() : SV_Target{
+  return float4(0,0,0,0);
+}

+ 15 - 0
tools/clang/test/HLSL/pix/DebugPreexistingSVInstance.hlsl

@@ -0,0 +1,15 @@
+// RUN: %dxc -Emain -Tvs_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+
+// Check that the SV_InstanceId check is present:
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %VertId = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+// CHECK: %InstanceId = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: %CompareToVertId = icmp eq i32 %VertId, 0
+// CHECK: %CompareToInstanceId = icmp eq i32 %InstanceId, 0
+// CHECK: %CompareBoth = and i1 %CompareToVertId, %CompareToInstanceId
+
+[RootSignature("")]
+float4 main(uint id : SV_InstanceId) : SV_Position{
+    return float4(id,0,0,0);
+}

+ 18 - 0
tools/clang/test/HLSL/pix/DebugPreexistingSVPosition.hlsl

@@ -0,0 +1,18 @@
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+
+// Check that the basic SV_Position check is present:
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %XPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: %YPos = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)
+// CHECK: %XIndex = fptoui float %XPos to i32
+// CHECK: %YIndex = fptoui float %YPos to i32
+// CHECK: %CompareToX = icmp eq i32 %XIndex, 0
+// CHECK: %CompareToY = icmp eq i32 %YIndex, 0
+// CHECK: %ComparePos = and i1 %CompareToX, %CompareToY
+
+
+[RootSignature("")]
+float4 main(float4 pos : SV_Position) : SV_Target {
+    return pos;
+}

+ 15 - 0
tools/clang/test/HLSL/pix/DebugPreexistingSVVertex.hlsl

@@ -0,0 +1,15 @@
+// RUN: %dxc -Emain -Tvs_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation | %FileCheck %s
+
+// Check that the vertex id check is present:
+
+// CHECK:  %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK:  %VertId = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK:  %InstanceId = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+// CHECK:  %CompareToVertId = icmp eq i32 %VertId, 0
+// CHECK:  %CompareToInstanceId = icmp eq i32 %InstanceId, 0
+// CHECK:  %CompareBoth = and i1 %CompareToVertId, %CompareToInstanceId
+
+[RootSignature("")]
+float4 main(uint id : SV_VertexId) : SV_Position{
+    return float4(id,0,0,0);
+}

+ 13 - 0
tools/clang/test/HLSL/pix/DebugUAVSize.hlsl

@@ -0,0 +1,13 @@
+// RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,UAVSize=100000 | %FileCheck %s
+
+// Check that the UAV size is reflected in the instrumentation. (Should be passed-in size - 64k)
+// (The offset here is the "dumping ground" for non-interesting invocations)
+// 100,000 - 65.536 = 34,464
+
+// CHECK: %OffsetAddend = mul i32 34464, %ComplementOfMultiplicand
+
+
+[RootSignature("")]
+float4 main() : SV_Target {
+    return float4(0,0,0,0);
+}

+ 17 - 0
tools/clang/test/HLSL/pix/DebugVSParameters.hlsl

@@ -0,0 +1,17 @@
+// RUN: %dxc -Emain -Tvs_6_0 %s | %opt -S -hlsl-dxil-debug-instrumentation,parameter0=1,parameter1=2 | %FileCheck %s
+
+// Check that the instance and vertex id are parsed and present:
+
+// CHECK: %PIX_DebugUAV_Handle = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: %VertId = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: %InstanceId = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+// CHECK: %CompareToVertId = icmp eq i32 %VertId, 1
+// CHECK: %CompareToInstanceId = icmp eq i32 %InstanceId, 2
+// CHECK: %CompareBoth = and i1 %CompareToVertId, %CompareToInstanceId
+// CHECK: %OffsetMultiplicand = zext i1 %CompareBoth to i32
+
+
+[RootSignature("")]
+float4 main() : SV_Position{
+  return float4(0,0,0,0);
+}

+ 45 - 0
tools/clang/unittests/HLSL/CompilerTest.cpp

@@ -430,6 +430,15 @@ public:
   TEST_METHOD(PixConstantColorFromCB)
   TEST_METHOD(PixConstantColorFromCBint)
   TEST_METHOD(PixForceEarlyZ)
+  TEST_METHOD(PixDebugBasic)
+  TEST_METHOD(PixDebugUAVSize)
+  TEST_METHOD(PixDebugPSParameters)
+  TEST_METHOD(PixDebugVSParameters)
+  TEST_METHOD(PixDebugCSParameters)
+  TEST_METHOD(PixDebugFlowControl)
+  TEST_METHOD(PixDebugPreexistingSVPosition)
+  TEST_METHOD(PixDebugPreexistingSVVertex)
+  TEST_METHOD(PixDebugPreexistingSVInstance)
 
   TEST_METHOD(CodeGenAbs1)
   TEST_METHOD(CodeGenAbs2)
@@ -2866,6 +2875,42 @@ TEST_F(CompilerTest, PixForceEarlyZ) {
   CodeGenTestCheck(L"pix\\forceEarlyZ.hlsl");
 }
 
+TEST_F(CompilerTest, PixDebugBasic) {
+  CodeGenTestCheck(L"pix\\DebugBasic.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugUAVSize) {
+  CodeGenTestCheck(L"pix\\DebugUAVSize.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugPSParameters) {
+  CodeGenTestCheck(L"pix\\DebugPSParameters.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugVSParameters) {
+  CodeGenTestCheck(L"pix\\DebugVSParameters.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugCSParameters) {
+  CodeGenTestCheck(L"pix\\DebugCSParameters.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugFlowControl) {
+  CodeGenTestCheck(L"pix\\DebugFlowControl.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugPreexistingSVPosition) {
+  CodeGenTestCheck(L"pix\\DebugPreexistingSVPosition.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugPreexistingSVVertex) {
+  CodeGenTestCheck(L"pix\\DebugPreexistingSVVertex.hlsl");
+}
+
+TEST_F(CompilerTest, PixDebugPreexistingSVInstance) {
+  CodeGenTestCheck(L"pix\\DebugPreexistingSVInstance.hlsl");
+}
+
 TEST_F(CompilerTest, CodeGenAbs1) {
   CodeGenTestCheck(L"..\\CodeGenHLSL\\abs1.hlsl");
 }

+ 5 - 0
utils/hct/hctdb.py

@@ -1282,6 +1282,11 @@ class db_dxil(object):
             {'n':'constant-alpha','t':'float','c':1}])
         add_pass('hlsl-dxil-remove-discards', 'DxilRemoveDiscards', 'HLSL DXIL Remove all discard instructions', [])
         add_pass('hlsl-dxil-force-early-z', 'DxilForceEarlyZ', 'HLSL DXIL Force the early Z global flag, if shader has no discard calls', [])
+        add_pass('hlsl-dxil-debug-instrumentation', 'DxilDebugInstrumentation', 'HLSL DXIL debug instrumentation for PIX', [
+            {'n':'UAVSize','t':'int','c':1},
+            {'n':'parameter0','t':'int','c':1},
+            {'n':'parameter1','t':'int','c':1},
+            {'n':'parameter2','t':'int','c':1}])
         add_pass('hlsl-dxil-reduce-msaa-to-single', 'DxilReduceMSAAToSingleSample', 'HLSL DXIL Reduce all MSAA reads to single-sample reads', [])
         add_pass('hlsl-dxilfinalize', 'DxilFinalizeModule', 'HLSL DXIL Finalize Module', [])
         add_pass('hlsl-dxilemit', 'DxilEmitMetadata', 'HLSL DXIL Metadata Emit', [])