O3DE
/
DirectXShaderCompiler
zrkadlo https://github.com/o3de/DirectXShaderCompiler


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020
							///////////////////////////////////////////////////////////////////////////////
//                                                                           //
// DxilDebugInstrumentation.cpp                                              //
// Copyright (C) Microsoft Corporation. All rights reserved.                 //
// This file is distributed under the University of Illinois Open Source     //
// License. See LICENSE.TXT for details.                                     //
//                                                                           //
// Adds instrumentation that enables shader debugging in PIX                 //
//                                                                           //
///////////////////////////////////////////////////////////////////////////////

#include "dxc/DXIL/DxilModule.h"
#include "dxc/DXIL/DxilOperations.h"
#include "dxc/DXIL/DxilUtil.h"
#include "dxc/DxilPIXPasses/DxilPIXPasses.h"
#include "dxc/DxilPIXPasses/DxilPIXVirtualRegisters.h"
#include "dxc/HLSL/DxilGenerationPass.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"

#include "PixPassHelpers.h"

using namespace llvm;
using namespace hlsl;

// Overview of instrumentation:
//
// In summary, instructions are added that cause a "trace" of the execution of
// the shader to be written out to a UAV. This trace is then used by a debugger
// application to provide a post-mortem debugging experience that reconstructs
// the execution history of the shader.
//
// The trace is only required for a particular shader instance of interest, and
// a branchless mechanism is used to write the trace either to an incrementing
// location within the UAV, or to a "dumping ground" area at the top of the UAV
// if the instance is not of interest.
//
// The following modifications are made:
//
// First, instructions are added to the top of the entry point function that
// implement the following:
// -  Examine the input variables that define the instance of the shader that is
// running. This will
//    be SV_Position for pixel shaders, SV_Vertex+SV_Instance for vertex
//    shaders, thread id for compute shaders etc. If these system values need to
//    be added to the shader, then they are also added to the input signature,
//    if appropriate.
// -  Compare the above variables with the instance of interest defined by the
// invoker of this pass.
//    Deduce two values: a multiplicand and an addend that together allow a
//    branchless calculation of the offset into the UAV at which to write via
//    "offset = offset * multiplicand + addend." If the instance is NOT of
//    interest, the multiplicand is zero and the addend is sizeof(UAV)-(a little
//    bit), causing writes for uninteresting invocations to end up at the top of
//    the UAV. Otherwise the multiplicand is 1 and the addend is 0.
// -  Calculate an "instance identifier". Even with the above instance
// identification, several invocations may
//    end up matching the selection criteria. Specifically, this happens during
//    a draw call in which many triangles overlap the pixel of interest. More on
//    this below.
//
// During execution, the instrumentation for most instructions cause data to be
// emitted to the UAV. The index at which data is written is identified by
// treating the first uint32 of the UAV as an index which is atomically
// incremented by the instrumentation. The very first value of this counter that
// is encountered by each invocation is used as the "instance identifier"
// mentioned above. That instance identifier is written out with each packet,
// since many pixel shaders executing in parallel will emit interleaved packets,
// and the debugger application uses the identifiers to group packets from each
// separate invocation together.
//
// If an instruction has a non-void and primitive return type, i.e. isn't a
// struct, then the instrumentation will write that value out to the UAV as well
// as part of the "step" data packet.
//
// The limiting size of the UAV is enforced in a branchless way by ANDing the
// offset with a precomputed value that is sizeof(UAV)-64. The actual size of
// the UAV allocated by the caller is required to be a power of two plus 64 for
// this reason. The caller detects UAV overrun by examining a canary value close
// to the end of the power-of-two size of the UAV. If this value has been
// overwritten, the debug session is deemed to have overflowed the UAV. The
// caller will than allocate a UAV that is twice the size and try again, up to a
// predefined maximum.

// Keep these in sync with the same-named value in the debugger application's
// WinPixShaderUtils.h

constexpr uint64_t DebugBufferDumpingGroundSize = 64 * 1024;
// The actual max size per record is much smaller than this, but it never
// hurts to be generous.
constexpr size_t CounterOffsetBeyondUsefulData = DebugBufferDumpingGroundSize / 2;

// These definitions echo those in the debugger application's
// debugshaderrecord.h file
enum DebugShaderModifierRecordType {
  DebugShaderModifierRecordTypeInvocationStartMarker,
  DebugShaderModifierRecordTypeStep,
  DebugShaderModifierRecordTypeEvent,
  DebugShaderModifierRecordTypeInputRegister,
  DebugShaderModifierRecordTypeReadRegister,
  DebugShaderModifierRecordTypeWrittenRegister,
  DebugShaderModifierRecordTypeRegisterRelativeIndex0,
  DebugShaderModifierRecordTypeRegisterRelativeIndex1,
  DebugShaderModifierRecordTypeRegisterRelativeIndex2,
  DebugShaderModifierRecordTypeDXILStepVoid = 251,
  DebugShaderModifierRecordTypeDXILStepFloat = 252,
  DebugShaderModifierRecordTypeDXILStepUint32 = 253,
  DebugShaderModifierRecordTypeDXILStepUint64 = 254,
  DebugShaderModifierRecordTypeDXILStepDouble = 255,
};

// These structs echo those in the debugger application's debugshaderrecord.h
// file, but are recapitulated here because the originals use unnamed unions
// which are disallowed by DXCompiler's build.
//
#pragma pack(push, 4)
struct DebugShaderModifierRecordHeader {
  union {
    struct {
      uint32_t SizeDwords : 4;
      uint32_t Flags : 4;
      uint32_t Type : 8;
      uint32_t HeaderPayload : 16;
    } Details;
    uint32_t u32Header;
  } Header;
  uint32_t UID;
};

struct DebugShaderModifierRecordDXILStepBase {
  union {
    struct {
      uint32_t SizeDwords : 4;
      uint32_t Flags : 4;
      uint32_t Type : 8;
      uint32_t Opcode : 16;
    } Details;
    uint32_t u32Header;
  } Header;
  uint32_t UID;
  uint32_t InstructionOffset;
};

template <typename ReturnType>
struct DebugShaderModifierRecordDXILStep
    : public DebugShaderModifierRecordDXILStepBase {
  ReturnType ReturnValue;
  union {
    struct {
      uint32_t ValueOrdinalBase : 16;
      uint32_t ValueOrdinalIndex : 16;
    } Details;
    uint32_t u32ValueOrdinal;
  } ValueOrdinal;
};

template <>
struct DebugShaderModifierRecordDXILStep<void>
    : public DebugShaderModifierRecordDXILStepBase {};
#pragma pack(pop)

uint32_t
DebugShaderModifierRecordPayloadSizeDwords(size_t recordTotalSizeBytes) {
  return ((recordTotalSizeBytes - sizeof(DebugShaderModifierRecordHeader)) /
          sizeof(uint32_t));
}

class DxilDebugInstrumentation : public ModulePass {

private:
  union ParametersAllTogether {
    unsigned Parameters[3];
    struct PixelShaderParameters {
      unsigned X;
      unsigned Y;
    } PixelShader;
    struct VertexShaderParameters {
      unsigned VertexId;
      unsigned InstanceId;
    } VertexShader;
    struct ComputeShaderParameters {
      unsigned ThreadIdX;
      unsigned ThreadIdY;
      unsigned ThreadIdZ;
    } ComputeShader;
    struct GeometryShaderParameters {
      unsigned PrimitiveId;
      unsigned InstanceId;
    } GeometryShader;
  } m_Parameters = {{0, 0, 0}};

  union SystemValueIndices {
    struct PixelShaderParameters {
      unsigned Position;
    } PixelShader;
    struct VertexShaderParameters {
      unsigned VertexId;
      unsigned InstanceId;
    } VertexShader;
    struct GeometryShaderParameters {
      unsigned PrimitiveId;
      unsigned InstanceId;
    } GeometryShader;
  };

  uint64_t m_UAVSize = 1024 * 1024;
  Value *m_SelectionCriterion = nullptr;
  CallInst *m_HandleForUAV = nullptr;
  Value *m_InvocationId = nullptr;

  // Together these two values allow branchless writing to the UAV. An
  // invocation of the shader is either of interest or not (e.g. it writes to
  // the pixel the user selected for debugging or it doesn't). If not of
  // interest, debugging output will still occur, but it will be relegated to
  // the very top few bytes of the UAV. Invocations of interest, by contrast,
  // will be written to the UAV at sequentially increasing offsets.

  // This value will either be one or zero (one if the invocation is of
  // interest, zero otherwise)
  Value *m_OffsetMultiplicand = nullptr;
  // This will either be zero (if the invocation is of interest) or
  // (UAVSize)-(SmallValue) if not.
  Value *m_OffsetAddend = nullptr;

  Constant *m_OffsetMask = nullptr;

  Constant *m_CounterOffset = nullptr;

  struct BuilderContext {
    Module &M;
    DxilModule &DM;
    LLVMContext &Ctx;
    OP *HlslOP;
    IRBuilder<> &Builder;
  };

  uint32_t m_RemainingReservedSpaceInBytes = 0;
  Value *m_CurrentIndex = nullptr;

public:
  static char ID; // Pass identification, replacement for typeid
  explicit DxilDebugInstrumentation() : ModulePass(ID) {}
  const char *getPassName() const override {
    return "Add PIX debug instrumentation";
  }
  void applyOptions(PassOptions O) override;
  bool runOnModule(Module &M) override;

private:
  SystemValueIndices addRequiredSystemValues(BuilderContext &BC);
  void addInvocationSelectionProlog(BuilderContext &BC,
                                    SystemValueIndices SVIndices);
  Value *addPixelShaderProlog(BuilderContext &BC, SystemValueIndices SVIndices);
  Value *addGeometryShaderProlog(BuilderContext &BC,
                                 SystemValueIndices SVIndices);
  Value *addDispatchedShaderProlog(BuilderContext &BC);
  Value *addVertexShaderProlog(BuilderContext &BC,
                               SystemValueIndices SVIndices);
  void addDebugEntryValue(BuilderContext &BC, Value *TheValue);
  void addInvocationStartMarker(BuilderContext &BC);
  void reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInDwords);
  void addStoreStepDebugEntry(BuilderContext &BC, StoreInst *Inst);
  void addStepDebugEntry(BuilderContext& BC, Instruction* Inst);
  void addStepDebugEntryValue(BuilderContext &BC, std::uint32_t InstNum,
                              Value *V, std::uint32_t ValueOrdinal,
                              Value *ValueOrdinalIndex);
  uint32_t UAVDumpingGroundOffset();
  template <typename ReturnType>
  void addStepEntryForType(DebugShaderModifierRecordType RecordType,
                           BuilderContext &BC, std::uint32_t InstNum, Value *V,
                           std::uint32_t ValueOrdinal,
                           Value *ValueOrdinalIndex);
};

void DxilDebugInstrumentation::applyOptions(PassOptions O) {
  GetPassOptionUnsigned(O, "parameter0", &m_Parameters.Parameters[0], 0);
  GetPassOptionUnsigned(O, "parameter1", &m_Parameters.Parameters[1], 0);
  GetPassOptionUnsigned(O, "parameter2", &m_Parameters.Parameters[2], 0);
  GetPassOptionUInt64(O, "UAVSize", &m_UAVSize, 1024 * 1024);
}

uint32_t DxilDebugInstrumentation::UAVDumpingGroundOffset() {
  return static_cast<uint32_t>(m_UAVSize - DebugBufferDumpingGroundSize);
}

DxilDebugInstrumentation::SystemValueIndices
DxilDebugInstrumentation::addRequiredSystemValues(BuilderContext &BC) {
  SystemValueIndices SVIndices{};

  hlsl::DxilSignature &InputSignature = BC.DM.GetInputSignature();

  auto &InputElements = InputSignature.GetElements();

  auto ShaderModel = BC.DM.GetShaderModel();
  switch (ShaderModel->GetKind()) {
  case DXIL::ShaderKind::Amplification:
  case DXIL::ShaderKind::Mesh:
  case DXIL::ShaderKind::Compute:
    // Dispatch* thread Id is not in the input signature
    break;
  case DXIL::ShaderKind::Vertex: {
    {
      auto Existing_SV_VertexId = std::find_if(
          InputElements.begin(), InputElements.end(),
          [](const std::unique_ptr<DxilSignatureElement> &Element) {
            return Element->GetSemantic()->GetKind() ==
                   hlsl::DXIL::SemanticKind::VertexID;
          });

      if (Existing_SV_VertexId == InputElements.end()) {
        auto Added_SV_VertexId =
            llvm::make_unique<DxilSignatureElement>(DXIL::SigPointKind::VSIn);
        Added_SV_VertexId->Initialize("VertexId", hlsl::CompType::getF32(),
                                      hlsl::DXIL::InterpolationMode::Undefined,
                                      1, 1);
        Added_SV_VertexId->AppendSemanticIndex(0);
        Added_SV_VertexId->SetSigPointKind(DXIL::SigPointKind::VSIn);
        Added_SV_VertexId->SetKind(hlsl::DXIL::SemanticKind::VertexID);

        auto index = InputSignature.AppendElement(std::move(Added_SV_VertexId));
        SVIndices.VertexShader.VertexId = InputElements[index]->GetID();
      } else {
        SVIndices.VertexShader.VertexId = Existing_SV_VertexId->get()->GetID();
      }
    }
    {
      auto Existing_SV_InstanceId = std::find_if(
          InputElements.begin(), InputElements.end(),
          [](const std::unique_ptr<DxilSignatureElement> &Element) {
            return Element->GetSemantic()->GetKind() ==
                   hlsl::DXIL::SemanticKind::InstanceID;
          });

      if (Existing_SV_InstanceId == InputElements.end()) {
        auto Added_SV_InstanceId =
            llvm::make_unique<DxilSignatureElement>(DXIL::SigPointKind::VSIn);
        Added_SV_InstanceId->Initialize(
            "InstanceId", hlsl::CompType::getF32(),
            hlsl::DXIL::InterpolationMode::Undefined, 1, 1);
        Added_SV_InstanceId->AppendSemanticIndex(0);
        Added_SV_InstanceId->SetSigPointKind(DXIL::SigPointKind::VSIn);
        Added_SV_InstanceId->SetKind(hlsl::DXIL::SemanticKind::InstanceID);

        auto index =
            InputSignature.AppendElement(std::move(Added_SV_InstanceId));
        SVIndices.VertexShader.InstanceId = InputElements[index]->GetID();
      } else {
        SVIndices.VertexShader.InstanceId =
            Existing_SV_InstanceId->get()->GetID();
      }
    }
  } break;
  case DXIL::ShaderKind::Geometry:
    // GS Instance Id and Primitive Id are not in the input signature
    break;
  case DXIL::ShaderKind::Pixel: {
    auto Existing_SV_Position =
        std::find_if(InputElements.begin(), InputElements.end(),
                     [](const std::unique_ptr<DxilSignatureElement> &Element) {
                       return Element->GetSemantic()->GetKind() ==
                              hlsl::DXIL::SemanticKind::Position;
                     });

    // SV_Position, if present, has to have full mask, so we needn't worry
    // about the shader having selected components that don't include x or y.
    // If not present, we add it.
    if (Existing_SV_Position == InputElements.end()) {
      auto Added_SV_Position =
          llvm::make_unique<DxilSignatureElement>(DXIL::SigPointKind::PSIn);
      Added_SV_Position->Initialize("Position", hlsl::CompType::getF32(),
                                    hlsl::DXIL::InterpolationMode::Linear, 1,
                                    4);
      Added_SV_Position->AppendSemanticIndex(0);
      Added_SV_Position->SetSigPointKind(DXIL::SigPointKind::PSIn);
      Added_SV_Position->SetKind(hlsl::DXIL::SemanticKind::Position);

      auto index = InputSignature.AppendElement(std::move(Added_SV_Position));
      SVIndices.PixelShader.Position = InputElements[index]->GetID();
    } else {
      SVIndices.PixelShader.Position = Existing_SV_Position->get()->GetID();
    }
  } break;
  default:
    assert(false); // guaranteed by runOnModule
  }

  return SVIndices;
}

Value *DxilDebugInstrumentation::addDispatchedShaderProlog(BuilderContext &BC) {
  Constant *Zero32Arg = BC.HlslOP->GetU32Const(0);
  Constant *One32Arg = BC.HlslOP->GetU32Const(1);
  Constant *Two32Arg = BC.HlslOP->GetU32Const(2);

  auto ThreadIdFunc =
      BC.HlslOP->GetOpFunc(DXIL::OpCode::ThreadId, Type::getInt32Ty(BC.Ctx));
  Constant *Opcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::ThreadId);
  auto ThreadIdX =
      BC.Builder.CreateCall(ThreadIdFunc, {Opcode, Zero32Arg}, "ThreadIdX");
  auto ThreadIdY =
      BC.Builder.CreateCall(ThreadIdFunc, {Opcode, One32Arg}, "ThreadIdY");
  auto ThreadIdZ =
      BC.Builder.CreateCall(ThreadIdFunc, {Opcode, Two32Arg}, "ThreadIdZ");

  // Compare to expected thread ID
  auto CompareToX = BC.Builder.CreateICmpEQ(
      ThreadIdX, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdX),
      "CompareToThreadIdX");
  auto CompareToY = BC.Builder.CreateICmpEQ(
      ThreadIdY, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdY),
      "CompareToThreadIdY");
  auto CompareToZ = BC.Builder.CreateICmpEQ(
      ThreadIdZ, BC.HlslOP->GetU32Const(m_Parameters.ComputeShader.ThreadIdZ),
      "CompareToThreadIdZ");

  auto CompareXAndY =
      BC.Builder.CreateAnd(CompareToX, CompareToY, "CompareXAndY");

  auto CompareAll =
      BC.Builder.CreateAnd(CompareXAndY, CompareToZ, "CompareAll");

  return CompareAll;
}

Value *
DxilDebugInstrumentation::addVertexShaderProlog(BuilderContext &BC,
                                                SystemValueIndices SVIndices) {
  Constant *Zero32Arg = BC.HlslOP->GetU32Const(0);
  Constant *Zero8Arg = BC.HlslOP->GetI8Const(0);
  UndefValue *UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));

  auto LoadInputOpFunc =
      BC.HlslOP->GetOpFunc(DXIL::OpCode::LoadInput, Type::getInt32Ty(BC.Ctx));
  Constant *LoadInputOpcode =
      BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::LoadInput);
  Constant *SV_Vert_ID =
      BC.HlslOP->GetU32Const(SVIndices.VertexShader.VertexId);
  auto VertId =
      BC.Builder.CreateCall(LoadInputOpFunc,
                            {LoadInputOpcode, SV_Vert_ID, Zero32Arg /*row*/,
                             Zero8Arg /*column*/, UndefArg},
                            "VertId");

  Constant *SV_Instance_ID =
      BC.HlslOP->GetU32Const(SVIndices.VertexShader.InstanceId);
  auto InstanceId =
      BC.Builder.CreateCall(LoadInputOpFunc,
                            {LoadInputOpcode, SV_Instance_ID, Zero32Arg /*row*/,
                             Zero8Arg /*column*/, UndefArg},
                            "InstanceId");

  // Compare to expected vertex ID and instance ID
  auto CompareToVert = BC.Builder.CreateICmpEQ(
      VertId, BC.HlslOP->GetU32Const(m_Parameters.VertexShader.VertexId),
      "CompareToVertId");
  auto CompareToInstance = BC.Builder.CreateICmpEQ(
      InstanceId, BC.HlslOP->GetU32Const(m_Parameters.VertexShader.InstanceId),
      "CompareToInstanceId");
  auto CompareBoth =
      BC.Builder.CreateAnd(CompareToVert, CompareToInstance, "CompareBoth");

  return CompareBoth;
}

Value *DxilDebugInstrumentation::addGeometryShaderProlog(
    BuilderContext &BC, SystemValueIndices SVIndices) {

  auto PrimitiveIdOpFunc =
      BC.HlslOP->GetOpFunc(DXIL::OpCode::PrimitiveID, Type::getInt32Ty(BC.Ctx));
  Constant *PrimitiveIdOpcode =
      BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::PrimitiveID);
  auto PrimId =
      BC.Builder.CreateCall(PrimitiveIdOpFunc, {PrimitiveIdOpcode}, "PrimId");

  auto CompareToPrim = BC.Builder.CreateICmpEQ(
      PrimId, BC.HlslOP->GetU32Const(m_Parameters.GeometryShader.PrimitiveId),
      "CompareToPrimId");

  if (BC.DM.GetGSInstanceCount() <= 1) {
    return CompareToPrim;
  }

  auto GSInstanceIdOpFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::GSInstanceID,
                                                 Type::getInt32Ty(BC.Ctx));
  Constant *GSInstanceIdOpcode =
      BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::GSInstanceID);
  auto GSInstanceId = BC.Builder.CreateCall(
      GSInstanceIdOpFunc, {GSInstanceIdOpcode}, "GSInstanceId");

  // Compare to expected vertex ID and instance ID
  auto CompareToInstance = BC.Builder.CreateICmpEQ(
      GSInstanceId,
      BC.HlslOP->GetU32Const(m_Parameters.GeometryShader.InstanceId),
      "CompareToInstanceId");
  auto CompareBoth =
      BC.Builder.CreateAnd(CompareToPrim, CompareToInstance, "CompareBoth");

  return CompareBoth;
}

Value *
DxilDebugInstrumentation::addPixelShaderProlog(BuilderContext &BC,
                                               SystemValueIndices SVIndices) {
  Constant *Zero32Arg = BC.HlslOP->GetU32Const(0);
  Constant *Zero8Arg = BC.HlslOP->GetI8Const(0);
  Constant *One8Arg = BC.HlslOP->GetI8Const(1);
  UndefValue *UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));

  // Convert SV_POSITION to UINT
  Value *XAsInt;
  Value *YAsInt;
  {
    auto LoadInputOpFunc =
        BC.HlslOP->GetOpFunc(DXIL::OpCode::LoadInput, Type::getFloatTy(BC.Ctx));
    Constant *LoadInputOpcode =
        BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::LoadInput);
    Constant *SV_Pos_ID =
        BC.HlslOP->GetU32Const(SVIndices.PixelShader.Position);
    auto XPos =
        BC.Builder.CreateCall(LoadInputOpFunc,
                              {LoadInputOpcode, SV_Pos_ID, Zero32Arg /*row*/,
                               Zero8Arg /*column*/, UndefArg},
                              "XPos");
    auto YPos =
        BC.Builder.CreateCall(LoadInputOpFunc,
                              {LoadInputOpcode, SV_Pos_ID, Zero32Arg /*row*/,
                               One8Arg /*column*/, UndefArg},
                              "YPos");

    XAsInt = BC.Builder.CreateCast(Instruction::CastOps::FPToUI, XPos,
                                   Type::getInt32Ty(BC.Ctx), "XIndex");
    YAsInt = BC.Builder.CreateCast(Instruction::CastOps::FPToUI, YPos,
                                   Type::getInt32Ty(BC.Ctx), "YIndex");
  }

  // Compare to expected pixel position and primitive ID
  auto CompareToX = BC.Builder.CreateICmpEQ(
      XAsInt, BC.HlslOP->GetU32Const(m_Parameters.PixelShader.X), "CompareToX");
  auto CompareToY = BC.Builder.CreateICmpEQ(
      YAsInt, BC.HlslOP->GetU32Const(m_Parameters.PixelShader.Y), "CompareToY");
  auto ComparePos = BC.Builder.CreateAnd(CompareToX, CompareToY, "ComparePos");

  return ComparePos;
}

void DxilDebugInstrumentation::addInvocationSelectionProlog(
    BuilderContext &BC, SystemValueIndices SVIndices) {
  auto ShaderModel = BC.DM.GetShaderModel();

  Value *ParameterTestResult = nullptr;
  switch (ShaderModel->GetKind()) {
  case DXIL::ShaderKind::Compute:
  case DXIL::ShaderKind::Amplification:
  case DXIL::ShaderKind::Mesh:
    ParameterTestResult = addDispatchedShaderProlog(BC);
    break;
  case DXIL::ShaderKind::Geometry:
    ParameterTestResult = addGeometryShaderProlog(BC, SVIndices);
    break;
  case DXIL::ShaderKind::Vertex:
    ParameterTestResult = addVertexShaderProlog(BC, SVIndices);
    break;
  case DXIL::ShaderKind::Pixel:
    ParameterTestResult = addPixelShaderProlog(BC, SVIndices);
    break;
  default:
    assert(false); // guaranteed by runOnModule
  }

  // This is a convenient place to calculate the values that modify the UAV
  // offset for invocations of interest and for UAV size.
  m_OffsetMultiplicand =
      BC.Builder.CreateCast(Instruction::CastOps::ZExt, ParameterTestResult,
                            Type::getInt32Ty(BC.Ctx), "OffsetMultiplicand");
  auto InverseOffsetMultiplicand =
      BC.Builder.CreateSub(BC.HlslOP->GetU32Const(1), m_OffsetMultiplicand,
                           "ComplementOfMultiplicand");
  m_OffsetAddend =
      BC.Builder.CreateMul(BC.HlslOP->GetU32Const(UAVDumpingGroundOffset()),
                           InverseOffsetMultiplicand, "OffsetAddend");
  m_OffsetMask = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() - 1);

  m_CounterOffset = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() + CounterOffsetBeyondUsefulData);

  m_SelectionCriterion = ParameterTestResult;
}

void DxilDebugInstrumentation::reserveDebugEntrySpace(BuilderContext &BC,
                                                      uint32_t SpaceInBytes) {
  assert(m_CurrentIndex == nullptr);
  assert(m_RemainingReservedSpaceInBytes == 0);

  m_RemainingReservedSpaceInBytes = SpaceInBytes;

  // Insert the UAV increment instruction:
  Function *AtomicOpFunc =
      BC.HlslOP->GetOpFunc(OP::OpCode::AtomicBinOp, Type::getInt32Ty(BC.Ctx));
  Constant *AtomicBinOpcode =
      BC.HlslOP->GetU32Const((unsigned)OP::OpCode::AtomicBinOp);
  Constant *AtomicAdd =
      BC.HlslOP->GetU32Const((unsigned)DXIL::AtomicBinOpCode::Add);
  UndefValue *UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));

  // so inc will be zero for uninteresting invocations:
  Constant *Increment = BC.HlslOP->GetU32Const(SpaceInBytes);
  Value *IncrementForThisInvocation = BC.Builder.CreateMul(
      Increment, m_OffsetMultiplicand, "IncrementForThisInvocation");

  auto PreviousValue = BC.Builder.CreateCall(
      AtomicOpFunc,
      {
          AtomicBinOpcode,  // i32, ; opcode
          m_HandleForUAV,   // %dx.types.Handle, ; resource handle
          AtomicAdd,        // i32, ; binary operation code : EXCHANGE, IADD, AND, OR,
                            // XOR, IMIN, IMAX, UMIN, UMAX
          m_CounterOffset,  // i32, ; coordinate c0: index in bytes
          UndefArg,         // i32, ; coordinate c1 (unused)
          UndefArg,         // i32, ; coordinate c2 (unused)
          IncrementForThisInvocation, // i32); increment value
      },
      "UAVIncResult");

  if (m_InvocationId == nullptr) {
    m_InvocationId = PreviousValue;
  }

  auto MaskedForLimit =
      BC.Builder.CreateAnd(PreviousValue, m_OffsetMask, "MaskedForUAVLimit");
  // The return value will either end up being itself (multiplied by one and
  // added with zero) or the "dump uninteresting things here" value of (UAVSize
  // - a bit).
  auto MultipliedForInterest = BC.Builder.CreateMul(
      MaskedForLimit, m_OffsetMultiplicand, "MultipliedForInterest");
  auto AddedForInterest = BC.Builder.CreateAdd(
      MultipliedForInterest, m_OffsetAddend, "AddedForInterest");
  m_CurrentIndex = AddedForInterest;
}

void DxilDebugInstrumentation::addDebugEntryValue(BuilderContext &BC,
                                                  Value *TheValue) {
  assert(m_RemainingReservedSpaceInBytes > 0);

  auto TheValueTypeID = TheValue->getType()->getTypeID();
  if (TheValueTypeID == Type::TypeID::DoubleTyID) {
    Function *SplitDouble =
        BC.HlslOP->GetOpFunc(OP::OpCode::SplitDouble, TheValue->getType());
    Constant *SplitDoubleOpcode =
        BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::SplitDouble);
    auto SplitDoubleIntruction = BC.Builder.CreateCall(
        SplitDouble, {SplitDoubleOpcode, TheValue}, "SplitDouble");
    auto LowBits =
        BC.Builder.CreateExtractValue(SplitDoubleIntruction, 0, "LowBits");
    auto HighBits =
        BC.Builder.CreateExtractValue(SplitDoubleIntruction, 1, "HighBits");
    // addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
    addDebugEntryValue(BC, LowBits);
    addDebugEntryValue(BC, HighBits);
  } else if (TheValueTypeID == Type::TypeID::IntegerTyID &&
             TheValue->getType()->getIntegerBitWidth() == 64) {
    auto LowBits =
        BC.Builder.CreateTrunc(TheValue, Type::getInt32Ty(BC.Ctx), "LowBits");
    auto ShiftedBits = BC.Builder.CreateLShr(TheValue, 32, "ShiftedBits");
    auto HighBits = BC.Builder.CreateTrunc(
        ShiftedBits, Type::getInt32Ty(BC.Ctx), "HighBits");
    // addDebugEntryValue(BC, BC.HlslOP->GetU32Const(0)); // padding
    addDebugEntryValue(BC, LowBits);
    addDebugEntryValue(BC, HighBits);
  } else if (TheValueTypeID == Type::TypeID::IntegerTyID &&
             (TheValue->getType()->getIntegerBitWidth() == 16 ||
              TheValue->getType()->getIntegerBitWidth() == 1)) {
    auto As32 =
        BC.Builder.CreateZExt(TheValue, Type::getInt32Ty(BC.Ctx), "As32");
    addDebugEntryValue(BC, As32);
  } else if (TheValueTypeID == Type::TypeID::HalfTyID) {
    auto AsFloat =
        BC.Builder.CreateFPCast(TheValue, Type::getFloatTy(BC.Ctx), "AsFloat");
    addDebugEntryValue(BC, AsFloat);
  } else {
    Function *StoreValue =
        BC.HlslOP->GetOpFunc(OP::OpCode::BufferStore,
                             TheValue->getType()); // Type::getInt32Ty(BC.Ctx));
    Constant *StoreValueOpcode =
        BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::BufferStore);
    UndefValue *Undef32Arg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
    UndefValue *UndefArg = nullptr;
    if (TheValueTypeID == Type::TypeID::IntegerTyID) {
      UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx));
    } else if (TheValueTypeID == Type::TypeID::FloatTyID) {
      UndefArg = UndefValue::get(Type::getFloatTy(BC.Ctx));
    } else {
      // The above are the only two valid types for a UAV store
      assert(false);
    }
    Constant *WriteMask_X = BC.HlslOP->GetI8Const(1);
    (void)BC.Builder.CreateCall(
        StoreValue, {StoreValueOpcode, // i32 opcode
                     m_HandleForUAV,   // %dx.types.Handle, ; resource handle
                     m_CurrentIndex,   // i32 c0: index in bytes into UAV
                     Undef32Arg,       // i32 c1: unused
                     TheValue,
                     UndefArg, // unused values
                     UndefArg, // unused values
                     UndefArg, // unused values
                     WriteMask_X});

    m_RemainingReservedSpaceInBytes -= 4;
    assert(m_RemainingReservedSpaceInBytes < 1024); // check for underflow

    if (m_RemainingReservedSpaceInBytes != 0) {
      m_CurrentIndex =
          BC.Builder.CreateAdd(m_CurrentIndex, BC.HlslOP->GetU32Const(4));
    } else {
      m_CurrentIndex = nullptr;
    }
  }
}

void DxilDebugInstrumentation::addInvocationStartMarker(BuilderContext &BC) {
  DebugShaderModifierRecordHeader marker{{{0, 0, 0, 0}}, 0};
  reserveDebugEntrySpace(BC, sizeof(marker));

  marker.Header.Details.SizeDwords =
      DebugShaderModifierRecordPayloadSizeDwords(sizeof(marker));
  ;
  marker.Header.Details.Flags = 0;
  marker.Header.Details.Type =
      DebugShaderModifierRecordTypeInvocationStartMarker;
  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(marker.Header.u32Header));
  addDebugEntryValue(BC, m_InvocationId);
}

template <typename ReturnType>
void DxilDebugInstrumentation::addStepEntryForType(
    DebugShaderModifierRecordType RecordType, BuilderContext &BC,
    std::uint32_t InstNum, Value *V, std::uint32_t ValueOrdinal,
    Value *ValueOrdinalIndex) {
  DebugShaderModifierRecordDXILStep<ReturnType> step = {};
  reserveDebugEntrySpace(BC, sizeof(step));

  step.Header.Details.SizeDwords =
      DebugShaderModifierRecordPayloadSizeDwords(sizeof(step));
  step.Header.Details.Type = static_cast<uint8_t>(RecordType);
  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(step.Header.u32Header));
  addDebugEntryValue(BC, m_InvocationId);
  addDebugEntryValue(BC, BC.HlslOP->GetU32Const(InstNum));

  if (RecordType != DebugShaderModifierRecordTypeDXILStepVoid) {
    addDebugEntryValue(BC, V);

    IRBuilder<> &B = BC.Builder;

    Value *VO = BC.HlslOP->GetU32Const(ValueOrdinal << 16);
    Value *VOI = B.CreateAnd(ValueOrdinalIndex, BC.HlslOP->GetU32Const(0xFFFF),
                             "ValueOrdinalIndex");
    Value *EncodedValueOrdinalAndIndex =
        BC.Builder.CreateOr(VO, VOI, "ValueOrdinal");
    addDebugEntryValue(BC, EncodedValueOrdinalAndIndex);
  }
}

void DxilDebugInstrumentation::addStoreStepDebugEntry(BuilderContext& BC,
    StoreInst* Inst) {
    std::uint32_t ValueOrdinalBase;
    std::uint32_t UnusedValueOrdinalSize;
    llvm::Value* ValueOrdinalIndex;
    if (!pix_dxil::PixAllocaRegWrite::FromInst(Inst, &ValueOrdinalBase,
        &UnusedValueOrdinalSize,
        &ValueOrdinalIndex)) {
        return;
    }

    std::uint32_t InstNum;
    if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
        return;
    }

    if (PIXPassHelpers::IsAllocateRayQueryInstruction(Inst->getValueOperand())) {
        return;
    }

    addStepDebugEntryValue(BC, InstNum, Inst->getValueOperand(), ValueOrdinalBase,
        ValueOrdinalIndex);
}

void DxilDebugInstrumentation::addStepDebugEntry(BuilderContext &BC,
                                                 Instruction *Inst) {
  if (Inst->getOpcode() == Instruction::OtherOps::PHI) {
    return;
  }
  if (PIXPassHelpers::IsAllocateRayQueryInstruction(Inst)) {
      return;
  }

  if (auto *St = llvm::dyn_cast<llvm::StoreInst>(Inst)) {
    addStoreStepDebugEntry(BC, St);
    return;
  }

  std::uint32_t RegNum;
  if (!pix_dxil::PixDxilReg::FromInst(Inst, &RegNum)) {
    return;
  }

  std::uint32_t InstNum;
  if (!pix_dxil::PixDxilInstNum::FromInst(Inst, &InstNum)) {
    return;
  }

  addStepDebugEntryValue(BC, InstNum, Inst, RegNum, BC.Builder.getInt32(0));
}

void DxilDebugInstrumentation::addStepDebugEntryValue(
    BuilderContext &BC, std::uint32_t InstNum, Value *V,
    std::uint32_t ValueOrdinal, Value *ValueOrdinalIndex) {
  const Type::TypeID ID = V->getType()->getTypeID();

  switch (ID) {
  case Type::TypeID::StructTyID:
  case Type::TypeID::VoidTyID:
    addStepEntryForType<void>(DebugShaderModifierRecordTypeDXILStepVoid, BC,
                              InstNum, V, ValueOrdinal, ValueOrdinalIndex);
    break;
  case Type::TypeID::FloatTyID:
    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC,
                               InstNum, V, ValueOrdinal, ValueOrdinalIndex);
    break;
  case Type::TypeID::IntegerTyID:
    if (V->getType()->getIntegerBitWidth() == 64) {
      addStepEntryForType<uint64_t>(DebugShaderModifierRecordTypeDXILStepUint64,
                                    BC, InstNum, V, ValueOrdinal,
                                    ValueOrdinalIndex);
    } else {
      addStepEntryForType<uint32_t>(DebugShaderModifierRecordTypeDXILStepUint32,
                                    BC, InstNum, V, ValueOrdinal,
                                    ValueOrdinalIndex);
    }
    break;
  case Type::TypeID::DoubleTyID:
    addStepEntryForType<double>(DebugShaderModifierRecordTypeDXILStepDouble, BC,
                                InstNum, V, ValueOrdinal, ValueOrdinalIndex);
    break;
  case Type::TypeID::HalfTyID:
    addStepEntryForType<float>(DebugShaderModifierRecordTypeDXILStepFloat, BC,
                               InstNum, V, ValueOrdinal, ValueOrdinalIndex);
    break;
  case Type::TypeID::PointerTyID:
    // Skip pointer calculation instructions. They aren't particularly
    // meaningful to the user (being a mere implementation detail for lookup
    // tables, etc.), and their type is problematic from a UI point of view. The
    // subsequent instructions that dereference the pointer will be properly
    // instrumented and show the (meaningful) retrieved value.
    break;
  case Type::TypeID::FP128TyID:
  case Type::TypeID::LabelTyID:
  case Type::TypeID::MetadataTyID:
  case Type::TypeID::FunctionTyID:
  case Type::TypeID::ArrayTyID:
  case Type::TypeID::VectorTyID:
  case Type::TypeID::X86_FP80TyID:
  case Type::TypeID::X86_MMXTyID:
  case Type::TypeID::PPC_FP128TyID:
    assert(false);
  }
}

bool DxilDebugInstrumentation::runOnModule(Module &M) {
  DxilModule &DM = M.GetOrCreateDxilModule();
  LLVMContext &Ctx = M.getContext();
  OP *HlslOP = DM.GetOP();

  auto ShaderModel = DM.GetShaderModel();
  switch (ShaderModel->GetKind()) {
  case DXIL::ShaderKind::Amplification:
  case DXIL::ShaderKind::Mesh:
  case DXIL::ShaderKind::Vertex:
  case DXIL::ShaderKind::Geometry:
  case DXIL::ShaderKind::Pixel:
  case DXIL::ShaderKind::Compute:
    break;
  default:
    return false;
  }

  // First record pointers to all instructions in the function:
  std::vector<Instruction *> AllInstructions;
  for (inst_iterator I = inst_begin(DM.GetEntryFunction()),
                     E = inst_end(DM.GetEntryFunction());
       I != E; ++I) {
    AllInstructions.push_back(&*I);
  }

  // Branchless instrumentation requires taking care of a few things:
  // -Each invocation of the shader will be either of interest or not of
  // interest
  //    -If of interest, the offset into the output UAV will be as expected
  //    -If not, the offset is forced to (UAVsize) - (Small Amount), and that
  //    output is ignored by the CPU-side code.
  // -The invocation of interest may overflow the UAV. This is handled by taking
  // the modulus of the
  //  output index. Overflow is then detected on the CPU side by checking for
  //  the presence of a canary value at (UAVSize) - (Small Amount) * 2 (which is
  //  actually a conservative definition of overflow).
  //

  Instruction *firstInsertionPt =
      dxilutil::FirstNonAllocaInsertionPt(DM.GetEntryFunction());
  IRBuilder<> Builder(firstInsertionPt);

  BuilderContext BC{M, DM, Ctx, HlslOP, Builder};

  m_HandleForUAV = PIXPassHelpers::CreateUAV(BC.DM, BC.Builder, 0, "PIX_DebugUAV_Handle");

  auto SystemValues = addRequiredSystemValues(BC);
  addInvocationSelectionProlog(BC, SystemValues);
  addInvocationStartMarker(BC);

  // Explicitly name new blocks in order to provide stable names for testing purposes
  int NewBlockCounter = 0;

  auto Fn = DM.GetEntryFunction();
  auto &Blocks = Fn->getBasicBlockList();
  for (auto &CurrentBlock : Blocks) {
    struct ValueAndPhi {
      Value *Val;
      PHINode *Phi;
      unsigned Index;
    };

    std::map<BasicBlock *, std::vector<ValueAndPhi>> InsertableEdges;
    auto &Is = CurrentBlock.getInstList();
    for (auto &Inst : Is) {
      if (Inst.getOpcode() != Instruction::OtherOps::PHI) {
        break;
      }
      PHINode &PN = llvm::cast<PHINode>(Inst);
      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
        BasicBlock *PhiBB = PN.getIncomingBlock(i);
        Value *PhiVal = PN.getIncomingValue(i);
        InsertableEdges[PhiBB].push_back({PhiVal, &PN, i});
      }
    }

    for (auto &InsertableEdge : InsertableEdges) {
      auto *NewBlock = BasicBlock::Create(Ctx, "PIXDebug" + std::to_string(NewBlockCounter++),
                                          InsertableEdge.first->getParent());
      IRBuilder<> Builder(NewBlock);

      auto *PreviousBlock = InsertableEdge.first;

      // Modify all successor operands of the terminator in the previous block
      // that match the current block to point to the new block:
      TerminatorInst *terminator = PreviousBlock->getTerminator();
      unsigned NumSuccessors = terminator->getNumSuccessors();
      for (unsigned SuccessorIndex = 0; SuccessorIndex < NumSuccessors;
           ++SuccessorIndex) {
        auto *CurrentSuccessor = terminator->getSuccessor(SuccessorIndex);
        if (CurrentSuccessor == &CurrentBlock) {
          terminator->setSuccessor(SuccessorIndex, NewBlock);
        }
      }

      // Modify the Phis and add debug instrumentation
      for (auto &ValueNPhi : InsertableEdge.second) {
        // Modify the phi to refer to the new block:
        ValueNPhi.Phi->setIncomingBlock(ValueNPhi.Index, NewBlock);

        // Add instrumentation to the new block
        std::uint32_t RegNum;
        if (!pix_dxil::PixDxilReg::FromInst(ValueNPhi.Phi, &RegNum)) {
          continue;
        }

        std::uint32_t InstNum;
        if (!pix_dxil::PixDxilInstNum::FromInst(ValueNPhi.Phi, &InstNum)) {
          continue;
        }

        BuilderContext BC{M, DM, Ctx, HlslOP, Builder};
        addStepDebugEntryValue(BC, InstNum, ValueNPhi.Val, RegNum,
                               BC.Builder.getInt32(0));
      }

      // Add a branch to the new block to point to the current block
      Builder.CreateBr(&CurrentBlock);
    }
  }

  // Instrument original instructions:
  for (auto &Inst : AllInstructions) {
    // Instrumentation goes after the instruction if it is not a terminator.
    // Otherwise, Instrumentation goes prior to the instruction.
    if (!Inst->isTerminator()) {
      IRBuilder<> Builder(Inst->getNextNode());
      BuilderContext BC2{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
      addStepDebugEntry(BC2, Inst);
    } else {
      // Insert before this instruction
      IRBuilder<> Builder(Inst);
      BuilderContext BC2{BC.M, BC.DM, BC.Ctx, BC.HlslOP, Builder};
      addStepDebugEntry(BC2, Inst);
    }
  }

  DM.ReEmitDxilResources();

  return true;
}

char DxilDebugInstrumentation::ID = 0;

ModulePass *llvm::createDxilDebugInstrumentationPass() {
  return new DxilDebugInstrumentation();
}

INITIALIZE_PASS(DxilDebugInstrumentation, "hlsl-dxil-debug-instrumentation",
                "HLSL DXIL debug instrumentation for PIX", false, false)