/////////////////////////////////////////////////////////////////////////////// // // // DxilAddPixelHitInstrumentation.cpp // // Copyright (C) Microsoft Corporation. All rights reserved. // // This file is distributed under the University of Illinois Open Source // // License. See LICENSE.TXT for details. // // // // Provides a pass to add instrumentation to retrieve mesh shader output. // // Used by PIX. // // // /////////////////////////////////////////////////////////////////////////////// #include "dxc/DXIL/DxilOperations.h" #include "dxc/DXIL/DxilUtil.h" #include "dxc/DXIL/DxilInstructions.h" #include "dxc/DXIL/DxilModule.h" #include "dxc/DxilPIXPasses/DxilPIXPasses.h" #include "dxc/HLSL/DxilGenerationPass.h" #include "dxc/HLSL/DxilSpanAllocator.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Transforms/Utils/Local.h" #include #ifdef _WIN32 #include #endif #include "PixPassHelpers.h" // Keep these in sync with the same-named value in the debugger application's // WinPixShaderUtils.h constexpr uint64_t DebugBufferDumpingGroundSize = 64 * 1024; // The actual max size per record is much smaller than this, but it never // hurts to be generous. constexpr size_t CounterOffsetBeyondUsefulData = DebugBufferDumpingGroundSize / 2; // Keep these in sync with the same-named values in PIX's MeshShaderOutput.cpp constexpr uint32_t triangleIndexIndicator = 1; constexpr uint32_t int32ValueIndicator = 2; constexpr uint32_t floatValueIndicator = 3; constexpr uint32_t int16ValueIndicator = 4; constexpr uint32_t float16ValueIndicator = 5; using namespace llvm; using namespace hlsl; class DxilPIXMeshShaderOutputInstrumentation : public ModulePass { public: static char ID; // Pass identification, replacement for typeid explicit DxilPIXMeshShaderOutputInstrumentation() : ModulePass(ID) {} const char *getPassName() const override { return "DXIL mesh shader output instrumentation"; } void applyOptions(PassOptions O) override; bool runOnModule(Module &M) override; private: CallInst *m_OutputUAV = nullptr; int m_RemainingReservedSpaceInBytes = 0; Constant *m_OffsetMask = nullptr; uint64_t m_UAVSize = 1024 * 1024; struct BuilderContext { Module &M; DxilModule &DM; LLVMContext &Ctx; OP *HlslOP; IRBuilder<> &Builder; }; Value *insertInstructionsToCalculateFlattenedGroupIdXandY(BuilderContext &BC); Value *insertInstructionsToCalculateGroupIdZ(BuilderContext &BC); Value *reserveDebugEntrySpace(BuilderContext &BC, uint32_t SpaceInBytes); uint32_t UAVDumpingGroundOffset(); Value *writeDwordAndReturnNewOffset(BuilderContext &BC, Value *TheOffset, Value *TheValue); template void Instrument(BuilderContext &BC, T... values); }; void DxilPIXMeshShaderOutputInstrumentation::applyOptions(PassOptions O) { GetPassOptionUInt64(O, "UAVSize", &m_UAVSize, 1024 * 1024); } uint32_t DxilPIXMeshShaderOutputInstrumentation::UAVDumpingGroundOffset() { return static_cast(m_UAVSize - DebugBufferDumpingGroundSize); } Value *DxilPIXMeshShaderOutputInstrumentation:: insertInstructionsToCalculateFlattenedGroupIdXandY(BuilderContext &BC) { Constant *Zero32Arg = BC.HlslOP->GetU32Const(0); Constant *One32Arg = BC.HlslOP->GetU32Const(1); auto GroupIdFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::GroupId, Type::getInt32Ty(BC.Ctx)); Constant *Opcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::GroupId); auto GroupIdX = BC.Builder.CreateCall(GroupIdFunc, {Opcode, Zero32Arg}, "GroupIdX"); auto GroupIdY = BC.Builder.CreateCall(GroupIdFunc, {Opcode, One32Arg}, "GroupIdY"); // Spec requires that no group id index is greater than 64k, so we can // combine two into one 32-bit value: auto YShifted = BC.Builder.CreateShl(GroupIdY, 16); return BC.Builder.CreateAdd(YShifted, GroupIdX); } Value *DxilPIXMeshShaderOutputInstrumentation:: insertInstructionsToCalculateGroupIdZ(BuilderContext &BC) { Constant *Two32Arg = BC.HlslOP->GetU32Const(2); auto GroupIdFunc = BC.HlslOP->GetOpFunc(DXIL::OpCode::GroupId, Type::getInt32Ty(BC.Ctx)); Constant *Opcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::GroupId); return BC.Builder.CreateCall(GroupIdFunc, {Opcode, Two32Arg}, "GroupIdZ"); } Value *DxilPIXMeshShaderOutputInstrumentation::reserveDebugEntrySpace( BuilderContext &BC, uint32_t SpaceInBytes) { // Check the previous caller didn't reserve too much space: assert(m_RemainingReservedSpaceInBytes == 0); // Check that the caller didn't ask for so much memory that it will // overwrite the offset counter: assert(m_RemainingReservedSpaceInBytes < (int)CounterOffsetBeyondUsefulData); m_RemainingReservedSpaceInBytes = SpaceInBytes; // Insert the UAV increment instruction: Function *AtomicOpFunc = BC.HlslOP->GetOpFunc(OP::OpCode::AtomicBinOp, Type::getInt32Ty(BC.Ctx)); Constant *AtomicBinOpcode = BC.HlslOP->GetU32Const((unsigned)OP::OpCode::AtomicBinOp); Constant *AtomicAdd = BC.HlslOP->GetU32Const((unsigned)DXIL::AtomicBinOpCode::Add); Constant *OffsetArg = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() + CounterOffsetBeyondUsefulData); UndefValue *UndefArg = UndefValue::get(Type::getInt32Ty(BC.Ctx)); Constant *Increment = BC.HlslOP->GetU32Const(SpaceInBytes); auto *PreviousValue = BC.Builder.CreateCall( AtomicOpFunc, { AtomicBinOpcode, // i32, ; opcode m_OutputUAV, // %dx.types.Handle, ; resource handle AtomicAdd, // i32, ; binary operation code : EXCHANGE, IADD, AND, OR, // XOR, IMIN, IMAX, UMIN, UMAX OffsetArg, // i32, ; coordinate c0: index in bytes UndefArg, // i32, ; coordinate c1 (unused) UndefArg, // i32, ; coordinate c2 (unused) Increment, // i32); increment value }, "UAVIncResult"); return BC.Builder.CreateAnd(PreviousValue, m_OffsetMask, "MaskedForUAVLimit"); } Value *DxilPIXMeshShaderOutputInstrumentation::writeDwordAndReturnNewOffset( BuilderContext &BC, Value *TheOffset, Value *TheValue) { Function *StoreValue = BC.HlslOP->GetOpFunc(OP::OpCode::BufferStore, Type::getInt32Ty(BC.Ctx)); Constant *StoreValueOpcode = BC.HlslOP->GetU32Const((unsigned)DXIL::OpCode::BufferStore); UndefValue *Undef32Arg = UndefValue::get(Type::getInt32Ty(BC.Ctx)); Constant *WriteMask_X = BC.HlslOP->GetI8Const(1); (void)BC.Builder.CreateCall( StoreValue, {StoreValueOpcode, // i32 opcode m_OutputUAV, // %dx.types.Handle, ; resource handle TheOffset, // i32 c0: index in bytes into UAV Undef32Arg, // i32 c1: unused TheValue, Undef32Arg, // unused values Undef32Arg, // unused values Undef32Arg, // unused values WriteMask_X}); m_RemainingReservedSpaceInBytes -= sizeof(uint32_t); assert(m_RemainingReservedSpaceInBytes >= 0); // or else the caller didn't reserve enough space return BC.Builder.CreateAdd( TheOffset, BC.HlslOP->GetU32Const(static_cast(sizeof(uint32_t)))); } template void DxilPIXMeshShaderOutputInstrumentation::Instrument(BuilderContext &BC, T... values) { llvm::SmallVector Values( {static_cast(values)...}); const uint32_t DwordCount = Values.size(); llvm::Value *byteOffset = reserveDebugEntrySpace(BC, DwordCount * sizeof(uint32_t)); for (llvm::Value *V : Values) { byteOffset = writeDwordAndReturnNewOffset(BC, byteOffset, V); } } bool DxilPIXMeshShaderOutputInstrumentation::runOnModule(Module &M) { DxilModule &DM = M.GetOrCreateDxilModule(); LLVMContext &Ctx = M.getContext(); OP *HlslOP = DM.GetOP(); Instruction *firstInsertionPt = dxilutil::FirstNonAllocaInsertionPt(DM.GetEntryFunction()); IRBuilder<> Builder(firstInsertionPt); BuilderContext BC{M, DM, Ctx, HlslOP, Builder}; m_OffsetMask = BC.HlslOP->GetU32Const(UAVDumpingGroundOffset() - 1); m_OutputUAV = PIXPassHelpers::CreateUAV(DM, Builder, 0, "PIX_DebugUAV_Handle"); auto GroupIdXandY = insertInstructionsToCalculateFlattenedGroupIdXandY(BC); auto GroupIdZ = insertInstructionsToCalculateGroupIdZ(BC); auto F = HlslOP->GetOpFunc(DXIL::OpCode::EmitIndices, Type::getVoidTy(Ctx)); auto FunctionUses = F->uses(); for (auto FI = FunctionUses.begin(); FI != FunctionUses.end();) { auto &FunctionUse = *FI++; auto FunctionUser = FunctionUse.getUser(); auto Call = cast(FunctionUser); IRBuilder<> Builder2(Call); BuilderContext BC2{M, DM, Ctx, HlslOP, Builder2}; Instrument(BC2, BC2.HlslOP->GetI32Const(triangleIndexIndicator), GroupIdXandY, GroupIdZ, Call->getOperand(1), Call->getOperand(2), Call->getOperand(3), Call->getOperand(4)); } struct OutputType { Type *type; uint32_t tag; }; SmallVector StoreVertexOutputOverloads { {Type::getInt32Ty(Ctx), int32ValueIndicator}, {Type::getInt16Ty(Ctx), int16ValueIndicator}, {Type::getFloatTy(Ctx), floatValueIndicator}, {Type::getHalfTy(Ctx), float16ValueIndicator} }; for (auto const &Overload : StoreVertexOutputOverloads) { F = HlslOP->GetOpFunc(DXIL::OpCode::StoreVertexOutput, Overload.type); FunctionUses = F->uses(); for (auto FI = FunctionUses.begin(); FI != FunctionUses.end();) { auto &FunctionUse = *FI++; auto FunctionUser = FunctionUse.getUser(); auto Call = cast(FunctionUser); IRBuilder<> Builder2(Call); BuilderContext BC2{M, DM, Ctx, HlslOP, Builder2}; // Expand column index to 32 bits: auto ColumnIndex = BC2.Builder.CreateCast( Instruction::ZExt, Call->getOperand(3), Type::getInt32Ty(Ctx)); // Coerce actual value to int32 Value *CoercedValue = Call->getOperand(4); if (Overload.tag == floatValueIndicator) { CoercedValue = BC2.Builder.CreateCast( Instruction::BitCast, CoercedValue, Type::getInt32Ty(Ctx)); } else if (Overload.tag == float16ValueIndicator) { auto * HalfInt = BC2.Builder.CreateCast( Instruction::BitCast, CoercedValue, Type::getInt16Ty(Ctx)); CoercedValue = BC2.Builder.CreateCast( Instruction::ZExt, HalfInt, Type::getInt32Ty(Ctx)); } else if (Overload.tag == int16ValueIndicator) { CoercedValue = BC2.Builder.CreateCast( Instruction::ZExt, CoercedValue, Type::getInt32Ty(Ctx)); } Instrument( BC2, BC2.HlslOP->GetI32Const(Overload.tag), GroupIdXandY, GroupIdZ, Call->getOperand(1), Call->getOperand(2), ColumnIndex, CoercedValue, Call->getOperand(5)); } } DM.ReEmitDxilResources(); return true; } char DxilPIXMeshShaderOutputInstrumentation::ID = 0; ModulePass *llvm::createDxilDxilPIXMeshShaderOutputInstrumentation() { return new DxilPIXMeshShaderOutputInstrumentation(); } INITIALIZE_PASS(DxilPIXMeshShaderOutputInstrumentation, "hlsl-dxil-pix-meshshader-output-instrumentation", "DXIL mesh shader output instrumentation for PIX", false, false)