瀏覽代碼

Support instruction counts in reflection (#2930)

Tex Riddell 5 年之前
父節點
當前提交
ee442e01bc

+ 67 - 0
include/dxc/DXIL/DxilCounters.h

@@ -0,0 +1,67 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilCounters.h                                                            //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Counters for Dxil instructions types.                                     //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <stdint.h>
+
+namespace llvm {
+  class Module;
+  class StringRef;
+}
+
+namespace hlsl {
+
+struct DxilCounters {
+  // <py::lines('OPCODE-COUNTERS')>['uint32_t %s = 0;' % c for c in hctdb_instrhelp.get_counters()]</py>
+  // OPCODE-COUNTERS:BEGIN
+  uint32_t array_local_bytes = 0;
+  uint32_t array_local_ldst = 0;
+  uint32_t array_static_bytes = 0;
+  uint32_t array_static_ldst = 0;
+  uint32_t array_tgsm_bytes = 0;
+  uint32_t array_tgsm_ldst = 0;
+  uint32_t atomic = 0;
+  uint32_t barrier = 0;
+  uint32_t branches = 0;
+  uint32_t fence = 0;
+  uint32_t floats = 0;
+  uint32_t gs_cut = 0;
+  uint32_t gs_emit = 0;
+  uint32_t insts = 0;
+  uint32_t ints = 0;
+  uint32_t sig_ld = 0;
+  uint32_t sig_st = 0;
+  uint32_t tex_bias = 0;
+  uint32_t tex_cmp = 0;
+  uint32_t tex_grad = 0;
+  uint32_t tex_load = 0;
+  uint32_t tex_norm = 0;
+  uint32_t tex_store = 0;
+  uint32_t uints = 0;
+  // OPCODE-COUNTERS:END
+
+  uint32_t AllArrayBytes() {
+    return array_local_bytes
+      + array_static_bytes
+      + array_tgsm_bytes;
+  }
+  uint32_t AllArrayAccesses() {
+    return array_local_ldst
+      + array_static_ldst
+      + array_tgsm_ldst;
+  }
+};
+
+void CountInstructions(llvm::Module &M, DxilCounters& counters);
+uint32_t *LookupByName(llvm::StringRef name, DxilCounters& counters);
+
+} // namespace hlsl

+ 12 - 0
include/dxc/DXIL/DxilMetadataHelper.h

@@ -55,6 +55,7 @@ class RootSignatureHandle;
 struct DxilFunctionProps;
 class DxilSubobjects;
 class DxilSubobject;
+struct DxilCounters;
 
 // Additional debug information for SROA'ed array variables,
 // where adjacent elements in DXIL might not have been adjacent
@@ -89,6 +90,10 @@ public:
   static const char kDxilIntermediateOptionsMDName[];
   static const unsigned kDxilIntermediateOptionsFlags = 0;  // Unique element ID.
 
+  // DxilCounters
+  static const char kDxilCountersMDName[];
+  // !{!"<counter>", i32 <count>, !"<counter>", i32 <count>, ...}
+
   // Entry points.
   static const char kDxilEntryPointsMDName[];
 
@@ -431,6 +436,10 @@ public:
   // Extra metadata present
   bool HasExtraMetadata() { return m_bExtraMetadata; }
 
+  // Instruction Counters
+  void EmitDxilCounters(const DxilCounters &counters);
+  void LoadDxilCounters(DxilCounters &counters) const;
+
   // Shader specific.
 private:
   llvm::MDTuple *EmitDxilGSState(DXIL::InputPrimitive Primitive, unsigned MaxVertexCount, 
@@ -473,6 +482,9 @@ private:
 
   llvm::MDTuple *EmitDxilASState(const unsigned *NumThreads, unsigned payloadSizeInBytes);
   void LoadDxilASState(const llvm::MDOperand &MDO, unsigned *NumThreads, unsigned &payloadSizeInBytes);
+
+  void AddCounterIfNonZero(uint32_t value, llvm::StringRef name, std::vector<llvm::Metadata*> &MDVals);
+  void LoadCounterMD(const llvm::MDOperand &MDName, const llvm::MDOperand &MDValue, DxilCounters &counters) const;
 public:
   // Utility functions.
   static bool IsKnownNamedMetaData(const llvm::NamedMDNode &Node);

+ 3 - 0
include/dxc/DXIL/DxilModule.h

@@ -181,6 +181,9 @@ public:
   /// Return true if non-fatal metadata error was detected.
   bool HasMetadataErrors();
 
+  void EmitDxilCounters();
+  void LoadDxilCounters(DxilCounters &counters) const;
+
   /// Check if a Named meta data node is known by dxil module.
   static bool IsKnownNamedMetaData(llvm::NamedMDNode &Node);
 

+ 1 - 0
lib/DXIL/CMakeLists.txt

@@ -3,6 +3,7 @@
 add_llvm_library(LLVMDXIL
   DxilCBuffer.cpp
   DxilCompType.cpp
+  DxilCounters.cpp
   DxilInterpolationMode.cpp
   DxilMetadataHelper.cpp
   DxilModule.cpp

+ 406 - 0
lib/DXIL/DxilCounters.cpp

@@ -0,0 +1,406 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilCounters.cpp                                                           //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/DXIL/DxilCounters.h"
+#include "dxc/Support/Global.h"
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include "dxc/DXIL/DxilOperations.h"
+#include "dxc/DXIL/DxilInstructions.h"
+
+using namespace llvm;
+using namespace hlsl;
+using namespace hlsl::DXIL;
+
+namespace hlsl {
+
+namespace {
+
+struct PointerInfo {
+  enum class MemType : unsigned {
+    Unknown = 0,
+    Global_Static,
+    Global_TGSM,
+    Alloca
+  };
+
+  MemType memType : 2;
+  bool isArray : 1;
+
+  PointerInfo() :
+    memType(MemType::Unknown),
+    isArray(false)
+  {}
+};
+
+typedef DenseMap<Value*, PointerInfo> PointerInfoMap;
+
+PointerInfo GetPointerInfo(Value* V, PointerInfoMap &ptrInfoMap) {
+  auto it = ptrInfoMap.find(V);
+  if (it != ptrInfoMap.end())
+    return it->second;
+
+  PointerInfo &PI = ptrInfoMap[V];
+  Type *Ty = V->getType()->getPointerElementType();
+  PI.isArray = Ty->isArrayTy();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    if (GV->getType()->getPointerAddressSpace() == DXIL::kTGSMAddrSpace)
+      PI.memType = PointerInfo::MemType::Global_TGSM;
+    else if (!GV->isConstant() &&
+             GV->getLinkage() == GlobalVariable::LinkageTypes::InternalLinkage &&
+             GV->getType()->getPointerAddressSpace() == DXIL::kDefaultAddrSpace)
+      PI.memType = PointerInfo::MemType::Global_Static;
+  } else if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    PI.memType = PointerInfo::MemType::Alloca;
+  } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    PI = GetPointerInfo(GEP->getPointerOperand(), ptrInfoMap);
+  } else if (BitCastOperator *BC = dyn_cast<BitCastOperator>(V)) {
+    PI = GetPointerInfo(BC->getOperand(0), ptrInfoMap);
+  } else if (AddrSpaceCastInst *AC = dyn_cast<AddrSpaceCastInst>(V)) {
+    PI = GetPointerInfo(AC->getOperand(0), ptrInfoMap);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == LLVMAddrSpaceCast)
+      PI = GetPointerInfo(AC->getOperand(0), ptrInfoMap);
+  //} else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+  //  for (auto it = PN->value_op_begin(), e = PN->value_op_end(); it != e; ++it) {
+  //    PI = GetPointerInfo(*it, ptrInfoMap);
+  //    if (PI.memType != PointerInfo::MemType::Unknown)
+  //      break;
+  //  }
+  }
+  return PI;
+};
+
+struct ValueInfo {
+  bool isCbuffer : 1;
+  bool isConstant : 1;
+
+  ValueInfo() :
+    isCbuffer(false),
+    isConstant(false)
+  {}
+
+  ValueInfo Combine(const ValueInfo &other) const {
+    ValueInfo R;
+    R.isCbuffer = isCbuffer && other.isCbuffer;
+    R.isConstant = isConstant && other.isConstant;
+    return R;
+  }
+};
+
+typedef SmallDenseMap<Value*, ValueInfo, 16> ValueInfoMap;
+
+ValueInfo GetValueInfo(Value* V, ValueInfoMap &valueInfoMap) {
+  auto it = valueInfoMap.find(V);
+  if (it != valueInfoMap.end())
+    return it->second;
+
+  ValueInfo &VI = valueInfoMap[V];
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    VI.isConstant = true;
+  } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (hlsl::OP::IsDxilOpFuncCallInst(CI)) {
+      OpCode opcode = (OpCode)llvm::cast<llvm::ConstantInt>(CI->getOperand(0))->getZExtValue();
+      if (opcode == OpCode::CBufferLoad || opcode == OpCode::CBufferLoadLegacy)
+        VI.isCbuffer = true;
+    }
+  } else if (CmpInst *CMP = dyn_cast<CmpInst>(V)) {
+    VI = GetValueInfo(CMP->getOperand(0), valueInfoMap).Combine(
+         GetValueInfo(CMP->getOperand(1), valueInfoMap));
+  } else if (ExtractElementInst *EE = dyn_cast<ExtractElementInst>(V)) {
+    VI = GetValueInfo(EE->getVectorOperand(), valueInfoMap);
+  }
+  // TODO: fill out more as necessary
+
+  return VI;
+}
+
+/*<py>
+
+def tab_lines(text):
+  return ['  ' + line for line in text.splitlines()]
+
+def gen_count_dxil_op(counter):
+  return (['bool CountDxilOp_%s(unsigned op) {' % counter] +
+          tab_lines(
+            hctdb_instrhelp.get_instrs_pred("op", hctdb_instrhelp.counter_pred(counter, True))) +
+          ['}'])
+
+def gen_count_llvm_op(counter):
+  return (['bool CountLlvmOp_%s(unsigned op) {' % counter] +
+          tab_lines(
+            hctdb_instrhelp.get_instrs_pred("op", hctdb_instrhelp.counter_pred(counter, False), 'llvm_id')) +
+          ['}'])
+
+def gen_counter_functions():
+  lines = ['// Counter functions for Dxil ops:']
+  for counter in hctdb_instrhelp.get_dxil_op_counters():
+    lines += gen_count_dxil_op(counter)
+  lines.append('// Counter functions for llvm ops:')
+  for counter in hctdb_instrhelp.get_llvm_op_counters():
+    lines += gen_count_llvm_op(counter)
+  return lines
+
+</py>*/
+
+// <py::lines('OPCODE-COUNTERS')>gen_counter_functions()</py>
+// OPCODE-COUNTERS:BEGIN
+// Counter functions for Dxil ops:
+bool CountDxilOp_atomic(unsigned op) {
+  // Instructions: BufferUpdateCounter=70, AtomicBinOp=78,
+  // AtomicCompareExchange=79
+  return op == 70 || (78 <= op && op <= 79);
+}
+bool CountDxilOp_barrier(unsigned op) {
+  // Instructions: Barrier=80
+  return op == 80;
+}
+bool CountDxilOp_floats(unsigned op) {
+  // Instructions: FAbs=6, Saturate=7, IsNaN=8, IsInf=9, IsFinite=10,
+  // IsNormal=11, Cos=12, Sin=13, Tan=14, Acos=15, Asin=16, Atan=17, Hcos=18,
+  // Hsin=19, Htan=20, Exp=21, Frc=22, Log=23, Sqrt=24, Rsqrt=25, Round_ne=26,
+  // Round_ni=27, Round_pi=28, Round_z=29, FMax=35, FMin=36, Fma=47, Dot2=54,
+  // Dot3=55, Dot4=56, Dot2AddHalf=162
+  return (6 <= op && op <= 29) || (35 <= op && op <= 36) || op == 47 || (54 <= op && op <= 56) || op == 162;
+}
+bool CountDxilOp_gs_cut(unsigned op) {
+  // Instructions: CutStream=98, EmitThenCutStream=99
+  return (98 <= op && op <= 99);
+}
+bool CountDxilOp_gs_emit(unsigned op) {
+  // Instructions: EmitStream=97, EmitThenCutStream=99
+  return op == 97 || op == 99;
+}
+bool CountDxilOp_ints(unsigned op) {
+  // Instructions: IMax=37, IMin=38, IMul=41, IMad=48, Ibfe=51,
+  // Dot4AddI8Packed=163
+  return (37 <= op && op <= 38) || op == 41 || op == 48 || op == 51 || op == 163;
+}
+bool CountDxilOp_sig_ld(unsigned op) {
+  // Instructions: LoadInput=4, LoadOutputControlPoint=103, LoadPatchConstant=104
+  return op == 4 || (103 <= op && op <= 104);
+}
+bool CountDxilOp_sig_st(unsigned op) {
+  // Instructions: StoreOutput=5, StorePatchConstant=106, StoreVertexOutput=171,
+  // StorePrimitiveOutput=172
+  return op == 5 || op == 106 || (171 <= op && op <= 172);
+}
+bool CountDxilOp_tex_bias(unsigned op) {
+  // Instructions: SampleBias=61
+  return op == 61;
+}
+bool CountDxilOp_tex_cmp(unsigned op) {
+  // Instructions: SampleCmp=64, SampleCmpLevelZero=65, TextureGatherCmp=74
+  return (64 <= op && op <= 65) || op == 74;
+}
+bool CountDxilOp_tex_grad(unsigned op) {
+  // Instructions: SampleGrad=63
+  return op == 63;
+}
+bool CountDxilOp_tex_load(unsigned op) {
+  // Instructions: TextureLoad=66, BufferLoad=68, RawBufferLoad=139
+  return op == 66 || op == 68 || op == 139;
+}
+bool CountDxilOp_tex_norm(unsigned op) {
+  // Instructions: Sample=60, SampleLevel=62, TextureGather=73
+  return op == 60 || op == 62 || op == 73;
+}
+bool CountDxilOp_tex_store(unsigned op) {
+  // Instructions: TextureStore=67, BufferStore=69, RawBufferStore=140,
+  // WriteSamplerFeedback=174, WriteSamplerFeedbackBias=175,
+  // WriteSamplerFeedbackLevel=176, WriteSamplerFeedbackGrad=177
+  return op == 67 || op == 69 || op == 140 || (174 <= op && op <= 177);
+}
+bool CountDxilOp_uints(unsigned op) {
+  // Instructions: Bfrev=30, Countbits=31, FirstbitLo=32, FirstbitHi=33,
+  // FirstbitSHi=34, UMax=39, UMin=40, UMul=42, UDiv=43, UAddc=44, USubb=45,
+  // UMad=49, Msad=50, Ubfe=52, Bfi=53, Dot4AddU8Packed=164
+  return (30 <= op && op <= 34) || (39 <= op && op <= 40) || (42 <= op && op <= 45) || (49 <= op && op <= 50) || (52 <= op && op <= 53) || op == 164;
+}
+// Counter functions for llvm ops:
+bool CountLlvmOp_atomic(unsigned op) {
+  // Instructions: AtomicCmpXchg=31, AtomicRMW=32
+  return (31 <= op && op <= 32);
+}
+bool CountLlvmOp_fence(unsigned op) {
+  // Instructions: Fence=30
+  return op == 30;
+}
+bool CountLlvmOp_floats(unsigned op) {
+  // Instructions: FAdd=9, FSub=11, FMul=13, FDiv=16, FRem=19, FPToUI=36,
+  // FPToSI=37, UIToFP=38, SIToFP=39, FPTrunc=40, FPExt=41, FCmp=47
+  return op == 9 || op == 11 || op == 13 || op == 16 || op == 19 || (36 <= op && op <= 41) || op == 47;
+}
+bool CountLlvmOp_ints(unsigned op) {
+  // Instructions: Add=8, Sub=10, Mul=12, SDiv=15, SRem=18, AShr=22, Trunc=33,
+  // SExt=35, ICmp=46
+  return op == 8 || op == 10 || op == 12 || op == 15 || op == 18 || op == 22 || op == 33 || op == 35 || op == 46;
+}
+bool CountLlvmOp_uints(unsigned op) {
+  // Instructions: UDiv=14, URem=17, Shl=20, LShr=21, And=23, Or=24, Xor=25,
+  // ZExt=34
+  return op == 14 || op == 17 || (20 <= op && op <= 21) || (23 <= op && op <= 25) || op == 34;
+}
+// OPCODE-COUNTERS:END
+
+void CountDxilOp(unsigned op, DxilCounters &counters) {
+  // <py::lines('COUNT-DXIL-OPS')>['if (CountDxilOp_%s(op)) ++counters.%s;' % (c,c) for c in hctdb_instrhelp.get_dxil_op_counters()]</py>
+  // COUNT-DXIL-OPS:BEGIN
+  if (CountDxilOp_atomic(op)) ++counters.atomic;
+  if (CountDxilOp_barrier(op)) ++counters.barrier;
+  if (CountDxilOp_floats(op)) ++counters.floats;
+  if (CountDxilOp_gs_cut(op)) ++counters.gs_cut;
+  if (CountDxilOp_gs_emit(op)) ++counters.gs_emit;
+  if (CountDxilOp_ints(op)) ++counters.ints;
+  if (CountDxilOp_sig_ld(op)) ++counters.sig_ld;
+  if (CountDxilOp_sig_st(op)) ++counters.sig_st;
+  if (CountDxilOp_tex_bias(op)) ++counters.tex_bias;
+  if (CountDxilOp_tex_cmp(op)) ++counters.tex_cmp;
+  if (CountDxilOp_tex_grad(op)) ++counters.tex_grad;
+  if (CountDxilOp_tex_load(op)) ++counters.tex_load;
+  if (CountDxilOp_tex_norm(op)) ++counters.tex_norm;
+  if (CountDxilOp_tex_store(op)) ++counters.tex_store;
+  if (CountDxilOp_uints(op)) ++counters.uints;
+  // COUNT-DXIL-OPS:END
+}
+
+void CountLlvmOp(unsigned op, DxilCounters &counters) {
+  // <py::lines('COUNT-LLVM-OPS')>['if (CountLlvmOp_%s(op)) ++counters.%s;' % (c,c) for c in hctdb_instrhelp.get_llvm_op_counters()]</py>
+  // COUNT-LLVM-OPS:BEGIN
+  if (CountLlvmOp_atomic(op)) ++counters.atomic;
+  if (CountLlvmOp_fence(op)) ++counters.fence;
+  if (CountLlvmOp_floats(op)) ++counters.floats;
+  if (CountLlvmOp_ints(op)) ++counters.ints;
+  if (CountLlvmOp_uints(op)) ++counters.uints;
+  // COUNT-LLVM-OPS:END
+}
+
+} // namespace
+
+void CountInstructions(llvm::Module &M, DxilCounters& counters) {
+  const DataLayout &DL = M.getDataLayout();
+  PointerInfoMap ptrInfoMap;
+
+  for (auto &GV : M.globals()) {
+    PointerInfo PI = GetPointerInfo(&GV, ptrInfoMap);
+    if (PI.isArray) {
+      // Count number of bytes used in global arrays.
+      Type *pTy = GV.getType()->getPointerElementType();
+      uint32_t size = DL.getTypeAllocSize(pTy);
+      switch (PI.memType) {
+      case PointerInfo::MemType::Global_Static:  counters.array_static_bytes += size;  break;
+      case PointerInfo::MemType::Global_TGSM:    counters.array_tgsm_bytes += size;    break;
+      default: break;
+      }
+    }
+  }
+
+  for (auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
+    for (auto itBlock = F.begin(), endBlock = F.end(); itBlock != endBlock; ++itBlock) {
+      for (auto itInst = itBlock->begin(), endInst = itBlock->end(); itInst != endInst; ++itInst) {
+        Instruction* I = itInst;
+        ++counters.insts;
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+          Type *pTy = AI->getType()->getPointerElementType();
+          // Count number of bytes used in alloca arrays.
+          if (pTy->isArrayTy()) {
+            counters.array_local_bytes += DL.getTypeAllocSize(pTy);
+          }
+        } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          if (hlsl::OP::IsDxilOpFuncCallInst(CI)) {
+            unsigned opcode = (unsigned)llvm::cast<llvm::ConstantInt>(I->getOperand(0))->getZExtValue();
+            CountDxilOp(opcode, counters);
+          }
+        } else if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+          LoadInst  *LI = dyn_cast<LoadInst>(I);
+          StoreInst *SI = dyn_cast<StoreInst>(I);
+          Value *PtrOp = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+          PointerInfo PI = GetPointerInfo(PtrOp, ptrInfoMap);
+          // Count load/store on array elements.
+          if (PI.isArray) {
+            switch (PI.memType) {
+            case PointerInfo::MemType::Alloca:         ++counters.array_local_ldst;        break;
+            case PointerInfo::MemType::Global_Static:  ++counters.array_static_ldst; break;
+            case PointerInfo::MemType::Global_TGSM:    ++counters.array_tgsm_ldst;   break;
+            default: break;
+            }
+          }
+        } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+          if (BI->getNumSuccessors() > 1) {
+            // TODO: More sophisticated analysis to separate dynamic from static branching?
+            ++counters.branches;
+          }
+        } else {
+          // Count llvm ops:
+          CountLlvmOp(I->getOpcode(), counters);
+        }
+      }
+    }
+  }
+}
+
+struct CounterOffsetByName {
+  StringRef name;
+  uint32_t DxilCounters::*ptr;
+};
+
+// Must be sorted case-sensitive:
+static const CounterOffsetByName CountersByName[] = {
+  // <py::lines('COUNTER-MEMBER-PTRS')>['{ "%s", &DxilCounters::%s },' % (c,c) for c in hctdb_instrhelp.get_counters()]</py>
+  // COUNTER-MEMBER-PTRS:BEGIN
+  { "array_local_bytes", &DxilCounters::array_local_bytes },
+  { "array_local_ldst", &DxilCounters::array_local_ldst },
+  { "array_static_bytes", &DxilCounters::array_static_bytes },
+  { "array_static_ldst", &DxilCounters::array_static_ldst },
+  { "array_tgsm_bytes", &DxilCounters::array_tgsm_bytes },
+  { "array_tgsm_ldst", &DxilCounters::array_tgsm_ldst },
+  { "atomic", &DxilCounters::atomic },
+  { "barrier", &DxilCounters::barrier },
+  { "branches", &DxilCounters::branches },
+  { "fence", &DxilCounters::fence },
+  { "floats", &DxilCounters::floats },
+  { "gs_cut", &DxilCounters::gs_cut },
+  { "gs_emit", &DxilCounters::gs_emit },
+  { "insts", &DxilCounters::insts },
+  { "ints", &DxilCounters::ints },
+  { "sig_ld", &DxilCounters::sig_ld },
+  { "sig_st", &DxilCounters::sig_st },
+  { "tex_bias", &DxilCounters::tex_bias },
+  { "tex_cmp", &DxilCounters::tex_cmp },
+  { "tex_grad", &DxilCounters::tex_grad },
+  { "tex_load", &DxilCounters::tex_load },
+  { "tex_norm", &DxilCounters::tex_norm },
+  { "tex_store", &DxilCounters::tex_store },
+  { "uints", &DxilCounters::uints },
+  // COUNTER-MEMBER-PTRS:END
+};
+
+static int CounterOffsetByNameLess(const CounterOffsetByName &a, const CounterOffsetByName &b) {
+  return a.name < b.name;
+}
+
+uint32_t *LookupByName(llvm::StringRef name, DxilCounters& counters) {
+  CounterOffsetByName key = {name, nullptr};
+  static const CounterOffsetByName *CounterEnd = CountersByName +_countof(CountersByName);
+  auto result = std::lower_bound(CountersByName, CounterEnd, key, CounterOffsetByNameLess);
+  if (result != CounterEnd && result->name == key.name)
+    return &(counters.*(result->ptr));
+  return nullptr;
+}
+
+
+} // namespace hlsl

+ 70 - 0
lib/DXIL/DxilMetadataHelper.cpp

@@ -11,6 +11,7 @@
 #include "dxc/DXIL/DxilMetadataHelper.h"
 #include "dxc/DXIL/DxilShaderModel.h"
 #include "dxc/DXIL/DxilCBuffer.h"
+#include "dxc/DXIL/DxilCounters.h"
 #include "dxc/DXIL/DxilResource.h"
 #include "dxc/DXIL/DxilSampler.h"
 #include "dxc/DXIL/DxilSignatureElement.h"
@@ -67,6 +68,9 @@ const char DxilMDHelper::kDxilSourceDefinesMDName[]                   = "dx.sour
 const char DxilMDHelper::kDxilSourceMainFileNameMDName[]              = "dx.source.mainFileName";
 const char DxilMDHelper::kDxilSourceArgsMDName[]                      = "dx.source.args";
 
+// This is reflection-only metadata
+const char DxilMDHelper::kDxilCountersMDName[]                        = "dx.counters";
+
 static std::array<const char *, 7> DxilMDNames = { {
   DxilMDHelper::kDxilVersionMDName,
   DxilMDHelper::kDxilShaderModelMDName,
@@ -2059,6 +2063,72 @@ void DxilMDHelper::LoadDxilASState(const MDOperand &MDO, unsigned *NumThreads, u
   payloadSizeInBytes = ConstMDToUint32(pTupleMD->getOperand(kDxilASStatePayloadSizeInBytes));
 }
 
+void DxilMDHelper::AddCounterIfNonZero(uint32_t value, StringRef name, vector<Metadata*> &MDVals) {
+  if (value) {
+    MDVals.emplace_back(MDString::get(m_Ctx, name));
+    MDVals.emplace_back(Uint32ToConstMD(value));
+  }
+}
+
+void DxilMDHelper::EmitDxilCounters(const DxilCounters &counters) {
+  NamedMDNode *pDxilCountersMD = m_pModule->getNamedMetadata(kDxilCountersMDName);
+  if (pDxilCountersMD)
+    m_pModule->eraseNamedMetadata(pDxilCountersMD);
+
+  vector<Metadata*> MDVals;
+  // <py::lines('OPCODE-COUNTERS')>['AddCounterIfNonZero(counters.%s, "%s", MDVals);' % (c,c) for c in hctdb_instrhelp.get_counters()]</py>
+  // OPCODE-COUNTERS:BEGIN
+  AddCounterIfNonZero(counters.array_local_bytes, "array_local_bytes", MDVals);
+  AddCounterIfNonZero(counters.array_local_ldst, "array_local_ldst", MDVals);
+  AddCounterIfNonZero(counters.array_static_bytes, "array_static_bytes", MDVals);
+  AddCounterIfNonZero(counters.array_static_ldst, "array_static_ldst", MDVals);
+  AddCounterIfNonZero(counters.array_tgsm_bytes, "array_tgsm_bytes", MDVals);
+  AddCounterIfNonZero(counters.array_tgsm_ldst, "array_tgsm_ldst", MDVals);
+  AddCounterIfNonZero(counters.atomic, "atomic", MDVals);
+  AddCounterIfNonZero(counters.barrier, "barrier", MDVals);
+  AddCounterIfNonZero(counters.branches, "branches", MDVals);
+  AddCounterIfNonZero(counters.fence, "fence", MDVals);
+  AddCounterIfNonZero(counters.floats, "floats", MDVals);
+  AddCounterIfNonZero(counters.gs_cut, "gs_cut", MDVals);
+  AddCounterIfNonZero(counters.gs_emit, "gs_emit", MDVals);
+  AddCounterIfNonZero(counters.insts, "insts", MDVals);
+  AddCounterIfNonZero(counters.ints, "ints", MDVals);
+  AddCounterIfNonZero(counters.sig_ld, "sig_ld", MDVals);
+  AddCounterIfNonZero(counters.sig_st, "sig_st", MDVals);
+  AddCounterIfNonZero(counters.tex_bias, "tex_bias", MDVals);
+  AddCounterIfNonZero(counters.tex_cmp, "tex_cmp", MDVals);
+  AddCounterIfNonZero(counters.tex_grad, "tex_grad", MDVals);
+  AddCounterIfNonZero(counters.tex_load, "tex_load", MDVals);
+  AddCounterIfNonZero(counters.tex_norm, "tex_norm", MDVals);
+  AddCounterIfNonZero(counters.tex_store, "tex_store", MDVals);
+  AddCounterIfNonZero(counters.uints, "uints", MDVals);
+  // OPCODE-COUNTERS:END
+
+  if (MDVals.size()) {
+    pDxilCountersMD = m_pModule->getOrInsertNamedMetadata(kDxilCountersMDName);
+    pDxilCountersMD->addOperand(MDNode::get(m_Ctx, MDVals));
+  }
+}
+
+void DxilMDHelper::LoadCounterMD(const MDOperand &MDName, const MDOperand &MDValue, DxilCounters &counters) const {
+  StringRef name = StringMDToStringRef(MDName);
+  uint32_t value = ConstMDToUint32(MDValue);
+  uint32_t *counter = LookupByName(name, counters);
+  if (counter)
+    *counter = value;
+}
+
+void DxilMDHelper::LoadDxilCounters(DxilCounters &counters) const {
+  ZeroMemory(&counters, sizeof(counters));
+  if (NamedMDNode *pDxilCountersMD = m_pModule->getNamedMetadata(kDxilCountersMDName)) {
+    MDNode *pMDCounters = pDxilCountersMD->getOperand(0);
+    for (unsigned i = 0; i < pMDCounters->getNumOperands(); i += 2) {
+      LoadCounterMD(pMDCounters->getOperand(i), pMDCounters->getOperand(i+1), counters);
+    }
+  }
+}
+
+
 //
 // DxilExtraPropertyHelper methods.
 //

+ 12 - 0
lib/DXIL/DxilModule.cpp

@@ -18,6 +18,7 @@
 #include "dxc/DXIL/DxilEntryProps.h"
 #include "dxc/DXIL/DxilSubobject.h"
 #include "dxc/DXIL/DxilInstructions.h"
+#include "dxc/DXIL/DxilCounters.h"
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -1304,6 +1305,7 @@ void DxilModule::ClearDxilMetadata(Module &M) {
       name == DxilMDHelper::kDxilTypeSystemMDName ||
       name == DxilMDHelper::kDxilViewIdStateMDName ||
       name == DxilMDHelper::kDxilSubobjectsMDName ||
+      name == DxilMDHelper::kDxilCountersMDName ||
       name.startswith(DxilMDHelper::kDxilTypeSystemHelperVariablePrefix)) {
       nodes.push_back(&b);
     }
@@ -1560,6 +1562,16 @@ void DxilModule::ReEmitDxilResources() {
   EmitDxilMetadata();
 }
 
+void DxilModule::EmitDxilCounters() {
+  DxilCounters counters = {};
+  hlsl::CountInstructions(*m_pModule, counters);
+  m_pMDHelper->EmitDxilCounters(counters);
+}
+void DxilModule::LoadDxilCounters(DxilCounters &counters) const {
+  m_pMDHelper->LoadDxilCounters(counters);
+}
+
+
 template <typename TResource>
 static bool
 StripResourcesReflection(std::vector<std::unique_ptr<TResource>> &vec) {

+ 2 - 0
lib/DxilContainer/DxilContainerAssembler.cpp

@@ -34,6 +34,7 @@
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/DxilContainer/DxilPipelineStateValidation.h"
 #include "dxc/DxilContainer/DxilRuntimeReflection.h"
+#include "dxc/DXIL/DxilCounters.h"
 #include <algorithm>
 #include <functional>
 
@@ -1698,6 +1699,7 @@ void hlsl::SerializeDxilContainerForModule(DxilModule *pModule,
     // 0,0 = Not meant to be validated, support latest
     pModule->SetValidatorVersion(0, 0);
     pModule->ReEmitDxilResources();
+    pModule->EmitDxilCounters();
 
     reflectionModule.reset(llvm::CloneModule(pModule->GetModule()));
 

+ 117 - 66
lib/HLSL/DxilContainerReflection.cpp

@@ -29,6 +29,7 @@
 #include "dxc/DXIL/DxilPDB.h"
 #include "dxc/DXIL/DxilUtil.h"
 #include "dxc/HLSL/HLMatrixType.h"
+#include "dxc/DXIL/DxilCounters.h"
 
 #include <unordered_set>
 #include "llvm/ADT/SetVector.h"
@@ -138,12 +139,15 @@ private:
   std::vector<D3D12_SIGNATURE_PARAMETER_DESC>     m_OutputSignature;
   std::vector<D3D12_SIGNATURE_PARAMETER_DESC>     m_PatchConstantSignature;
   std::vector<std::unique_ptr<char[]>>            m_UpperCaseNames;
+  D3D12_SHADER_DESC m_Desc = {};
+
   void SetCBufferUsage();
   void CreateReflectionObjectsForSignature(
       const DxilSignature &Sig,
       std::vector<D3D12_SIGNATURE_PARAMETER_DESC> &Descs);
   LPCSTR CreateUpperCase(LPCSTR pValue);
   void MarkUsedSignatureElements();
+  void InitDesc();
 public:
   PublicAPI m_PublicAPI;
   void SetPublicAPI(PublicAPI value) { m_PublicAPI = value; }
@@ -273,6 +277,13 @@ HRESULT CreateDxilLibraryReflection(const DxilPartHeader *pModulePart, const Dxi
 _Use_decl_annotations_
 HRESULT DxilContainerReflection::Load(IDxcBlob *pContainer) {
 
+  if (pContainer == nullptr) {
+    m_container.Release();
+    m_pHeader = nullptr;
+    m_headerLen = 0;
+    return S_OK;
+  }
+
   CComPtr<IDxcBlob> pPDBContainer;
   {
     DxcThreadMalloc DxcMalloc(m_pMalloc);
@@ -283,13 +294,6 @@ HRESULT DxilContainerReflection::Load(IDxcBlob *pContainer) {
     }
   }
 
-  if (pContainer == nullptr) {
-    m_container.Release();
-    m_pHeader = nullptr;
-    m_headerLen = 0;
-    return S_OK;
-  }
-
   uint32_t bufLen = pContainer->GetBufferSize();
   const DxilContainerHeader *pHeader =
       IsDxilContainerLike(pContainer->GetBufferPointer(), bufLen);
@@ -2085,6 +2089,9 @@ HRESULT DxilShaderReflection::Load(const DxilPartHeader *pModulePart,
     CreateReflectionObjectsForSignature(m_pDxilModule->GetPatchConstOrPrimSignature(), m_PatchConstantSignature);
     if (!m_bUsageInMetadata)
       MarkUsedSignatureElements();
+
+    InitDesc();
+
     return S_OK;
   }
   CATCH_CPP_RETURN_HRESULT();
@@ -2092,65 +2099,8 @@ HRESULT DxilShaderReflection::Load(const DxilPartHeader *pModulePart,
 
 _Use_decl_annotations_
 HRESULT DxilShaderReflection::GetDesc(D3D12_SHADER_DESC *pDesc) {
-  IFR(ZeroMemoryToOut(pDesc));
-  const DxilModule &M = *m_pDxilModule;
-  const ShaderModel *pSM = M.GetShaderModel();
-
-  pDesc->Version = EncodeVersion(pSM->GetKind(), pSM->GetMajor(), pSM->GetMinor());
-
-  // Unset:  LPCSTR                  Creator;                     // Creator string
-  // Unset:  UINT                    Flags;                       // Shader compilation/parse flags
-
-  pDesc->ConstantBuffers = m_CBs.size();
-  pDesc->BoundResources = m_Resources.size();
-  pDesc->InputParameters = m_InputSignature.size();
-  pDesc->OutputParameters = m_OutputSignature.size();
-  pDesc->PatchConstantParameters = m_PatchConstantSignature.size();
-
-  // Unset:  UINT                    InstructionCount;            // Number of emitted instructions
-  // Unset:  UINT                    TempRegisterCount;           // Number of temporary registers used 
-  // Unset:  UINT                    TempArrayCount;              // Number of temporary arrays used
-  // Unset:  UINT                    DefCount;                    // Number of constant defines 
-  // Unset:  UINT                    DclCount;                    // Number of declarations (input + output)
-  // Unset:  UINT                    TextureNormalInstructions;   // Number of non-categorized texture instructions
-  // Unset:  UINT                    TextureLoadInstructions;     // Number of texture load instructions
-  // Unset:  UINT                    TextureCompInstructions;     // Number of texture comparison instructions
-  // Unset:  UINT                    TextureBiasInstructions;     // Number of texture bias instructions
-  // Unset:  UINT                    TextureGradientInstructions; // Number of texture gradient instructions
-  // Unset:  UINT                    FloatInstructionCount;       // Number of floating point arithmetic instructions used
-  // Unset:  UINT                    IntInstructionCount;         // Number of signed integer arithmetic instructions used
-  // Unset:  UINT                    UintInstructionCount;        // Number of unsigned integer arithmetic instructions used
-  // Unset:  UINT                    StaticFlowControlCount;      // Number of static flow control instructions used
-  // Unset:  UINT                    DynamicFlowControlCount;     // Number of dynamic flow control instructions used
-  // Unset:  UINT                    MacroInstructionCount;       // Number of macro instructions used
-  // Unset:  UINT                    ArrayInstructionCount;       // Number of array instructions used
-  // Unset:  UINT                    CutInstructionCount;         // Number of cut instructions used
-  // Unset:  UINT                    EmitInstructionCount;        // Number of emit instructions used
-
-  pDesc->GSOutputTopology = (D3D_PRIMITIVE_TOPOLOGY)M.GetStreamPrimitiveTopology();
-  pDesc->GSMaxOutputVertexCount = M.GetMaxVertexCount();
-
-  if (pSM->IsHS())
-    pDesc->InputPrimitive = (D3D_PRIMITIVE)(D3D_PRIMITIVE_1_CONTROL_POINT_PATCH + M.GetInputControlPointCount() - 1);
-  else
-    pDesc->InputPrimitive = (D3D_PRIMITIVE)M.GetInputPrimitive();
-
-  pDesc->cGSInstanceCount = M.GetGSInstanceCount();
-
-  if (pSM->IsHS())
-    pDesc->cControlPoints = M.GetOutputControlPointCount();
-  else if (pSM->IsDS())
-    pDesc->cControlPoints = M.GetInputControlPointCount();
-
-  pDesc->HSOutputPrimitive = (D3D_TESSELLATOR_OUTPUT_PRIMITIVE)M.GetTessellatorOutputPrimitive();
-  pDesc->HSPartitioning = (D3D_TESSELLATOR_PARTITIONING)M.GetTessellatorPartitioning();
-  pDesc->TessellatorDomain = (D3D_TESSELLATOR_DOMAIN)M.GetTessellatorDomain();
-
-  // instruction counts
-  // Unset:  UINT cBarrierInstructions;                           // Number of barrier instructions in a compute shader
-  // Unset:  UINT cInterlockedInstructions;                       // Number of interlocked instructions
-  // Unset:  UINT cTextureStoreInstructions;                      // Number of texture writes
-
+  if (nullptr == pDesc) return E_POINTER;
+  memcpy(pDesc, &m_Desc, sizeof(D3D12_SHADER_DESC));
   return S_OK;
 }
 
@@ -2240,6 +2190,107 @@ void DxilShaderReflection::MarkUsedSignatureElements() {
   }
 }
 
+void DxilShaderReflection::InitDesc() {
+  D3D12_SHADER_DESC *pDesc = &m_Desc;
+
+  const DxilModule &M = *m_pDxilModule;
+  const ShaderModel *pSM = M.GetShaderModel();
+
+  pDesc->Version = EncodeVersion(pSM->GetKind(), pSM->GetMajor(), pSM->GetMinor());
+
+  Module *pModule = M.GetModule();
+  if (NamedMDNode *pIdentMD = pModule->getNamedMetadata("llvm.ident")) {
+    if (pIdentMD->getNumOperands()) {
+      if (MDNode *pMDList = pIdentMD->getOperand(0)) {
+        if (pMDList->getNumOperands()) {
+          if (MDString *pMDString = dyn_cast_or_null<MDString>(pMDList->getOperand(0))) {
+            pDesc->Creator = pMDString->getString().data();
+          }
+        }
+      }
+    }
+  }
+
+  // Unset:  UINT                    Flags;                       // Shader compilation/parse flags
+
+  pDesc->ConstantBuffers = m_CBs.size();
+  pDesc->BoundResources = m_Resources.size();
+  pDesc->InputParameters = m_InputSignature.size();
+  pDesc->OutputParameters = m_OutputSignature.size();
+  pDesc->PatchConstantParameters = m_PatchConstantSignature.size();
+
+  pDesc->GSOutputTopology = (D3D_PRIMITIVE_TOPOLOGY)M.GetStreamPrimitiveTopology();
+  pDesc->GSMaxOutputVertexCount = M.GetMaxVertexCount();
+
+  if (pSM->IsHS())
+    pDesc->InputPrimitive = (D3D_PRIMITIVE)(D3D_PRIMITIVE_1_CONTROL_POINT_PATCH + M.GetInputControlPointCount() - 1);
+  else
+    pDesc->InputPrimitive = (D3D_PRIMITIVE)M.GetInputPrimitive();
+
+  pDesc->cGSInstanceCount = M.GetGSInstanceCount();
+
+  if (pSM->IsHS())
+    pDesc->cControlPoints = M.GetOutputControlPointCount();
+  else if (pSM->IsDS())
+    pDesc->cControlPoints = M.GetInputControlPointCount();
+
+  pDesc->HSOutputPrimitive = (D3D_TESSELLATOR_OUTPUT_PRIMITIVE)M.GetTessellatorOutputPrimitive();
+  pDesc->HSPartitioning = (D3D_TESSELLATOR_PARTITIONING)M.GetTessellatorPartitioning();
+  pDesc->TessellatorDomain = (D3D_TESSELLATOR_DOMAIN)M.GetTessellatorDomain();
+
+  // Instruction counts only roughly track some fxc counters
+  DxilCounters counters = {};
+  m_pDxilModule->LoadDxilCounters(counters);
+
+  // UINT InstructionCount;               // Num llvm instructions in all functions
+  // UINT TempArrayCount;                 // Number of bytes used in arrays (alloca + static global)
+  // UINT DynamicFlowControlCount;        // Number of branches with more than one successor for now
+  // UINT ArrayInstructionCount;          // number of load/store on arrays for now
+  pDesc->InstructionCount = counters.insts;
+  pDesc->TempArrayCount = counters.AllArrayBytes();
+  pDesc->DynamicFlowControlCount = counters.branches;
+  pDesc->ArrayInstructionCount = counters.AllArrayAccesses();
+
+  // UINT FloatInstructionCount;          // Number of floating point arithmetic instructions used
+  // UINT IntInstructionCount;            // Number of signed integer arithmetic instructions used
+  // UINT UintInstructionCount;           // Number of unsigned integer arithmetic instructions used
+  pDesc->FloatInstructionCount = counters.floats;
+  pDesc->IntInstructionCount = counters.ints;
+  pDesc->UintInstructionCount = counters.uints;
+
+  // UINT TextureNormalInstructions;      // Number of non-categorized texture instructions
+  // UINT TextureLoadInstructions;        // Number of texture load instructions
+  // UINT TextureCompInstructions;        // Number of texture comparison instructions
+  // UINT TextureBiasInstructions;        // Number of texture bias instructions
+  // UINT TextureGradientInstructions;    // Number of texture gradient instructions
+  pDesc->TextureNormalInstructions = counters.tex_norm;
+  pDesc->TextureLoadInstructions = counters.tex_load;
+  pDesc->TextureCompInstructions = counters.tex_cmp;
+  pDesc->TextureBiasInstructions = counters.tex_bias;
+  pDesc->TextureGradientInstructions = counters.tex_grad;
+
+  // UINT CutInstructionCount;            // Number of cut instructions used
+  // UINT EmitInstructionCount;           // Number of emit instructions used
+  pDesc->CutInstructionCount = counters.gs_cut;
+  pDesc->EmitInstructionCount = counters.gs_emit;
+
+  // UINT cBarrierInstructions;           // Number of barrier instructions in a compute shader
+  // UINT cInterlockedInstructions;       // Number of interlocked instructions
+  // UINT cTextureStoreInstructions;      // Number of texture writes
+  pDesc->cBarrierInstructions = counters.barrier;
+  pDesc->cInterlockedInstructions = counters.atomic;
+  pDesc->cTextureStoreInstructions = counters.tex_store;
+
+  // Unset:  UINT TempRegisterCount;      // Don't know how to map this for SSA (not going to do reg allocation here)
+  // Unset:  UINT DefCount;               // Not sure what to map this to
+  // Unset:  UINT DclCount;               // Number of declarations (input + output)
+  // TODO: map to used input + output signature rows?
+  // Unset:  UINT StaticFlowControlCount; // Number of static flow control instructions used
+  // This used to map to flow control using special int/bool constant registers in DX9.
+  // Unset:  UINT MacroInstructionCount;  // Number of macro instructions used
+  // Macro instructions are a <= DX9 concept.
+}
+
 _Use_decl_annotations_
 ID3D12ShaderReflectionConstantBuffer* DxilShaderReflection::GetConstantBufferByIndex(UINT Index) {
   return DxilModuleReflection::_GetConstantBufferByIndex(Index);

+ 17 - 0
tools/clang/test/HLSLFileCheck/d3dreflect/tbuffer-16bit.hlsl

@@ -42,6 +42,23 @@ float main(int i : A) : SV_TARGET
 // CHECK-NEXT:     BoundResources: 1
 // CHECK-NEXT:     InputParameters: 1
 // CHECK-NEXT:     OutputParameters: 1
+// CHECK-NEXT:     InstructionCount: 49
+// CHECK-NEXT:     TempArrayCount: 0
+// CHECK-NEXT:     DynamicFlowControlCount: 0
+// CHECK-NEXT:     ArrayInstructionCount: 0
+// CHECK-NEXT:     TextureNormalInstructions: 0
+// CHECK-NEXT:     TextureLoadInstructions: 6
+// CHECK-NEXT:     TextureCompInstructions: 0
+// CHECK-NEXT:     TextureBiasInstructions: 0
+// CHECK-NEXT:     TextureGradientInstructions: 0
+// CHECK-NEXT:     FloatInstructionCount: 12
+// CHECK-NEXT:     IntInstructionCount: 6
+// CHECK-NEXT:     UintInstructionCount: 7
+// CHECK-NEXT:     CutInstructionCount: 0
+// CHECK-NEXT:     EmitInstructionCount: 0
+// CHECK-NEXT:     cBarrierInstructions: 0
+// CHECK-NEXT:     cInterlockedInstructions: 0
+// CHECK-NEXT:     cTextureStoreInstructions: 0
 // CHECK-NEXT:   Constant Buffers:
 // CHECK-NEXT:     ID3D12ShaderReflectionConstantBuffer:
 // CHECK-NEXT:       D3D12_SHADER_BUFFER_DESC: Name: tb

+ 17 - 0
tools/clang/test/HLSLFileCheck/d3dreflect/tbuffer.hlsl

@@ -54,6 +54,23 @@ float main(int i : A) : SV_TARGET
 // CHECK-NEXT:     BoundResources: 1
 // CHECK-NEXT:     InputParameters: 1
 // CHECK-NEXT:     OutputParameters: 1
+// CHECK-NEXT:     InstructionCount: 26
+// CHECK-NEXT:     TempArrayCount: 0
+// CHECK-NEXT:     DynamicFlowControlCount: 0
+// CHECK-NEXT:     ArrayInstructionCount: 0
+// CHECK-NEXT:     TextureNormalInstructions: 0
+// CHECK-NEXT:     TextureLoadInstructions: 5
+// CHECK-NEXT:     TextureCompInstructions: 0
+// CHECK-NEXT:     TextureBiasInstructions: 0
+// CHECK-NEXT:     TextureGradientInstructions: 0
+// CHECK-NEXT:     FloatInstructionCount: 8
+// CHECK-NEXT:     IntInstructionCount: 2
+// CHECK-NEXT:     UintInstructionCount: 0
+// CHECK-NEXT:     CutInstructionCount: 0
+// CHECK-NEXT:     EmitInstructionCount: 0
+// CHECK-NEXT:     cBarrierInstructions: 0
+// CHECK-NEXT:     cInterlockedInstructions: 0
+// CHECK-NEXT:     cTextureStoreInstructions: 0
 // CHECK-NEXT:   Constant Buffers:
 // CHECK-NEXT:     ID3D12ShaderReflectionConstantBuffer:
 // CHECK-NEXT:       D3D12_SHADER_BUFFER_DESC: Name: tb

+ 2 - 3
tools/clang/test/HLSLFileCheck/hlsl/objects/FeedbackTexture/feedback-reflect.hlsl

@@ -33,13 +33,12 @@ float main() : SV_Target
 // CHECK: ID3D12ShaderReflection:
 // CHECK-NEXT:   D3D12_SHADER_BUFFER_DESC:
 // CHECK-NEXT:     Shader Version: Pixel 6.5
-// CHECK-NEXT:     Creator: <nullptr>
-// CHECK-NEXT:     Flags: 0
+// CHECK:     Flags: 0
 // CHECK-NEXT:     ConstantBuffers: 0
 // CHECK-NEXT:     BoundResources: 7
 // CHECK-NEXT:     InputParameters: 0
 // CHECK-NEXT:     OutputParameters: 1
-// CHECK-NEXT:   Bound Resources:
+// CHECK:   Bound Resources:
 // CHECK-NEXT:     D3D12_SHADER_BUFFER_DESC: Name: samp
 // CHECK-NEXT:       Type: D3D_SIT_SAMPLER
 // CHECK-NEXT:       uID: 0

+ 15 - 0
tools/clang/test/HLSLFileCheck/samples/MiniEngine/BlurCS.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -T cs_6_0 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: groupId
 // CHECK: threadIdInGroup
@@ -137,3 +138,17 @@ void main( uint3 Gid : SV_GroupID, uint3 GTid : SV_GroupThreadID, uint3 DTid : S
 	//
 	BlurVertically(DTid.xy, (GTid.y << 3) + GTid.x);
 }
+
+// Note: TGSM is counted as part of temp array for now.
+// REFL: TempArrayCount: 1536
+// REFL: DynamicFlowControlCount: 0
+// REFL: ArrayInstructionCount: 54
+// REFL: TextureLoadInstructions: 4
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 2
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 1

+ 19 - 0
tools/clang/test/HLSLFileCheck/samples/MiniEngine/MagnifyPixelsPS.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -T ps_6_0 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: sampleLevel
 
@@ -32,3 +33,21 @@ float3 main( float4 position : SV_Position, float2 uv : TexCoord0 ) : SV_Target0
 	float2 ScaledUV = ScaleFactor * (uv - 0.5) + 0.5;
 	return ColorTex.SampleLevel(PointSampler, ScaledUV, 0);
 }
+
+// REFL: InstructionCount: 21
+// REFL: TempArrayCount: 0
+// REFL: DynamicFlowControlCount: 0
+// REFL: ArrayInstructionCount: 0
+// REFL: TextureNormalInstructions: 1
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: FloatInstructionCount: 6
+// REFL: IntInstructionCount: 0
+// REFL: UintInstructionCount: 0
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 17 - 0
tools/clang/test/HLSLFileCheck/samples/SubD11_SmoothPS.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -T ps_6_0 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: sample
 // CHECK: dot3
@@ -227,3 +228,19 @@ float4 main( PS_INPUT Input ) : SV_TARGET
     
     return float4( fLightColor, 1 );
 }
+
+// REFL: TempArrayCount: 0
+// REFL: DynamicFlowControlCount: 4
+// REFL: ArrayInstructionCount: 0
+// REFL: TextureNormalInstructions: 3
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: IntInstructionCount: 2
+// REFL: UintInstructionCount: 0
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 19 - 0
tools/clang/test/HLSLFileCheck/shader_targets/geometry/multiStreamGS.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T gs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -T gs_6_0 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK:; Output signature:
 // CHECK:;
@@ -68,3 +69,21 @@ void main(point float4 array[1] : COORD, inout PointStream<MyStruct> OutputStrea
   OutputStream2.Append(a);
   OutputStream2.RestartStrip();
 }
+
+// REFL: InstructionCount: 84
+// REFL: TempArrayCount: 0
+// REFL: DynamicFlowControlCount: 1
+// REFL: ArrayInstructionCount: 0
+// REFL: TextureNormalInstructions: 0
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: FloatInstructionCount: 11
+// REFL: IntInstructionCount: 1
+// REFL: UintInstructionCount: 0
+// REFL: CutInstructionCount: 4
+// REFL: EmitInstructionCount: 4
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 16 - 0
tools/clang/test/HLSLFileCheck/shader_targets/hull/FloatMaxtessfactorHs.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T hs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -T hs_6_0 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: float 3.000000e+00}
 
@@ -68,3 +69,18 @@ HSFoo main( InputPatch<HSFoo_Input, 32> p,
     output.d = p[i].qq + r;
     return output;
 }
+
+// REFL: TempArrayCount: 16
+// REFL: DynamicFlowControlCount: 1
+// REFL: ArrayInstructionCount: 5
+// REFL: TextureNormalInstructions: 0
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: UintInstructionCount: 0
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 18 - 0
tools/clang/test/HLSLFileCheck/shader_targets/mesh/amplification.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T as_6_5 %s | FileCheck %s
+// RUN: %dxc -E main -T as_6_5 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: dx.op.dispatchMesh.struct.Payload
 
@@ -20,3 +21,20 @@ void main()
     pld.color[1] = 8.0;
     DispatchMesh(NUM_THREADS, 1, 1, pld);
 }
+
+// REFL: TempArrayCount: 0
+// REFL: DynamicFlowControlCount: 0
+// REFL: ArrayInstructionCount: 0
+// REFL: TextureNormalInstructions: 0
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: FloatInstructionCount: 0
+// REFL: IntInstructionCount: 0
+// REFL: UintInstructionCount: 0
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 19 - 1
tools/clang/test/HLSLFileCheck/shader_targets/mesh/mesh.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -E main -T ms_6_5 %s | FileCheck %s
+// RUN: %dxc -E main -T ms_6_5 %s | %D3DReflect %s | FileCheck -check-prefix=REFL %s
 
 // CHECK: dx.op.getMeshPayload.struct.MeshPayload(i32 170)
 // CHECK: dx.op.setMeshOutputCounts(i32 168, i32 32, i32 16)
@@ -78,4 +79,21 @@ void main(
       prims[tig / 3] = op;
     }
     verts[tig] = ov;
-}
+}
+
+// REFL: TempArrayCount: 64
+// REFL: DynamicFlowControlCount: 1
+// REFL: ArrayInstructionCount: 2
+// REFL: TextureNormalInstructions: 0
+// REFL: TextureLoadInstructions: 0
+// REFL: TextureCompInstructions: 0
+// REFL: TextureBiasInstructions: 0
+// REFL: TextureGradientInstructions: 0
+// REFL: FloatInstructionCount: 0
+// REFL: IntInstructionCount: 5
+// REFL: UintInstructionCount: 3
+// REFL: CutInstructionCount: 0
+// REFL: EmitInstructionCount: 0
+// REFL: cBarrierInstructions: 0
+// REFL: cInterlockedInstructions: 0
+// REFL: cTextureStoreInstructions: 0

+ 19 - 1
tools/clang/unittests/HLSLTestLib/D3DReflectionDumper.cpp

@@ -452,7 +452,25 @@ void D3DReflectionDumper::Dump(D3D12_SHADER_DESC &Desc) {
     WriteLn("cControlPoints: ", Desc.cControlPoints);
     DumpEnum("TessellatorDomain", Desc.TessellatorDomain);
   }
-  // TODO
+  // Instruction Counts
+  WriteLn("InstructionCount: ", Desc.InstructionCount);
+  WriteLn("TempArrayCount: ", Desc.TempArrayCount);
+  WriteLn("DynamicFlowControlCount: ", Desc.DynamicFlowControlCount);
+  WriteLn("ArrayInstructionCount: ", Desc.ArrayInstructionCount);
+  WriteLn("TextureNormalInstructions: ", Desc.TextureNormalInstructions);
+  WriteLn("TextureLoadInstructions: ", Desc.TextureLoadInstructions);
+  WriteLn("TextureCompInstructions: ", Desc.TextureCompInstructions);
+  WriteLn("TextureBiasInstructions: ", Desc.TextureBiasInstructions);
+  WriteLn("TextureGradientInstructions: ", Desc.TextureGradientInstructions);
+  WriteLn("FloatInstructionCount: ", Desc.FloatInstructionCount);
+  WriteLn("IntInstructionCount: ", Desc.IntInstructionCount);
+  WriteLn("UintInstructionCount: ", Desc.UintInstructionCount);
+  WriteLn("CutInstructionCount: ", Desc.CutInstructionCount);
+  WriteLn("EmitInstructionCount: ", Desc.EmitInstructionCount);
+  WriteLn("cBarrierInstructions: ", Desc.cBarrierInstructions);
+  WriteLn("cInterlockedInstructions: ", Desc.cInterlockedInstructions);
+  WriteLn("cTextureStoreInstructions: ", Desc.cTextureStoreInstructions);
+
   Dedent();
 }
 void D3DReflectionDumper::Dump(D3D12_FUNCTION_DESC &Desc) {

+ 189 - 89
utils/hct/hctdb.py

@@ -23,6 +23,20 @@ all_stages = (
     'amplification',
     )
 
+# These counters aren't collected directly from instructions,
+# so they need to be added manually so they can be accessed
+# with custom code in DxilCounters.cpp.
+extra_counters = [
+    'insts',
+    'branches',
+    'array_tgsm_bytes',
+    'array_static_bytes',
+    'array_local_bytes',
+    'array_tgsm_ldst',
+    'array_static_ldst',
+    'array_local_ldst',
+    ]
+
 class db_dxil_enum_value(object):
     "A representation for a value in an enumeration type"
     def __init__(self, name, value, doc):
@@ -77,6 +91,7 @@ class db_dxil_inst(object):
         self.is_dxil_op = self.dxil_op != "" # whether this is a DXIL operation
         self.is_reserved = self.dxil_class == "Reserved"
         self.shader_model_translated = () # minimum shader model required with translation by linker
+        self.props = {}                 # extra properties
 
     def __str__(self):
         return self.name
@@ -159,6 +174,9 @@ class db_dxil(object):
         self.name_idx = {}      # DXIL instructions by name
         self.enum_idx = {}      # enumerations by name
         self.dxil_version_info = {}
+        # list of counters for instructions and dxil ops,
+        # starting with extra ones specified here
+        self.counters = extra_counters
 
         self.populate_llvm_instructions()
         self.call_instr = self.get_instr_by_llvm_name("CallInst")
@@ -173,6 +191,7 @@ class db_dxil(object):
         self.build_valrules()
         self.build_semantics()
         self.build_indices()
+        self.populate_counters()
 
     def __str__(self):
         return '\n'.join(str(i) for i in self.instr)
@@ -451,50 +470,50 @@ class db_dxil(object):
         self.add_llvm_instr("TERM", 6, "Resume", "ResumeInst", "resumes the propagation of an exception", "", [])
         self.add_llvm_instr("TERM", 7, "Unreachable", "UnreachableInst", "is unreachable", "", [])
 
-        self.add_llvm_instr("BINARY",  8, "Add"  , "BinaryOperator", "returns the sum of its two operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY",  9, "FAdd" , "BinaryOperator", "returns the sum of its two operands", oload_float_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 10, "Sub"  , "BinaryOperator", "returns the difference of its two operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 11, "FSub" , "BinaryOperator", "returns the difference of its two operands", oload_float_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 12, "Mul"  , "BinaryOperator", "returns the product of its two operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 13, "FMul" , "BinaryOperator", "returns the product of its two operands", oload_float_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 14, "UDiv" , "BinaryOperator", "returns the quotient of its two unsigned operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 15, "SDiv" , "BinaryOperator", "returns the quotient of its two signed operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 16, "FDiv" , "BinaryOperator", "returns the quotient of its two operands", oload_float_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 17, "URem" , "BinaryOperator", "returns the remainder from the unsigned division of its two operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 18, "SRem" , "BinaryOperator", "returns the remainder from the signed division of its two operands", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 19, "FRem" , "BinaryOperator", "returns the remainder from the division of its two operands", oload_float_arith, oload_binary_params)
-
-        self.add_llvm_instr("BINARY", 20, "Shl", "BinaryOperator", "shifts left (logical)", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 21, "LShr", "BinaryOperator", "shifts right (logical), with zero bit fill", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 22, "AShr", "BinaryOperator", "shifts right (arithmetic), with 'a' operand sign bit fill", oload_int_arith, oload_binary_params)
-        self.add_llvm_instr("BINARY", 23, "And", "BinaryOperator", "returns a  bitwise logical and of its two operands", oload_int_arith_b, oload_binary_params)
-        self.add_llvm_instr("BINARY", 24, "Or", "BinaryOperator", "returns a bitwise logical or of its two operands", oload_int_arith_b, oload_binary_params)
-        self.add_llvm_instr("BINARY", 25, "Xor", "BinaryOperator", "returns a bitwise logical xor of its two operands", oload_int_arith_b, oload_binary_params)
+        self.add_llvm_instr("BINARY",  8, "Add"  , "BinaryOperator", "returns the sum of its two operands", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY",  9, "FAdd" , "BinaryOperator", "returns the sum of its two operands", oload_float_arith, oload_binary_params, counters=('floats',))
+        self.add_llvm_instr("BINARY", 10, "Sub"  , "BinaryOperator", "returns the difference of its two operands", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY", 11, "FSub" , "BinaryOperator", "returns the difference of its two operands", oload_float_arith, oload_binary_params, counters=('floats',))
+        self.add_llvm_instr("BINARY", 12, "Mul"  , "BinaryOperator", "returns the product of its two operands", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY", 13, "FMul" , "BinaryOperator", "returns the product of its two operands", oload_float_arith, oload_binary_params, counters=('floats',))
+        self.add_llvm_instr("BINARY", 14, "UDiv" , "BinaryOperator", "returns the quotient of its two unsigned operands", oload_int_arith, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 15, "SDiv" , "BinaryOperator", "returns the quotient of its two signed operands", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY", 16, "FDiv" , "BinaryOperator", "returns the quotient of its two operands", oload_float_arith, oload_binary_params, counters=('floats',))
+        self.add_llvm_instr("BINARY", 17, "URem" , "BinaryOperator", "returns the remainder from the unsigned division of its two operands", oload_int_arith, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 18, "SRem" , "BinaryOperator", "returns the remainder from the signed division of its two operands", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY", 19, "FRem" , "BinaryOperator", "returns the remainder from the division of its two operands", oload_float_arith, oload_binary_params, counters=('floats',))
+
+        self.add_llvm_instr("BINARY", 20, "Shl", "BinaryOperator", "shifts left (logical)", oload_int_arith, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 21, "LShr", "BinaryOperator", "shifts right (logical), with zero bit fill", oload_int_arith, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 22, "AShr", "BinaryOperator", "shifts right (arithmetic), with 'a' operand sign bit fill", oload_int_arith, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("BINARY", 23, "And", "BinaryOperator", "returns a  bitwise logical and of its two operands", oload_int_arith_b, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 24, "Or", "BinaryOperator", "returns a bitwise logical or of its two operands", oload_int_arith_b, oload_binary_params, counters=('uints',))
+        self.add_llvm_instr("BINARY", 25, "Xor", "BinaryOperator", "returns a bitwise logical xor of its two operands", oload_int_arith_b, oload_binary_params, counters=('uints',))
 
         self.add_llvm_instr("MEMORY", 26, "Alloca", "AllocaInst", "allocates memory on the stack frame of the currently executing function", "", [])
         self.add_llvm_instr("MEMORY", 27, "Load", "LoadInst", "reads from memory", "", [])
         self.add_llvm_instr("MEMORY", 28, "Store", "StoreInst", "writes to memory", "", [])
         self.add_llvm_instr("MEMORY", 29, "GetElementPtr", "GetElementPtrInst", "gets the address of a subelement of an aggregate value", "", [])
-        self.add_llvm_instr("MEMORY", 30, "Fence", "FenceInst", "introduces happens-before edges between operations", "", [])
-        self.add_llvm_instr("MEMORY", 31, "AtomicCmpXchg", "AtomicCmpXchgInst" , "atomically modifies memory", "", [])
-        self.add_llvm_instr("MEMORY", 32, "AtomicRMW", "AtomicRMWInst", "atomically modifies memory", "", [])
-
-        self.add_llvm_instr("CAST", 33, "Trunc", "TruncInst", "truncates an integer", oload_int_arith_b, oload_cast_params)
-        self.add_llvm_instr("CAST", 34, "ZExt", "ZExtInst", "zero extends an integer", oload_int_arith_b, oload_cast_params)
-        self.add_llvm_instr("CAST", 35, "SExt", "SExtInst", "sign extends an integer", oload_int_arith_b, oload_cast_params)
-        self.add_llvm_instr("CAST", 36, "FPToUI", "FPToUIInst", "converts a floating point to UInt", oload_all_arith, oload_cast_params)
-        self.add_llvm_instr("CAST", 37, "FPToSI", "FPToSIInst", "converts a floating point to SInt", oload_all_arith, oload_cast_params)
-        self.add_llvm_instr("CAST", 38, "UIToFP", "UIToFPInst", "converts a UInt to floating point", oload_all_arith, oload_cast_params)
-        self.add_llvm_instr("CAST", 39, "SIToFP" , "SIToFPInst", "converts a SInt to floating point", oload_all_arith, oload_cast_params)
-        self.add_llvm_instr("CAST", 40, "FPTrunc", "FPTruncInst", "truncates a floating point", oload_float_arith, oload_cast_params)
-        self.add_llvm_instr("CAST", 41, "FPExt", "FPExtInst", "extends a floating point", oload_float_arith, oload_cast_params)
+        self.add_llvm_instr("MEMORY", 30, "Fence", "FenceInst", "introduces happens-before edges between operations", "", [], counters=('fence',))
+        self.add_llvm_instr("MEMORY", 31, "AtomicCmpXchg", "AtomicCmpXchgInst" , "atomically modifies memory", "", [], counters=('atomic',))
+        self.add_llvm_instr("MEMORY", 32, "AtomicRMW", "AtomicRMWInst", "atomically modifies memory", "", [], counters=('atomic',))
+
+        self.add_llvm_instr("CAST", 33, "Trunc", "TruncInst", "truncates an integer", oload_int_arith_b, oload_cast_params, counters=('ints',))
+        self.add_llvm_instr("CAST", 34, "ZExt", "ZExtInst", "zero extends an integer", oload_int_arith_b, oload_cast_params, counters=('uints',))
+        self.add_llvm_instr("CAST", 35, "SExt", "SExtInst", "sign extends an integer", oload_int_arith_b, oload_cast_params, counters=('ints',))
+        self.add_llvm_instr("CAST", 36, "FPToUI", "FPToUIInst", "converts a floating point to UInt", oload_all_arith, oload_cast_params, counters=('floats',))
+        self.add_llvm_instr("CAST", 37, "FPToSI", "FPToSIInst", "converts a floating point to SInt", oload_all_arith, oload_cast_params, counters=('floats',))
+        self.add_llvm_instr("CAST", 38, "UIToFP", "UIToFPInst", "converts a UInt to floating point", oload_all_arith, oload_cast_params, counters=('floats',))
+        self.add_llvm_instr("CAST", 39, "SIToFP" , "SIToFPInst", "converts a SInt to floating point", oload_all_arith, oload_cast_params, counters=('floats',))
+        self.add_llvm_instr("CAST", 40, "FPTrunc", "FPTruncInst", "truncates a floating point", oload_float_arith, oload_cast_params, counters=('floats',))
+        self.add_llvm_instr("CAST", 41, "FPExt", "FPExtInst", "extends a floating point", oload_float_arith, oload_cast_params, counters=('floats',))
         self.add_llvm_instr("CAST", 42, "PtrToInt", "PtrToIntInst", "converts a pointer to integer", "i", oload_cast_params)
         self.add_llvm_instr("CAST", 43, "IntToPtr", "IntToPtrInst", "converts an integer to Pointer", "i", oload_cast_params)
         self.add_llvm_instr("CAST", 44, "BitCast", "BitCastInst", "performs a bit-preserving type cast", oload_all_arith, oload_cast_params)
         self.add_llvm_instr("CAST", 45, "AddrSpaceCast", "AddrSpaceCastInst", "casts a value addrspace", "", oload_cast_params)
 
-        self.add_llvm_instr("OTHER", 46, "ICmp", "ICmpInst", "compares integers", oload_int_arith_b, oload_binary_params)
-        self.add_llvm_instr("OTHER", 47, "FCmp", "FCmpInst", "compares floating points", oload_float_arith, oload_binary_params)
+        self.add_llvm_instr("OTHER", 46, "ICmp", "ICmpInst", "compares integers", oload_int_arith_b, oload_binary_params, counters=('ints',))
+        self.add_llvm_instr("OTHER", 47, "FCmp", "FCmpInst", "compares floating points", oload_float_arith, oload_binary_params, counters=('floats',))
         self.add_llvm_instr("OTHER", 48, "PHI", "PHINode", "is a PHI node instruction", "", [])
         self.add_llvm_instr("OTHER", 49, "Call", "CallInst", "calls a function", "", [])
         self.add_llvm_instr("OTHER", 50, "Select", "SelectInst", "selects an instruction", "", [])
@@ -539,48 +558,68 @@ class db_dxil(object):
             db_dxil_param(2, "u32", "inputSigId", "input signature element ID"),
             db_dxil_param(3, "u32", "rowIndex", "row index relative to element"),
             db_dxil_param(4, "u8", "colIndex", "column index relative to element"),
-            db_dxil_param(5, "i32", "gsVertexAxis", "gsVertexAxis")])
+            db_dxil_param(5, "i32", "gsVertexAxis", "gsVertexAxis")],
+            counters=('sig_ld',))
         next_op_idx += 1
         self.add_dxil_op("StoreOutput", next_op_idx, "StoreOutput", "stores the value to shader output", "hfwi", "", [ # note, cannot store bit even though load supports it
             retvoid_param,
             db_dxil_param(2, "u32", "outputSigId", "output signature element ID"),
             db_dxil_param(3, "u32", "rowIndex", "row index relative to element"),
             db_dxil_param(4, "u8", "colIndex", "column index relative to element"),
-            db_dxil_param(5, "$o", "value", "value to store")])
-        next_op_idx += 1
+            db_dxil_param(5, "$o", "value", "value to store")],
+            counters=('sig_st',))
+        next_op_idx += 1
+
+        def UFI(name, **mappings):
+            name = name.upper()
+            for k,v in mappings.items():
+                if name.startswith(k):
+                    return v
+            if name.upper().startswith('F'):
+                return 'floats'
+            elif name.upper().startswith('U'):
+                return 'uints'
+            else:
+                return 'ints'
 
         # Unary float operations are regular.
         for i in "FAbs,Saturate".split(","):
             self.add_dxil_op(i, next_op_idx, "Unary", "returns the " + i, "hfd", "rn", [
                 db_dxil_param(0, "$o", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('floats',))
             next_op_idx += 1
         for i in "IsNaN,IsInf,IsFinite,IsNormal".split(","):
             self.add_dxil_op(i, next_op_idx, "IsSpecialFloat", "returns the " + i, "hf", "rn", [
                 db_dxil_param(0, "i1", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('floats',))
             next_op_idx += 1
         for i in "Cos,Sin,Tan,Acos,Asin,Atan,Hcos,Hsin,Htan,Exp,Frc,Log,Sqrt,Rsqrt,Round_ne,Round_ni,Round_pi,Round_z".split(","):
             self.add_dxil_op(i, next_op_idx, "Unary", "returns the " + i, "hf", "rn", [
                 db_dxil_param(0, "$o", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('floats',))
             next_op_idx += 1
 
         # Unary int operations are regular.
         for i in "Bfrev".split(","):
             self.add_dxil_op(i, next_op_idx, "Unary", "returns the reverse bit pattern of the input value", "wil", "rn", [
                 db_dxil_param(0, "$o", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('uints',))
             next_op_idx += 1
         for i in "Countbits,FirstbitLo".split(","):
             self.add_dxil_op(i, next_op_idx, "UnaryBits", "returns the " + i, "wil", "rn", [
                 db_dxil_param(0, "i32", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('uints',))
             next_op_idx += 1
         for i in "FirstbitHi,FirstbitSHi".split(","):
             self.add_dxil_op(i, next_op_idx, "UnaryBits", "returns src != 0? (BitWidth-1 - " + i + ") : -1", "wil", "rn", [
                 db_dxil_param(0, "i32", "", "operation result"),
-                db_dxil_param(2, "$o", "value", "input value")])
+                db_dxil_param(2, "$o", "value", "input value")],
+                counters=('uints',))
             next_op_idx += 1
 
         # Binary float operations
@@ -588,7 +627,8 @@ class db_dxil(object):
             self.add_dxil_op(i, next_op_idx, "Binary", "returns the " + i + " of the input values", "hfd", "rn", [
                 db_dxil_param(0, "$o", "", "operation result"),
                 db_dxil_param(2, "$o", "a", "input value"),
-                db_dxil_param(3, "$o", "b", "input value")])
+                db_dxil_param(3, "$o", "b", "input value")],
+                counters=('floats',))
             next_op_idx += 1
 
         # Binary int operations
@@ -596,7 +636,8 @@ class db_dxil(object):
             self.add_dxil_op(i, next_op_idx, "Binary", "returns the " + i + " of the input values", "wil", "rn", [
                 db_dxil_param(0, "$o", "", "operation result"),
                 db_dxil_param(2, "$o", "a", "input value"),
-                db_dxil_param(3, "$o", "b", "input value")])
+                db_dxil_param(3, "$o", "b", "input value")],
+                counters=(UFI(i),))
             next_op_idx += 1
 
         # Binary int operations with two outputs
@@ -604,7 +645,8 @@ class db_dxil(object):
             self.add_dxil_op(i, next_op_idx, "BinaryWithTwoOuts", "returns the " + i + " of the input values", "i", "rn", [
                 db_dxil_param(0, "twoi32", "", "operation result"),
                 db_dxil_param(2, "$o", "a", "input value"),
-                db_dxil_param(3, "$o", "b", "input value")])
+                db_dxil_param(3, "$o", "b", "input value")],
+                counters=(UFI(i),))
             next_op_idx += 1
 
         # Binary int operations with carry
@@ -612,7 +654,8 @@ class db_dxil(object):
             self.add_dxil_op(i, next_op_idx, "BinaryWithCarryOrBorrow", "returns the " + i + " of the input values", "i", "rn", [
                 db_dxil_param(0, "i32c", "", "operation result with carry/borrow value"),
                 db_dxil_param(2, "$o", "a", "input value"),
-                db_dxil_param(3, "$o", "b", "input value")])
+                db_dxil_param(3, "$o", "b", "input value")],
+                counters=('uints',))
             next_op_idx += 1
 
         # Tertiary float.
@@ -626,7 +669,8 @@ class db_dxil(object):
             db_dxil_param(0, "$o", "", "the double-precision fused multiply-addition of parameters a * b + c, accurate to 0.5 units of least precision (ULP)"),
             db_dxil_param(2, "$o", "a", "first value for FMA, the first factor"),
             db_dxil_param(3, "$o", "b", "second value for FMA, the second factor"),
-            db_dxil_param(4, "$o", "c", "third value for FMA, the addend")])
+            db_dxil_param(4, "$o", "c", "third value for FMA, the addend")],
+            counters=('floats',))
         next_op_idx += 1
 
         # Tertiary int.
@@ -635,14 +679,16 @@ class db_dxil(object):
                 db_dxil_param(0, "$o", "", "the operation result"),
                 db_dxil_param(2, "$o", "a", "first value for FMA, the first factor"),
                 db_dxil_param(3, "$o", "b", "second value for FMA, the second factor"),
-                db_dxil_param(4, "$o", "c", "third value for FMA, the addend")])
+                db_dxil_param(4, "$o", "c", "third value for FMA, the addend")],
+                counters=(UFI(i),))
             next_op_idx += 1
         for i in "Msad,Ibfe,Ubfe".split(","):
             self.add_dxil_op(i, next_op_idx, "Tertiary", "performs an integral " + i, "il", "rn", [
                 db_dxil_param(0, "$o", "", "the operation result"),
                 db_dxil_param(2, "$o", "a", "first value for FMA, the first factor"),
                 db_dxil_param(3, "$o", "b", "second value for FMA, the second factor"),
-                db_dxil_param(4, "$o", "c", "third value for FMA, the addend")])
+                db_dxil_param(4, "$o", "c", "third value for FMA, the addend")],
+                counters=(UFI(i, M='uints'),))
             next_op_idx += 1
 
         # Quaternary
@@ -651,7 +697,8 @@ class db_dxil(object):
             db_dxil_param(2, "$o", "width", "the bitfield width to take from the value"),
             db_dxil_param(3, "$o", "offset", "the bitfield offset to replace in the value"),
             db_dxil_param(4, "$o", "value", "the number the bits are taken from"),
-            db_dxil_param(5, "$o", "replacedValue", "the number with bits to be replaced")])
+            db_dxil_param(5, "$o", "replacedValue", "the number with bits to be replaced")],
+            counters=('uints',))
         next_op_idx += 1
 
         # Dot
@@ -660,7 +707,8 @@ class db_dxil(object):
             db_dxil_param(2, "$o", "ax", "the first component of the first vector"),
             db_dxil_param(3, "$o", "ay", "the second component of the first vector"),
             db_dxil_param(4, "$o", "bx", "the first component of the second vector"),
-            db_dxil_param(5, "$o", "by", "the second component of the second vector")])
+            db_dxil_param(5, "$o", "by", "the second component of the second vector")],
+            counters=('floats',))
         next_op_idx += 1
         self.add_dxil_op("Dot3", next_op_idx, "Dot3", "three-dimensional vector dot-product", "hf", "rn", [
             db_dxil_param(0, "$o", "", "the operation result"),
@@ -669,7 +717,8 @@ class db_dxil(object):
             db_dxil_param(4, "$o", "az", "the third component of the first vector"),
             db_dxil_param(5, "$o", "bx", "the first component of the second vector"),
             db_dxil_param(6, "$o", "by", "the second component of the second vector"),
-            db_dxil_param(7, "$o", "bz", "the third component of the second vector")])
+            db_dxil_param(7, "$o", "bz", "the third component of the second vector")],
+            counters=('floats',))
         next_op_idx += 1
         self.add_dxil_op("Dot4", next_op_idx, "Dot4", "four-dimensional vector dot-product", "hf", "rn", [
             db_dxil_param(0, "$o", "", "the operation result"),
@@ -680,7 +729,8 @@ class db_dxil(object):
             db_dxil_param(6, "$o", "bx", "the first component of the second vector"),
             db_dxil_param(7, "$o", "by", "the second component of the second vector"),
             db_dxil_param(8, "$o", "bz", "the third component of the second vector"),
-            db_dxil_param(9, "$o", "bw", "the fourth component of the second vector")])
+            db_dxil_param(9, "$o", "bw", "the fourth component of the second vector")],
+            counters=('floats',))
         next_op_idx += 1
 
         # Resources.
@@ -713,7 +763,8 @@ class db_dxil(object):
             db_dxil_param(8, "i32", "offset0", "optional offset, applicable to Texture1D, Texture1DArray, and as part of offset1"),
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "offset2", "optional offset, applicable to Texture3D"),
-            db_dxil_param(11, "f", "clamp", "clamp value")])
+            db_dxil_param(11, "f", "clamp", "clamp value")],
+            counters=('tex_norm',))
         next_op_idx += 1
         self.add_dxil_op("SampleBias", next_op_idx, "SampleBias", "samples a texture after applying the input bias to the mipmap level", "hf", "ro", [
             db_dxil_param(0, "$r", "", "the sampled value"),
@@ -727,7 +778,8 @@ class db_dxil(object):
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "offset2", "optional offset, applicable to Texture3D"),
             db_dxil_param(11, "f", "bias", "bias value"),
-            db_dxil_param(12, "f", "clamp", "clamp value")])
+            db_dxil_param(12, "f", "clamp", "clamp value")],
+            counters=('tex_bias',))
         next_op_idx += 1
         self.add_dxil_op("SampleLevel", next_op_idx, "SampleLevel", "samples a texture using a mipmap-level offset", "hf", "ro", [
             db_dxil_param(0, "$r", "", "the sampled value"),
@@ -740,7 +792,8 @@ class db_dxil(object):
             db_dxil_param(8, "i32", "offset0", "optional offset, applicable to Texture1D, Texture1DArray, and as part of offset1"),
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "offset2", "optional offset, applicable to Texture3D"),
-            db_dxil_param(11, "f", "LOD", "level of detail, biggest map if less than or equal to zero; fraction used to interpolate across levels")])
+            db_dxil_param(11, "f", "LOD", "level of detail, biggest map if less than or equal to zero; fraction used to interpolate across levels")],
+            counters=('tex_norm',))
         next_op_idx += 1
         self.add_dxil_op("SampleGrad", next_op_idx, "SampleGrad", "samples a texture using a gradient to influence the way the sample location is calculated", "hf", "ro", [
             db_dxil_param(0, "$r", "", "the sampled value"),
@@ -759,7 +812,8 @@ class db_dxil(object):
             db_dxil_param(14, "f", "ddy0", "rate of change of the texture coordinate in the y direction"),
             db_dxil_param(15, "f", "ddy1", "rate of change of the texture coordinate in the y direction"),
             db_dxil_param(16, "f", "ddy2", "rate of change of the texture coordinate in the y direction"),
-            db_dxil_param(17, "f", "clamp", "clamp value")])
+            db_dxil_param(17, "f", "clamp", "clamp value")],
+            counters=('tex_grad',))
         next_op_idx += 1
         self.add_dxil_op("SampleCmp", next_op_idx, "SampleCmp", "samples a texture and compares a single component against the specified comparison value", "hf", "ro", [
             db_dxil_param(0, "$r", "", "the value for the constant buffer variable"),
@@ -773,7 +827,8 @@ class db_dxil(object):
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "offset2", "optional offset, applicable to Texture3D"),
             db_dxil_param(11, "f", "compareValue", "the value to compare with"),
-            db_dxil_param(12, "f", "clamp", "clamp value")])
+            db_dxil_param(12, "f", "clamp", "clamp value")],
+            counters=('tex_cmp',))
         next_op_idx += 1
         self.add_dxil_op("SampleCmpLevelZero", next_op_idx, "SampleCmpLevelZero", "samples a texture and compares a single component against the specified comparison value", "hf", "ro", [
             db_dxil_param(0, "$r", "", "the value for the constant buffer variable"),
@@ -786,7 +841,8 @@ class db_dxil(object):
             db_dxil_param(8, "i32", "offset0", "optional offset, applicable to Texture1D, Texture1DArray, and as part of offset1"),
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "offset2", "optional offset, applicable to Texture3D"),
-            db_dxil_param(11, "f", "compareValue", "the value to compare with")])
+            db_dxil_param(11, "f", "compareValue", "the value to compare with")],
+            counters=('tex_cmp',))
         next_op_idx += 1
         self.add_dxil_op("TextureLoad", next_op_idx, "TextureLoad", "reads texel data without any filtering or sampling", "hfwi", "ro", [
             db_dxil_param(0, "$r", "", "the loaded value"),
@@ -797,7 +853,8 @@ class db_dxil(object):
             db_dxil_param(6, "i32", "coord2", "coordinate"),
             db_dxil_param(7, "i32", "offset0", "optional offset"),
             db_dxil_param(8, "i32", "offset1", "optional offset"),
-            db_dxil_param(9, "i32", "offset2", "optional offset")])
+            db_dxil_param(9, "i32", "offset2", "optional offset")],
+            counters=('tex_load',))
         next_op_idx += 1
         self.add_dxil_op("TextureStore", next_op_idx, "TextureStore", "reads texel data without any filtering or sampling", "hfwi", "", [
             db_dxil_param(0, "v", "", ""),
@@ -809,13 +866,15 @@ class db_dxil(object):
             db_dxil_param(7, "$o", "value1", "value"),
             db_dxil_param(8, "$o", "value2", "value"),
             db_dxil_param(9, "$o", "value3", "value"),
-            db_dxil_param(10,"i8", "mask", "written value mask")])
+            db_dxil_param(10,"i8", "mask", "written value mask")],
+            counters=('tex_store',))
         next_op_idx += 1
         self.add_dxil_op("BufferLoad", next_op_idx, "BufferLoad", "reads from a TypedBuffer", "hfwi", "ro", [
             db_dxil_param(0, "$r", "", "the loaded value"),
             db_dxil_param(2, "res", "srv", "handle of TypedBuffer SRV to sample"),
             db_dxil_param(3, "i32", "index", "element index"),
-            db_dxil_param(4, "i32", "wot", "coordinate")])
+            db_dxil_param(4, "i32", "wot", "coordinate")],
+            counters=('tex_load',))
         next_op_idx += 1
         self.add_dxil_op("BufferStore", next_op_idx, "BufferStore", "writes to a RWTypedBuffer", "hfwi", "", [
             db_dxil_param(0, "v", "", ""),
@@ -826,12 +885,14 @@ class db_dxil(object):
             db_dxil_param(6, "$o", "value1", "value"),
             db_dxil_param(7, "$o", "value2", "value"),
             db_dxil_param(8, "$o", "value3", "value"),
-            db_dxil_param(9, "i8", "mask", "written value mask")])
+            db_dxil_param(9, "i8", "mask", "written value mask")],
+            counters=('tex_store',))
         next_op_idx += 1
         self.add_dxil_op("BufferUpdateCounter", next_op_idx, "BufferUpdateCounter", "atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV", "v", "", [
             db_dxil_param(0, "i32", "", "the new value in the buffer"),
             db_dxil_param(2, "res", "uav", "handle to a structured buffer UAV with the count or append flag"),
-            db_dxil_param(3, "i8", "inc", "1 to increase, 0 to decrease")])
+            db_dxil_param(3, "i8", "inc", "1 to increase, 0 to decrease")],
+            counters=('atomic',))
         next_op_idx += 1
         self.add_dxil_op("CheckAccessFullyMapped", next_op_idx, "CheckAccessFullyMapped", "determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource", "i", "ro", [
             db_dxil_param(0, "i1", "", "nonzero if all values accessed mapped tiles in a tiled resource"),
@@ -852,7 +913,8 @@ class db_dxil(object):
             db_dxil_param(7, "f", "coord3", "coordinate, defined only for TextureCubeArray"),
             db_dxil_param(8, "i32", "offset0", "optional offset, applicable to Texture1D, Texture1DArray, and as part of offset1"),
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
-            db_dxil_param(10, "i32", "channel", "channel to sample")])
+            db_dxil_param(10, "i32", "channel", "channel to sample")],
+            counters=('tex_norm',))
         next_op_idx += 1
         self.add_dxil_op("TextureGatherCmp", next_op_idx, "TextureGatherCmp", "same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp", "hfwi", "ro", [
             db_dxil_param(0, "$r", "", "gathered texels"),
@@ -865,7 +927,8 @@ class db_dxil(object):
             db_dxil_param(8, "i32", "offset0", "optional offset, applicable to Texture1D, Texture1DArray, and as part of offset1"),
             db_dxil_param(9, "i32", "offset1", "optional offset, applicable to Texture2D, Texture2DArray, and as part of offset2"),
             db_dxil_param(10, "i32", "channel", "channel to sample"),
-            db_dxil_param(11, "f", "compareVale", "value to compare with")])
+            db_dxil_param(11, "f", "compareVale", "value to compare with")],
+            counters=('tex_cmp',))
         next_op_idx += 1
 
         self.add_dxil_op("Texture2DMSGetSamplePosition", next_op_idx, "Texture2DMSGetSamplePosition", "gets the position of the specified sample", "v", "ro", [
@@ -889,7 +952,8 @@ class db_dxil(object):
             db_dxil_param(4, "i32", "offset0", "offset in elements"),
             db_dxil_param(5, "i32", "offset1", "offset"),
             db_dxil_param(6, "i32", "offset2", "offset"),
-            db_dxil_param(7, "i32", "newValue", "new value")])
+            db_dxil_param(7, "i32", "newValue", "new value")],
+            counters=('atomic',))
         next_op_idx += 1
         self.add_dxil_op("AtomicCompareExchange", next_op_idx, "AtomicCompareExchange", "atomic compare and exchange to memory", "i", "", [
             db_dxil_param(0, "i32", "", "the original value in the location updated"),
@@ -898,13 +962,15 @@ class db_dxil(object):
             db_dxil_param(4, "i32", "offset1", "offset"),
             db_dxil_param(5, "i32", "offset2", "offset"),
             db_dxil_param(6, "i32", "compareValue", "value to compare for exchange"),
-            db_dxil_param(7, "i32", "newValue", "new value")])
+            db_dxil_param(7, "i32", "newValue", "new value")],
+            counters=('atomic',))
         next_op_idx += 1
 
         # Synchronization.
         self.add_dxil_op("Barrier", next_op_idx, "Barrier", "inserts a memory barrier in the shader", "v", "nd", [
             retvoid_param,
-            db_dxil_param(2, "i32", "barrierMode", "a mask of DXIL::BarrierMode values", is_const=True)])
+            db_dxil_param(2, "i32", "barrierMode", "a mask of DXIL::BarrierMode values", is_const=True)],
+            counters=('barrier',))
         next_op_idx += 1
 
         # Pixel shader
@@ -988,15 +1054,18 @@ class db_dxil(object):
         # Geometry shader
         self.add_dxil_op("EmitStream", next_op_idx, "EmitStream", "emits a vertex to a given stream", "v", "", [
             retvoid_param,
-            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")])
+            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")],
+            counters=('gs_emit',))
         next_op_idx += 1
         self.add_dxil_op("CutStream", next_op_idx, "CutStream", "completes the current primitive topology at the specified stream", "v", "", [
             retvoid_param,
-            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")])
+            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")],
+            counters=('gs_cut',))
         next_op_idx += 1
         self.add_dxil_op("EmitThenCutStream", next_op_idx, "EmitThenCutStream", "equivalent to an EmitStream followed by a CutStream", "v", "", [
             retvoid_param,
-            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")])
+            db_dxil_param(2, "i8", "streamId", "target stream ID for operation")],
+            counters=('gs_emit','gs_cut'))
         next_op_idx += 1
         self.add_dxil_op("GSInstanceID", next_op_idx, "GSInstanceID", "GSInstanceID", "i", "rn", [
             db_dxil_param(0, "i32", "", "result")])
@@ -1019,13 +1088,15 @@ class db_dxil(object):
             db_dxil_param(2, "i32", "inputSigId", "input signature element ID"),
             db_dxil_param(3, "i32", "row", "row, relative to the element"),
             db_dxil_param(4, "i8", "col", "column, relative to the element"),
-            db_dxil_param(5, "i32", "index", "vertex/point index")])
+            db_dxil_param(5, "i32", "index", "vertex/point index")],
+            counters=('sig_ld',))
         next_op_idx += 1
         self.add_dxil_op("LoadPatchConstant", next_op_idx, "LoadPatchConstant", "LoadPatchConstant", "hfwi", "rn", [
             db_dxil_param(0, "$o", "", "result"),
             db_dxil_param(2, "i32", "inputSigId", "input signature element ID"),
             db_dxil_param(3, "i32", "row", "row, relative to the element"),
-            db_dxil_param(4, "i8", "col", "column, relative to the element")])
+            db_dxil_param(4, "i8", "col", "column, relative to the element")],
+            counters=('sig_ld',))
         next_op_idx += 1
 
         # Domain shader.
@@ -1040,7 +1111,8 @@ class db_dxil(object):
             db_dxil_param(2, "i32", "outputSigID", "output signature element ID"),
             db_dxil_param(3, "i32", "row", "row, relative to the element"),
             db_dxil_param(4, "i8", "col", "column, relative to the element"),
-            db_dxil_param(5, "$o", "value", "value to store")])
+            db_dxil_param(5, "$o", "value", "value to store")],
+            counters=('sig_st',))
         next_op_idx += 1
         self.add_dxil_op("OutputControlPointID", next_op_idx, "OutputControlPointID", "OutputControlPointID", "i", "rn", [
             db_dxil_param(0, "i32", "", "result")])
@@ -1219,7 +1291,8 @@ class db_dxil(object):
             db_dxil_param(3, "i32", "index", "element index for StructuredBuffer, or byte offset for ByteAddressBuffer"),
             db_dxil_param(4, "i32", "elementOffset", "offset into element for StructuredBuffer, or undef for ByteAddressBuffer"),
             db_dxil_param(5, "i8", "mask", "loading value mask", is_const=True),
-            db_dxil_param(6, "i32", "alignment", "relative load access alignment", is_const=True)])
+            db_dxil_param(6, "i32", "alignment", "relative load access alignment", is_const=True)],
+            counters=('tex_load',))
         next_op_idx += 1
 
         self.add_dxil_op("RawBufferStore", next_op_idx, "RawBufferStore", "writes to a RWByteAddressBuffer or RWStructuredBuffer", "hfwidl", "", [
@@ -1232,7 +1305,8 @@ class db_dxil(object):
             db_dxil_param(7, "$o", "value2", "value"),
             db_dxil_param(8, "$o", "value3", "value"),
             db_dxil_param(9, "i8", "mask", "mask of contiguous components stored starting at first component (valid: 1, 3, 7, 15)", is_const=True),
-            db_dxil_param(10, "i32", "alignment", "relative store access alignment", is_const=True)])
+            db_dxil_param(10, "i32", "alignment", "relative store access alignment", is_const=True)],
+            counters=('tex_store',))
         next_op_idx += 1
 
         # End of DXIL 1.2 opcodes.
@@ -1365,21 +1439,24 @@ class db_dxil(object):
             db_dxil_param(3, "h", "ax", "the first component of the first vector"),
             db_dxil_param(4, "h", "ay", "the second component of the first vector"),
             db_dxil_param(5, "h", "bx", "the first component of the second vector"),
-            db_dxil_param(6, "h", "by", "the second component of the second vector")])
+            db_dxil_param(6, "h", "by", "the second component of the second vector")],
+            counters=('floats',))
         next_op_idx += 1
 
         self.add_dxil_op("Dot4AddI8Packed", next_op_idx, "Dot4AddPacked", "signed dot product of 4 x i8 vectors packed into i32, with accumulate to i32", "i", "rn", [
             db_dxil_param(0, "i32", "", "accumulated result"),
             db_dxil_param(2, "i32", "acc", "input accumulator"),
             db_dxil_param(3, "i32", "a", "first packed 4 x i8 for dot product"),
-            db_dxil_param(4, "i32", "b", "second packed 4 x i8 for dot product")])
+            db_dxil_param(4, "i32", "b", "second packed 4 x i8 for dot product")],
+            counters=('ints',))
         next_op_idx += 1
 
         self.add_dxil_op("Dot4AddU8Packed", next_op_idx, "Dot4AddPacked", "unsigned dot product of 4 x u8 vectors packed into i32, with accumulate to i32", "i", "rn", [
             db_dxil_param(0, "i32", "", "accumulated result"),
             db_dxil_param(2, "i32", "acc", "input accumulator"),
             db_dxil_param(3, "i32", "a", "first packed 4 x u8 for dot product"),
-            db_dxil_param(4, "i32", "b", "second packed 4 x u8 for dot product")])
+            db_dxil_param(4, "i32", "b", "second packed 4 x u8 for dot product")],
+            counters=('uints',))
         next_op_idx += 1
 
         # End of DXIL 1.4 opcodes.
@@ -1439,7 +1516,8 @@ class db_dxil(object):
             db_dxil_param(3, "u32", "rowIndex", "row index relative to element"),
             db_dxil_param(4, "u8", "colIndex", "column index relative to element"),
             db_dxil_param(5, "$o", "value", "value to store"),
-            db_dxil_param(6, "u32", "vertexIndex", "vertex index")])
+            db_dxil_param(6, "u32", "vertexIndex", "vertex index")],
+            counters=('sig_st',))
         next_op_idx += 1
         self.add_dxil_op("StorePrimitiveOutput", next_op_idx, "StorePrimitiveOutput", "stores the value to mesh shader primitive output", "hfwi", "", [
             retvoid_param,
@@ -1447,7 +1525,8 @@ class db_dxil(object):
             db_dxil_param(3, "u32", "rowIndex", "row index relative to element"),
             db_dxil_param(4, "u8", "colIndex", "column index relative to element"),
             db_dxil_param(5, "$o", "value", "value to store"),
-            db_dxil_param(6, "u32", "primitiveIndex", "primitive index")])
+            db_dxil_param(6, "u32", "primitiveIndex", "primitive index")],
+            counters=('sig_st',))
         next_op_idx += 1
 
         # Amplification Shader
@@ -1469,7 +1548,8 @@ class db_dxil(object):
             db_dxil_param(6, "f", "c1", "coordinate c1"),
             db_dxil_param(7, "f", "c2", "coordinate c2"),
             db_dxil_param(8, "f", "c3", "coordinate c3"),
-            db_dxil_param(9, "f", "clamp", "clamp")])
+            db_dxil_param(9, "f", "clamp", "clamp")],
+            counters=('tex_store',))
         next_op_idx += 1
         self.add_dxil_op("WriteSamplerFeedbackBias", next_op_idx, "WriteSamplerFeedbackBias", "updates a feedback texture for a sampling operation with a bias on the mipmap level", "v", "", [
             db_dxil_param(0, "v", "", ""),
@@ -1481,7 +1561,8 @@ class db_dxil(object):
             db_dxil_param(7, "f", "c2", "coordinate c2"),
             db_dxil_param(8, "f", "c3", "coordinate c3"),
             db_dxil_param(9, "f", "bias", "bias in [-16.f,15.99f]"),
-            db_dxil_param(10, "f", "clamp", "clamp")])
+            db_dxil_param(10, "f", "clamp", "clamp")],
+            counters=('tex_store',))
         next_op_idx += 1
         self.add_dxil_op("WriteSamplerFeedbackLevel", next_op_idx, "WriteSamplerFeedbackLevel", "updates a feedback texture for a sampling operation with a mipmap-level offset", "v", "", [
             db_dxil_param(0, "v", "", ""),
@@ -1492,7 +1573,8 @@ class db_dxil(object):
             db_dxil_param(6, "f", "c1", "coordinate c1"),
             db_dxil_param(7, "f", "c2", "coordinate c2"),
             db_dxil_param(8, "f", "c3", "coordinate c3"),
-            db_dxil_param(9, "f", "lod", "LOD")])
+            db_dxil_param(9, "f", "lod", "LOD")],
+            counters=('tex_store',))
         next_op_idx += 1
         self.add_dxil_op("WriteSamplerFeedbackGrad", next_op_idx, "WriteSamplerFeedbackGrad", "updates a feedback texture for a sampling operation with explicit gradients", "v", "", [
             db_dxil_param(0, "v", "", ""),
@@ -1509,7 +1591,8 @@ class db_dxil(object):
             db_dxil_param(12, "f", "ddy0", "rate of change of coordinate c0 in the y direction"),
             db_dxil_param(13, "f", "ddy1", "rate of change of coordinate c1 in the y direction"),
             db_dxil_param(14, "f", "ddy2", "rate of change of coordinate c2 in the y direction"),
-            db_dxil_param(15, "f", "clamp", "clamp")])
+            db_dxil_param(15, "f", "clamp", "clamp")],
+            counters=('tex_store',))
         next_op_idx += 1
 
         # RayQuery
@@ -2565,27 +2648,43 @@ class db_dxil(object):
             valrule_enum.values.append(vrval)
         self.enums.append(valrule_enum)
 
+    def populate_counters(self):
+        self.llvm_op_counters = set()
+        self.dxil_op_counters = set()
+        for i in self.instr:
+            counters = getattr(i, 'props', {}).get('counters', ())
+            if i.dxil_opid:
+                self.dxil_op_counters.update(counters)
+            else:
+                self.llvm_op_counters.update(counters)
+        counter_set = set(self.counters)
+        counter_set.update(self.llvm_op_counters)
+        counter_set.update(self.dxil_op_counters)
+        self.counters = list(sorted(counter_set))
+
     def add_valrule(self, name, desc):
         self.val_rules.append(db_dxil_valrule(name, len(self.val_rules), err_msg=desc, doc=desc))
 
     def add_valrule_msg(self, name, desc, err_msg):
         self.val_rules.append(db_dxil_valrule(name, len(self.val_rules), err_msg=err_msg, doc=desc))
 
-    def add_llvm_instr(self, kind, llvm_id, name, llvm_name, doc, oload_types, op_params):
+    def add_llvm_instr(self, kind, llvm_id, name, llvm_name, doc, oload_types, op_params, **props):
         i = db_dxil_inst(name, llvm_id=llvm_id, llvm_name=llvm_name, doc=doc, ops=op_params, oload_types=oload_types)
         if kind == "TERM": i.is_bb_terminator=True
         if kind == "BINARY": i.is_binary=True
         if kind == "MEMORY": i.is_memory=True
         if kind == "CAST": i.is_cast=True
+        i.props = props
         self.instr.append(i)
 
-    def add_dxil_op(self, name, code_id, code_class, doc, oload_types, fn_attr, op_params):
+    def add_dxil_op(self, name, code_id, code_class, doc, oload_types, fn_attr, op_params, **props):
         # The return value is parameter 0, insert the opcode as 1.
         op_params.insert(1, self.opcode_param)
         i = db_dxil_inst(name,
                          llvm_id=self.call_instr.llvm_id, llvm_name=self.call_instr.llvm_name,
                          dxil_op=name, dxil_opid=code_id, doc=doc, ops=op_params, dxil_class=code_class,
                          oload_types=oload_types, fn_attr=fn_attr)
+        i.props = props
         self.instr.append(i)
     def add_dxil_op_reserved(self, name, code_id):
         # The return value is parameter 0, insert the opcode as 1.
@@ -3004,6 +3103,7 @@ class db_hlsl(object):
         add_attr_arg("Unroll", "l", "Unroll the loop until it stops executing or a max count", [{"name":"Count", "type":"int"}])
         self.attributes = attributes
 
+
 if __name__ == "__main__":
     db = db_dxil()
     print(db)

+ 18 - 0
utils/hct/hctdb_instrhelp.py

@@ -811,6 +811,21 @@ def get_instrs_pred(varname, pred, attr_name="dxil_opid"):
     result += "\n"
     return result
 
+def counter_pred(name, dxil_op=True):
+    def pred(i):
+        return (dxil_op == i.is_dxil_op) and getattr(i, 'props') and 'counters' in i.props and name in i.props['counters']
+    return pred
+
+def get_counters():
+    db = get_db_dxil()
+    return db.counters
+def get_llvm_op_counters():
+    db = get_db_dxil()
+    return [c for c in db.counters if c in db.llvm_op_counters]
+def get_dxil_op_counters():
+    db = get_db_dxil()
+    return [c for c in db.counters if c in db.dxil_op_counters]
+
 def get_instrs_rst():
     "Create an rst table of allowed LLVM instructions."
     db = get_db_dxil()
@@ -1423,6 +1438,9 @@ if __name__ == "__main__":
             'include/dxc/HlslIntrinsicOp.h',
             'tools/clang/tools/dxcompiler/dxcdisassembler.cpp',
             'include/dxc/DXIL/DxilSigPoint.inl',
+            'include/dxc/DXIL/DxilCounters.h',
+            'lib/DXIL/DxilCounters.cpp',
+            'lib/DXIL/DxilMetadataHelper.cpp',
             ]
         for relative_file_path in files:
             RunCodeTagUpdate(pj(hlsl_src_dir, relative_file_path))