7 years ago · d09e2746aa
--- a/lib/HLSL/DxilGenerationPass.cpp
+++ b/lib/HLSL/DxilGenerationPass.cpp
@@ -1415,13 +1415,113 @@ INITIALIZE_PASS(DxilLegalizeEvalOperations,
 
				 // and will be truncated to their corresponding types after loading / before storing.
			
 
				 namespace {
			
 
				 
			
 
				+// Create { v0, v1 } from { v0.lo, v0.hi, v1.lo, v1.hi }
			
 
				+void Make64bitResultForLoad(Type *EltTy, ArrayRef<Value *> resultElts32,
			
 
				+                            unsigned size, MutableArrayRef<Value *> resultElts,
			
 
				+                            hlsl::OP *hlslOP, IRBuilder<> &Builder) {
			
 
				+  Type *i64Ty = Builder.getInt64Ty();
			
 
				+  Type *doubleTy = Builder.getDoubleTy();
			
 
				+  if (EltTy == doubleTy) {
			
 
				+    Function *makeDouble =
			
 
				+        hlslOP->GetOpFunc(DXIL::OpCode::MakeDouble, doubleTy);
			
 
				+    Value *makeDoubleOpArg =
			
 
				+        Builder.getInt32((unsigned)DXIL::OpCode::MakeDouble);
			
 
				+    for (unsigned i = 0; i < size; i++) {
			
 
				+      Value *lo = resultElts32[2 * i];
			
 
				+      Value *hi = resultElts32[2 * i + 1];
			
 
				+      Value *V = Builder.CreateCall(makeDouble, {makeDoubleOpArg, lo, hi});
			
 
				+      resultElts[i] = V;
			
 
				+    }
			
 
				+  } else {
			
 
				+    for (unsigned i = 0; i < size; i++) {
			
 
				+      Value *lo = resultElts32[2 * i];
			
 
				+      Value *hi = resultElts32[2 * i + 1];
			
 
				+      lo = Builder.CreateZExt(lo, i64Ty);
			
 
				+      hi = Builder.CreateZExt(hi, i64Ty);
			
 
				+      hi = Builder.CreateShl(hi, 32);
			
 
				+      resultElts[i] = Builder.CreateOr(lo, hi);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Split { v0, v1 } to { v0.lo, v0.hi, v1.lo, v1.hi }
			
 
				+void Split64bitValForStore(Type *EltTy, ArrayRef<Value *> vals, unsigned size,
			
 
				+                           MutableArrayRef<Value *> vals32, hlsl::OP *hlslOP,
			
 
				+                           IRBuilder<> &Builder) {
			
 
				+  Type *i32Ty = Builder.getInt32Ty();
			
 
				+  Type *doubleTy = Builder.getDoubleTy();
			
 
				+  Value *undefI32 = UndefValue::get(i32Ty);
			
 
				+
			
 
				+  if (EltTy == doubleTy) {
			
 
				+    Function *dToU = hlslOP->GetOpFunc(DXIL::OpCode::SplitDouble, doubleTy);
			
 
				+    Value *dToUOpArg = Builder.getInt32((unsigned)DXIL::OpCode::SplitDouble);
			
 
				+    for (unsigned i = 0; i < size; i++) {
			
 
				+      if (isa<UndefValue>(vals[i])) {
			
 
				+        vals32[2 * i] = undefI32;
			
 
				+        vals32[2 * i + 1] = undefI32;
			
 
				+      } else {
			
 
				+        Value *retVal = Builder.CreateCall(dToU, {dToUOpArg, vals[i]});
			
 
				+        Value *lo = Builder.CreateExtractValue(retVal, 0);
			
 
				+        Value *hi = Builder.CreateExtractValue(retVal, 1);
			
 
				+        vals32[2 * i] = lo;
			
 
				+        vals32[2 * i + 1] = hi;
			
 
				+      }
			
 
				+    }
			
 
				+  } else {
			
 
				+    for (unsigned i = 0; i < size; i++) {
			
 
				+      if (isa<UndefValue>(vals[i])) {
			
 
				+        vals32[2 * i] = undefI32;
			
 
				+        vals32[2 * i + 1] = undefI32;
			
 
				+      } else {
			
 
				+        Value *lo = Builder.CreateTrunc(vals[i], i32Ty);
			
 
				+        Value *hi = Builder.CreateLShr(vals[i], 32);
			
 
				+        hi = Builder.CreateTrunc(hi, i32Ty);
			
 
				+        vals32[2 * i] = lo;
			
 
				+        vals32[2 * i + 1] = hi;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 class DxilTranslateRawBuffer : public ModulePass {
			
 
				 public:
			
 
				   static char ID;
			
 
				   explicit DxilTranslateRawBuffer() : ModulePass(ID) {}
			
 
				   bool runOnModule(Module &M) {
			
 
				     unsigned major, minor;
			
 
				-    M.GetDxilModule().GetDxilVersion(major, minor);
			
 
				+    DxilModule &DM = M.GetDxilModule();
			
 
				+    DM.GetDxilVersion(major, minor);
			
 
				+    OP *hlslOP = DM.GetOP();
			
 
				+    // Split 64bit for shader model less than 6.3.
			
 
				+    if (major == 1 && minor <= 2) {
			
 
				+      for (auto F = M.functions().begin(); F != M.functions().end();) {
			
 
				+        Function *func = &*(F++);
			
 
				+        DXIL::OpCodeClass opClass;
			
 
				+        if (hlslOP->GetOpCodeClass(func, opClass)) {
			
 
				+          if (opClass == DXIL::OpCodeClass::RawBufferLoad) {
			
 
				+            Type *ETy =
			
 
				+                hlslOP->GetOverloadType(DXIL::OpCode::RawBufferLoad, func);
			
 
				+
			
 
				+            bool is64 =
			
 
				+                ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
			
 
				+            if (is64) {
			
 
				+              ReplaceRawBufferLoad64Bit(func, ETy, M);
			
 
				+              func->eraseFromParent();
			
 
				+            }
			
 
				+          } else if (opClass == DXIL::OpCodeClass::RawBufferStore) {
			
 
				+            Type *ETy =
			
 
				+                hlslOP->GetOverloadType(DXIL::OpCode::RawBufferStore, func);
			
 
				+
			
 
				+            bool is64 =
			
 
				+                ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
			
 
				+            if (is64) {
			
 
				+              ReplaceRawBufferStore64Bit(func, ETy, M);
			
 
				+              func->eraseFromParent();
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				     if (major == 1 && minor < 2) {
			
 
				       for (auto F = M.functions().begin(), E = M.functions().end(); F != E;) {
			
 
				         Function *func = &*(F++);
			
@@ -1454,6 +1554,8 @@ private:
 
				   // Replace RawBufferLoad/Store to BufferLoad/Store for DXIL < 1.2
			
 
				   void ReplaceRawBufferLoad(Function *F, Module &M);
			
 
				   void ReplaceRawBufferStore(Function *F, Module &M);
			
 
				+  void ReplaceRawBufferLoad64Bit(Function *F, Type *EltTy, Module &M);
			
 
				+  void ReplaceRawBufferStore64Bit(Function *F, Type *EltTy, Module &M);
			
 
				   // Replace RawBufferLoad/Store of min-precision types to have its actual storage size
			
 
				   void ReplaceMinPrecisionRawBufferLoad(Function *F, Module &M);
			
 
				   void ReplaceMinPrecisionRawBufferStore(Function *F, Module &M);
			
@@ -1491,6 +1593,96 @@ void DxilTranslateRawBuffer::ReplaceRawBufferLoad(Function *F,
 
				   }
			
 
				 }
			
 
				 
			
 
				+void DxilTranslateRawBuffer::ReplaceRawBufferLoad64Bit(Function *F, Type *EltTy, Module &M) {
			
 
				+  OP *hlslOP = M.GetDxilModule().GetOP();
			
 
				+  Function *bufLd = hlslOP->GetOpFunc(DXIL::OpCode::RawBufferLoad,
			
 
				+                                      Type::getInt32Ty(M.getContext()));
			
 
				+  for (auto U = F->user_begin(), E = F->user_end(); U != E;) {
			
 
				+    User *user = *(U++);
			
 
				+    if (CallInst *CI = dyn_cast<CallInst>(user)) {
			
 
				+      IRBuilder<> Builder(CI);
			
 
				+      SmallVector<Value *, 4> args(CI->arg_operands());
			
 
				+
			
 
				+      Value *offset = CI->getArgOperand(
			
 
				+          DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx);
			
 
				+
			
 
				+      unsigned size = 0;
			
 
				+      bool bNeedStatus = false;
			
 
				+      for (User *U : CI->users()) {
			
 
				+        ExtractValueInst *Elt = cast<ExtractValueInst>(U);
			
 
				+        DXASSERT(Elt->getNumIndices() == 1, "else invalid use for resRet");
			
 
				+        unsigned idx = Elt->getIndices()[0];
			
 
				+        if (idx == 4) {
			
 
				+          bNeedStatus = true;
			
 
				+        } else {
			
 
				+          size = std::max(size, idx+1);
			
 
				+        }
			
 
				+      }
			
 
				+      unsigned maskHi = 0;
			
 
				+      unsigned maskLo = 0;
			
 
				+      switch (size) {
			
 
				+      case 1:
			
 
				+        maskLo = 3;
			
 
				+        break;
			
 
				+      case 2:
			
 
				+        maskLo = 0xf;
			
 
				+        break;
			
 
				+      case 3:
			
 
				+        maskLo = 0xf;
			
 
				+        maskHi = 3;
			
 
				+        break;
			
 
				+      case 4:
			
 
				+        maskLo = 0xf;
			
 
				+        maskHi = 0xf;
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
			
 
				+          Builder.getInt8(maskLo);
			
 
				+      Value *resultElts[5] = {nullptr, nullptr, nullptr, nullptr, nullptr};
			
 
				+      CallInst *newLd = Builder.CreateCall(bufLd, args);
			
 
				+
			
 
				+      Value *resultElts32[8];
			
 
				+      unsigned eltBase = 0;
			
 
				+      for (unsigned i = 0; i < size; i++) {
			
 
				+        if (i == 2) {
			
 
				+          // Update offset 4 by 4 bytes.
			
 
				+          args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
			
 
				+              Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
			
 
				+          args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
			
 
				+              Builder.getInt8(maskHi);
			
 
				+          newLd = Builder.CreateCall(bufLd, args);
			
 
				+          eltBase = 4;
			
 
				+        }
			
 
				+        unsigned resBase = 2 * i;
			
 
				+        resultElts32[resBase] =
			
 
				+            Builder.CreateExtractValue(newLd, resBase - eltBase);
			
 
				+        resultElts32[resBase + 1] =
			
 
				+            Builder.CreateExtractValue(newLd, resBase + 1 - eltBase);
			
 
				+      }
			
 
				+
			
 
				+      Make64bitResultForLoad(EltTy, resultElts32, size, resultElts, hlslOP, Builder);
			
 
				+      if (bNeedStatus) {
			
 
				+        resultElts[4] = Builder.CreateExtractValue(newLd, 4);
			
 
				+      }
			
 
				+      for (auto it = CI->user_begin(); it != CI->user_end(); ) {
			
 
				+        ExtractValueInst *Elt = cast<ExtractValueInst>(*(it++));
			
 
				+        DXASSERT(Elt->getNumIndices() == 1, "else invalid use for resRet");
			
 
				+        unsigned idx = Elt->getIndices()[0];
			
 
				+        if (!Elt->user_empty()) {
			
 
				+          Value *newElt = resultElts[idx];
			
 
				+          Elt->replaceAllUsesWith(newElt);
			
 
				+        }
			
 
				+        Elt->eraseFromParent();
			
 
				+      }
			
 
				+
			
 
				+      CI->eraseFromParent();
			
 
				+    } else {
			
 
				+      DXASSERT(false, "function can only be used with call instructions.");
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 void DxilTranslateRawBuffer::ReplaceRawBufferStore(Function *F,
			
 
				   Module &M) {
			
 
				   OP *op = M.GetDxilModule().GetOP();
			
@@ -1515,6 +1707,85 @@ void DxilTranslateRawBuffer::ReplaceRawBufferStore(Function *F,
 
				   }
			
 
				 }
			
 
				 
			
 
				+void DxilTranslateRawBuffer::ReplaceRawBufferStore64Bit(Function *F, Type *ETy,
			
 
				+                                                        Module &M) {
			
 
				+  OP *hlslOP = M.GetDxilModule().GetOP();
			
 
				+  Function *newFunction = hlslOP->GetOpFunc(hlsl::DXIL::OpCode::RawBufferStore,
			
 
				+                                            Type::getInt32Ty(M.getContext()));
			
 
				+  for (auto U = F->user_begin(), E = F->user_end(); U != E;) {
			
 
				+    User *user = *(U++);
			
 
				+    if (CallInst *CI = dyn_cast<CallInst>(user)) {
			
 
				+      IRBuilder<> Builder(CI);
			
 
				+      SmallVector<Value *, 4> args(CI->arg_operands());
			
 
				+      Value *vals[4] = {
			
 
				+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal0OpIdx),
			
 
				+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal1OpIdx),
			
 
				+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal2OpIdx),
			
 
				+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal3OpIdx)};
			
 
				+      ConstantInt *cMask = cast<ConstantInt>(
			
 
				+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreMaskOpIdx));
			
 
				+      Value *undefI32 = UndefValue::get(Builder.getInt32Ty());
			
 
				+      Value *vals32[8] = {undefI32, undefI32, undefI32, undefI32,
			
 
				+                          undefI32, undefI32, undefI32, undefI32};
			
 
				+
			
 
				+      unsigned maskLo = 0;
			
 
				+      unsigned maskHi = 0;
			
 
				+      unsigned size = 0;
			
 
				+      unsigned mask = cMask->getLimitedValue();
			
 
				+      switch (mask) {
			
 
				+      case 1:
			
 
				+        maskLo = 3;
			
 
				+        size = 1;
			
 
				+        break;
			
 
				+      case 3:
			
 
				+        maskLo = 15;
			
 
				+        size = 2;
			
 
				+        break;
			
 
				+      case 7:
			
 
				+        maskLo = 15;
			
 
				+        maskHi = 3;
			
 
				+        size = 3;
			
 
				+        break;
			
 
				+      case 15:
			
 
				+        maskLo = 15;
			
 
				+        maskHi = 15;
			
 
				+        size = 4;
			
 
				+        break;
			
 
				+      default:
			
 
				+        DXASSERT(0, "invalid mask");
			
 
				+      }
			
 
				+
			
 
				+      Split64bitValForStore(ETy, vals, size, vals32, hlslOP, Builder);
			
 
				+      args[DXIL::OperandIndex::kRawBufferStoreMaskOpIdx] =
			
 
				+          Builder.getInt8(maskLo);
			
 
				+      args[DXIL::OperandIndex::kRawBufferStoreVal0OpIdx] = vals32[0];
			
 
				+      args[DXIL::OperandIndex::kRawBufferStoreVal1OpIdx] = vals32[1];
			
 
				+      args[DXIL::OperandIndex::kRawBufferStoreVal2OpIdx] = vals32[2];
			
 
				+      args[DXIL::OperandIndex::kRawBufferStoreVal3OpIdx] = vals32[3];
			
 
				+
			
 
				+      Builder.CreateCall(newFunction, args);
			
 
				+
			
 
				+      if (maskHi) {
			
 
				+        Value *offset = args[DXIL::OperandIndex::kBufferStoreCoord1OpIdx];
			
 
				+        // Update offset 4 by 4 bytes.
			
 
				+        offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreElementOffsetOpIdx] = offset;
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreMaskOpIdx] =
			
 
				+            Builder.getInt8(maskHi);
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreVal0OpIdx] = vals32[4];
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreVal1OpIdx] = vals32[5];
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreVal2OpIdx] = vals32[6];
			
 
				+        args[DXIL::OperandIndex::kRawBufferStoreVal3OpIdx] = vals32[7];
			
 
				+
			
 
				+        Builder.CreateCall(newFunction, args);
			
 
				+      }
			
 
				+      CI->eraseFromParent();
			
 
				+    } else {
			
 
				+      DXASSERT(false, "function can only be used with call instructions.");
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 void DxilTranslateRawBuffer::ReplaceMinPrecisionRawBufferLoad(Function *F,
			
 
				                                                               Module &M) {
			
 
				   OP *Op = M.GetDxilModule().GetOP();
			
--- a/lib/HLSL/DxilOperations.cpp
+++ b/lib/HLSL/DxilOperations.cpp
@@ -262,8 +262,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
 
				   {  OC::ViewID,                  "ViewID",                   OCC::ViewID,                   "viewID",                     false, false, false, false, false, false, false,  true, false, false, false, Attribute::ReadNone, },
			
 
				 
			
 
				   // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,  obj,  function attribute
			
 
				-  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",              false,  true,  true, false, false, false,  true,  true, false, false, false, Attribute::ReadOnly, },
			
 
				-  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",             false,  true,  true, false, false, false,  true,  true, false, false, false, Attribute::None,     },
			
 
				+  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",              false,  true,  true,  true, false, false,  true,  true,  true, false, false, Attribute::ReadOnly, },
			
 
				+  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",             false,  true,  true,  true, false, false,  true,  true,  true, false, false, Attribute::None,     },
			
 
				 
			
 
				   // Raytracing object space uint System Values                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,  obj,  function attribute
			
 
				   {  OC::InstanceID,              "InstanceID",               OCC::InstanceID,               "instanceID",                 false, false, false, false, false, false, false,  true, false, false, false, Attribute::ReadNone, },
			
--- a/lib/HLSL/DxilValidation.cpp
+++ b/lib/HLSL/DxilValidation.cpp
@@ -2583,6 +2583,7 @@ static bool IsLLVMInstructionAllowedForLib(Instruction &I, ValidationContext &Va
 
				   switch (I.getOpcode()) {
			
 
				   case Instruction::InsertElement:
			
 
				   case Instruction::ExtractElement:
			
 
				+  case Instruction::ShuffleVector:
			
 
				     return true;
			
 
				   case Instruction::Unreachable:
			
 
				     if (Instruction *Prev = I.getPrevNode()) {
			
@@ -2676,6 +2677,9 @@ static void ValidateFunctionBody(Function *F, ValidationContext &ValCtx) {
 
				           if (InsertElementInst *InsertInst = dyn_cast<InsertElementInst>(&I)) {
			
 
				             legalUndef = op == I.getOperand(0);
			
 
				           }
			
 
				+          if (ShuffleVectorInst *Shuf = dyn_cast<ShuffleVectorInst>(&I)) {
			
 
				+            legalUndef = op == I.getOperand(1);
			
 
				+          }
			
 
				 
			
 
				           if (!legalUndef)
			
 
				             ValCtx.EmitInstrError(&I,
			
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -3122,42 +3122,25 @@ static uint8_t GetRawBufferMaskFromIOP(IntrinsicOp IOP, hlsl::OP *OP) {
 
				 }
			
 
				 
			
 
				 static Constant *GetRawBufferMaskForETy(Type *Ty, unsigned NumComponents, hlsl::OP *OP) {
			
 
				-  Type *ETy = Ty->getScalarType();
			
 
				-  bool is64 = ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
			
 
				   unsigned mask = 0;
			
 
				-  if (is64) {
			
 
				-    switch (NumComponents) {
			
 
				-    case 0:
			
 
				-      break;
			
 
				-    case 1:
			
 
				-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
			
 
				-      break;
			
 
				-    case 2:
			
 
				-      mask = DXIL::kCompMask_All;
			
 
				-      break;
			
 
				-    default:
			
 
				-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
			
 
				-    }
			
 
				-  }
			
 
				-  else {
			
 
				-    switch (NumComponents) {
			
 
				-    case 0:
			
 
				-      break;
			
 
				-    case 1:
			
 
				-      mask = DXIL::kCompMask_X;
			
 
				-      break;
			
 
				-    case 2:
			
 
				-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
			
 
				-      break;
			
 
				-    case 3:
			
 
				-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
			
 
				-      break;
			
 
				-    case 4:
			
 
				-      mask = DXIL::kCompMask_All;
			
 
				-      break;
			
 
				-    default:
			
 
				-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
			
 
				-    }
			
 
				+
			
 
				+  switch (NumComponents) {
			
 
				+  case 0:
			
 
				+    break;
			
 
				+  case 1:
			
 
				+    mask = DXIL::kCompMask_X;
			
 
				+    break;
			
 
				+  case 2:
			
 
				+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
			
 
				+    break;
			
 
				+  case 3:
			
 
				+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
			
 
				+    break;
			
 
				+  case 4:
			
 
				+    mask = DXIL::kCompMask_All;
			
 
				+    break;
			
 
				+  default:
			
 
				+    DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
			
 
				   }
			
 
				   return OP->GetI8Const(mask);
			
 
				 }
			
@@ -3200,8 +3183,10 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
 
				     return;
			
 
				   }
			
 
				 
			
 
				+  bool isTyped = opcode == OP::OpCode::TextureLoad ||
			
 
				+                 RK == DxilResource::Kind::TypedBuffer;
			
 
				   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
			
 
				-  if (is64) {
			
 
				+  if (is64 && isTyped) {
			
 
				     EltTy = i32Ty;
			
 
				   }
			
 
				 
			
@@ -3278,7 +3263,7 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
 
				       Builder.CreateCall(F, loadArgs, OP->GetOpCodeName(opcode));
			
 
				 
			
 
				   Value *retValNew = nullptr;
			
 
				-  if (!is64) {
			
 
				+  if (!is64 || !isTyped) {
			
 
				     retValNew = ScalarizeResRet(Ty, ResRet, Builder);
			
 
				   } else {
			
 
				     unsigned size = numComponents;
			
@@ -3380,13 +3365,16 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
 
				     break;
			
 
				   }
			
 
				 
			
 
				+  bool isTyped = opcode == OP::OpCode::TextureStore ||
			
 
				+                 RK == DxilResource::Kind::TypedBuffer;
			
 
				+
			
 
				   Type *i32Ty = Builder.getInt32Ty();
			
 
				   Type *i64Ty = Builder.getInt64Ty();
			
 
				   Type *doubleTy = Builder.getDoubleTy();
			
 
				   Type *EltTy = Ty->getScalarType();
			
 
				   Constant *Alignment = OP->GetI32Const(OP->GetAllocSizeForType(EltTy));
			
 
				   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
			
 
				-  if (is64) {
			
 
				+  if (is64 && isTyped) {
			
 
				     EltTy = i32Ty;
			
 
				   }
			
 
				 
			
@@ -3434,8 +3422,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
 
				   }
			
 
				 
			
 
				   // values
			
 
				-  bool isTyped = opcode == OP::OpCode::TextureStore ||
			
 
				-                 RK == DxilResource::Kind::TypedBuffer;
			
 
				   uint8_t mask = 0;
			
 
				   if (Ty->isVectorTy()) {
			
 
				     unsigned vecSize = Ty->getVectorNumElements();
			
@@ -3470,7 +3456,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
 
				     }
			
 
				   }
			
 
				 
			
 
				-  if (is64) {
			
 
				+  if (is64 && isTyped) {
			
 
				     unsigned size = 1;
			
 
				     if (Ty->isVectorTy()) {
			
 
				       size = Ty->getVectorNumElements();
			
@@ -5736,57 +5722,23 @@ void GenerateStructBufLd(Value *handle, Value *bufIdx, Value *offset,
 
				   DXASSERT(resultElts.size() <= 4,
			
 
				            "buffer load cannot load more than 4 values");
			
 
				 
			
 
				+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
			
 
				+  Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
			
 
				+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
			
 
				+                   handle,
			
 
				+                   bufIdx,
			
 
				+                   offset,
			
 
				+                   mask,
			
 
				+                   alignment};
			
 
				+  Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
			
 
				 
			
 
				-  Type *i64Ty = Builder.getInt64Ty();
			
 
				-  Type *doubleTy = Builder.getDoubleTy();
			
 
				-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
			
 
				-
			
 
				-  if (!is64) {
			
 
				-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
			
 
				-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
			
 
				-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
			
 
				-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
			
 
				-
			
 
				-    for (unsigned i = 0; i < resultElts.size(); i++) {
			
 
				-      resultElts[i] = Builder.CreateExtractValue(Ld, i);
			
 
				-    }
			
 
				-
			
 
				-    // status
			
 
				-    UpdateStatus(Ld, status, Builder, OP);
			
 
				-    return;
			
 
				-  } else {
			
 
				-    // 64 bit.
			
 
				-    Function *dxilF = OP->GetOpFunc(opcode, Builder.getInt32Ty());
			
 
				-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents < 2 ? NumComponents : 2, OP);
			
 
				-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
			
 
				-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
			
 
				-    Value *resultElts32[8];
			
 
				-    unsigned size = resultElts.size();
			
 
				-    unsigned eltBase = 0;
			
 
				-    for (unsigned i = 0; i < size; i++) {
			
 
				-      if (i == 2) {
			
 
				-        // Update offset 4 by 4 bytes.
			
 
				-        Args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
			
 
				-            Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
			
 
				-        // Update Mask
			
 
				-        Args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
			
 
				-          GetRawBufferMaskForETy(EltTy, NumComponents < 3 ? 0 : NumComponents - 2, OP);
			
 
				-        Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
			
 
				-        eltBase = 4;
			
 
				-      }
			
 
				-      unsigned resBase = 2 * i;
			
 
				-      resultElts32[resBase] = Builder.CreateExtractValue(Ld, resBase - eltBase);
			
 
				-      resultElts32[resBase + 1] =
			
 
				-          Builder.CreateExtractValue(Ld, resBase + 1 - eltBase);
			
 
				-    }
			
 
				-
			
 
				-    Make64bitResultForLoad(EltTy, resultElts32, size, resultElts, OP, Builder);
			
 
				-
			
 
				-    // status
			
 
				-    UpdateStatus(Ld, status, Builder, OP);
			
 
				-
			
 
				-    return;
			
 
				+  for (unsigned i = 0; i < resultElts.size(); i++) {
			
 
				+    resultElts[i] = Builder.CreateExtractValue(Ld, i);
			
 
				   }
			
 
				+
			
 
				+  // status
			
 
				+  UpdateStatus(Ld, status, Builder, OP);
			
 
				+  return;
			
 
				 }
			
 
				 
			
 
				 void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
			
@@ -5794,85 +5746,19 @@ void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
 
				                          ArrayRef<Value *> vals, uint8_t mask, Constant *alignment) {
			
 
				   OP::OpCode opcode = OP::OpCode::RawBufferStore;
			
 
				   DXASSERT(vals.size() == 4, "buffer store need 4 values");
			
 
				-  Type *i64Ty = Builder.getInt64Ty();
			
 
				-  Type *doubleTy = Builder.getDoubleTy();
			
 
				-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
			
 
				-  if (!is64) {
			
 
				-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
			
 
				-                     handle,
			
 
				-                     bufIdx,
			
 
				-                     offset,
			
 
				-                     vals[0],
			
 
				-                     vals[1],
			
 
				-                     vals[2],
			
 
				-                     vals[3],
			
 
				-                     OP->GetU8Const(mask),
			
 
				-                     alignment};
			
 
				-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
			
 
				-    Builder.CreateCall(dxilF, Args);
			
 
				-  } else {
			
 
				-    Type *i32Ty = Builder.getInt32Ty();
			
 
				-    Function *dxilF = OP->GetOpFunc(opcode, i32Ty);
			
 
				-
			
 
				-    Value *undefI32 = UndefValue::get(i32Ty);
			
 
				-    Value *vals32[8] = {undefI32, undefI32, undefI32, undefI32,
			
 
				-                        undefI32, undefI32, undefI32, undefI32};
			
 
				-
			
 
				-    unsigned maskLo = 0;
			
 
				-    unsigned maskHi = 0;
			
 
				-    unsigned size = 0;
			
 
				-    switch (mask) {
			
 
				-    case 1:
			
 
				-      maskLo = 3;
			
 
				-      size = 1;
			
 
				-      break;
			
 
				-    case 3:
			
 
				-      maskLo = 15;
			
 
				-      size = 2;
			
 
				-      break;
			
 
				-    case 7:
			
 
				-      maskLo = 15;
			
 
				-      maskHi = 3;
			
 
				-      size = 3;
			
 
				-      break;
			
 
				-    case 15:
			
 
				-      maskLo = 15;
			
 
				-      maskHi = 15;
			
 
				-      size = 4;
			
 
				-      break;
			
 
				-    default:
			
 
				-      DXASSERT(0, "invalid mask");
			
 
				-    }
			
 
				 
			
 
				-    Split64bitValForStore(EltTy, vals, size, vals32, OP, Builder);
			
 
				-
			
 
				-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
			
 
				-                     handle,
			
 
				-                     bufIdx,
			
 
				-                     offset,
			
 
				-                     vals32[0],
			
 
				-                     vals32[1],
			
 
				-                     vals32[2],
			
 
				-                     vals32[3],
			
 
				-                     OP->GetU8Const(maskLo),
			
 
				-                     alignment};
			
 
				-    Builder.CreateCall(dxilF, Args);
			
 
				-    if (maskHi) {
			
 
				-      // Update offset 4 by 4 bytes.
			
 
				-      offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
			
 
				-      Value *Args[] = {OP->GetU32Const((unsigned)opcode),
			
 
				-                       handle,
			
 
				-                       bufIdx,
			
 
				-                       offset,
			
 
				-                       vals32[4],
			
 
				-                       vals32[5],
			
 
				-                       vals32[6],
			
 
				-                       vals32[7],
			
 
				-                       OP->GetU8Const(maskHi),
			
 
				-                       alignment};
			
 
				-      Builder.CreateCall(dxilF, Args);
			
 
				-    }
			
 
				-  }
			
 
				+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
			
 
				+                   handle,
			
 
				+                   bufIdx,
			
 
				+                   offset,
			
 
				+                   vals[0],
			
 
				+                   vals[1],
			
 
				+                   vals[2],
			
 
				+                   vals[3],
			
 
				+                   OP->GetU8Const(mask),
			
 
				+                   alignment};
			
 
				+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
			
 
				+  Builder.CreateCall(dxilF, Args);
			
 
				 }
			
 
				 
			
 
				 Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
			
@@ -5881,7 +5767,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
 
				                                bool colMajor, const DataLayout &DL) {
			
 
				   unsigned col, row;
			
 
				   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
			
 
				-  Constant* alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
			
 
				+  unsigned  EltSize = DL.getTypeAllocSize(EltTy);
			
 
				+  Constant* alignment = OP->GetI32Const(EltSize);
			
 
				 
			
 
				   Value *offset = baseOffset;
			
 
				   if (baseOffset == nullptr)
			
@@ -5896,7 +5783,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
 
				     GenerateStructBufLd(handle, bufIdx, offset, status, EltTy, ResultElts, OP, Builder, 3, alignment);
			
 
				     for (unsigned i = 0; i < rest; i++)
			
 
				       elts[i] = ResultElts[i];
			
 
				-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * rest));
			
 
				+    offset = Builder.CreateAdd(offset, OP->GetU32Const(EltSize * rest));
			
 
				   }
			
 
				 
			
 
				   for (unsigned i = rest; i < matSize; i += 4) {
			
@@ -5908,7 +5795,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
 
				     elts[i + 3] = ResultElts[3];
			
 
				 
			
 
				     // Update offset by 4*4bytes.
			
 
				-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
			
 
				+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
			
 
				   }
			
 
				 
			
 
				   return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
			
@@ -5919,7 +5806,8 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
 
				                              Value *val, bool colMajor, const DataLayout &DL) {
			
 
				   unsigned col, row;
			
 
				   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
			
 
				-  Constant *Alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
			
 
				+  unsigned EltSize = DL.getTypeAllocSize(EltTy);
			
 
				+  Constant *Alignment = OP->GetI32Const(EltSize);
			
 
				   Value *offset = baseOffset;
			
 
				   if (baseOffset == nullptr)
			
 
				     offset = OP->GetU32Const(0);
			
@@ -5955,7 +5843,7 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
 
				                         {elts[i], elts[i + 1], elts[i + 2], elts[i + 3]}, mask,
			
 
				                         Alignment);
			
 
				     // Update offset by 4*4bytes.
			
 
				-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
			
 
				+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -6214,7 +6102,6 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
 
				       }
			
 
				       userCall->eraseFromParent();
			
 
				     } else if (group == HLOpcodeGroup::HLMatLoadStore)
			
 
				-      // TODO: support 64 bit.
			
 
				       TranslateStructBufMatLdSt(userCall, handle, OP, status, bufIdx,
			
 
				                                 baseOffset, DL);
			
 
				     else if (group == HLOpcodeGroup::HLSubscript) {
			
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -763,13 +763,16 @@ MDNode *CGMSHLSLRuntime::GetOrAddResTypeMD(QualType resTy) {
 
				   }
			
 
				 }
			
 
				 
			
 
				-void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
			
 
				-    DxilFieldAnnotation &fieldAnnotation, QualType fieldTy,
			
 
				-    bool bDefaultRowMajor) {
			
 
				-  QualType Ty = fieldTy;
			
 
				-  if (Ty->isReferenceType())
			
 
				-    Ty = Ty.getNonReferenceType();
			
 
				+namespace {
			
 
				+MatrixOrientation GetMatrixMajor(QualType Ty, bool bDefaultRowMajor) {
			
 
				+  DXASSERT(hlsl::IsHLSLMatType(Ty), "");
			
 
				+  bool bIsRowMajor = bDefaultRowMajor;
			
 
				+  HasHLSLMatOrientation(Ty, &bIsRowMajor);
			
 
				+  return bIsRowMajor ? MatrixOrientation::RowMajor
			
 
				+                          : MatrixOrientation::ColumnMajor;
			
 
				+}
			
 
				 
			
 
				+QualType GetArrayEltType(QualType Ty) {
			
 
				   // Get element type.
			
 
				   if (Ty->isArrayType()) {
			
 
				     while (isa<clang::ArrayType>(Ty)) {
			
@@ -777,15 +780,26 @@ void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
 
				       Ty = ATy->getElementType();
			
 
				     }
			
 
				   }
			
 
				+  return Ty;
			
 
				+}
			
 
				+
			
 
				+} // namespace
			
 
				+
			
 
				+void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
			
 
				+    DxilFieldAnnotation &fieldAnnotation, QualType fieldTy,
			
 
				+    bool bDefaultRowMajor) {
			
 
				+  QualType Ty = fieldTy;
			
 
				+  if (Ty->isReferenceType())
			
 
				+    Ty = Ty.getNonReferenceType();
			
 
				+
			
 
				+  // Get element type.
			
 
				+  Ty = GetArrayEltType(Ty);
			
 
				 
			
 
				   QualType EltTy = Ty;
			
 
				   if (hlsl::IsHLSLMatType(Ty)) {
			
 
				     DxilMatrixAnnotation Matrix;
			
 
				-    bool bRowMajor = bDefaultRowMajor;
			
 
				-    HasHLSLMatOrientation(Ty, &bRowMajor);
			
 
				-    Matrix.Orientation = bRowMajor ? MatrixOrientation::RowMajor
			
 
				-                                   : MatrixOrientation::ColumnMajor;
			
 
				 
			
 
				+    Matrix.Orientation = GetMatrixMajor(Ty, bDefaultRowMajor);
			
 
				 
			
 
				     hlsl::GetHLSLMatRowColCount(Ty, Matrix.Rows, Matrix.Cols);
			
 
				     fieldAnnotation.SetMatrixAnnotation(Matrix);
			
@@ -6269,11 +6283,24 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
 
				     clang::QualType DestTy) {
			
 
				   llvm::Type *SrcPtrTy = SrcPtr->getType()->getPointerElementType();
			
 
				   llvm::Type *DestPtrTy = DestPtr->getType()->getPointerElementType();
			
 
				+
			
 
				+  bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
			
 
				   if (SrcPtrTy == DestPtrTy) {
			
 
				-    // Memcpy if type is match.
			
 
				-    unsigned size = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
			
 
				-    CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, size, 1);
			
 
				-    return;
			
 
				+    bool bMatArrayRotate = false;
			
 
				+    if (HLMatrixLower::IsMatrixArrayPointer(SrcPtr->getType())) {
			
 
				+      QualType SrcEltTy = GetArrayEltType(SrcTy);
			
 
				+      QualType DestEltTy = GetArrayEltType(DestTy);
			
 
				+      if (GetMatrixMajor(SrcEltTy, bDefaultRowMajor) !=
			
 
				+          GetMatrixMajor(DestEltTy, bDefaultRowMajor)) {
			
 
				+        bMatArrayRotate = true;
			
 
				+      }
			
 
				+    }
			
 
				+    if (!bMatArrayRotate) {
			
 
				+      // Memcpy if type is match.
			
 
				+      unsigned size = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
			
 
				+      CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, size, 1);
			
 
				+      return;
			
 
				+    }
			
 
				   } else if (HLModule::IsHLSLObjectType(dxilutil::GetArrayEltTy(SrcPtrTy)) &&
			
 
				              HLModule::IsHLSLObjectType(dxilutil::GetArrayEltTy(DestPtrTy))) {
			
 
				     unsigned sizeSrc = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry4.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry4.hlsl
@@ -0,0 +1,56 @@
 
				+// RUN: %dxc -T lib_6_3   %s | FileCheck %s
			
 
				+
			
 
				+// Make sure major change on function call work and offset is correct.
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 4)
			
 
				+
			
 
				+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.3.4*
			
 
				+// CHECK: [[RET:%.*]] = call %class.matrix.float.4.3 @"\01?mat_test
			
 
				+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.4.3*
			
 
				+// CHECK: store %class.matrix.float.4.3 [[RET]], %class.matrix.float.4.3* 
			
 
				+
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, i8 15, i32 4)
			
 
				+
			
 
				+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.3.4*
			
 
				+// CHECK: [[RET2:%.*]] = call %class.matrix.float.4.3 @"\01?mat_test
			
 
				+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.4.3*
			
 
				+// CHECK: store %class.matrix.float.4.3 [[RET2]], %class.matrix.float.4.3* 
			
 
				+
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
			
 
				+
			
 
				+
			
 
				+
			
 
				+float4x3 mat_test(inout float3x4 m);
			
 
				+
			
 
				+
			
 
				+cbuffer A {
			
 
				+column_major float3x4 cm;
			
 
				+row_major    float3x4 rm;
			
 
				+column_major float3x2 cma[2];
			
 
				+row_major    float3x3 rma[2];
			
 
				+uint3 i;
			
 
				+};
			
 
				+
			
 
				+struct matMajor {
			
 
				+  column_major float3x4 cm;
			
 
				+  row_major    float3x4 rm;  
			
 
				+};
			
 
				+RWStructuredBuffer<matMajor> uav0;
			
 
				+
			
 
				+[shader("pixel")]
			
 
				+float3 mainx() : SV_Target {
			
 
				+  column_major float3x4 cm;
			
 
				+  row_major    float3x4 rm;
			
 
				+  float4x3 tm = mat_test(uav0[i.x].cm);
			
 
				+  row_major float4x3 tm2 = mat_test(uav0[i.y].rm);
			
 
				+  return tm[i.x] + tm2[i.y];
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry5.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry5.hlsl
@@ -0,0 +1,44 @@
 
				+// RUN: %dxc -T lib_6_3 -enable-16bit-types  %s | FileCheck %s
			
 
				+
			
 
				+// Make sure half matrix have correct offset.
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 2)
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 8, i8 15, i32 2)
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i8 15, i32 2)
			
 
				+
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 8, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 24, i8 15, i32 2)
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 2)
			
 
				+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 40, i8 15, i32 2)
			
 
				+
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 24, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 40, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
			
 
				+
			
 
				+half4x3 mat_test(inout half3x4 m);
			
 
				+
			
 
				+
			
 
				+cbuffer A {
			
 
				+column_major half3x4 cm;
			
 
				+row_major    half3x4 rm;
			
 
				+column_major half3x2 cma[2];
			
 
				+row_major    half3x3 rma[2];
			
 
				+uint3 i;
			
 
				+};
			
 
				+
			
 
				+struct matMajor {
			
 
				+  column_major half3x4 cm;
			
 
				+  row_major    half3x4 rm;  
			
 
				+};
			
 
				+RWStructuredBuffer<matMajor> uav0;
			
 
				+
			
 
				+[shader("pixel")]
			
 
				+half3 mainx() : SV_Target {
			
 
				+  column_major half3x4 cm;
			
 
				+  row_major    half3x4 rm;
			
 
				+  half4x3 tm = mat_test(uav0[i.x].cm);
			
 
				+  half4x3 tm2 = mat_test(uav0[i.y].rm);
			
 
				+  return tm[i.x] + tm2[i.y];
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry6.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry6.hlsl
@@ -0,0 +1,44 @@
 
				+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
			
 
				+
			
 
				+// Make sure double matrix have correct offset.
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 8)
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 8)
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i8 15, i32 8)
			
 
				+
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, i8 15, i32 8)
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, i8 15, i32 8)
			
 
				+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, i8 15, i32 8)
			
 
				+
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
			
 
				+
			
 
				+double4x3 mat_test(inout double3x4 m);
			
 
				+
			
 
				+
			
 
				+cbuffer A {
			
 
				+column_major double3x4 cm;
			
 
				+row_major    double3x4 rm;
			
 
				+column_major double3x2 cma[2];
			
 
				+row_major    double3x3 rma[2];
			
 
				+uint3 i;
			
 
				+};
			
 
				+
			
 
				+struct matMajor {
			
 
				+  column_major double3x4 cm;
			
 
				+  row_major    double3x4 rm;  
			
 
				+};
			
 
				+RWStructuredBuffer<matMajor> uav0;
			
 
				+
			
 
				+[shader("pixel")]
			
 
				+float3 mainx() : SV_Target {
			
 
				+  column_major double3x4 cm;
			
 
				+  row_major    double3x4 rm;
			
 
				+  double4x3 tm = mat_test(uav0[i.x].cm);
			
 
				+  double4x3 tm2 = mat_test(uav0[i.y].rm);
			
 
				+  return tm[i.x] + tm2[i.y];
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry7.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry7.hlsl
@@ -0,0 +1,85 @@
 
				+// RUN: %dxc -T lib_6_1  %s | FileCheck %s
			
 
				+
			
 
				+// Make sure double matrix flattened and have correct offset.
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 112)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 144)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160)
			
 
				+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 176)
			
 
				+// CHECK: makeDouble.f64(i32 101,
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 112, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 144, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+// CHECK: splitDouble.f64(i32 102,
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 176, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
			
 
				+
			
 
				+
			
 
				+
			
 
				+double4x3 mat_test(inout double3x4 m);
			
 
				+
			
 
				+
			
 
				+cbuffer A {
			
 
				+column_major double3x4 cm;
			
 
				+row_major    double3x4 rm;
			
 
				+column_major double3x2 cma[2];
			
 
				+row_major    double3x3 rma[2];
			
 
				+uint3 i;
			
 
				+};
			
 
				+
			
 
				+struct matMajor {
			
 
				+  column_major double3x4 cm;
			
 
				+  row_major    double3x4 rm;  
			
 
				+};
			
 
				+RWStructuredBuffer<matMajor> uav0;
			
 
				+
			
 
				+[shader("pixel")]
			
 
				+float3 mainx() : SV_Target {
			
 
				+  column_major double3x4 cm;
			
 
				+  row_major    double3x4 rm;
			
 
				+  double4x3 tm = mat_test(uav0[i.x].cm);
			
 
				+  double4x3 tm2 = mat_test(uav0[i.y].rm);
			
 
				+  return tm[i.x] + tm2[i.y];
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry8.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry8.hlsl
@@ -0,0 +1,28 @@
 
				+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
			
 
				+
			
 
				+// Make sure sure major change on matrix array works.
			
 
				+// CHECK: bitcast [12 x float]* %4 to [2 x %class.matrix.float.3.2]*
			
 
				+// CHECK: bitcast [12 x float]* %4 to [2 x %class.matrix.float.3.2]*
			
 
				+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
			
 
				+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
			
 
				+
			
 
				+
			
 
				+struct MA {
			
 
				+  float2x3 ma[2];
			
 
				+};
			
 
				+
			
 
				+MA mat_test2( float3x2 m[2], int idx);
			
 
				+
			
 
				+cbuffer A {
			
 
				+column_major float3x2 cma[2];
			
 
				+row_major    float3x2 rma[2];
			
 
				+uint3 i;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+[shader("pixel")]
			
 
				+float3 mainx() : SV_Target {
			
 
				+  MA ma = mat_test2(cma, i.x);
			
 
				+  MA ma2 = mat_test2(rma, i.y);
			
 
				+  return ma.ma[i.z][i.y] + ma2.ma[i.z][i.y];
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_param8.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/lib_mat_param8.hlsl
@@ -0,0 +1,16 @@
 
				+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
			
 
				+
			
 
				+// Make sure return matrix struct works.
			
 
				+// CHECK: bitcast %class.matrix.float.3.2* {{.*}} to <6 x float>*
			
 
				+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
			
 
				+// CHECK: bitcast %class.matrix.float.3.2* {{.*}} to <6 x float>*
			
 
				+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
			
 
				+
			
 
				+struct MA {
			
 
				+  float2x3 ma[2];
			
 
				+};
			
 
				+
			
 
				+MA mat_test2( float3x2 m[2], int idx) {
			
 
				+  MA ma = { { transpose(m[0]), transpose(m[1])}};
			
 
				+  return ma;
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/raw_buf4.hlsl
+++ b/tools/clang/test/CodeGenHLSL/raw_buf4.hlsl
@@ -28,7 +28,7 @@
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float undef, float undef, i8 3, i32 4)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float undef, i8 7, i32 4)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 4)
			
 
				-// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 3, i32 8)
			
 
				+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 15, i32 8)
			
 
				 
			
 
				 ByteAddressBuffer buf1;
			
--- a/tools/clang/test/CodeGenHLSL/raw_buf5.hlsl
+++ b/tools/clang/test/CodeGenHLSL/raw_buf5.hlsl
@@ -41,7 +41,7 @@
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float undef, float undef, i8 3, i32 4)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float undef, i8 7, i32 4)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 4)
			
 
				-// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 3, i32 8)
			
 
				+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
			
 
				 // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 15, i32 8)
			
 
				 
			
 
				 ByteAddressBuffer buf1;
			
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1154,7 +1154,7 @@ class db_dxil(object):
 
				         # End of DXIL 1.1 opcodes.
			
 
				         self.set_op_count_for_version(1, 1, next_op_idx)
			
 
				 
			
 
				-        self.add_dxil_op("RawBufferLoad", next_op_idx, "RawBufferLoad", "reads from a raw buffer and structured buffer", "hfwi", "ro", [
			
 
				+        self.add_dxil_op("RawBufferLoad", next_op_idx, "RawBufferLoad", "reads from a raw buffer and structured buffer", "hfwidl", "ro", [
			
 
				             db_dxil_param(0, "$r", "", "the loaded value"),
			
 
				             db_dxil_param(2, "res", "srv", "handle of TypedBuffer SRV to sample"),
			
 
				             db_dxil_param(3, "i32", "index", "element index for StructuredBuffer, or byte offset for ByteAddressBuffer"),
			
@@ -1163,7 +1163,7 @@ class db_dxil(object):
 
				             db_dxil_param(6, "i32", "alignment", "relative load access alignment", is_const=True)])
			
 
				         next_op_idx += 1
			
 
				 
			
 
				-        self.add_dxil_op("RawBufferStore", next_op_idx, "RawBufferStore", "writes to a RWByteAddressBuffer or RWStructuredBuffer", "hfwi", "", [
			
 
				+        self.add_dxil_op("RawBufferStore", next_op_idx, "RawBufferStore", "writes to a RWByteAddressBuffer or RWStructuredBuffer", "hfwidl", "", [
			
 
				             db_dxil_param(0, "v", "", ""),
			
 
				             db_dxil_param(2, "res", "uav", "handle of UAV to store to"),
			
 
				             db_dxil_param(3, "i32", "index", "element index for StructuredBuffer, or byte offset for ByteAddressBuffer"),