Browse Source

Merged PR 90: Enable 64bit rawBufferLoad/Store.

Enable 64bit rawBufferLoad/Store.
Also fix 16bit offset for buffer matrix load/store.
And flat copy on matrix array with different major.
Xiang_Li (XBox) 7 years ago
parent
commit
d09e2746aa

+ 272 - 1
lib/HLSL/DxilGenerationPass.cpp

@@ -1415,13 +1415,113 @@ INITIALIZE_PASS(DxilLegalizeEvalOperations,
 // and will be truncated to their corresponding types after loading / before storing.
 namespace {
 
+// Create { v0, v1 } from { v0.lo, v0.hi, v1.lo, v1.hi }
+void Make64bitResultForLoad(Type *EltTy, ArrayRef<Value *> resultElts32,
+                            unsigned size, MutableArrayRef<Value *> resultElts,
+                            hlsl::OP *hlslOP, IRBuilder<> &Builder) {
+  Type *i64Ty = Builder.getInt64Ty();
+  Type *doubleTy = Builder.getDoubleTy();
+  if (EltTy == doubleTy) {
+    Function *makeDouble =
+        hlslOP->GetOpFunc(DXIL::OpCode::MakeDouble, doubleTy);
+    Value *makeDoubleOpArg =
+        Builder.getInt32((unsigned)DXIL::OpCode::MakeDouble);
+    for (unsigned i = 0; i < size; i++) {
+      Value *lo = resultElts32[2 * i];
+      Value *hi = resultElts32[2 * i + 1];
+      Value *V = Builder.CreateCall(makeDouble, {makeDoubleOpArg, lo, hi});
+      resultElts[i] = V;
+    }
+  } else {
+    for (unsigned i = 0; i < size; i++) {
+      Value *lo = resultElts32[2 * i];
+      Value *hi = resultElts32[2 * i + 1];
+      lo = Builder.CreateZExt(lo, i64Ty);
+      hi = Builder.CreateZExt(hi, i64Ty);
+      hi = Builder.CreateShl(hi, 32);
+      resultElts[i] = Builder.CreateOr(lo, hi);
+    }
+  }
+}
+
+// Split { v0, v1 } to { v0.lo, v0.hi, v1.lo, v1.hi }
+void Split64bitValForStore(Type *EltTy, ArrayRef<Value *> vals, unsigned size,
+                           MutableArrayRef<Value *> vals32, hlsl::OP *hlslOP,
+                           IRBuilder<> &Builder) {
+  Type *i32Ty = Builder.getInt32Ty();
+  Type *doubleTy = Builder.getDoubleTy();
+  Value *undefI32 = UndefValue::get(i32Ty);
+
+  if (EltTy == doubleTy) {
+    Function *dToU = hlslOP->GetOpFunc(DXIL::OpCode::SplitDouble, doubleTy);
+    Value *dToUOpArg = Builder.getInt32((unsigned)DXIL::OpCode::SplitDouble);
+    for (unsigned i = 0; i < size; i++) {
+      if (isa<UndefValue>(vals[i])) {
+        vals32[2 * i] = undefI32;
+        vals32[2 * i + 1] = undefI32;
+      } else {
+        Value *retVal = Builder.CreateCall(dToU, {dToUOpArg, vals[i]});
+        Value *lo = Builder.CreateExtractValue(retVal, 0);
+        Value *hi = Builder.CreateExtractValue(retVal, 1);
+        vals32[2 * i] = lo;
+        vals32[2 * i + 1] = hi;
+      }
+    }
+  } else {
+    for (unsigned i = 0; i < size; i++) {
+      if (isa<UndefValue>(vals[i])) {
+        vals32[2 * i] = undefI32;
+        vals32[2 * i + 1] = undefI32;
+      } else {
+        Value *lo = Builder.CreateTrunc(vals[i], i32Ty);
+        Value *hi = Builder.CreateLShr(vals[i], 32);
+        hi = Builder.CreateTrunc(hi, i32Ty);
+        vals32[2 * i] = lo;
+        vals32[2 * i + 1] = hi;
+      }
+    }
+  }
+}
+
 class DxilTranslateRawBuffer : public ModulePass {
 public:
   static char ID;
   explicit DxilTranslateRawBuffer() : ModulePass(ID) {}
   bool runOnModule(Module &M) {
     unsigned major, minor;
-    M.GetDxilModule().GetDxilVersion(major, minor);
+    DxilModule &DM = M.GetDxilModule();
+    DM.GetDxilVersion(major, minor);
+    OP *hlslOP = DM.GetOP();
+    // Split 64bit for shader model less than 6.3.
+    if (major == 1 && minor <= 2) {
+      for (auto F = M.functions().begin(); F != M.functions().end();) {
+        Function *func = &*(F++);
+        DXIL::OpCodeClass opClass;
+        if (hlslOP->GetOpCodeClass(func, opClass)) {
+          if (opClass == DXIL::OpCodeClass::RawBufferLoad) {
+            Type *ETy =
+                hlslOP->GetOverloadType(DXIL::OpCode::RawBufferLoad, func);
+
+            bool is64 =
+                ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
+            if (is64) {
+              ReplaceRawBufferLoad64Bit(func, ETy, M);
+              func->eraseFromParent();
+            }
+          } else if (opClass == DXIL::OpCodeClass::RawBufferStore) {
+            Type *ETy =
+                hlslOP->GetOverloadType(DXIL::OpCode::RawBufferStore, func);
+
+            bool is64 =
+                ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
+            if (is64) {
+              ReplaceRawBufferStore64Bit(func, ETy, M);
+              func->eraseFromParent();
+            }
+          }
+        }
+      }
+    }
     if (major == 1 && minor < 2) {
       for (auto F = M.functions().begin(), E = M.functions().end(); F != E;) {
         Function *func = &*(F++);
@@ -1454,6 +1554,8 @@ private:
   // Replace RawBufferLoad/Store to BufferLoad/Store for DXIL < 1.2
   void ReplaceRawBufferLoad(Function *F, Module &M);
   void ReplaceRawBufferStore(Function *F, Module &M);
+  void ReplaceRawBufferLoad64Bit(Function *F, Type *EltTy, Module &M);
+  void ReplaceRawBufferStore64Bit(Function *F, Type *EltTy, Module &M);
   // Replace RawBufferLoad/Store of min-precision types to have its actual storage size
   void ReplaceMinPrecisionRawBufferLoad(Function *F, Module &M);
   void ReplaceMinPrecisionRawBufferStore(Function *F, Module &M);
@@ -1491,6 +1593,96 @@ void DxilTranslateRawBuffer::ReplaceRawBufferLoad(Function *F,
   }
 }
 
+void DxilTranslateRawBuffer::ReplaceRawBufferLoad64Bit(Function *F, Type *EltTy, Module &M) {
+  OP *hlslOP = M.GetDxilModule().GetOP();
+  Function *bufLd = hlslOP->GetOpFunc(DXIL::OpCode::RawBufferLoad,
+                                      Type::getInt32Ty(M.getContext()));
+  for (auto U = F->user_begin(), E = F->user_end(); U != E;) {
+    User *user = *(U++);
+    if (CallInst *CI = dyn_cast<CallInst>(user)) {
+      IRBuilder<> Builder(CI);
+      SmallVector<Value *, 4> args(CI->arg_operands());
+
+      Value *offset = CI->getArgOperand(
+          DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx);
+
+      unsigned size = 0;
+      bool bNeedStatus = false;
+      for (User *U : CI->users()) {
+        ExtractValueInst *Elt = cast<ExtractValueInst>(U);
+        DXASSERT(Elt->getNumIndices() == 1, "else invalid use for resRet");
+        unsigned idx = Elt->getIndices()[0];
+        if (idx == 4) {
+          bNeedStatus = true;
+        } else {
+          size = std::max(size, idx+1);
+        }
+      }
+      unsigned maskHi = 0;
+      unsigned maskLo = 0;
+      switch (size) {
+      case 1:
+        maskLo = 3;
+        break;
+      case 2:
+        maskLo = 0xf;
+        break;
+      case 3:
+        maskLo = 0xf;
+        maskHi = 3;
+        break;
+      case 4:
+        maskLo = 0xf;
+        maskHi = 0xf;
+        break;
+      }
+
+      args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
+          Builder.getInt8(maskLo);
+      Value *resultElts[5] = {nullptr, nullptr, nullptr, nullptr, nullptr};
+      CallInst *newLd = Builder.CreateCall(bufLd, args);
+
+      Value *resultElts32[8];
+      unsigned eltBase = 0;
+      for (unsigned i = 0; i < size; i++) {
+        if (i == 2) {
+          // Update offset 4 by 4 bytes.
+          args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
+              Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
+          args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
+              Builder.getInt8(maskHi);
+          newLd = Builder.CreateCall(bufLd, args);
+          eltBase = 4;
+        }
+        unsigned resBase = 2 * i;
+        resultElts32[resBase] =
+            Builder.CreateExtractValue(newLd, resBase - eltBase);
+        resultElts32[resBase + 1] =
+            Builder.CreateExtractValue(newLd, resBase + 1 - eltBase);
+      }
+
+      Make64bitResultForLoad(EltTy, resultElts32, size, resultElts, hlslOP, Builder);
+      if (bNeedStatus) {
+        resultElts[4] = Builder.CreateExtractValue(newLd, 4);
+      }
+      for (auto it = CI->user_begin(); it != CI->user_end(); ) {
+        ExtractValueInst *Elt = cast<ExtractValueInst>(*(it++));
+        DXASSERT(Elt->getNumIndices() == 1, "else invalid use for resRet");
+        unsigned idx = Elt->getIndices()[0];
+        if (!Elt->user_empty()) {
+          Value *newElt = resultElts[idx];
+          Elt->replaceAllUsesWith(newElt);
+        }
+        Elt->eraseFromParent();
+      }
+
+      CI->eraseFromParent();
+    } else {
+      DXASSERT(false, "function can only be used with call instructions.");
+    }
+  }
+}
+
 void DxilTranslateRawBuffer::ReplaceRawBufferStore(Function *F,
   Module &M) {
   OP *op = M.GetDxilModule().GetOP();
@@ -1515,6 +1707,85 @@ void DxilTranslateRawBuffer::ReplaceRawBufferStore(Function *F,
   }
 }
 
+void DxilTranslateRawBuffer::ReplaceRawBufferStore64Bit(Function *F, Type *ETy,
+                                                        Module &M) {
+  OP *hlslOP = M.GetDxilModule().GetOP();
+  Function *newFunction = hlslOP->GetOpFunc(hlsl::DXIL::OpCode::RawBufferStore,
+                                            Type::getInt32Ty(M.getContext()));
+  for (auto U = F->user_begin(), E = F->user_end(); U != E;) {
+    User *user = *(U++);
+    if (CallInst *CI = dyn_cast<CallInst>(user)) {
+      IRBuilder<> Builder(CI);
+      SmallVector<Value *, 4> args(CI->arg_operands());
+      Value *vals[4] = {
+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal0OpIdx),
+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal1OpIdx),
+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal2OpIdx),
+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreVal3OpIdx)};
+      ConstantInt *cMask = cast<ConstantInt>(
+          CI->getArgOperand(DXIL::OperandIndex::kRawBufferStoreMaskOpIdx));
+      Value *undefI32 = UndefValue::get(Builder.getInt32Ty());
+      Value *vals32[8] = {undefI32, undefI32, undefI32, undefI32,
+                          undefI32, undefI32, undefI32, undefI32};
+
+      unsigned maskLo = 0;
+      unsigned maskHi = 0;
+      unsigned size = 0;
+      unsigned mask = cMask->getLimitedValue();
+      switch (mask) {
+      case 1:
+        maskLo = 3;
+        size = 1;
+        break;
+      case 3:
+        maskLo = 15;
+        size = 2;
+        break;
+      case 7:
+        maskLo = 15;
+        maskHi = 3;
+        size = 3;
+        break;
+      case 15:
+        maskLo = 15;
+        maskHi = 15;
+        size = 4;
+        break;
+      default:
+        DXASSERT(0, "invalid mask");
+      }
+
+      Split64bitValForStore(ETy, vals, size, vals32, hlslOP, Builder);
+      args[DXIL::OperandIndex::kRawBufferStoreMaskOpIdx] =
+          Builder.getInt8(maskLo);
+      args[DXIL::OperandIndex::kRawBufferStoreVal0OpIdx] = vals32[0];
+      args[DXIL::OperandIndex::kRawBufferStoreVal1OpIdx] = vals32[1];
+      args[DXIL::OperandIndex::kRawBufferStoreVal2OpIdx] = vals32[2];
+      args[DXIL::OperandIndex::kRawBufferStoreVal3OpIdx] = vals32[3];
+
+      Builder.CreateCall(newFunction, args);
+
+      if (maskHi) {
+        Value *offset = args[DXIL::OperandIndex::kBufferStoreCoord1OpIdx];
+        // Update offset 4 by 4 bytes.
+        offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
+        args[DXIL::OperandIndex::kRawBufferStoreElementOffsetOpIdx] = offset;
+        args[DXIL::OperandIndex::kRawBufferStoreMaskOpIdx] =
+            Builder.getInt8(maskHi);
+        args[DXIL::OperandIndex::kRawBufferStoreVal0OpIdx] = vals32[4];
+        args[DXIL::OperandIndex::kRawBufferStoreVal1OpIdx] = vals32[5];
+        args[DXIL::OperandIndex::kRawBufferStoreVal2OpIdx] = vals32[6];
+        args[DXIL::OperandIndex::kRawBufferStoreVal3OpIdx] = vals32[7];
+
+        Builder.CreateCall(newFunction, args);
+      }
+      CI->eraseFromParent();
+    } else {
+      DXASSERT(false, "function can only be used with call instructions.");
+    }
+  }
+}
+
 void DxilTranslateRawBuffer::ReplaceMinPrecisionRawBufferLoad(Function *F,
                                                               Module &M) {
   OP *Op = M.GetDxilModule().GetOP();

+ 2 - 2
lib/HLSL/DxilOperations.cpp

@@ -262,8 +262,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
   {  OC::ViewID,                  "ViewID",                   OCC::ViewID,                   "viewID",                     false, false, false, false, false, false, false,  true, false, false, false, Attribute::ReadNone, },
 
   // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,  obj,  function attribute
-  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",              false,  true,  true, false, false, false,  true,  true, false, false, false, Attribute::ReadOnly, },
-  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",             false,  true,  true, false, false, false,  true,  true, false, false, false, Attribute::None,     },
+  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",              false,  true,  true,  true, false, false,  true,  true,  true, false, false, Attribute::ReadOnly, },
+  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",             false,  true,  true,  true, false, false,  true,  true,  true, false, false, Attribute::None,     },
 
   // Raytracing object space uint System Values                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,  obj,  function attribute
   {  OC::InstanceID,              "InstanceID",               OCC::InstanceID,               "instanceID",                 false, false, false, false, false, false, false,  true, false, false, false, Attribute::ReadNone, },

+ 4 - 0
lib/HLSL/DxilValidation.cpp

@@ -2583,6 +2583,7 @@ static bool IsLLVMInstructionAllowedForLib(Instruction &I, ValidationContext &Va
   switch (I.getOpcode()) {
   case Instruction::InsertElement:
   case Instruction::ExtractElement:
+  case Instruction::ShuffleVector:
     return true;
   case Instruction::Unreachable:
     if (Instruction *Prev = I.getPrevNode()) {
@@ -2676,6 +2677,9 @@ static void ValidateFunctionBody(Function *F, ValidationContext &ValCtx) {
           if (InsertElementInst *InsertInst = dyn_cast<InsertElementInst>(&I)) {
             legalUndef = op == I.getOperand(0);
           }
+          if (ShuffleVectorInst *Shuf = dyn_cast<ShuffleVectorInst>(&I)) {
+            legalUndef = op == I.getOperand(1);
+          }
 
           if (!legalUndef)
             ValCtx.EmitInstrError(&I,

+ 61 - 174
lib/HLSL/HLOperationLower.cpp

@@ -3122,42 +3122,25 @@ static uint8_t GetRawBufferMaskFromIOP(IntrinsicOp IOP, hlsl::OP *OP) {
 }
 
 static Constant *GetRawBufferMaskForETy(Type *Ty, unsigned NumComponents, hlsl::OP *OP) {
-  Type *ETy = Ty->getScalarType();
-  bool is64 = ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
   unsigned mask = 0;
-  if (is64) {
-    switch (NumComponents) {
-    case 0:
-      break;
-    case 1:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
-      break;
-    case 2:
-      mask = DXIL::kCompMask_All;
-      break;
-    default:
-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
-    }
-  }
-  else {
-    switch (NumComponents) {
-    case 0:
-      break;
-    case 1:
-      mask = DXIL::kCompMask_X;
-      break;
-    case 2:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
-      break;
-    case 3:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
-      break;
-    case 4:
-      mask = DXIL::kCompMask_All;
-      break;
-    default:
-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
-    }
+
+  switch (NumComponents) {
+  case 0:
+    break;
+  case 1:
+    mask = DXIL::kCompMask_X;
+    break;
+  case 2:
+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
+    break;
+  case 3:
+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
+    break;
+  case 4:
+    mask = DXIL::kCompMask_All;
+    break;
+  default:
+    DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
   }
   return OP->GetI8Const(mask);
 }
@@ -3200,8 +3183,10 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
     return;
   }
 
+  bool isTyped = opcode == OP::OpCode::TextureLoad ||
+                 RK == DxilResource::Kind::TypedBuffer;
   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (is64) {
+  if (is64 && isTyped) {
     EltTy = i32Ty;
   }
 
@@ -3278,7 +3263,7 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
       Builder.CreateCall(F, loadArgs, OP->GetOpCodeName(opcode));
 
   Value *retValNew = nullptr;
-  if (!is64) {
+  if (!is64 || !isTyped) {
     retValNew = ScalarizeResRet(Ty, ResRet, Builder);
   } else {
     unsigned size = numComponents;
@@ -3380,13 +3365,16 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     break;
   }
 
+  bool isTyped = opcode == OP::OpCode::TextureStore ||
+                 RK == DxilResource::Kind::TypedBuffer;
+
   Type *i32Ty = Builder.getInt32Ty();
   Type *i64Ty = Builder.getInt64Ty();
   Type *doubleTy = Builder.getDoubleTy();
   Type *EltTy = Ty->getScalarType();
   Constant *Alignment = OP->GetI32Const(OP->GetAllocSizeForType(EltTy));
   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (is64) {
+  if (is64 && isTyped) {
     EltTy = i32Ty;
   }
 
@@ -3434,8 +3422,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   }
 
   // values
-  bool isTyped = opcode == OP::OpCode::TextureStore ||
-                 RK == DxilResource::Kind::TypedBuffer;
   uint8_t mask = 0;
   if (Ty->isVectorTy()) {
     unsigned vecSize = Ty->getVectorNumElements();
@@ -3470,7 +3456,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     }
   }
 
-  if (is64) {
+  if (is64 && isTyped) {
     unsigned size = 1;
     if (Ty->isVectorTy()) {
       size = Ty->getVectorNumElements();
@@ -5736,57 +5722,23 @@ void GenerateStructBufLd(Value *handle, Value *bufIdx, Value *offset,
   DXASSERT(resultElts.size() <= 4,
            "buffer load cannot load more than 4 values");
 
+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
+  Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
+                   handle,
+                   bufIdx,
+                   offset,
+                   mask,
+                   alignment};
+  Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
 
-  Type *i64Ty = Builder.getInt64Ty();
-  Type *doubleTy = Builder.getDoubleTy();
-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-
-  if (!is64) {
-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-
-    for (unsigned i = 0; i < resultElts.size(); i++) {
-      resultElts[i] = Builder.CreateExtractValue(Ld, i);
-    }
-
-    // status
-    UpdateStatus(Ld, status, Builder, OP);
-    return;
-  } else {
-    // 64 bit.
-    Function *dxilF = OP->GetOpFunc(opcode, Builder.getInt32Ty());
-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents < 2 ? NumComponents : 2, OP);
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-    Value *resultElts32[8];
-    unsigned size = resultElts.size();
-    unsigned eltBase = 0;
-    for (unsigned i = 0; i < size; i++) {
-      if (i == 2) {
-        // Update offset 4 by 4 bytes.
-        Args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
-            Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
-        // Update Mask
-        Args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
-          GetRawBufferMaskForETy(EltTy, NumComponents < 3 ? 0 : NumComponents - 2, OP);
-        Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-        eltBase = 4;
-      }
-      unsigned resBase = 2 * i;
-      resultElts32[resBase] = Builder.CreateExtractValue(Ld, resBase - eltBase);
-      resultElts32[resBase + 1] =
-          Builder.CreateExtractValue(Ld, resBase + 1 - eltBase);
-    }
-
-    Make64bitResultForLoad(EltTy, resultElts32, size, resultElts, OP, Builder);
-
-    // status
-    UpdateStatus(Ld, status, Builder, OP);
-
-    return;
+  for (unsigned i = 0; i < resultElts.size(); i++) {
+    resultElts[i] = Builder.CreateExtractValue(Ld, i);
   }
+
+  // status
+  UpdateStatus(Ld, status, Builder, OP);
+  return;
 }
 
 void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
@@ -5794,85 +5746,19 @@ void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
                          ArrayRef<Value *> vals, uint8_t mask, Constant *alignment) {
   OP::OpCode opcode = OP::OpCode::RawBufferStore;
   DXASSERT(vals.size() == 4, "buffer store need 4 values");
-  Type *i64Ty = Builder.getInt64Ty();
-  Type *doubleTy = Builder.getDoubleTy();
-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (!is64) {
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                     handle,
-                     bufIdx,
-                     offset,
-                     vals[0],
-                     vals[1],
-                     vals[2],
-                     vals[3],
-                     OP->GetU8Const(mask),
-                     alignment};
-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
-    Builder.CreateCall(dxilF, Args);
-  } else {
-    Type *i32Ty = Builder.getInt32Ty();
-    Function *dxilF = OP->GetOpFunc(opcode, i32Ty);
-
-    Value *undefI32 = UndefValue::get(i32Ty);
-    Value *vals32[8] = {undefI32, undefI32, undefI32, undefI32,
-                        undefI32, undefI32, undefI32, undefI32};
-
-    unsigned maskLo = 0;
-    unsigned maskHi = 0;
-    unsigned size = 0;
-    switch (mask) {
-    case 1:
-      maskLo = 3;
-      size = 1;
-      break;
-    case 3:
-      maskLo = 15;
-      size = 2;
-      break;
-    case 7:
-      maskLo = 15;
-      maskHi = 3;
-      size = 3;
-      break;
-    case 15:
-      maskLo = 15;
-      maskHi = 15;
-      size = 4;
-      break;
-    default:
-      DXASSERT(0, "invalid mask");
-    }
 
-    Split64bitValForStore(EltTy, vals, size, vals32, OP, Builder);
-
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                     handle,
-                     bufIdx,
-                     offset,
-                     vals32[0],
-                     vals32[1],
-                     vals32[2],
-                     vals32[3],
-                     OP->GetU8Const(maskLo),
-                     alignment};
-    Builder.CreateCall(dxilF, Args);
-    if (maskHi) {
-      // Update offset 4 by 4 bytes.
-      offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
-      Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                       handle,
-                       bufIdx,
-                       offset,
-                       vals32[4],
-                       vals32[5],
-                       vals32[6],
-                       vals32[7],
-                       OP->GetU8Const(maskHi),
-                       alignment};
-      Builder.CreateCall(dxilF, Args);
-    }
-  }
+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
+                   handle,
+                   bufIdx,
+                   offset,
+                   vals[0],
+                   vals[1],
+                   vals[2],
+                   vals[3],
+                   OP->GetU8Const(mask),
+                   alignment};
+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
+  Builder.CreateCall(dxilF, Args);
 }
 
 Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
@@ -5881,7 +5767,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                                bool colMajor, const DataLayout &DL) {
   unsigned col, row;
   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
-  Constant* alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
+  unsigned  EltSize = DL.getTypeAllocSize(EltTy);
+  Constant* alignment = OP->GetI32Const(EltSize);
 
   Value *offset = baseOffset;
   if (baseOffset == nullptr)
@@ -5896,7 +5783,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     GenerateStructBufLd(handle, bufIdx, offset, status, EltTy, ResultElts, OP, Builder, 3, alignment);
     for (unsigned i = 0; i < rest; i++)
       elts[i] = ResultElts[i];
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * rest));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(EltSize * rest));
   }
 
   for (unsigned i = rest; i < matSize; i += 4) {
@@ -5908,7 +5795,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     elts[i + 3] = ResultElts[3];
 
     // Update offset by 4*4bytes.
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 
   return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
@@ -5919,7 +5806,8 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                              Value *val, bool colMajor, const DataLayout &DL) {
   unsigned col, row;
   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
-  Constant *Alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
+  unsigned EltSize = DL.getTypeAllocSize(EltTy);
+  Constant *Alignment = OP->GetI32Const(EltSize);
   Value *offset = baseOffset;
   if (baseOffset == nullptr)
     offset = OP->GetU32Const(0);
@@ -5955,7 +5843,7 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                         {elts[i], elts[i + 1], elts[i + 2], elts[i + 3]}, mask,
                         Alignment);
     // Update offset by 4*4bytes.
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 }
 
@@ -6214,7 +6102,6 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
       }
       userCall->eraseFromParent();
     } else if (group == HLOpcodeGroup::HLMatLoadStore)
-      // TODO: support 64 bit.
       TranslateStructBufMatLdSt(userCall, handle, OP, status, bufIdx,
                                 baseOffset, DL);
     else if (group == HLOpcodeGroup::HLSubscript) {

+ 41 - 14
tools/clang/lib/CodeGen/CGHLSLMS.cpp

@@ -763,13 +763,16 @@ MDNode *CGMSHLSLRuntime::GetOrAddResTypeMD(QualType resTy) {
   }
 }
 
-void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
-    DxilFieldAnnotation &fieldAnnotation, QualType fieldTy,
-    bool bDefaultRowMajor) {
-  QualType Ty = fieldTy;
-  if (Ty->isReferenceType())
-    Ty = Ty.getNonReferenceType();
+namespace {
+MatrixOrientation GetMatrixMajor(QualType Ty, bool bDefaultRowMajor) {
+  DXASSERT(hlsl::IsHLSLMatType(Ty), "");
+  bool bIsRowMajor = bDefaultRowMajor;
+  HasHLSLMatOrientation(Ty, &bIsRowMajor);
+  return bIsRowMajor ? MatrixOrientation::RowMajor
+                          : MatrixOrientation::ColumnMajor;
+}
 
+QualType GetArrayEltType(QualType Ty) {
   // Get element type.
   if (Ty->isArrayType()) {
     while (isa<clang::ArrayType>(Ty)) {
@@ -777,15 +780,26 @@ void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
       Ty = ATy->getElementType();
     }
   }
+  return Ty;
+}
+
+} // namespace
+
+void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
+    DxilFieldAnnotation &fieldAnnotation, QualType fieldTy,
+    bool bDefaultRowMajor) {
+  QualType Ty = fieldTy;
+  if (Ty->isReferenceType())
+    Ty = Ty.getNonReferenceType();
+
+  // Get element type.
+  Ty = GetArrayEltType(Ty);
 
   QualType EltTy = Ty;
   if (hlsl::IsHLSLMatType(Ty)) {
     DxilMatrixAnnotation Matrix;
-    bool bRowMajor = bDefaultRowMajor;
-    HasHLSLMatOrientation(Ty, &bRowMajor);
-    Matrix.Orientation = bRowMajor ? MatrixOrientation::RowMajor
-                                   : MatrixOrientation::ColumnMajor;
 
+    Matrix.Orientation = GetMatrixMajor(Ty, bDefaultRowMajor);
 
     hlsl::GetHLSLMatRowColCount(Ty, Matrix.Rows, Matrix.Cols);
     fieldAnnotation.SetMatrixAnnotation(Matrix);
@@ -6269,11 +6283,24 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
     clang::QualType DestTy) {
   llvm::Type *SrcPtrTy = SrcPtr->getType()->getPointerElementType();
   llvm::Type *DestPtrTy = DestPtr->getType()->getPointerElementType();
+
+  bool bDefaultRowMajor = m_pHLModule->GetHLOptions().bDefaultRowMajor;
   if (SrcPtrTy == DestPtrTy) {
-    // Memcpy if type is match.
-    unsigned size = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
-    CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, size, 1);
-    return;
+    bool bMatArrayRotate = false;
+    if (HLMatrixLower::IsMatrixArrayPointer(SrcPtr->getType())) {
+      QualType SrcEltTy = GetArrayEltType(SrcTy);
+      QualType DestEltTy = GetArrayEltType(DestTy);
+      if (GetMatrixMajor(SrcEltTy, bDefaultRowMajor) !=
+          GetMatrixMajor(DestEltTy, bDefaultRowMajor)) {
+        bMatArrayRotate = true;
+      }
+    }
+    if (!bMatArrayRotate) {
+      // Memcpy if type is match.
+      unsigned size = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
+      CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, size, 1);
+      return;
+    }
   } else if (HLModule::IsHLSLObjectType(dxilutil::GetArrayEltTy(SrcPtrTy)) &&
              HLModule::IsHLSLObjectType(dxilutil::GetArrayEltTy(DestPtrTy))) {
     unsigned sizeSrc = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);

+ 56 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry4.hlsl

@@ -0,0 +1,56 @@
+// RUN: %dxc -T lib_6_3   %s | FileCheck %s
+
+// Make sure major change on function call work and offset is correct.
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 4)
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i8 15, i32 4)
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 4)
+
+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.3.4*
+// CHECK: [[RET:%.*]] = call %class.matrix.float.4.3 @"\01?mat_test
+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.4.3*
+// CHECK: store %class.matrix.float.4.3 [[RET]], %class.matrix.float.4.3* 
+
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, i8 15, i32 4)
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i8 15, i32 4)
+// CHECK: dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, i8 15, i32 4)
+
+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.3.4*
+// CHECK: [[RET2:%.*]] = call %class.matrix.float.4.3 @"\01?mat_test
+// CHECK: bitcast <12 x float>* {{.*}} to %class.matrix.float.4.3*
+// CHECK: store %class.matrix.float.4.3 [[RET2]], %class.matrix.float.4.3* 
+
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+// CHECK: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i8 15, i32 4)
+
+
+
+float4x3 mat_test(inout float3x4 m);
+
+
+cbuffer A {
+column_major float3x4 cm;
+row_major    float3x4 rm;
+column_major float3x2 cma[2];
+row_major    float3x3 rma[2];
+uint3 i;
+};
+
+struct matMajor {
+  column_major float3x4 cm;
+  row_major    float3x4 rm;  
+};
+RWStructuredBuffer<matMajor> uav0;
+
+[shader("pixel")]
+float3 mainx() : SV_Target {
+  column_major float3x4 cm;
+  row_major    float3x4 rm;
+  float4x3 tm = mat_test(uav0[i.x].cm);
+  row_major float4x3 tm2 = mat_test(uav0[i.y].rm);
+  return tm[i.x] + tm2[i.y];
+}

+ 44 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry5.hlsl

@@ -0,0 +1,44 @@
+// RUN: %dxc -T lib_6_3 -enable-16bit-types  %s | FileCheck %s
+
+// Make sure half matrix have correct offset.
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 2)
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 8, i8 15, i32 2)
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i8 15, i32 2)
+
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 8, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 24, i8 15, i32 2)
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 2)
+// CHECK:rawBufferLoad.f16(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 40, i8 15, i32 2)
+
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 24, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+// CHECK:rawBufferStore.f16(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 40, half {{.*}}, half {{.*}}, half {{.*}}, half {{.*}}, i8 15, i32 2)
+
+half4x3 mat_test(inout half3x4 m);
+
+
+cbuffer A {
+column_major half3x4 cm;
+row_major    half3x4 rm;
+column_major half3x2 cma[2];
+row_major    half3x3 rma[2];
+uint3 i;
+};
+
+struct matMajor {
+  column_major half3x4 cm;
+  row_major    half3x4 rm;  
+};
+RWStructuredBuffer<matMajor> uav0;
+
+[shader("pixel")]
+half3 mainx() : SV_Target {
+  column_major half3x4 cm;
+  row_major    half3x4 rm;
+  half4x3 tm = mat_test(uav0[i.x].cm);
+  half4x3 tm2 = mat_test(uav0[i.y].rm);
+  return tm[i.x] + tm2[i.y];
+}

+ 44 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry6.hlsl

@@ -0,0 +1,44 @@
+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
+
+// Make sure double matrix have correct offset.
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i8 15, i32 8)
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i8 15, i32 8)
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i8 15, i32 8)
+
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, i8 15, i32 8)
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, i8 15, i32 8)
+// CHECK:rawBufferLoad.f64(i32 139, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, i8 15, i32 8)
+
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+// CHECK:rawBufferStore.f64(i32 140, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}}, i8 15, i32 8)
+
+double4x3 mat_test(inout double3x4 m);
+
+
+cbuffer A {
+column_major double3x4 cm;
+row_major    double3x4 rm;
+column_major double3x2 cma[2];
+row_major    double3x3 rma[2];
+uint3 i;
+};
+
+struct matMajor {
+  column_major double3x4 cm;
+  row_major    double3x4 rm;  
+};
+RWStructuredBuffer<matMajor> uav0;
+
+[shader("pixel")]
+float3 mainx() : SV_Target {
+  column_major double3x4 cm;
+  row_major    double3x4 rm;
+  double4x3 tm = mat_test(uav0[i.x].cm);
+  double4x3 tm2 = mat_test(uav0[i.y].rm);
+  return tm[i.x] + tm2[i.y];
+}

+ 85 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry7.hlsl

@@ -0,0 +1,85 @@
+// RUN: %dxc -T lib_6_1  %s | FileCheck %s
+
+// Make sure double matrix flattened and have correct offset.
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16)
+// CHECK: makeDouble.f64(i32 101,
+
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48)
+// CHECK: makeDouble.f64(i32 101,
+
+
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80)
+// CHECK: makeDouble.f64(i32 101,
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 0, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 16, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 32, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 48, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 64, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 80, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 112)
+// CHECK: makeDouble.f64(i32 101,
+
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 144)
+// CHECK: makeDouble.f64(i32 101,
+
+
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160)
+// CHECK:dx.op.bufferLoad.i32(i32 68, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 176)
+// CHECK: makeDouble.f64(i32 101,
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 96, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 112, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 128, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 144, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+// CHECK: splitDouble.f64(i32 102,
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 160, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+// CHECK: dx.op.bufferStore.i32(i32 69, %dx.types.Handle {{.*}}, i32 {{.*}}, i32 176, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i8 15)
+
+
+
+double4x3 mat_test(inout double3x4 m);
+
+
+cbuffer A {
+column_major double3x4 cm;
+row_major    double3x4 rm;
+column_major double3x2 cma[2];
+row_major    double3x3 rma[2];
+uint3 i;
+};
+
+struct matMajor {
+  column_major double3x4 cm;
+  row_major    double3x4 rm;  
+};
+RWStructuredBuffer<matMajor> uav0;
+
+[shader("pixel")]
+float3 mainx() : SV_Target {
+  column_major double3x4 cm;
+  row_major    double3x4 rm;
+  double4x3 tm = mat_test(uav0[i.x].cm);
+  double4x3 tm2 = mat_test(uav0[i.y].rm);
+  return tm[i.x] + tm2[i.y];
+}

+ 28 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_entry8.hlsl

@@ -0,0 +1,28 @@
+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
+
+// Make sure sure major change on matrix array works.
+// CHECK: bitcast [12 x float]* %4 to [2 x %class.matrix.float.3.2]*
+// CHECK: bitcast [12 x float]* %4 to [2 x %class.matrix.float.3.2]*
+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
+
+
+struct MA {
+  float2x3 ma[2];
+};
+
+MA mat_test2( float3x2 m[2], int idx);
+
+cbuffer A {
+column_major float3x2 cma[2];
+row_major    float3x2 rma[2];
+uint3 i;
+};
+
+
+[shader("pixel")]
+float3 mainx() : SV_Target {
+  MA ma = mat_test2(cma, i.x);
+  MA ma2 = mat_test2(rma, i.y);
+  return ma.ma[i.z][i.y] + ma2.ma[i.z][i.y];
+}

+ 16 - 0
tools/clang/test/CodeGenHLSL/quick-test/lib_mat_param8.hlsl

@@ -0,0 +1,16 @@
+// RUN: %dxc -T lib_6_3  %s | FileCheck %s
+
+// Make sure return matrix struct works.
+// CHECK: bitcast %class.matrix.float.3.2* {{.*}} to <6 x float>*
+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
+// CHECK: bitcast %class.matrix.float.3.2* {{.*}} to <6 x float>*
+// CHECK: bitcast %class.matrix.float.2.3* {{.*}} to <6 x float>*
+
+struct MA {
+  float2x3 ma[2];
+};
+
+MA mat_test2( float3x2 m[2], int idx) {
+  MA ma = { { transpose(m[0]), transpose(m[1])}};
+  return ma;
+}

+ 1 - 1
tools/clang/test/CodeGenHLSL/raw_buf4.hlsl

@@ -28,7 +28,7 @@
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float undef, float undef, i8 3, i32 4)
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float undef, i8 7, i32 4)
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 4)
-// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 3, i32 8)
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
 // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 15, i32 8)
 
 ByteAddressBuffer buf1;

+ 1 - 1
tools/clang/test/CodeGenHLSL/raw_buf5.hlsl

@@ -41,7 +41,7 @@
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float undef, float undef, i8 3, i32 4)
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float undef, i8 7, i32 4)
 // CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 4)
-// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 3, i32 8)
+// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
 // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buf2_UAV_rawbuf, i32 1, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i8 15, i32 8)
 
 ByteAddressBuffer buf1;

+ 2 - 2
utils/hct/hctdb.py

@@ -1154,7 +1154,7 @@ class db_dxil(object):
         # End of DXIL 1.1 opcodes.
         self.set_op_count_for_version(1, 1, next_op_idx)
 
-        self.add_dxil_op("RawBufferLoad", next_op_idx, "RawBufferLoad", "reads from a raw buffer and structured buffer", "hfwi", "ro", [
+        self.add_dxil_op("RawBufferLoad", next_op_idx, "RawBufferLoad", "reads from a raw buffer and structured buffer", "hfwidl", "ro", [
             db_dxil_param(0, "$r", "", "the loaded value"),
             db_dxil_param(2, "res", "srv", "handle of TypedBuffer SRV to sample"),
             db_dxil_param(3, "i32", "index", "element index for StructuredBuffer, or byte offset for ByteAddressBuffer"),
@@ -1163,7 +1163,7 @@ class db_dxil(object):
             db_dxil_param(6, "i32", "alignment", "relative load access alignment", is_const=True)])
         next_op_idx += 1
 
-        self.add_dxil_op("RawBufferStore", next_op_idx, "RawBufferStore", "writes to a RWByteAddressBuffer or RWStructuredBuffer", "hfwi", "", [
+        self.add_dxil_op("RawBufferStore", next_op_idx, "RawBufferStore", "writes to a RWByteAddressBuffer or RWStructuredBuffer", "hfwidl", "", [
             db_dxil_param(0, "v", "", ""),
             db_dxil_param(2, "res", "uav", "handle of UAV to store to"),
             db_dxil_param(3, "i32", "index", "element index for StructuredBuffer, or byte offset for ByteAddressBuffer"),