Browse Source

Fix issues with matrix (append|consume) for (Append|Consume)StructuredBuffer (#3460)

Vishal Sharma 4 years ago
parent
commit
e2116a0eeb

+ 126 - 2
tools/clang/lib/CodeGen/CGHLSLMSFinishCodeGen.cpp

@@ -20,6 +20,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -292,6 +293,35 @@ void ReplaceBoolVectorSubscript(Function *F) {
   }
 }
 
+// Returns a valid field annotation (if present) for the matrix type of templated
+// resource on matrix type.
+// Example:-
+// AppendStructuredBuffer<float4x4> abuf;
+// Return the field annotation of the matrix type in the above decl.
+static DxilFieldAnnotation* GetTemplatedResMatAnnotation(Function *F, unsigned argOpIdx,
+  unsigned matAnnotationIdx) {
+  for (User* U : F->users()) {
+    if (CallInst* CI = dyn_cast<CallInst>(U)) {
+      if (argOpIdx >= CI->getNumArgOperands())
+        continue;
+      Value *resArg = CI->getArgOperand(argOpIdx);
+      Type* resArgTy = resArg->getType();
+      if (resArgTy->isPointerTy())
+        resArgTy = cast<PointerType>(resArgTy)->getPointerElementType();
+      if (isa<StructType>(resArgTy)) {
+        DxilTypeSystem& TS = F->getParent()->GetHLModule().GetTypeSystem();
+        auto *SA = TS.GetStructAnnotation(cast<StructType>(resArgTy));
+        auto *FA = &(SA->GetFieldAnnotation(matAnnotationIdx));
+        if (FA && FA->HasMatrixAnnotation()) {
+          return FA;
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
 // Add function body for intrinsic if possible.
 Function *CreateOpFunction(llvm::Module &M, Function *F,
                            llvm::FunctionType *funcTy, HLOpcodeGroup group,
@@ -370,6 +400,10 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
       Value *subscript =
           Builder.CreateCall(subscriptFunc, {subscriptOpArg, thisArg, counter});
 
+      constexpr unsigned kArgIdx = 0;
+      constexpr unsigned kMatAnnotationIdx = 0;
+      DxilFieldAnnotation* MatAnnotation = HLMatrixType::isa(valTy) ? 
+        GetTemplatedResMatAnnotation(F, kArgIdx, kMatAnnotationIdx) : nullptr;
       if (bAppend) {
         Argument *valArg = argIter;
         // Buf[counter] = val;
@@ -377,8 +411,53 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
           unsigned size = M.getDataLayout().getTypeAllocSize(
               subscript->getType()->getPointerElementType());
           Builder.CreateMemCpy(subscript, valArg, size, 1);
-        } else {
-          Value *storedVal = valArg;
+        } else if (MatAnnotation) {
+          // If the to-be-stored value is a matrix then we need to generate
+          // an HL matrix store which is then handled appropriately in HLMatrixLowerPass.
+          bool isRowMajor = MatAnnotation->GetMatrixAnnotation().Orientation == MatrixOrientation::RowMajor;
+          Value* matStoreVal = valArg;
+
+          // The in-reg matrix orientation is always row-major.
+          // If the in-memory matrix orientation is col-major, then we
+          // need to change the orientation to col-major before storing
+          // to memory
+          if (!isRowMajor) {
+            unsigned castOpCode = (unsigned)HLCastOpcode::RowMatrixToColMatrix;
+
+            // Construct signature of the function that is used for converting
+            // orientation of a matrix from row-major to col-major.
+            FunctionType* MatCastFnType = FunctionType::get(
+              matStoreVal->getType(), { Builder.getInt32Ty(), matStoreVal->getType() },
+              /* isVarArg */ false);
+
+            // Create the conversion function.
+            Function* MatCastFn = GetOrCreateHLFunction(
+              M, MatCastFnType, HLOpcodeGroup::HLCast, castOpCode);
+            Value* MatCastOpCode = ConstantInt::get(Builder.getInt32Ty(), castOpCode);
+
+            // Insert call to the conversion function.
+            matStoreVal = Builder.CreateCall(MatCastFn, { MatCastOpCode, matStoreVal });
+          }
+
+          unsigned storeOpCode = isRowMajor ? (unsigned) HLMatLoadStoreOpcode::RowMatStore
+            : (unsigned) HLMatLoadStoreOpcode::ColMatStore;
+
+          // Construct signature of the function that is used for storing
+          // the matrix value to the memory.
+          FunctionType* MatStFnType = FunctionType::get(
+            Builder.getVoidTy(), { Builder.getInt32Ty(), subscriptTy, matStoreVal->getType() },
+            /* isVarArg */ false);
+
+          // Create the matrix store function.
+          Function* MatStFn = GetOrCreateHLFunction(
+            M, MatStFnType, HLOpcodeGroup::HLMatLoadStore, storeOpCode);
+          Value* MatStOpCode = ConstantInt::get(Builder.getInt32Ty(), storeOpCode);
+
+          // Insert call to the matrix store function.
+          Builder.CreateCall(MatStFn, { MatStOpCode, subscript, matStoreVal });
+        }
+        else {
+          Value* storedVal = valArg;
           // Convert to memory representation
           if (isBoolScalarOrVector)
             storedVal = Builder.CreateZExt(
@@ -390,6 +469,51 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
         // return Buf[counter];
         if (valTy->isPointerTy())
           Builder.CreateRet(subscript);
+        else if (MatAnnotation) {
+          // If the to-be-loaded value is a matrix then we need to generate
+          // an HL matrix load which is then handled appropriately in HLMatrixLowerPass.
+          bool isRowMajor = MatAnnotation->GetMatrixAnnotation().Orientation == MatrixOrientation::RowMajor;
+
+          unsigned loadOpCode = isRowMajor ? (unsigned)HLMatLoadStoreOpcode::RowMatLoad
+            : (unsigned)HLMatLoadStoreOpcode::ColMatLoad;
+
+          // Construct signature of the function that is used for loading
+          // the matrix value from the memory.
+          FunctionType* MatLdFnType = FunctionType::get(valTy, { Builder.getInt32Ty(), subscriptTy },
+            /* isVarArg */ false);
+
+          // Create the matrix load function.
+          Function* MatLdFn = GetOrCreateHLFunction(
+            M, MatLdFnType, HLOpcodeGroup::HLMatLoadStore, loadOpCode);
+          Value* MatStOpCode = ConstantInt::get(Builder.getInt32Ty(), loadOpCode);
+
+          // Insert call to the matrix load function.
+          Value *matLdVal = Builder.CreateCall(MatLdFn, { MatStOpCode, subscript });
+
+          // The in-reg matrix orientation is always row-major.
+          // If the in-memory matrix orientation is col-major, then we
+          // need to change the orientation to row-major after loading
+          // from memory.
+          if (!isRowMajor) {
+            unsigned castOpCode = (unsigned)HLCastOpcode::ColMatrixToRowMatrix;
+
+            // Construct signature of the function that is used for converting
+            // orientation of a matrix from col-major to row-major.
+            FunctionType* MatCastFnType = FunctionType::get(
+              matLdVal->getType(), { Builder.getInt32Ty(), matLdVal->getType() },
+              /* isVarArg */ false);
+
+            // Create the conversion function.
+            Function* MatCastFn = GetOrCreateHLFunction(
+              M, MatCastFnType, HLOpcodeGroup::HLCast, castOpCode);
+            Value* MatCastOpCode = ConstantInt::get(Builder.getInt32Ty(), castOpCode);
+
+            // Insert call to the conversion function.
+            matLdVal = Builder.CreateCall(MatCastFn, { MatCastOpCode, matLdVal });
+
+          }
+          Builder.CreateRet(matLdVal);
+        }
         else {
           Value *retVal = Builder.CreateLoad(subscript);
           // Convert to register representation

+ 99 - 0
tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_col_major.hlsl

@@ -0,0 +1,99 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool1x1 -DMAT1x1=1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float1x2 -DMAT1x2=1 %s | FileCheck %s -check-prefix=CHK_MAT1x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int2x1 -DMAT2x1=1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint2x2 -DMAT2x2=1 %s | FileCheck %s -check-prefix=CHK_MAT2x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint16_t2x3 -DMAT2x3=1 %s | FileCheck %s -check-prefix=CHK_MAT2x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int16_t3x2 -DMAT3x2=1 %s | FileCheck %s -check-prefix=CHK_MAT3x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float16_t3x3 -DMAT3x3=1 %s | FileCheck %s -check-prefix=CHK_MAT3x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float3x4 -DMAT3x4=1 %s | FileCheck %s -check-prefix=CHK_MAT3x4
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool4x3 -DMAT4x3=1 %s | FileCheck %s -check-prefix=CHK_MAT4x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float4x4 -DMAT4x4=1 %s | FileCheck %s -check-prefix=CHK_MAT4x4
+
+AppendStructuredBuffer<TY> buf;
+
+void main()
+{
+
+#ifdef MAT1x1
+    bool1x1 m = bool1x1(1);
+#endif    
+
+#ifdef MAT1x2
+    float1x2 m = float1x2(1, 2);
+#endif
+
+#ifdef MAT2x1
+    int2x1 m = int2x1(1, 2);
+#endif
+
+#ifdef MAT2x2
+    uint2x2 m = uint2x2(1, 2, 3, 4);
+#endif
+
+#ifdef MAT2x3
+    uint16_t2x3 m = uint16_t2x3(1, 2, 3, 4, 5, 6);
+#endif
+
+#ifdef MAT3x2
+    int16_t3x2 m = int16_t3x2(1, 2, 3, 4, 5, 6);
+#endif
+
+#ifdef MAT3x3
+    float16_t3x3 m = float16_t3x3(1, 2, 3, 4, 5, 6, 7, 8, 9);
+#endif
+
+#ifdef MAT3x4
+    float3x4 m = float3x4(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+#endif
+
+#ifdef MAT4x3
+    bool4x3 m = bool4x3(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1);
+#endif
+
+#ifdef MAT4x4                          
+    float4x4 m = float4x4(float4(1, 2, 3, 4), float4(5, 6, 7, 8), float4(9, 10, 11, 12), float4(13, 14, 15, 16));  
+#endif
+
+// CHK_MAT1x1: dx.op.bufferUpdateCounter
+// CHK_MAT1x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
+
+// CHK_MAT1x2: dx.op.bufferUpdateCounter
+// CHK_MAT1x2: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i8 3, i32 4)
+
+// CHK_MAT2x1: dx.op.bufferUpdateCounter
+// CHK_MAT2x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 undef, i32 undef, i8 3, i32 4)
+
+// CHK_MAT2x2: dx.op.bufferUpdateCounter
+// CHK_MAT2x2: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 3, i32 2, i32 4, i8 15, i32 4)
+
+// CHK_MAT2x3: dx.op.bufferUpdateCounter
+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 4, i16 2, i16 5, i8 15, i32 2)
+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 3, i16 6, i16 undef, i16 undef, i8 3, i32 2)
+
+// CHK_MAT3x2: dx.op.bufferUpdateCounter
+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 3, i16 5, i16 2, i8 15, i32 2)
+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 4, i16 6, i16 undef, i16 undef, i8 3, i32 2)
+
+// CHK_MAT3x3: dx.op.bufferUpdateCounter
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, half 0xH3C00, half 0xH4400, half 0xH4700, half 0xH4000, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, half 0xH4500, half 0xH4800, half 0xH4200, half 0xH4600, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, half 0xH4880, half undef, half undef, half undef, i8 1, i32 2)
+
+// CHK_MAT3x4: dx.op.bufferUpdateCounter
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 2.000000e+00, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 6.000000e+00, float 1.000000e+01, float 3.000000e+00, float 7.000000e+00, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 1.100000e+01, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, i8 15, i32 4)
+
+// CHK_MAT4x3: dx.op.bufferUpdateCounter
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 0, i32 0, i32 1, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, i32 0, i32 1, i32 1, i32 1, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, i32 0, i32 0, i32 1, i32 1, i8 15, i32 4)
+
+// CHK_MAT4x4: dx.op.bufferUpdateCounter  
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 1.300000e+01, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 2.000000e+00, float 6.000000e+00, float 1.000000e+01, float 1.400000e+01, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 3.000000e+00, float 7.000000e+00, float 1.100000e+01, float 1.500000e+01, i8 15, i32 4) 
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 48, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, float 1.600000e+01, i8 15, i32 4)
+  
+    buf.Append(m);
+}

+ 99 - 0
tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_row_major.hlsl

@@ -0,0 +1,99 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool1x1 -DMAT1x1=1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float1x2 -DMAT1x2=1 %s | FileCheck %s -check-prefix=CHK_MAT1x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int2x1 -DMAT2x1=1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint2x2 -DMAT2x2=1 %s | FileCheck %s -check-prefix=CHK_MAT2x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint16_t2x3 -DMAT2x3=1 %s | FileCheck %s -check-prefix=CHK_MAT2x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int16_t3x2 -DMAT3x2=1 %s | FileCheck %s -check-prefix=CHK_MAT3x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float16_t3x3 -DMAT3x3=1 %s | FileCheck %s -check-prefix=CHK_MAT3x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float3x4 -DMAT3x4=1 %s | FileCheck %s -check-prefix=CHK_MAT3x4
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool4x3 -DMAT4x3=1 %s | FileCheck %s -check-prefix=CHK_MAT4x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float4x4 -DMAT4x4=1 %s | FileCheck %s -check-prefix=CHK_MAT4x4
+
+AppendStructuredBuffer<TY> buf;
+
+void main()
+{
+
+#ifdef MAT1x1
+    bool1x1 m = bool1x1(1);
+#endif    
+
+#ifdef MAT1x2
+    float1x2 m = float1x2(1, 2);
+#endif
+
+#ifdef MAT2x1
+    int2x1 m = int2x1(1, 2);
+#endif
+
+#ifdef MAT2x2
+    uint2x2 m = uint2x2(1, 2, 3, 4);
+#endif
+
+#ifdef MAT2x3
+    uint16_t2x3 m = uint16_t2x3(1, 2, 3, 4, 5, 6);
+#endif
+
+#ifdef MAT3x2
+    int16_t3x2 m = int16_t3x2(1, 2, 3, 4, 5, 6);
+#endif
+
+#ifdef MAT3x3
+    float16_t3x3 m = float16_t3x3(1, 2, 3, 4, 5, 6, 7, 8, 9);
+#endif
+
+#ifdef MAT3x4
+    float3x4 m = float3x4(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+#endif
+
+#ifdef MAT4x3
+    bool4x3 m = bool4x3(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1);
+#endif
+
+#ifdef MAT4x4                          
+    float4x4 m = float4x4(float4(1, 2, 3, 4), float4(5, 6, 7, 8), float4(9, 10, 11, 12), float4(13, 14, 15, 16));  
+#endif
+
+// CHK_MAT1x1: dx.op.bufferUpdateCounter
+// CHK_MAT1x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
+
+// CHK_MAT1x2: dx.op.bufferUpdateCounter
+// CHK_MAT1x2: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i8 3, i32 4)
+
+// CHK_MAT2x1: dx.op.bufferUpdateCounter
+// CHK_MAT2x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 undef, i32 undef, i8 3, i32 4)
+
+// CHK_MAT2x2: dx.op.bufferUpdateCounter
+// CHK_MAT2x2: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 3, i32 4, i8 15, i32 4)
+
+// CHK_MAT2x3: dx.op.bufferUpdateCounter
+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 2, i16 3, i16 4, i8 15, i32 2)
+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 5, i16 6, i16 undef, i16 undef, i8 3, i32 2)
+
+// CHK_MAT3x2: dx.op.bufferUpdateCounter
+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 2, i16 3, i16 4, i8 15, i32 2)
+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 5, i16 6, i16 undef, i16 undef, i8 3, i32 2)
+
+// CHK_MAT3x3: dx.op.bufferUpdateCounter
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, half 0xH4500, half 0xH4600, half 0xH4700, half 0xH4800, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, half 0xH4880, half undef, half undef, half undef, i8 1, i32 2)
+
+// CHK_MAT3x4: dx.op.bufferUpdateCounter
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01, i8 15, i32 4)
+
+// CHK_MAT4x3: dx.op.bufferUpdateCounter
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 0, i32 0, i32 0, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, i32 1, i32 0, i32 0, i32 1, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, i32 1, i32 1, i32 1, i32 1, i8 15, i32 4)
+
+// CHK_MAT4x4: dx.op.bufferUpdateCounter  
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15, i32 4)  
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00, i8 15, i32 4)  
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01, i8 15, i32 4)  
+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 48, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01, i8 15, i32 4)  
+  
+    buf.Append(m);
+}

+ 70 - 0
tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_col_major.hlsl

@@ -0,0 +1,70 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool1x1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float1x2 %s | FileCheck %s -check-prefix=CHK_MAT1x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int2x1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint2x2 %s | FileCheck %s -check-prefix=CHK_MAT2x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint16_t2x3 %s | FileCheck %s -check-prefix=CHK_MAT2x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int16_t3x2 %s | FileCheck %s -check-prefix=CHK_MAT3x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float16_t3x3 %s | FileCheck %s -check-prefix=CHK_MAT3x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float3x4 %s | FileCheck %s -check-prefix=CHK_MAT3x4
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool4x3 %s | FileCheck %s -check-prefix=CHK_MAT4x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float4x4 %s | FileCheck %s -check-prefix=CHK_MAT4x4
+
+ConsumeStructuredBuffer<TY> buf;
+
+TY main() : OUT
+{
+
+// CHK_MAT1x1: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT1x1: i32 %{{.*}}, i32 0, i8 1, i32 4)
+
+// CHK_MAT1x2: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT1x2: i32 %{{.*}}, i32 0, i8 3, i32 4)
+
+// CHK_MAT2x1: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT2x1: i32 %{{.*}}, i32 0, i8 3, i32 4)
+
+// CHK_MAT2x2: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT2x2: i32 %{{.*}}, i32 0, i8 15, i32 4)
+
+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT2x3: i32 %{{.*}}, i32 0, i8 3, i32 2)
+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT2x3: i32 %{{.*}}, i32 4, i8 15, i32 2)
+
+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT3x2: i32 %{{.*}}, i32 0, i8 3, i32 2)
+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT3x2: i32 %{{.*}}, i32 4, i8 15, i32 2)
+
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 0, i8 1, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 2, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 10, i8 15, i32 2)
+
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
+
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 32, i8 15, i32 4)
+
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 48, i8 15, i32 4)
+  
+    return buf.Consume();
+}

+ 70 - 0
tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_row_major.hlsl

@@ -0,0 +1,70 @@
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool1x1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float1x2 %s | FileCheck %s -check-prefix=CHK_MAT1x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int2x1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint2x2 %s | FileCheck %s -check-prefix=CHK_MAT2x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint16_t2x3 %s | FileCheck %s -check-prefix=CHK_MAT2x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int16_t3x2 %s | FileCheck %s -check-prefix=CHK_MAT3x2
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float16_t3x3 %s | FileCheck %s -check-prefix=CHK_MAT3x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float3x4 %s | FileCheck %s -check-prefix=CHK_MAT3x4
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool4x3 %s | FileCheck %s -check-prefix=CHK_MAT4x3
+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float4x4 %s | FileCheck %s -check-prefix=CHK_MAT4x4
+
+ConsumeStructuredBuffer<TY> buf;
+
+TY main() : OUT
+{
+
+// CHK_MAT1x1: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT1x1: i32 %{{.*}}, i32 0, i8 1, i32 4)
+
+// CHK_MAT1x2: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT1x2: i32 %{{.*}}, i32 0, i8 3, i32 4)
+
+// CHK_MAT2x1: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT2x1: i32 %{{.*}}, i32 0, i8 3, i32 4)
+
+// CHK_MAT2x2: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT2x2: i32 %{{.*}}, i32 0, i8 15, i32 4)
+
+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT2x3: i32 %{{.*}}, i32 0, i8 3, i32 2)
+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT2x3: i32 %{{.*}}, i32 4, i8 15, i32 2)
+
+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT3x2: i32 %{{.*}}, i32 0, i8 3, i32 2)
+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
+// CHK_MAT3x2: i32 %{{.*}}, i32 4, i8 15, i32 2)
+
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 0, i8 1, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 2, i8 15, i32 2)
+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
+// CHK_MAT3x3: i32 %{{.*}}, i32 10, i8 15, i32 2)
+
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT3x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
+
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
+// CHK_MAT4x3: i32 %{{.*}}, i32 32, i8 15, i32 4)
+
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
+// CHK_MAT4x4: i32 %{{.*}}, i32 48, i8 15, i32 4)
+  
+    return buf.Consume();
+}