4 years ago · e2116a0eeb
--- a/tools/clang/lib/CodeGen/CGHLSLMSFinishCodeGen.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMSFinishCodeGen.cpp
@@ -20,6 +20,7 @@
 
				 #include "llvm/IR/Instructions.h"
			
 
				 #include "llvm/IR/Module.h"
			
 
				 #include "llvm/IR/Type.h"
			
 
				+#include "llvm/IR/DerivedTypes.h"
			
 
				 #include "llvm/Transforms/Utils/Cloning.h"
			
 
				 #include "llvm/Transforms/Utils/ValueMapper.h"
			
 
				 
			
@@ -292,6 +293,35 @@ void ReplaceBoolVectorSubscript(Function *F) {
 
				   }
			
 
				 }
			
 
				 
			
 
				+// Returns a valid field annotation (if present) for the matrix type of templated
			
 
				+// resource on matrix type.
			
 
				+// Example:-
			
 
				+// AppendStructuredBuffer<float4x4> abuf;
			
 
				+// Return the field annotation of the matrix type in the above decl.
			
 
				+static DxilFieldAnnotation* GetTemplatedResMatAnnotation(Function *F, unsigned argOpIdx,
			
 
				+  unsigned matAnnotationIdx) {
			
 
				+  for (User* U : F->users()) {
			
 
				+    if (CallInst* CI = dyn_cast<CallInst>(U)) {
			
 
				+      if (argOpIdx >= CI->getNumArgOperands())
			
 
				+        continue;
			
 
				+      Value *resArg = CI->getArgOperand(argOpIdx);
			
 
				+      Type* resArgTy = resArg->getType();
			
 
				+      if (resArgTy->isPointerTy())
			
 
				+        resArgTy = cast<PointerType>(resArgTy)->getPointerElementType();
			
 
				+      if (isa<StructType>(resArgTy)) {
			
 
				+        DxilTypeSystem& TS = F->getParent()->GetHLModule().GetTypeSystem();
			
 
				+        auto *SA = TS.GetStructAnnotation(cast<StructType>(resArgTy));
			
 
				+        auto *FA = &(SA->GetFieldAnnotation(matAnnotationIdx));
			
 
				+        if (FA && FA->HasMatrixAnnotation()) {
			
 
				+          return FA;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return nullptr;
			
 
				+}
			
 
				+
			
 
				 // Add function body for intrinsic if possible.
			
 
				 Function *CreateOpFunction(llvm::Module &M, Function *F,
			
 
				                            llvm::FunctionType *funcTy, HLOpcodeGroup group,
			
@@ -370,6 +400,10 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
 
				       Value *subscript =
			
 
				           Builder.CreateCall(subscriptFunc, {subscriptOpArg, thisArg, counter});
			
 
				 
			
 
				+      constexpr unsigned kArgIdx = 0;
			
 
				+      constexpr unsigned kMatAnnotationIdx = 0;
			
 
				+      DxilFieldAnnotation* MatAnnotation = HLMatrixType::isa(valTy) ? 
			
 
				+        GetTemplatedResMatAnnotation(F, kArgIdx, kMatAnnotationIdx) : nullptr;
			
 
				       if (bAppend) {
			
 
				         Argument *valArg = argIter;
			
 
				         // Buf[counter] = val;
			
@@ -377,8 +411,53 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
 
				           unsigned size = M.getDataLayout().getTypeAllocSize(
			
 
				               subscript->getType()->getPointerElementType());
			
 
				           Builder.CreateMemCpy(subscript, valArg, size, 1);
			
 
				-        } else {
			
 
				-          Value *storedVal = valArg;
			
 
				+        } else if (MatAnnotation) {
			
 
				+          // If the to-be-stored value is a matrix then we need to generate
			
 
				+          // an HL matrix store which is then handled appropriately in HLMatrixLowerPass.
			
 
				+          bool isRowMajor = MatAnnotation->GetMatrixAnnotation().Orientation == MatrixOrientation::RowMajor;
			
 
				+          Value* matStoreVal = valArg;
			
 
				+
			
 
				+          // The in-reg matrix orientation is always row-major.
			
 
				+          // If the in-memory matrix orientation is col-major, then we
			
 
				+          // need to change the orientation to col-major before storing
			
 
				+          // to memory
			
 
				+          if (!isRowMajor) {
			
 
				+            unsigned castOpCode = (unsigned)HLCastOpcode::RowMatrixToColMatrix;
			
 
				+
			
 
				+            // Construct signature of the function that is used for converting
			
 
				+            // orientation of a matrix from row-major to col-major.
			
 
				+            FunctionType* MatCastFnType = FunctionType::get(
			
 
				+              matStoreVal->getType(), { Builder.getInt32Ty(), matStoreVal->getType() },
			
 
				+              /* isVarArg */ false);
			
 
				+
			
 
				+            // Create the conversion function.
			
 
				+            Function* MatCastFn = GetOrCreateHLFunction(
			
 
				+              M, MatCastFnType, HLOpcodeGroup::HLCast, castOpCode);
			
 
				+            Value* MatCastOpCode = ConstantInt::get(Builder.getInt32Ty(), castOpCode);
			
 
				+
			
 
				+            // Insert call to the conversion function.
			
 
				+            matStoreVal = Builder.CreateCall(MatCastFn, { MatCastOpCode, matStoreVal });
			
 
				+          }
			
 
				+
			
 
				+          unsigned storeOpCode = isRowMajor ? (unsigned) HLMatLoadStoreOpcode::RowMatStore
			
 
				+            : (unsigned) HLMatLoadStoreOpcode::ColMatStore;
			
 
				+
			
 
				+          // Construct signature of the function that is used for storing
			
 
				+          // the matrix value to the memory.
			
 
				+          FunctionType* MatStFnType = FunctionType::get(
			
 
				+            Builder.getVoidTy(), { Builder.getInt32Ty(), subscriptTy, matStoreVal->getType() },
			
 
				+            /* isVarArg */ false);
			
 
				+
			
 
				+          // Create the matrix store function.
			
 
				+          Function* MatStFn = GetOrCreateHLFunction(
			
 
				+            M, MatStFnType, HLOpcodeGroup::HLMatLoadStore, storeOpCode);
			
 
				+          Value* MatStOpCode = ConstantInt::get(Builder.getInt32Ty(), storeOpCode);
			
 
				+
			
 
				+          // Insert call to the matrix store function.
			
 
				+          Builder.CreateCall(MatStFn, { MatStOpCode, subscript, matStoreVal });
			
 
				+        }
			
 
				+        else {
			
 
				+          Value* storedVal = valArg;
			
 
				           // Convert to memory representation
			
 
				           if (isBoolScalarOrVector)
			
 
				             storedVal = Builder.CreateZExt(
			
@@ -390,6 +469,51 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
 
				         // return Buf[counter];
			
 
				         if (valTy->isPointerTy())
			
 
				           Builder.CreateRet(subscript);
			
 
				+        else if (MatAnnotation) {
			
 
				+          // If the to-be-loaded value is a matrix then we need to generate
			
 
				+          // an HL matrix load which is then handled appropriately in HLMatrixLowerPass.
			
 
				+          bool isRowMajor = MatAnnotation->GetMatrixAnnotation().Orientation == MatrixOrientation::RowMajor;
			
 
				+
			
 
				+          unsigned loadOpCode = isRowMajor ? (unsigned)HLMatLoadStoreOpcode::RowMatLoad
			
 
				+            : (unsigned)HLMatLoadStoreOpcode::ColMatLoad;
			
 
				+
			
 
				+          // Construct signature of the function that is used for loading
			
 
				+          // the matrix value from the memory.
			
 
				+          FunctionType* MatLdFnType = FunctionType::get(valTy, { Builder.getInt32Ty(), subscriptTy },
			
 
				+            /* isVarArg */ false);
			
 
				+
			
 
				+          // Create the matrix load function.
			
 
				+          Function* MatLdFn = GetOrCreateHLFunction(
			
 
				+            M, MatLdFnType, HLOpcodeGroup::HLMatLoadStore, loadOpCode);
			
 
				+          Value* MatStOpCode = ConstantInt::get(Builder.getInt32Ty(), loadOpCode);
			
 
				+
			
 
				+          // Insert call to the matrix load function.
			
 
				+          Value *matLdVal = Builder.CreateCall(MatLdFn, { MatStOpCode, subscript });
			
 
				+
			
 
				+          // The in-reg matrix orientation is always row-major.
			
 
				+          // If the in-memory matrix orientation is col-major, then we
			
 
				+          // need to change the orientation to row-major after loading
			
 
				+          // from memory.
			
 
				+          if (!isRowMajor) {
			
 
				+            unsigned castOpCode = (unsigned)HLCastOpcode::ColMatrixToRowMatrix;
			
 
				+
			
 
				+            // Construct signature of the function that is used for converting
			
 
				+            // orientation of a matrix from col-major to row-major.
			
 
				+            FunctionType* MatCastFnType = FunctionType::get(
			
 
				+              matLdVal->getType(), { Builder.getInt32Ty(), matLdVal->getType() },
			
 
				+              /* isVarArg */ false);
			
 
				+
			
 
				+            // Create the conversion function.
			
 
				+            Function* MatCastFn = GetOrCreateHLFunction(
			
 
				+              M, MatCastFnType, HLOpcodeGroup::HLCast, castOpCode);
			
 
				+            Value* MatCastOpCode = ConstantInt::get(Builder.getInt32Ty(), castOpCode);
			
 
				+
			
 
				+            // Insert call to the conversion function.
			
 
				+            matLdVal = Builder.CreateCall(MatCastFn, { MatCastOpCode, matLdVal });
			
 
				+
			
 
				+          }
			
 
				+          Builder.CreateRet(matLdVal);
			
 
				+        }
			
 
				         else {
			
 
				           Value *retVal = Builder.CreateLoad(subscript);
			
 
				           // Convert to register representation
			
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_col_major.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_col_major.hlsl
@@ -0,0 +1,99 @@
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool1x1 -DMAT1x1=1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float1x2 -DMAT1x2=1 %s | FileCheck %s -check-prefix=CHK_MAT1x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int2x1 -DMAT2x1=1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint2x2 -DMAT2x2=1 %s | FileCheck %s -check-prefix=CHK_MAT2x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint16_t2x3 -DMAT2x3=1 %s | FileCheck %s -check-prefix=CHK_MAT2x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int16_t3x2 -DMAT3x2=1 %s | FileCheck %s -check-prefix=CHK_MAT3x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float16_t3x3 -DMAT3x3=1 %s | FileCheck %s -check-prefix=CHK_MAT3x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float3x4 -DMAT3x4=1 %s | FileCheck %s -check-prefix=CHK_MAT3x4
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool4x3 -DMAT4x3=1 %s | FileCheck %s -check-prefix=CHK_MAT4x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float4x4 -DMAT4x4=1 %s | FileCheck %s -check-prefix=CHK_MAT4x4
			
 
				+
			
 
				+AppendStructuredBuffer<TY> buf;
			
 
				+
			
 
				+void main()
			
 
				+{
			
 
				+
			
 
				+#ifdef MAT1x1
			
 
				+    bool1x1 m = bool1x1(1);
			
 
				+#endif    
			
 
				+
			
 
				+#ifdef MAT1x2
			
 
				+    float1x2 m = float1x2(1, 2);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x1
			
 
				+    int2x1 m = int2x1(1, 2);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x2
			
 
				+    uint2x2 m = uint2x2(1, 2, 3, 4);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x3
			
 
				+    uint16_t2x3 m = uint16_t2x3(1, 2, 3, 4, 5, 6);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x2
			
 
				+    int16_t3x2 m = int16_t3x2(1, 2, 3, 4, 5, 6);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x3
			
 
				+    float16_t3x3 m = float16_t3x3(1, 2, 3, 4, 5, 6, 7, 8, 9);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x4
			
 
				+    float3x4 m = float3x4(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT4x3
			
 
				+    bool4x3 m = bool4x3(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT4x4                          
			
 
				+    float4x4 m = float4x4(float4(1, 2, 3, 4), float4(5, 6, 7, 8), float4(9, 10, 11, 12), float4(13, 14, 15, 16));  
			
 
				+#endif
			
 
				+
			
 
				+// CHK_MAT1x1: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT1x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
			
 
				+
			
 
				+// CHK_MAT1x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT1x2: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x1: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 undef, i32 undef, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x2: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 3, i32 2, i32 4, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 4, i16 2, i16 5, i8 15, i32 2)
			
 
				+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 3, i16 6, i16 undef, i16 undef, i8 3, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 3, i16 5, i16 2, i8 15, i32 2)
			
 
				+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 4, i16 6, i16 undef, i16 undef, i8 3, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, half 0xH3C00, half 0xH4400, half 0xH4700, half 0xH4000, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, half 0xH4500, half 0xH4800, half 0xH4200, half 0xH4600, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, half 0xH4880, half undef, half undef, half undef, i8 1, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x4: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 2.000000e+00, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 6.000000e+00, float 1.000000e+01, float 3.000000e+00, float 7.000000e+00, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 1.100000e+01, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 0, i32 0, i32 1, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, i32 0, i32 1, i32 1, i32 1, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, i32 0, i32 0, i32 1, i32 1, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x4: dx.op.bufferUpdateCounter  
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 1.300000e+01, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 2.000000e+00, float 6.000000e+00, float 1.000000e+01, float 1.400000e+01, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 3.000000e+00, float 7.000000e+00, float 1.100000e+01, float 1.500000e+01, i8 15, i32 4) 
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 48, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, float 1.600000e+01, i8 15, i32 4)
			
 
				+  
			
 
				+    buf.Append(m);
			
 
				+}
			
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_row_major.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/AppendStructuredBuffer/append_mat_row_major.hlsl
@@ -0,0 +1,99 @@
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool1x1 -DMAT1x1=1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float1x2 -DMAT1x2=1 %s | FileCheck %s -check-prefix=CHK_MAT1x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int2x1 -DMAT2x1=1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint2x2 -DMAT2x2=1 %s | FileCheck %s -check-prefix=CHK_MAT2x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint16_t2x3 -DMAT2x3=1 %s | FileCheck %s -check-prefix=CHK_MAT2x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int16_t3x2 -DMAT3x2=1 %s | FileCheck %s -check-prefix=CHK_MAT3x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float16_t3x3 -DMAT3x3=1 %s | FileCheck %s -check-prefix=CHK_MAT3x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float3x4 -DMAT3x4=1 %s | FileCheck %s -check-prefix=CHK_MAT3x4
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool4x3 -DMAT4x3=1 %s | FileCheck %s -check-prefix=CHK_MAT4x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float4x4 -DMAT4x4=1 %s | FileCheck %s -check-prefix=CHK_MAT4x4
			
 
				+
			
 
				+AppendStructuredBuffer<TY> buf;
			
 
				+
			
 
				+void main()
			
 
				+{
			
 
				+
			
 
				+#ifdef MAT1x1
			
 
				+    bool1x1 m = bool1x1(1);
			
 
				+#endif    
			
 
				+
			
 
				+#ifdef MAT1x2
			
 
				+    float1x2 m = float1x2(1, 2);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x1
			
 
				+    int2x1 m = int2x1(1, 2);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x2
			
 
				+    uint2x2 m = uint2x2(1, 2, 3, 4);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT2x3
			
 
				+    uint16_t2x3 m = uint16_t2x3(1, 2, 3, 4, 5, 6);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x2
			
 
				+    int16_t3x2 m = int16_t3x2(1, 2, 3, 4, 5, 6);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x3
			
 
				+    float16_t3x3 m = float16_t3x3(1, 2, 3, 4, 5, 6, 7, 8, 9);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT3x4
			
 
				+    float3x4 m = float3x4(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT4x3
			
 
				+    bool4x3 m = bool4x3(1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef MAT4x4                          
			
 
				+    float4x4 m = float4x4(float4(1, 2, 3, 4), float4(5, 6, 7, 8), float4(9, 10, 11, 12), float4(13, 14, 15, 16));  
			
 
				+#endif
			
 
				+
			
 
				+// CHK_MAT1x1: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT1x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
			
 
				+
			
 
				+// CHK_MAT1x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT1x2: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x1: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x1: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 undef, i32 undef, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x2: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 2, i32 3, i32 4, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 2, i16 3, i16 4, i8 15, i32 2)
			
 
				+// CHK_MAT2x3: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 5, i16 6, i16 undef, i16 undef, i8 3, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x2: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i16 1, i16 2, i16 3, i16 4, i8 15, i32 2)
			
 
				+// CHK_MAT3x2: dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, i16 5, i16 6, i16 undef, i16 undef, i8 3, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 8, half 0xH4500, half 0xH4600, half 0xH4700, half 0xH4800, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, half 0xH4880, half undef, half undef, half undef, i8 1, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x4: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x3: dx.op.bufferUpdateCounter
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, i32 1, i32 0, i32 0, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, i32 1, i32 0, i32 0, i32 1, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, i32 1, i32 1, i32 1, i32 1, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x4: dx.op.bufferUpdateCounter  
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 0, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15, i32 4)  
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 16, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00, i8 15, i32 4)  
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 32, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01, i8 15, i32 4)  
			
 
				+// CHK_MAT4x4: dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %{{.*}}, i32 %{{.*}}, i32 48, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01, i8 15, i32 4)  
			
 
				+  
			
 
				+    buf.Append(m);
			
 
				+}
			
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_col_major.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_col_major.hlsl
@@ -0,0 +1,70 @@
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool1x1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float1x2 %s | FileCheck %s -check-prefix=CHK_MAT1x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int2x1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint2x2 %s | FileCheck %s -check-prefix=CHK_MAT2x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=uint16_t2x3 %s | FileCheck %s -check-prefix=CHK_MAT2x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=int16_t3x2 %s | FileCheck %s -check-prefix=CHK_MAT3x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float16_t3x3 %s | FileCheck %s -check-prefix=CHK_MAT3x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float3x4 %s | FileCheck %s -check-prefix=CHK_MAT3x4
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=bool4x3 %s | FileCheck %s -check-prefix=CHK_MAT4x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpc -DTY=float4x4 %s | FileCheck %s -check-prefix=CHK_MAT4x4
			
 
				+
			
 
				+ConsumeStructuredBuffer<TY> buf;
			
 
				+
			
 
				+TY main() : OUT
			
 
				+{
			
 
				+
			
 
				+// CHK_MAT1x1: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT1x1: i32 %{{.*}}, i32 0, i8 1, i32 4)
			
 
				+
			
 
				+// CHK_MAT1x2: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT1x2: i32 %{{.*}}, i32 0, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x1: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT2x1: i32 %{{.*}}, i32 0, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x2: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT2x2: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT2x3: i32 %{{.*}}, i32 0, i8 3, i32 2)
			
 
				+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT2x3: i32 %{{.*}}, i32 4, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT3x2: i32 %{{.*}}, i32 0, i8 3, i32 2)
			
 
				+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT3x2: i32 %{{.*}}, i32 4, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 0, i8 1, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 2, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 10, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 48, i8 15, i32 4)
			
 
				+  
			
 
				+    return buf.Consume();
			
 
				+}
			
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_row_major.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ConsumeStructuredBuffer/consume_mat_row_major.hlsl
@@ -0,0 +1,70 @@
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool1x1 %s | FileCheck %s -check-prefix=CHK_MAT1x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float1x2 %s | FileCheck %s -check-prefix=CHK_MAT1x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int2x1 %s | FileCheck %s -check-prefix=CHK_MAT2x1
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint2x2 %s | FileCheck %s -check-prefix=CHK_MAT2x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=uint16_t2x3 %s | FileCheck %s -check-prefix=CHK_MAT2x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=int16_t3x2 %s | FileCheck %s -check-prefix=CHK_MAT3x2
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float16_t3x3 %s | FileCheck %s -check-prefix=CHK_MAT3x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float3x4 %s | FileCheck %s -check-prefix=CHK_MAT3x4
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=bool4x3 %s | FileCheck %s -check-prefix=CHK_MAT4x3
			
 
				+// RUN: %dxc -E main -T vs_6_2 -enable-16bit-types -Zpr -DTY=float4x4 %s | FileCheck %s -check-prefix=CHK_MAT4x4
			
 
				+
			
 
				+ConsumeStructuredBuffer<TY> buf;
			
 
				+
			
 
				+TY main() : OUT
			
 
				+{
			
 
				+
			
 
				+// CHK_MAT1x1: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT1x1: i32 %{{.*}}, i32 0, i8 1, i32 4)
			
 
				+
			
 
				+// CHK_MAT1x2: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT1x2: i32 %{{.*}}, i32 0, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x1: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT2x1: i32 %{{.*}}, i32 0, i8 3, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x2: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT2x2: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT2x3: i32 %{{.*}}, i32 0, i8 3, i32 2)
			
 
				+// CHK_MAT2x3: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT2x3: i32 %{{.*}}, i32 4, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT3x2: i32 %{{.*}}, i32 0, i8 3, i32 2)
			
 
				+// CHK_MAT3x2: dx.op.rawBufferLoad.i16(i32 139, 
			
 
				+// CHK_MAT3x2: i32 %{{.*}}, i32 4, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 0, i8 1, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 2, i8 15, i32 2)
			
 
				+// CHK_MAT3x3: dx.op.rawBufferLoad.f16(i32 139, 
			
 
				+// CHK_MAT3x3: i32 %{{.*}}, i32 10, i8 15, i32 2)
			
 
				+
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT3x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT3x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT4x3: dx.op.rawBufferLoad.i32(i32 139, 
			
 
				+// CHK_MAT4x3: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 0, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 16, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 32, i8 15, i32 4)
			
 
				+// CHK_MAT4x4: dx.op.rawBufferLoad.f32(i32 139, 
			
 
				+// CHK_MAT4x4: i32 %{{.*}}, i32 48, i8 15, i32 4)
			
 
				+  
			
 
				+    return buf.Consume();
			
 
				+}