4 years ago · c3ffca66af
--- a/tools/clang/lib/CodeGen/CGCall.cpp
+++ b/tools/clang/lib/CodeGen/CGCall.cpp
@@ -3612,6 +3612,17 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
				 
			
 
				   llvm::CallSite CS;
			
 
				   if (!InvokeDest) {
			
 
				+    // HLSL changes begin
			
 
				+    // When storing a matrix to memory, make sure to change its orientation to match in-memory
			
 
				+    // orientation.
			
 
				+    if (getLangOpts().HLSL && CGM.getHLSLRuntime().NeedHLSLMartrixCastForStoreOp(TargetDecl, IRCallArgs)) {
			
 
				+      llvm::SmallVector<clang::QualType, 16> tyList;
			
 
				+      for (CallArgList::const_iterator I = CallArgs.begin(), E = CallArgs.end(); I != E; ++I) {
			
 
				+        tyList.emplace_back(I->Ty);
			
 
				+      }
			
 
				+      CGM.getHLSLRuntime().EmitHLSLMartrixCastForStoreOp(*this, IRCallArgs, tyList);
			
 
				+    }
			
 
				+    // HLSL changes end
			
 
				     CS = Builder.CreateCall(Callee, IRCallArgs);
			
 
				   } else {
			
 
				     llvm::BasicBlock *Cont = createBasicBlock("invoke.cont");
			
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -302,6 +302,11 @@ public:
 
				   void MarkLoopStmt(CodeGenFunction &CGF, BasicBlock *loopContinue,
			
 
				                      BasicBlock *loopExit) override;
			
 
				   void MarkScopeEnd(CodeGenFunction &CGF) override;
			
 
				+  bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
			
 
				+    llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) override;
			
 
				+  void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
			
 
				+    SmallVector<llvm::Value*, 16>& IRCallArgs,
			
 
				+    llvm::SmallVector<clang::QualType, 16>& ArgTys) override;
			
 
				   /// Get or add constant to the program
			
 
				   HLCBuffer &GetOrCreateCBuffer(HLSLBufferDecl *D);
			
 
				 };
			
@@ -4947,6 +4952,60 @@ void CGMSHLSLRuntime::EmitHLSLMatrixStore(CGBuilderTy &Builder, Value *Val,
 
				                                  Val->getType(), {DestPtr, Val}, TheModule);
			
 
				 }
			
 
				 
			
 
				+bool CGMSHLSLRuntime::NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
			
 
				+  llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) {
			
 
				+
			
 
				+  const clang::FunctionDecl* FD = dyn_cast<clang::FunctionDecl>(TD);
			
 
				+
			
 
				+  unsigned opcode = 0;
			
 
				+  StringRef group;
			
 
				+  if (!hlsl::GetIntrinsicOp(FD, opcode, group))
			
 
				+    return false;
			
 
				+
			
 
				+  if (opcode != (unsigned)hlsl::IntrinsicOp::MOP_Store)
			
 
				+    return false;
			
 
				+
			
 
				+  // Note that the store op is not yet an HL op. It's just a call
			
 
				+  // to mangled rwbab store function. So adjust the store val position.
			
 
				+  const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
			
 
				+
			
 
				+  if (storeValOpIdx >= IRCallArgs.size()) {
			
 
				+    DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  return HLMatrixType::isa(IRCallArgs[storeValOpIdx]->getType());
			
 
				+}
			
 
				+
			
 
				+void CGMSHLSLRuntime::EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
			
 
				+  SmallVector<llvm::Value*, 16>& IRCallArgs,
			
 
				+  llvm::SmallVector<clang::QualType, 16>& ArgTys) {
			
 
				+
			
 
				+  // Note that the store op is not yet an HL op. It's just a call
			
 
				+  // to mangled rwbab store function. So adjust the store val position.
			
 
				+  const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
			
 
				+
			
 
				+  if (storeValOpIdx >= IRCallArgs.size() ||
			
 
				+    storeValOpIdx >= ArgTys.size()) {
			
 
				+    DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
			
 
				+    DXASSERT_NOMSG(storeValOpIdx < ArgTys.size());
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  if (!hlsl::IsHLSLMatType(ArgTys[storeValOpIdx]))
			
 
				+    return;
			
 
				+
			
 
				+  bool isRowMajor =
			
 
				+    hlsl::IsHLSLMatRowMajor(ArgTys[storeValOpIdx], m_pHLModule->GetHLOptions().bDefaultRowMajor);
			
 
				+
			
 
				+  if (!isRowMajor) {
			
 
				+    IRCallArgs[storeValOpIdx] = EmitHLSLMatrixOperationCallImp(
			
 
				+      CGF.Builder, HLOpcodeGroup::HLCast,
			
 
				+      static_cast<unsigned>(HLCastOpcode::RowMatrixToColMatrix),
			
 
				+      IRCallArgs[storeValOpIdx]->getType(), { IRCallArgs[storeValOpIdx] }, TheModule);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 Value *CGMSHLSLRuntime::EmitHLSLMatrixLoad(CodeGenFunction &CGF, Value *Ptr,
			
 
				                                            QualType Ty) {
			
 
				   return EmitHLSLMatrixLoad(CGF.Builder, Ptr, Ty);
			
--- a/tools/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/tools/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -145,6 +145,13 @@ public:
 
				                              llvm::BasicBlock *loopExit) = 0;
			
 
				 
			
 
				   virtual void MarkScopeEnd(CodeGenFunction &CGF) = 0;
			
 
				+
			
 
				+  virtual bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
			
 
				+                              llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) = 0;
			
 
				+
			
 
				+  virtual void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
			
 
				+                              llvm::SmallVector<llvm::Value*, 16>& IRCallArgs,
			
 
				+                              llvm::SmallVector<clang::QualType, 16>& ArgTys) = 0;
			
 
				 };
			
 
				 
			
 
				 /// Create an instance of a HLSL runtime class.
			
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/rwbab_incomplete_mat_store_const_init_zpc.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/rwbab_incomplete_mat_store_const_init_zpc.hlsl
@@ -1,7 +1,7 @@
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST1=1 %s | FileCheck %s -check-prefix=CHK_TEST1
			
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST2=1 %s | FileCheck %s -check-prefix=CHK_TEST2
			
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST3=1 %s | FileCheck %s -check-prefix=CHK_TEST3
			
 
				-// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4 | XFail Github #3423
			
 
				+// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4
			
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST5=1 %s | FileCheck %s -check-prefix=CHK_TEST5
			
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST6=1 %s | FileCheck %s -check-prefix=CHK_TEST6
			
 
				 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST7=1 %s | FileCheck %s -check-prefix=CHK_TEST7
			
@@ -33,49 +33,49 @@ void main()
 
				   float2x2 t = {1,2,3,4};
			
 
				 #elif TEST5  
			
 
				   // CHK_TEST5: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 2.000000e+00, float 5.000000e+00
			
 
				   // CHK_TEST5: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST5: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
			
 
				+  // CHK_TEST5: i32 16, i32 undef, float 3.000000e+00, float 6.000000e+00
			
 
				   float2x3 t = {1,2,3,4,5,6};
			
 
				 #elif TEST6
			
 
				   // CHK_TEST6: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00, float 2.000000e+00
			
 
				   // CHK_TEST6: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST6: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
			
 
				+  // CHK_TEST6: i32 16, i32 undef, float 4.000000e+00, float 6.000000e+00
			
 
				   float3x2 t = {1,2,3,4,5,6};
			
 
				 #elif TEST7  
			
 
				   // CHK_TEST7: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 2.000000e+00
			
 
				   // CHK_TEST7: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
			
 
				+  // CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 8.000000e+00, float 3.000000e+00, float 6.000000e+00
			
 
				   // CHK_TEST7: dx.op.rawBufferStore.f32
			
 
				   // CHK_TEST7: i32 32, i32 undef, float 9.000000e+00
			
 
				   float3x3 t = {1,2,3,4,5,6,7,8,9};
			
 
				 #elif TEST8  
			
 
				   // CHK_TEST8: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 2.000000e+00
			
 
				   // CHK_TEST8: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST8: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
			
 
				+  // CHK_TEST8: i32 16, i32 undef, float 6.000000e+00, float 1.000000e+01, float 3.000000e+00, float 7.000000e+00
			
 
				   // CHK_TEST8: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST8: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
			
 
				+  // CHK_TEST8: i32 32, i32 undef, float 1.100000e+01, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01
			
 
				   float3x4 t = {1,2,3,4,5,6,7,8,9,10,11,12};
			
 
				 #elif TEST9  
			
 
				   // CHK_TEST9: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 1.000000e+01
			
 
				   // CHK_TEST9: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST9: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
			
 
				+  // CHK_TEST9: i32 16, i32 undef, float 2.000000e+00, float 5.000000e+00, float 8.000000e+00, float 1.100000e+01
			
 
				   // CHK_TEST9: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST9: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
			
 
				+  // CHK_TEST9: i32 32, i32 undef, float 3.000000e+00, float 6.000000e+00, float 9.000000e+00, float 1.200000e+01
			
 
				   float4x3 t = {1,2,3,4,5,6,7,8,9,10,11,12};
			
 
				 #else
			
 
				   // CHK_TEST10: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
			
 
				+  // CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 1.300000e+01
			
 
				   // CHK_TEST10: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST10: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
			
 
				+  // CHK_TEST10: i32 16, i32 undef, float 2.000000e+00, float 6.000000e+00, float 1.000000e+01, float 1.400000e+01
			
 
				   // CHK_TEST10: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST10: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
			
 
				+  // CHK_TEST10: i32 32, i32 undef, float 3.000000e+00, float 7.000000e+00, float 1.100000e+01, float 1.500000e+01
			
 
				   // CHK_TEST10: dx.op.rawBufferStore.f32
			
 
				-  // CHK_TEST10: i32 48, i32 undef, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01
			
 
				+  // CHK_TEST10: i32 48, i32 undef, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, float 1.600000e+01
			
 
				 	float4x4 t = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
			
 
				 #endif
			
 
				 	buffer.Store(0, t);