Browse Source

Respect matrix orientation when doing store op in RWByteAddressBuffer (#3484)

Vishal Sharma 4 years ago
parent
commit
c3ffca66af

+ 11 - 0
tools/clang/lib/CodeGen/CGCall.cpp

@@ -3612,6 +3612,17 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
   llvm::CallSite CS;
   if (!InvokeDest) {
+    // HLSL changes begin
+    // When storing a matrix to memory, make sure to change its orientation to match in-memory
+    // orientation.
+    if (getLangOpts().HLSL && CGM.getHLSLRuntime().NeedHLSLMartrixCastForStoreOp(TargetDecl, IRCallArgs)) {
+      llvm::SmallVector<clang::QualType, 16> tyList;
+      for (CallArgList::const_iterator I = CallArgs.begin(), E = CallArgs.end(); I != E; ++I) {
+        tyList.emplace_back(I->Ty);
+      }
+      CGM.getHLSLRuntime().EmitHLSLMartrixCastForStoreOp(*this, IRCallArgs, tyList);
+    }
+    // HLSL changes end
     CS = Builder.CreateCall(Callee, IRCallArgs);
   } else {
     llvm::BasicBlock *Cont = createBasicBlock("invoke.cont");

+ 59 - 0
tools/clang/lib/CodeGen/CGHLSLMS.cpp

@@ -302,6 +302,11 @@ public:
   void MarkLoopStmt(CodeGenFunction &CGF, BasicBlock *loopContinue,
                      BasicBlock *loopExit) override;
   void MarkScopeEnd(CodeGenFunction &CGF) override;
+  bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
+    llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) override;
+  void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
+    SmallVector<llvm::Value*, 16>& IRCallArgs,
+    llvm::SmallVector<clang::QualType, 16>& ArgTys) override;
   /// Get or add constant to the program
   HLCBuffer &GetOrCreateCBuffer(HLSLBufferDecl *D);
 };
@@ -4947,6 +4952,60 @@ void CGMSHLSLRuntime::EmitHLSLMatrixStore(CGBuilderTy &Builder, Value *Val,
                                  Val->getType(), {DestPtr, Val}, TheModule);
 }
 
+bool CGMSHLSLRuntime::NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
+  llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) {
+
+  const clang::FunctionDecl* FD = dyn_cast<clang::FunctionDecl>(TD);
+
+  unsigned opcode = 0;
+  StringRef group;
+  if (!hlsl::GetIntrinsicOp(FD, opcode, group))
+    return false;
+
+  if (opcode != (unsigned)hlsl::IntrinsicOp::MOP_Store)
+    return false;
+
+  // Note that the store op is not yet an HL op. It's just a call
+  // to mangled rwbab store function. So adjust the store val position.
+  const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
+
+  if (storeValOpIdx >= IRCallArgs.size()) {
+    DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
+    return false;
+  }
+
+  return HLMatrixType::isa(IRCallArgs[storeValOpIdx]->getType());
+}
+
+void CGMSHLSLRuntime::EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
+  SmallVector<llvm::Value*, 16>& IRCallArgs,
+  llvm::SmallVector<clang::QualType, 16>& ArgTys) {
+
+  // Note that the store op is not yet an HL op. It's just a call
+  // to mangled rwbab store function. So adjust the store val position.
+  const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
+
+  if (storeValOpIdx >= IRCallArgs.size() ||
+    storeValOpIdx >= ArgTys.size()) {
+    DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
+    DXASSERT_NOMSG(storeValOpIdx < ArgTys.size());
+    return;
+  }
+
+  if (!hlsl::IsHLSLMatType(ArgTys[storeValOpIdx]))
+    return;
+
+  bool isRowMajor =
+    hlsl::IsHLSLMatRowMajor(ArgTys[storeValOpIdx], m_pHLModule->GetHLOptions().bDefaultRowMajor);
+
+  if (!isRowMajor) {
+    IRCallArgs[storeValOpIdx] = EmitHLSLMatrixOperationCallImp(
+      CGF.Builder, HLOpcodeGroup::HLCast,
+      static_cast<unsigned>(HLCastOpcode::RowMatrixToColMatrix),
+      IRCallArgs[storeValOpIdx]->getType(), { IRCallArgs[storeValOpIdx] }, TheModule);
+  }
+}
+
 Value *CGMSHLSLRuntime::EmitHLSLMatrixLoad(CodeGenFunction &CGF, Value *Ptr,
                                            QualType Ty) {
   return EmitHLSLMatrixLoad(CGF.Builder, Ptr, Ty);

+ 7 - 0
tools/clang/lib/CodeGen/CGHLSLRuntime.h

@@ -145,6 +145,13 @@ public:
                              llvm::BasicBlock *loopExit) = 0;
 
   virtual void MarkScopeEnd(CodeGenFunction &CGF) = 0;
+
+  virtual bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
+                              llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) = 0;
+
+  virtual void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
+                              llvm::SmallVector<llvm::Value*, 16>& IRCallArgs,
+                              llvm::SmallVector<clang::QualType, 16>& ArgTys) = 0;
 };
 
 /// Create an instance of a HLSL runtime class.

+ 17 - 17
tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/rwbab_incomplete_mat_store_const_init_zpc.hlsl

@@ -1,7 +1,7 @@
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST1=1 %s | FileCheck %s -check-prefix=CHK_TEST1
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST2=1 %s | FileCheck %s -check-prefix=CHK_TEST2
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST3=1 %s | FileCheck %s -check-prefix=CHK_TEST3
-// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4 | XFail Github #3423
+// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST5=1 %s | FileCheck %s -check-prefix=CHK_TEST5
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST6=1 %s | FileCheck %s -check-prefix=CHK_TEST6
 // RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST7=1 %s | FileCheck %s -check-prefix=CHK_TEST7
@@ -33,49 +33,49 @@ void main()
   float2x2 t = {1,2,3,4};
 #elif TEST5  
   // CHK_TEST5: dx.op.rawBufferStore.f32
-  // CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 2.000000e+00, float 5.000000e+00
   // CHK_TEST5: dx.op.rawBufferStore.f32
-  // CHK_TEST5: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
+  // CHK_TEST5: i32 16, i32 undef, float 3.000000e+00, float 6.000000e+00
   float2x3 t = {1,2,3,4,5,6};
 #elif TEST6
   // CHK_TEST6: dx.op.rawBufferStore.f32
-  // CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00, float 2.000000e+00
   // CHK_TEST6: dx.op.rawBufferStore.f32
-  // CHK_TEST6: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
+  // CHK_TEST6: i32 16, i32 undef, float 4.000000e+00, float 6.000000e+00
   float3x2 t = {1,2,3,4,5,6};
 #elif TEST7  
   // CHK_TEST7: dx.op.rawBufferStore.f32
-  // CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 2.000000e+00
   // CHK_TEST7: dx.op.rawBufferStore.f32
-  // CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
+  // CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 8.000000e+00, float 3.000000e+00, float 6.000000e+00
   // CHK_TEST7: dx.op.rawBufferStore.f32
   // CHK_TEST7: i32 32, i32 undef, float 9.000000e+00
   float3x3 t = {1,2,3,4,5,6,7,8,9};
 #elif TEST8  
   // CHK_TEST8: dx.op.rawBufferStore.f32
-  // CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 2.000000e+00
   // CHK_TEST8: dx.op.rawBufferStore.f32
-  // CHK_TEST8: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
+  // CHK_TEST8: i32 16, i32 undef, float 6.000000e+00, float 1.000000e+01, float 3.000000e+00, float 7.000000e+00
   // CHK_TEST8: dx.op.rawBufferStore.f32
-  // CHK_TEST8: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
+  // CHK_TEST8: i32 32, i32 undef, float 1.100000e+01, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01
   float3x4 t = {1,2,3,4,5,6,7,8,9,10,11,12};
 #elif TEST9  
   // CHK_TEST9: dx.op.rawBufferStore.f32
-  // CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 1.000000e+01
   // CHK_TEST9: dx.op.rawBufferStore.f32
-  // CHK_TEST9: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
+  // CHK_TEST9: i32 16, i32 undef, float 2.000000e+00, float 5.000000e+00, float 8.000000e+00, float 1.100000e+01
   // CHK_TEST9: dx.op.rawBufferStore.f32
-  // CHK_TEST9: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
+  // CHK_TEST9: i32 32, i32 undef, float 3.000000e+00, float 6.000000e+00, float 9.000000e+00, float 1.200000e+01
   float4x3 t = {1,2,3,4,5,6,7,8,9,10,11,12};
 #else
   // CHK_TEST10: dx.op.rawBufferStore.f32
-  // CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
+  // CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 1.300000e+01
   // CHK_TEST10: dx.op.rawBufferStore.f32
-  // CHK_TEST10: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
+  // CHK_TEST10: i32 16, i32 undef, float 2.000000e+00, float 6.000000e+00, float 1.000000e+01, float 1.400000e+01
   // CHK_TEST10: dx.op.rawBufferStore.f32
-  // CHK_TEST10: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
+  // CHK_TEST10: i32 32, i32 undef, float 3.000000e+00, float 7.000000e+00, float 1.100000e+01, float 1.500000e+01
   // CHK_TEST10: dx.op.rawBufferStore.f32
-  // CHK_TEST10: i32 48, i32 undef, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01
+  // CHK_TEST10: i32 48, i32 undef, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, float 1.600000e+01
 	float4x4 t = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
 #endif
 	buffer.Store(0, t);