Преглед изворни кода

Implement Fxc's mul-only code gen pattern for pow function (#1564)

Vishal Sharma пре 7 година
родитељ
комит
953ba46999

+ 2 - 0
include/dxc/HLSL/DxilUtil.h

@@ -14,6 +14,7 @@
 #include <string>
 #include <string>
 #include <memory>
 #include <memory>
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
 
 
 namespace llvm {
 namespace llvm {
 class Type;
 class Type;
@@ -92,6 +93,7 @@ namespace dxilutil {
   void PrintDiagnosticHandler(const llvm::DiagnosticInfo &DI, void *Context);
   void PrintDiagnosticHandler(const llvm::DiagnosticInfo &DI, void *Context);
   // Returns true if type contains HLSL Object type (resource)
   // Returns true if type contains HLSL Object type (resource)
   bool ContainsHLSLObjectType(llvm::Type *Ty);
   bool ContainsHLSLObjectType(llvm::Type *Ty);
+  bool IsSplat(llvm::ConstantDataVector *cdv);
 }
 }
 
 
 }
 }

+ 4 - 3
include/dxc/HLSL/HLModule.h

@@ -49,7 +49,7 @@ class RootSignatureHandle;
 struct HLOptions {
 struct HLOptions {
   HLOptions()
   HLOptions()
       : bDefaultRowMajor(false), bIEEEStrict(false), bDisableOptimizations(false),
       : bDefaultRowMajor(false), bIEEEStrict(false), bDisableOptimizations(false),
-        bLegacyCBufferLoad(false), PackingStrategy(0), bBackCompatMode(0), unused(0) {
+        bLegacyCBufferLoad(false), PackingStrategy(0), bDX9CompatMode(0), bFXCCompatMode(0), unused(0) {
   }
   }
   uint32_t GetHLOptionsRaw() const;
   uint32_t GetHLOptionsRaw() const;
   void SetHLOptionsRaw(uint32_t data);
   void SetHLOptionsRaw(uint32_t data);
@@ -61,8 +61,9 @@ struct HLOptions {
   unsigned PackingStrategy         : 2;
   unsigned PackingStrategy         : 2;
   static_assert((unsigned)DXIL::PackingStrategy::Invalid < 4, "otherwise 2 bits is not enough to store PackingStrategy");
   static_assert((unsigned)DXIL::PackingStrategy::Invalid < 4, "otherwise 2 bits is not enough to store PackingStrategy");
   unsigned bUseMinPrecision        : 1;
   unsigned bUseMinPrecision        : 1;
-  unsigned bBackCompatMode         : 1;
-  unsigned unused                  : 23;
+  unsigned bDX9CompatMode          : 1;
+  unsigned bFXCCompatMode          : 1;
+  unsigned unused                  : 22;
 };
 };
 
 
 typedef std::unordered_map<const llvm::Function *, std::unique_ptr<DxilFunctionProps>> DxilFunctionPropsMap;
 typedef std::unordered_map<const llvm::Function *, std::unique_ptr<DxilFunctionProps>> DxilFunctionPropsMap;

+ 2 - 1
include/dxc/Support/HLSLOptions.h

@@ -132,7 +132,8 @@ public:
   bool AvoidFlowControl = false;     // OPT_Gfa
   bool AvoidFlowControl = false;     // OPT_Gfa
   bool PreferFlowControl = false;    // OPT_Gfp
   bool PreferFlowControl = false;    // OPT_Gfp
   bool EnableStrictMode = false;     // OPT_Ges
   bool EnableStrictMode = false;     // OPT_Ges
-  bool EnableBackCompatMode = false;     // OPT_Gec
+  bool EnableDX9CompatMode = false;     // OPT_Gec
+  bool EnableFXCCompatMode = false;     // internal flag
   unsigned long HLSLVersion = 0; // OPT_hlsl_version (2015-2018)
   unsigned long HLSLVersion = 0; // OPT_hlsl_version (2015-2018)
   bool Enable16BitTypes = false; // OPT_enable_16bit_types
   bool Enable16BitTypes = false; // OPT_enable_16bit_types
   bool OptDump = false; // OPT_ODump - dump optimizer commands
   bool OptDump = false; // OPT_ODump - dump optimizer commands

+ 7 - 3
lib/DxcSupport/HLSLOptions.cpp

@@ -363,10 +363,10 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     }
     }
   }
   }
 
 
-  opts.EnableBackCompatMode = Args.hasFlag(OPT_Gec, OPT_INVALID, false);
+  opts.EnableDX9CompatMode = Args.hasFlag(OPT_Gec, OPT_INVALID, false);
   llvm::StringRef ver = Args.getLastArgValue(OPT_hlsl_version);
   llvm::StringRef ver = Args.getLastArgValue(OPT_hlsl_version);
   if (ver.empty()) {
   if (ver.empty()) {
-    if (opts.EnableBackCompatMode)
+    if (opts.EnableDX9CompatMode)
       opts.HLSLVersion = 2016; // Default to max supported version with /Gec flag
       opts.HLSLVersion = 2016; // Default to max supported version with /Gec flag
     else
     else
       opts.HLSLVersion = 2018; // Default to latest version
       opts.HLSLVersion = 2018; // Default to latest version
@@ -393,11 +393,15 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     return 1;
     return 1;
   }
   }
 
 
-  if (opts.EnableBackCompatMode && opts.HLSLVersion > 2016) {
+  if (opts.EnableDX9CompatMode && opts.HLSLVersion > 2016) {
     errors << "/Gec is not supported with HLSLVersion " << opts.HLSLVersion;
     errors << "/Gec is not supported with HLSLVersion " << opts.HLSLVersion;
     return 1;
     return 1;
   }
   }
 
 
+  if (opts.HLSLVersion <= 2016) {
+    opts.EnableFXCCompatMode = true;
+  }
+
   // AssemblyCodeHex not supported (Fx)
   // AssemblyCodeHex not supported (Fx)
   // OutputLibrary not supported (Fl)
   // OutputLibrary not supported (Fl)
   opts.AssemblyCode = Args.getLastArgValue(OPT_Fc);
   opts.AssemblyCode = Args.getLastArgValue(OPT_Fc);

+ 13 - 0
lib/HLSL/DxilUtil.cpp

@@ -394,6 +394,19 @@ bool ContainsHLSLObjectType(llvm::Type *Ty) {
   return false;
   return false;
 }
 }
 
 
+// Based on the implementation available in LLVM's trunk:
+// http://llvm.org/doxygen/Constants_8cpp_source.html#l02734
+bool IsSplat(llvm::ConstantDataVector *cdv) {
+  const char *Base = cdv->getRawDataValues().data();
+
+  // Compare elements 1+ to the 0'th element.
+  unsigned EltSize = cdv->getElementByteSize();
+  for (unsigned i = 1, e = cdv->getNumElements(); i != e; ++i)
+    if (memcmp(Base, Base + i * EltSize, EltSize))
+      return false;
+
+  return true;
+}
 
 
 }
 }
 }
 }

+ 119 - 7
lib/HLSL/HLOperationLower.cpp

@@ -27,6 +27,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ADT/APSInt.h"
 
 
 using namespace llvm;
 using namespace llvm;
 using namespace hlsl;
 using namespace hlsl;
@@ -2113,21 +2114,132 @@ Value *TranslateStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Builder.CreateSelect(cond, zero, one);
   return Builder.CreateSelect(cond, zero, one);
 }
 }
 
 
-Value *TranslatePow(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  IRBuilder<> Builder(CI);
+// Returns true if pow can be implemented using Fxc's mul-only code gen pattern.
+// Fxc uses the below rules when choosing mul-only code gen pattern to implement pow function.
+// Rule 1: Applicable only to power values in the range [INT32_MIN, INT32_MAX]
+// Rule 2: The maximum number of mul ops needed shouldn't exceed (2n+1) or (n+1) based on whether the power
+//         is a positive or a negative value. Here "n" is the number of scalar elements in power.
+// Rule 3: Power must be an exact value.
+// +----------+---------------------+------------------+
+// | BaseType | IsExponentPositive  | MaxMulOpsAllowed |
+// +----------+---------------------+------------------+
+// | float4x4 | True                |               33 |
+// | float4x4 | False               |               17 |
+// | float4x2 | True                |               17 |
+// | float4x2 | False               |                9 |
+// | float2x4 | True                |               17 |
+// | float2x4 | False               |                9 |
+// | float4   | True                |                9 |
+// | float4   | False               |                5 |
+// | float2   | True                |                5 |
+// | float2   | False               |                3 |
+// | float    | True                |                3 |
+// | float    | False               |                2 |
+// +----------+---------------------+------------------+
+
+bool CanUseFxcMulOnlyPatternForPow(IRBuilder<>& Builder, Value *x, Value *pow, int32_t& powI) {
+  // Applicable only when power is a literal.
+  if (!isa<ConstantDataVector>(pow) && !isa<ConstantFP>(pow)) {
+    return false;
+  }
+
+  // Only apply this code gen on splat values.
+  if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(pow)) {
+    if (!hlsl::dxilutil::IsSplat(cdv)) {
+      return false;
+    }
+  }
+
+  APFloat powAPF = isa<ConstantDataVector>(pow) ?
+    cast<ConstantDataVector>(pow)->getElementAsAPFloat(0) : // should be a splat value
+    cast<ConstantFP>(pow)->getValueAPF();
+  APSInt powAPS(32, false);
+  bool isExact = false;
+  // Try converting float value of power to integer and also check if the float value is exact.
+  APFloat::opStatus status = powAPF.convertToInteger(powAPS, APFloat::rmTowardZero, &isExact);
+  if (status == APFloat::opStatus::opOK && isExact) {
+    powI = powAPS.getExtValue();
+    uint32_t powU = abs(powI);
+    int setBitCount = 0;
+    int maxBitSetPos = -1;
+    for (int i = 0; i < 32; i++) {
+      if ((powU >> i) & 1) {
+        setBitCount++;
+        maxBitSetPos = i;
+      }
+    }
+
+    DXASSERT(maxBitSetPos <= 30, "msb should always be zero.");
+    unsigned numElem = isa<ConstantDataVector>(pow) ? x->getType()->getVectorNumElements() : 1;
+    int mulOpThreshold = powI < 0 ? numElem + 1 : 2 * numElem + 1;
+    int mulOpNeeded = maxBitSetPos + setBitCount - 1;
+    return mulOpNeeded <= mulOpThreshold;
+  }
+
+  return false;
+}
+
+Value *TranslatePowUsingFxcMulOnlyPattern(IRBuilder<>& Builder, Value *x, const int32_t y) {
+  uint32_t absY = abs(y);
+  // If y is zero then always return 1.
+  if (absY == 0) {
+    return ConstantFP::get(x->getType(), 1);
+  }
+
+  int lastSetPos = -1;
+  Value *result = nullptr;
+  Value *mul = nullptr;
+  for (int i = 0; i < 32; i++) {
+    if ((absY >> i) & 1) {
+      for (int j = i; j > lastSetPos; j--) {
+        if (!mul) {
+          mul = x;
+        } else {
+          mul = Builder.CreateFMul(mul, mul);
+        }
+      }
+
+      result = (result == nullptr) ? mul : Builder.CreateFMul(result, mul);
+      lastSetPos = i;
+    }
+  }
+
+  // Compute reciprocal for negative power values.
+  if (y < 0) {
+    Value* constOne = ConstantFP::get(x->getType(), 1);
+    result = Builder.CreateFDiv(constOne, result);
+  }
+
+  return result;
+}
+
+Value *TranslatePowImpl(hlsl::OP *hlslOP, IRBuilder<>& Builder, Value *x, Value *y, bool isFXCCompatMode = false) {
+  // As applicable implement pow using only mul ops as done by Fxc.
+  int32_t p=0;
+  if (isFXCCompatMode && CanUseFxcMulOnlyPatternForPow(Builder, x, y, p)) {
+    return TranslatePowUsingFxcMulOnlyPattern(Builder, x, p);
+  }
+
+  // Default to log-mul-exp pattern if previous scenarios don't apply.
   // t = log(x);
   // t = log(x);
   Value *logX =
   Value *logX =
-      TrivialDxilUnaryOperation(DXIL::OpCode::Log, x, hlslOP, Builder);
+    TrivialDxilUnaryOperation(DXIL::OpCode::Log, x, hlslOP, Builder);
   // t = y * t;
   // t = y * t;
   Value *mulY = Builder.CreateFMul(logX, y);
   Value *mulY = Builder.CreateFMul(logX, y);
   // pow = exp(t);
   // pow = exp(t);
   return TrivialDxilUnaryOperation(DXIL::OpCode::Exp, mulY, hlslOP, Builder);
   return TrivialDxilUnaryOperation(DXIL::OpCode::Exp, mulY, hlslOP, Builder);
 }
 }
 
 
+Value *TranslatePow(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                    HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  bool isFXCCompatMode = CI->getModule()->GetHLModule().GetHLOptions().bFXCCompatMode;
+  IRBuilder<> Builder(CI);
+  return TranslatePowImpl(hlslOP,Builder,x,y,isFXCCompatMode);
+}
+
 Value *TranslateFaceforward(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
 Value *TranslateFaceforward(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
                             HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
                             HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
   hlsl::OP *hlslOP = &helper.hlslOP;

+ 1 - 1
lib/HLSL/HLSignatureLower.cpp

@@ -231,7 +231,7 @@ void HLSignatureLower::ProcessArgument(Function *func,
   }
   }
 
 
   //  back-compat mode - remap obsolete semantics
   //  back-compat mode - remap obsolete semantics
-  if (HLM.GetHLOptions().bBackCompatMode && paramAnnotation.HasSemanticString()) {
+  if (HLM.GetHLOptions().bDX9CompatMode && paramAnnotation.HasSemanticString()) {
     hlsl::RemapObsoleteSemantic(paramAnnotation, sigPoint->GetKind(), HLM.GetCtx());
     hlsl::RemapObsoleteSemantic(paramAnnotation, sigPoint->GetKind(), HLM.GetCtx());
   }
   }
 
 

+ 2 - 1
tools/clang/include/clang/Basic/LangOptions.h

@@ -156,7 +156,8 @@ public:
   unsigned RootSigMinor;
   unsigned RootSigMinor;
   bool IsHLSLLibrary;
   bool IsHLSLLibrary;
   bool UseMinPrecision; // use min precision, not native precision.
   bool UseMinPrecision; // use min precision, not native precision.
-  bool EnableBackCompatMode;
+  bool EnableDX9CompatMode;
+  bool EnableFXCCompatMode;
   // HLSL Change Ends
   // HLSL Change Ends
 
 
   bool SPIRV = false;  // SPIRV Change
   bool SPIRV = false;  // SPIRV Change

+ 6 - 5
tools/clang/lib/CodeGen/CGHLSLMS.cpp

@@ -385,7 +385,8 @@ CGMSHLSLRuntime::CGMSHLSLRuntime(CodeGenModule &CGM)
   opts.PackingStrategy = CGM.getCodeGenOpts().HLSLSignaturePackingStrategy;
   opts.PackingStrategy = CGM.getCodeGenOpts().HLSLSignaturePackingStrategy;
 
 
   opts.bUseMinPrecision = CGM.getLangOpts().UseMinPrecision;
   opts.bUseMinPrecision = CGM.getLangOpts().UseMinPrecision;
-  opts.bBackCompatMode = CGM.getLangOpts().EnableBackCompatMode;
+  opts.bDX9CompatMode = CGM.getLangOpts().EnableDX9CompatMode;
+  opts.bFXCCompatMode = CGM.getLangOpts().EnableFXCCompatMode;
 
 
   m_pHLModule->SetHLOptions(opts);
   m_pHLModule->SetHLOptions(opts);
   m_pHLModule->SetAutoBindingSpace(CGM.getCodeGenOpts().HLSLDefaultSpace);
   m_pHLModule->SetAutoBindingSpace(CGM.getCodeGenOpts().HLSLDefaultSpace);
@@ -1559,7 +1560,7 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   SourceLocation retTySemanticLoc = SetSemantic(FD, retTyAnnotation);
   SourceLocation retTySemanticLoc = SetSemantic(FD, retTyAnnotation);
   retTyAnnotation.SetParamInputQual(DxilParamInputQual::Out);
   retTyAnnotation.SetParamInputQual(DxilParamInputQual::Out);
   if (isEntry) {
   if (isEntry) {
-    if (CGM.getLangOpts().EnableBackCompatMode && retTyAnnotation.HasSemanticString()) {
+    if (CGM.getLangOpts().EnableDX9CompatMode && retTyAnnotation.HasSemanticString()) {
       RemapObsoleteSemantic(retTyAnnotation, /*isPatchConstantFunction*/ false);
       RemapObsoleteSemantic(retTyAnnotation, /*isPatchConstantFunction*/ false);
     }
     }
     CheckParameterAnnotation(retTySemanticLoc, retTyAnnotation,
     CheckParameterAnnotation(retTySemanticLoc, retTyAnnotation,
@@ -1840,7 +1841,7 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
 
 
     paramAnnotation.SetParamInputQual(dxilInputQ);
     paramAnnotation.SetParamInputQual(dxilInputQ);
     if (isEntry) {
     if (isEntry) {
-      if (CGM.getLangOpts().EnableBackCompatMode && paramAnnotation.HasSemanticString()) {
+      if (CGM.getLangOpts().EnableDX9CompatMode && paramAnnotation.HasSemanticString()) {
         RemapObsoleteSemantic(paramAnnotation, /*isPatchConstantFunction*/ false);
         RemapObsoleteSemantic(paramAnnotation, /*isPatchConstantFunction*/ false);
       }
       }
       CheckParameterAnnotation(paramSemanticLoc, paramAnnotation,
       CheckParameterAnnotation(paramSemanticLoc, paramAnnotation,
@@ -1941,7 +1942,7 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
 }
 }
 
 
 void CGMSHLSLRuntime::RemapObsoleteSemantic(DxilParameterAnnotation &paramInfo, bool isPatchConstantFunction) {
 void CGMSHLSLRuntime::RemapObsoleteSemantic(DxilParameterAnnotation &paramInfo, bool isPatchConstantFunction) {
-  DXASSERT(CGM.getLangOpts().EnableBackCompatMode, "should be used only in back-compat mode");
+  DXASSERT(CGM.getLangOpts().EnableDX9CompatMode, "should be used only in back-compat mode");
 
 
   const ShaderModel *SM = m_pHLModule->GetShaderModel();
   const ShaderModel *SM = m_pHLModule->GetShaderModel();
   DXIL::SigPointKind sigPointKind = SigPointFromInputQual(paramInfo.GetParamInputQual(), SM->GetKind(), isPatchConstantFunction);
   DXIL::SigPointKind sigPointKind = SigPointFromInputQual(paramInfo.GetParamInputQual(), SM->GetKind(), isPatchConstantFunction);
@@ -4577,7 +4578,7 @@ void CGMSHLSLRuntime::FinishCodeGen() {
     // In back-compat mode (with /Gec flag) create a static global for each const global
     // In back-compat mode (with /Gec flag) create a static global for each const global
     // to allow writing to it.
     // to allow writing to it.
     // TODO: Verfiy the behavior of static globals in hull shader
     // TODO: Verfiy the behavior of static globals in hull shader
-    if(CGM.getLangOpts().EnableBackCompatMode && CGM.getLangOpts().HLSLVersion <= 2016)
+    if(CGM.getLangOpts().EnableDX9CompatMode && CGM.getLangOpts().HLSLVersion <= 2016)
       CreateWriteEnabledStaticGlobals(m_pHLModule->GetModule(), m_pHLModule->GetEntryFunction());
       CreateWriteEnabledStaticGlobals(m_pHLModule->GetModule(), m_pHLModule->GetEntryFunction());
     if (m_pHLModule->GetShaderModel()->IsHS()) {
     if (m_pHLModule->GetShaderModel()->IsHS()) {
       SetPatchConstantFunction(Entry);
       SetPatchConstantFunction(Entry);

+ 1 - 1
tools/clang/lib/Parse/ParseDecl.cpp

@@ -2177,7 +2177,7 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
   // global variable can be inside a global structure as a static member.
   // global variable can be inside a global structure as a static member.
   // Check if the global is a static member and skip global const pass.
   // Check if the global is a static member and skip global const pass.
   // in backcompat mode, the check for global const is deferred to later stage in CGMSHLSLRuntime::FinishCodeGen()
   // in backcompat mode, the check for global const is deferred to later stage in CGMSHLSLRuntime::FinishCodeGen()
-  bool CheckGlobalConst = getLangOpts().HLSL && getLangOpts().EnableBackCompatMode && getLangOpts().HLSLVersion <= 2016 ? false : true;
+  bool CheckGlobalConst = getLangOpts().HLSL && getLangOpts().EnableDX9CompatMode && getLangOpts().HLSLVersion <= 2016 ? false : true;
   if (NestedNameSpecifier *nameSpecifier = D.getCXXScopeSpec().getScopeRep()) {
   if (NestedNameSpecifier *nameSpecifier = D.getCXXScopeSpec().getScopeRep()) {
     if (nameSpecifier->getKind() == NestedNameSpecifier::SpecifierKind::TypeSpec) {
     if (nameSpecifier->getKind() == NestedNameSpecifier::SpecifierKind::TypeSpec) {
       const Type *type = D.getCXXScopeSpec().getScopeRep()->getAsType();
       const Type *type = D.getCXXScopeSpec().getScopeRep()->getAsType();

+ 35 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-check-count01.hlsl

@@ -0,0 +1,35 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fdiv
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+
+float4 main (float x1 : A, float4x4 x2 : B, float2 x3 : C, float4 x4 : D) : SV_Target
+{
+    float p1 = 8.0;
+    float4x4 p2 =         {57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0};
+    float2 p3 = float2(-5.0,-5.0);
+    float4 p4 = float4(17.0,17.0,17.0,17.0);
+
+    return float4(pow(x1, p1), pow(x2, p2)[0][0], pow(x3, p3)[0], pow(x4, p4)[0]);
+}

+ 18 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-check-count02.hlsl

@@ -0,0 +1,18 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+// CHECK: fmul
+
+float2 main (float4 x : A) : SV_Target
+{
+    float2 y = float2(11.0,11.0);
+    return pow(x, y);
+}

+ 38 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-correctness.hlsl

@@ -0,0 +1,38 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+
+// Verify the mul-only pattern implemented to support Fxc compatability.
+
+// 2.0^8.0.
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 2.560000e+02)
+
+// 2.0^57.0 = 144115188075855872 (0x4380000000000000)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 0x4380000000000000)
+
+// 2.0^-5.0
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 3.125000e-02)
+
+//2.0^17.0
+// call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.310720e+05)
+
+float4 main () : SV_Target
+{
+    float x1 = 2.0;
+    float p1 = 8.0;
+
+    float4x4 x2 = {2.0, 2.0, 2.0, 2.0,
+                           2.0, 2.0, 2.0, 2.0,
+                           2.0, 2.0, 2.0, 2.0,
+                           2.0, 2.0, 2.0, 2.0};
+    float4x4 p2 = {57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0,
+                           57.0, 57.0, 57.0, 57.0};
+
+    float2 x3 = float2(2.0,2.0);
+    float2 p3 = float2(-5.0,-5.0);
+
+    float4 x4 = float4(2.0,2.0,2.0,2.0);
+    float4 p4 = float4(17.0,17.0,17.0,17.0);
+
+    return float4(pow(x1, p1), pow(x2, p2)[0][0], pow(x3, p3)[0], pow(x4, p4)[0]);
+}

+ 27 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-criteria01.hlsl

@@ -0,0 +1,27 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+// dxc should use log-mul-exp pattern to implement all scenarios listed below.
+
+// CHECK: Log
+// CHECK: Exp
+// CHECK: Log
+// CHECK: Exp
+// CHECK: Log
+// CHECK: Exp
+// CHECK: Log
+// CHECK: Exp
+// CHECK: Log
+// CHECK: Exp
+
+float main (float4x4 a : A, float b : B, float4 c: C) : SV_Target
+{
+    float4x4 p1 = {2.0, 2.0, 3.0, 2.0,
+                  2.0, 2.0, 2.0, 2.0,
+                  2.0, 2.0, 2.0, 2.0,
+                  2.0, 2.0, -1.0, 2.0,}; // not a splat vector
+    float4 p2 = {2.33, 2.33, 2.33, 2.33}; // a splat vector but not exact
+    float p3 = 2.001; // not an exact value
+    float p4 = 4294967296.0; // value greater than int max
+    float p5 = 7; // exceeds the mulop threshold criteria for float
+
+    return pow(a,p1)[0][0] + pow(b,p2)[0] + pow(a,p3)[0][0] + pow(a,p4)[0][0] + pow(c,p4)[0] + pow(b,p5);
+}

+ 17 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-criteria02.hlsl

@@ -0,0 +1,17 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+// dxc should use log-mul-exp pattern to implement all scenarios listed below.
+
+// CHECK-NOT: Log
+// CHECK-NOT: Exp
+
+float main (float4x4 a : A, float b : B, float4 c: C) : SV_Target
+{
+    float4x4 p1 = {2.0, 2.0, 2.0, 2.0,
+                  2.0, 2.0, 2.0, 2.0,
+                  2.0, 2.0, 2.0, 2.0,
+                  2.0, 2.0, 2.0, 2.0,}; // a splat
+    float4 p2 = {9, 9, 9, 9}; // another splat
+    float p3 = 8; // meets the threshold criteria
+
+    return pow(a,p1)[0][0] + pow(b,p2)[0] + pow(a,p3)[0][0];
+}

+ 14 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-lit-types.hlsl

@@ -0,0 +1,14 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+// check that different float literals are being considered for mul-only code gen for pow.
+// CHECK-NOT: Log
+// CHECK-NOT: Exp
+
+float main ( float a : A, float4x4 b: B, float4 c: C, float2 d: D) : SV_Target
+{
+    return pow(a, 8.0f) + 
+           pow(d, 14.0h)[0] +
+           pow(c, 384.0H)[0] +
+           pow(c, -32.0F)[0] +
+           pow(b, -131072.0L)[0][0] +
+           pow(b, 1073741824.0L)[0][0];
+}

+ 11 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-one-as-power.hlsl

@@ -0,0 +1,11 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %{{[a-z0-9]+.*[a-z0-9]*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %{{[a-z0-9]+.*[a-z0-9]*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %{{[a-z0-9]+.*[a-z0-9]*}})
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %{{[a-z0-9]+.*[a-z0-9]*}})
+
+float4 main ( float a : A, float2 b : B, float4 c: C, float4x4 d: D) : SV_Target
+{
+    return float4(pow(a, 1), pow(b, float2(1.00,1.00))[0], pow(c, float4(1.00,1.00,1.00,1.00))[2], pow(d, 1.00)[1][2]);
+}

+ 11 - 0
tools/clang/test/CodeGenHLSL/quick-test/pow-mulonly-zero-as-power.hlsl

@@ -0,0 +1,11 @@
+// RUN: %dxc -HV 2016 -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00)
+
+float4 main ( float a : A, float2 b : B, float4 c: C, float4x4 d: D) : SV_Target
+{
+    return float4(pow(a, 0), pow(b, float2(0.00,0.00))[0], pow(c, -0.00)[2], pow(d, 0.00)[1][2]);
+}

+ 3 - 2
tools/clang/tools/dxcompiler/dxcompilerobj.cpp

@@ -800,7 +800,7 @@ public:
     compiler.createSourceManager(compiler.getFileManager());
     compiler.createSourceManager(compiler.getFileManager());
     compiler.setTarget(
     compiler.setTarget(
         TargetInfo::CreateTargetInfo(compiler.getDiagnostics(), targetOptions));
         TargetInfo::CreateTargetInfo(compiler.getDiagnostics(), targetOptions));
-    if (Opts.EnableBackCompatMode) {
+    if (Opts.EnableDX9CompatMode) {
       auto const ID = compiler.getDiagnostics().getCustomDiagID(clang::DiagnosticsEngine::Warning, "/Gec flag is a deprecated functionality.");
       auto const ID = compiler.getDiagnostics().getCustomDiagID(clang::DiagnosticsEngine::Warning, "/Gec flag is a deprecated functionality.");
       compiler.getDiagnostics().Report(ID);
       compiler.getDiagnostics().Report(ID);
     }
     }
@@ -855,7 +855,8 @@ public:
     compiler.getLangOpts().RootSigMajor = 1;
     compiler.getLangOpts().RootSigMajor = 1;
     compiler.getLangOpts().RootSigMinor = rootSigMinor;
     compiler.getLangOpts().RootSigMinor = rootSigMinor;
     compiler.getLangOpts().HLSLVersion = (unsigned) Opts.HLSLVersion;
     compiler.getLangOpts().HLSLVersion = (unsigned) Opts.HLSLVersion;
-    compiler.getLangOpts().EnableBackCompatMode = Opts.EnableBackCompatMode;
+    compiler.getLangOpts().EnableDX9CompatMode = Opts.EnableDX9CompatMode;
+    compiler.getLangOpts().EnableFXCCompatMode = Opts.EnableFXCCompatMode;
 
 
     compiler.getLangOpts().UseMinPrecision = !Opts.Enable16BitTypes;
     compiler.getLangOpts().UseMinPrecision = !Opts.Enable16BitTypes;
 
 

+ 2 - 1
tools/clang/tools/libclang/dxcrewriteunused.cpp

@@ -129,7 +129,8 @@ void SetupCompilerForRewrite(CompilerInstance &compiler,
   compiler.getDiagnostics().setIgnoreAllWarnings(!opts.OutputWarnings);
   compiler.getDiagnostics().setIgnoreAllWarnings(!opts.OutputWarnings);
   compiler.getLangOpts().HLSLVersion = (unsigned)opts.HLSLVersion;
   compiler.getLangOpts().HLSLVersion = (unsigned)opts.HLSLVersion;
   compiler.getLangOpts().UseMinPrecision = !opts.Enable16BitTypes;
   compiler.getLangOpts().UseMinPrecision = !opts.Enable16BitTypes;
-  compiler.getLangOpts().EnableBackCompatMode = opts.EnableBackCompatMode;
+  compiler.getLangOpts().EnableDX9CompatMode = opts.EnableDX9CompatMode;
+  compiler.getLangOpts().EnableFXCCompatMode = opts.EnableFXCCompatMode;
 
 
   PreprocessorOptions &PPOpts = compiler.getPreprocessorOpts();
   PreprocessorOptions &PPOpts = compiler.getPreprocessorOpts();
   if (rewrite != nullptr) {
   if (rewrite != nullptr) {