Forráskód Böngészése

Merged PR 98: Adding a DxrFallbackCompiler.dll containing IDxcDxrFallbackCompiler implementation + tests

This adds an isolated component *IDxcDxrFallbackCompiler*  that is used by the D3D12 Raytracing Fallback Compiler. The component is responsible for linking together DXR shaders into a state machine capable of emulating function invocations including recursion, patching HLSL intrinsics, export renaming, and emulating Shader Record support. This component is compiled to a DxrFallbackCompiler.dll that gets consumedd with D3D12 Raytracing Fallback Compiler apps.

For deeper implementation details, I will be fleshing out lib\DxrFallback\readme.md

Some leftover work still required:

1. Currently the Fallback Layer tests compile to an exe that must be manually run. This should be refactored to conform with existing DXIL unittests.
Christopher Wallis 7 éve
szülő
commit
74520bb662
58 módosított fájl, 15314 hozzáadás és 1 törlés
  1. 99 0
      include/dxc/DxrFallback/DxrFallbackCompiler.h
  2. 20 0
      include/dxc/HLSL/DxilFallbackLayerPass.h
  3. 113 0
      include/dxc/dxcdxrfallbackcompiler.h
  4. 1 0
      lib/CMakeLists.txt
  5. 14 0
      lib/DxrFallback/CMakeLists.txt
  6. 864 0
      lib/DxrFallback/DxrFallbackCompiler.cpp
  7. 148 0
      lib/DxrFallback/FunctionBuilder.h
  8. 16 0
      lib/DxrFallback/LLVMBuild.txt
  9. 122 0
      lib/DxrFallback/LLVMUtils.cpp
  10. 34 0
      lib/DxrFallback/LLVMUtils.h
  11. 337 0
      lib/DxrFallback/LiveValues.cpp
  12. 81 0
      lib/DxrFallback/LiveValues.h
  13. 356 0
      lib/DxrFallback/Reducibility.cpp
  14. 10 0
      lib/DxrFallback/Reducibility.h
  15. 1797 0
      lib/DxrFallback/StateFunctionTransform.cpp
  16. 295 0
      lib/DxrFallback/StateFunctionTransform.h
  17. 26 0
      lib/DxrFallback/readme.md
  18. 1974 0
      lib/DxrFallback/runtime.h
  19. 62 0
      lib/DxrFallback/runtime/rewriteRuntime.py
  20. 658 0
      lib/DxrFallback/runtime/runtime.c
  21. 9 0
      lib/DxrFallback/runtime/script.cmd
  22. 1 0
      lib/HLSL/CMakeLists.txt
  23. 1155 0
      lib/HLSL/DxilPatchShaderRecordBindings.cpp
  24. 75 0
      lib/HLSL/DxilPatchShaderRecordBindingsShared.h
  25. 2 1
      lib/LLVMBuild.txt
  26. 1 0
      tools/clang/tools/CMakeLists.txt
  27. 68 0
      tools/clang/tools/dxrfallbackcompiler/CMakeLists.txt
  28. 104 0
      tools/clang/tools/dxrfallbackcompiler/DXCompiler.cpp
  29. 4 0
      tools/clang/tools/dxrfallbackcompiler/DXCompiler.def
  30. 14 0
      tools/clang/tools/dxrfallbackcompiler/DXCompiler.rc
  31. 55 0
      tools/clang/tools/dxrfallbackcompiler/dxcapi.cpp
  32. 778 0
      tools/clang/tools/dxrfallbackcompiler/dxcdxrfallbackcompiler.cpp
  33. 266 0
      tools/clang/tools/dxrfallbackcompiler/dxcutil.cpp
  34. 70 0
      tools/clang/tools/dxrfallbackcompiler/dxcutil.h
  35. 269 0
      tools/clang/tools/dxrfallbackcompiler/dxcvalidator.cpp
  36. 68 0
      tools/clang/tools/dxrfallbackcompiler/dxillib.cpp
  37. 42 0
      tools/clang/tools/dxrfallbackcompiler/dxillib.h
  38. 1 0
      tools/clang/unittests/CMakeLists.txt
  39. 73 0
      tools/clang/unittests/DxrFallback/CMakeLists.txt
  40. 110 0
      tools/clang/unittests/DxrFallback/DXSampleHelper.h
  41. 15 0
      tools/clang/unittests/DxrFallback/ShaderTester.h
  42. 542 0
      tools/clang/unittests/DxrFallback/ShaderTesterImpl.cpp
  43. 55 0
      tools/clang/unittests/DxrFallback/ShaderTesterImpl.h
  44. 1946 0
      tools/clang/unittests/DxrFallback/d3dx12.h
  45. 1 0
      tools/clang/unittests/DxrFallback/defaultTestFilePath.h.in
  46. 71 0
      tools/clang/unittests/DxrFallback/testFiles/HLSLRayTracingInternalPrototypes.h
  47. 83 0
      tools/clang/unittests/DxrFallback/testFiles/HLSLRayTracingPrototypes.h
  48. 25 0
      tools/clang/unittests/DxrFallback/testFiles/testLib.h
  49. 59 0
      tools/clang/unittests/DxrFallback/testFiles/testLib.hlsl
  50. 380 0
      tools/clang/unittests/DxrFallback/testFiles/testShader1.hlsl
  51. 228 0
      tools/clang/unittests/DxrFallback/testFiles/testShader2.hlsl
  52. 47 0
      tools/clang/unittests/DxrFallback/testFiles/testShader3.hlsl
  53. 69 0
      tools/clang/unittests/DxrFallback/testFiles/testShader4.hlsl
  54. 84 0
      tools/clang/unittests/DxrFallback/testFiles/testShader5.hlsl
  55. 97 0
      tools/clang/unittests/DxrFallback/testFiles/testTraversal.h
  56. 281 0
      tools/clang/unittests/DxrFallback/testFiles/testTraversal.hlsl
  57. 165 0
      tools/clang/unittests/DxrFallback/testFiles/testTraversal2.hlsl
  58. 974 0
      tools/clang/unittests/DxrFallback/test_DxrFallback.cpp

+ 99 - 0
include/dxc/DxrFallback/DxrFallbackCompiler.h

@@ -0,0 +1,99 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class CallInst;
+  class Function;
+  class Module;
+  class Type;
+}
+
+// Combines DXIL raytracing shaders together into a compute shader.
+//
+// The incoming module should contain the following functions if the corresponding
+// intrinsic are called by the specified shaders,
+// if called:
+//    Fallback_TraceRay() 
+//    Fallback_Ignore()
+//    Fallback_AcceptHitAndEndSearch()
+//    Fallback_ReportHit()
+//
+// Fallback_TraceRay() will be called with the original arguments, substituting
+// the offset of the payload on the stack for the actual payload. 
+// Fallback_TraceRay() will also be used to replace calls to TraceRayTest().
+//
+// ReportHit() returns a boolean. But to handle the abort of the intersection
+// shader when AcceptHitAndEndSearch() is called we need a third return value.
+// Fallback_ReportHit() should return an integer < 0 for end search, 0 for ignore, 
+// and > 0 for accept.
+//
+// The module should also contain a single call to Fallback_Scheduler() in the
+// entry shader for the raytracing compute shader.
+//
+// resizeStack() needs to be called after inlining everything in the compute 
+// shader.
+//
+// Currently the main scheduling loop and the implementation for intrinsic 
+// functions come from an internal runtime module.
+class DxrFallbackCompiler
+{
+public:
+  typedef std::map<int, std::string> IntToFuncNameMap;
+
+  // If findCalledShaders is true, then the list of shaderNames is expanded to 
+  // include shader functions (functions with attribute "exp-shader") that are 
+  // called by functions in shaderNames. Shader entry state IDs are still
+  // returned only for those originally in shaderNames. findCalledShaders used 
+  // for testing.
+  DxrFallbackCompiler(llvm::Module* module, const std::vector<std::string>& shaderNames, unsigned maxAttributeSize, unsigned stackSizeInBytes, bool findCalledShaders = false);
+
+  // 0 - no debug output
+  // 1 - dump initial combined module, compiled module, and final linked module
+  // 2 - dump intermediate stages of SFT to console
+  // 3 - dump intermediate stages of SFT to file
+  void setDebugOutputLevel(int val);
+
+  // Returns the entry state id for each of shaderNames. The transformations 
+  // are performed in place on the module.
+  void compile(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap);
+  void link(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap);
+  // TODO: Ideally we would run this after inlining everything at the end of compile.
+  // Until we figure out to do this, we will call the function after the final link.
+  static void resizeStack(llvm::Function* F, unsigned stackSizeInBytes);
+private:
+  typedef std::map<int, llvm::Function*> IntToFuncMap;
+  typedef std::map<std::string, llvm::Function*> StringToFuncMap;
+
+  llvm::Module* m_module = nullptr;
+  const std::vector<std::string>& m_entryShaderNames;
+  unsigned m_stackSizeInBytes = 0;
+  unsigned m_maxAttributeSize = 0;
+  bool m_findCalledShaders = false;
+  int m_debugOutputLevel = 0;
+
+  StringToFuncMap m_shaderMap;
+
+  void initShaderMap(std::vector<std::string>& shaderNames);
+  void linkRuntime();
+  void lowerAnyHitControlFlowFuncs();
+  void lowerReportHit();
+  void lowerTraceRay(llvm::Type* runtimeDataArgTy);
+  void createStateFunctions(IntToFuncMap& stateFunctionMap, std::vector<int>& shaderEntryStateIds, std::vector<unsigned int>& shaderStackSizes, int baseStateId, const std::vector<std::string>& shaderNames, llvm::Type* runtimeDataArgTy);
+  void createLaunchParams(llvm::Function* func);
+  void createStack(llvm::Function* func);
+  void createStateDispatch(llvm::Function* func, const IntToFuncMap& stateFunctionMap, llvm::Type* runtimeDataArgTy);
+  void lowerIntrinsics();
+
+  llvm::Type* getRuntimeDataArgType();
+  llvm::Function* createDispatchFunction(const IntToFuncMap &stateFunctionMap, llvm::Type* runtimeDataArgTy);
+
+  // These functions return calls only in shaders in m_shaderMap.
+  std::vector<llvm::CallInst*> getCallsInShadersToFunction(const std::string& funcName);
+  std::vector<llvm::CallInst*> getCallsInShadersToFunctionWithPrefix(const std::string& funcNamePrefix);
+
+};

+ 20 - 0
include/dxc/HLSL/DxilFallbackLayerPass.h

@@ -0,0 +1,20 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilFallbackLayerPass.h                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// This file provides passes used by the Ray Tracing Fallback Layer          //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace llvm {
+    ModulePass *createDxilUpdateMetadataPass();
+    ModulePass *createDxilPatchShaderRecordBindingsPass();
+
+    void initializeDxilUpdateMetadataPass(llvm::PassRegistry&);
+    void initializeDxilPatchShaderRecordBindingsPass(llvm::PassRegistry&);
+}

+ 113 - 0
include/dxc/dxcdxrfallbackcompiler.h

@@ -0,0 +1,113 @@
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcapi.h                                                                  //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides declarations for the DirectX Compiler API entry point.           //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef __DXC_DXR_FALLBACK_COMPILER_API__
+#define __DXC_DXR_FALLBACK_COMPILER_API__
+#include "dxcapi.h"
+
+enum class ShaderType : unsigned int
+{
+    Raygen,
+    AnyHit,
+    ClosestHit,
+    Intersection,
+    Miss,
+    Callable,
+    Lib,
+};
+
+struct DxcShaderInfo
+{
+    UINT32 Identifier;
+    UINT32 StackSize;
+    ShaderType Type;
+};
+
+struct DxcShaderBytecode
+{
+    LPBYTE pData;
+    UINT32 Size;
+};
+
+struct DxcExportDesc
+{
+    LPCWSTR ExportToRename;
+    LPCWSTR ExportName;
+};
+
+struct __declspec(uuid("76bb3c85-006d-4b72-9e10-63cd97df57f0"))
+  IDxcDxrFallbackCompiler : public IUnknown {
+
+  // If set to true then shaders not listed in pShaderNames in Compile() but 
+  // called by shaders in pShaderNames are added to the final computer shader. 
+  // Otherwise these are considered errors. This is intended for testing purposes.
+  virtual HRESULT STDMETHODCALLTYPE SetFindCalledShaders(bool val) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE SetDebugOutput(int val) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE RenameAndLink(
+      _In_count_(libCount) DxcShaderBytecode *pLibs,
+      UINT32 libCount,
+      _In_count_(ExportCount) DxcExportDesc *pExports,
+      UINT32 ExportCount,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  ) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE PatchShaderBindingTables(
+      _In_ const LPCWSTR pEntryName,
+      _In_ DxcShaderBytecode *pShaderBytecode,
+      _In_ void *pShaderInfo,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  ) = 0;
+
+  // Compiles libs together to create a raytracing compute shader. One of the libs 
+  // should be the fallback implementation lib that defines functions like 
+  // Fallback_TraceRay(), Fallback_ReportHit(), etc. Fallback_TraceRay() should 
+  // be one of the shader names so that it gets included in the compile. 
+  virtual HRESULT STDMETHODCALLTYPE Compile(
+    _In_count_(libCount) DxcShaderBytecode *pLibs,                  // Array of libraries containing shaders
+    UINT32 libCount,                                        // Number of libraries containing shaders
+    _In_count_(shaderCount) const LPCWSTR *pShaderNames,    // Array of shader names to compile
+    _Out_writes_(shaderCount) DxcShaderInfo *pShaderInfo,   // Array of shaderInfo corresponding to pShaderNames
+    UINT32 shaderCount,                                     // Number of shaders to compile
+    UINT32 maxAttributeSize,
+    _COM_Outptr_ IDxcOperationResult **ppResult             // Compiler output status, buffer, and errors
+  ) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE Link(
+      _In_ const LPCWSTR pEntryName,                          // Name of entry function, null if compiling a collection
+      _In_count_(libCount) IDxcBlob **pLibs,                  // Array of libraries containing shaders
+      UINT32 libCount,                                        // Number of libraries containing shaders
+      _In_count_(shaderCount) const LPCWSTR *pShaderNames,    // Array of shader names to compile
+      _In_count_(shaderCount) DxcShaderInfo *pShaderInfo,   // Array of shaderInfo corresponding to pShaderNames
+      UINT32 shaderCount,                                     // Number of shaders to compile
+      UINT32 maxAttributeSize,
+      UINT32 stackSizeInBytes,                                // Continuation stack size. Use 0 for default.
+      _COM_Outptr_ IDxcOperationResult **ppResult             // Compiler output status, buffer, and errors
+  ) = 0;
+};
+
+// {76bb3c85-006d-4b72-9e10-63cd97df57f0}
+__declspec(selectany) extern const GUID CLSID_DxcDxrFallbackCompiler = {
+  0x76bb3c85,
+  0x006d,
+  0x4b72,
+{ 0x9e, 0x10, 0x63, 0xcd, 0x97, 0xdf, 0x57, 0xf0 }
+};
+
+typedef HRESULT(__stdcall *DxcCreateDxrFallbackCompilerProc)(
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID*   ppv
+  );
+
+#endif

+ 1 - 0
lib/CMakeLists.txt

@@ -22,3 +22,4 @@ add_subdirectory(ProfileData)
 # add_subdirectory(LibDriver) # HLSL Change
 add_subdirectory(DxcSupport) # HLSL Change
 add_subdirectory(HLSL) # HLSL Change
+add_subdirectory(DxrFallback) # HLSL Change

+ 14 - 0
lib/DxrFallback/CMakeLists.txt

@@ -0,0 +1,14 @@
+add_llvm_library(LLVMDxrFallback
+  DxrFallbackCompiler.cpp
+  FunctionBuilder.h
+  LiveValues.cpp
+  LiveValues.h
+  LLVMUtils.cpp
+  LLVMUtils.h
+  Reducibility.h
+  Reducibility.cpp
+  StateFunctionTransform.cpp
+  StateFunctionTransform.h
+)
+
+add_dependencies(LLVMDxrFallback intrinscs_gen)

+ 864 - 0
lib/DxrFallback/DxrFallbackCompiler.cpp

@@ -0,0 +1,864 @@
+#include "dxc/DxrFallback/DxrFallbackCompiler.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/Unicode.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/dxcapi.h"
+#include "dxc/dxcdxrfallbackcompiler.h"
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilLinker.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilInstructions.h"
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "FunctionBuilder.h"
+#include "LLVMUtils.h"
+#include "runtime.h"
+#include "StateFunctionTransform.h"
+
+#include <queue>
+
+using namespace hlsl;
+using namespace llvm;
+
+static std::vector<Function*> getFunctionsWithPrefix(Module* module, const std::string& prefix)
+{
+  std::vector<Function*> functions;
+  for (auto F = module->begin(), E = module->end(); F != E; ++F)
+  {
+    StringRef name = F->getName();
+    if (name.startswith(prefix))
+      functions.push_back(F);
+  }
+  return functions;
+}
+
+
+static bool inlineFunc(CallInst* call, Function* Fimpl)
+{
+  // Note. LLVM inlining may not be sufficient if the function references DX 
+  // resources because the corresponding metadata is not created if the function
+  // comes from another module.
+
+  // Make sure that we have a definition for the called function in this module
+  Function* F = call->getCalledFunction();
+  Module* dstM = F->getParent();
+  if (F->isDeclaration())
+  {
+    // Map called functions in impl module to functions in this one (because the
+    // cloning step doesn't do this automatically)
+    ValueToValueMapTy VMap;
+    for (auto& I : inst_range(Fimpl))
+    {
+      if (CallInst* c = dyn_cast<CallInst>(&I))
+      {
+        Function* calledFimpl = c->getCalledFunction();
+        if (VMap.count(calledFimpl))
+          continue;
+
+        Constant* calledF = dstM->getOrInsertFunction(calledFimpl->getName(), calledFimpl->getFunctionType(), calledFimpl->getAttributes());
+        VMap[calledFimpl] = calledF;
+      }
+    }
+
+    // Map arguments
+    for (auto SI = Fimpl->arg_begin(), SE = Fimpl->arg_end(), DI = F->arg_begin(); SI != SE; ++SI, ++DI)
+      VMap[SI] = DI;
+
+    SmallVector<ReturnInst*, 4> returns;
+    CloneFunctionInto(F, Fimpl, VMap, true, returns);
+    F->setLinkage(GlobalValue::InternalLinkage);
+  }
+
+  InlineFunctionInfo IFI;
+  return InlineFunction(call, IFI, false);
+}
+
+
+// Remove ELF mangling
+static std::string cleanName(StringRef name)
+{
+  if (!name.startswith("\x1?"))
+    return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+  std::string newName = name.substr(2, pos - 2);
+  return newName;
+}
+
+
+static inline Function* getOrInsertFunction(Module* module, Function* F)
+{
+  return dyn_cast<Function>(module->getOrInsertFunction(F->getName(), F->getFunctionType()));
+}
+
+
+template<typename K, typename V>
+V get(std::map<K, V>& theMap, const K& key, V defaultVal = static_cast<V>(nullptr))
+{
+  auto it = theMap.find(key);
+  if (it == theMap.end())
+    return defaultVal;
+  else
+    return it->second;
+}
+
+
+DxrFallbackCompiler::DxrFallbackCompiler(llvm::Module* module, const std::vector<std::string>& shaderNames, unsigned maxAttributeSize, unsigned stackSizeInBytes, bool findCalledShaders /*= false*/)
+  : m_module(module)
+  , m_entryShaderNames(shaderNames)
+  , m_stackSizeInBytes(stackSizeInBytes)
+  , m_maxAttributeSize(maxAttributeSize)
+  , m_findCalledShaders(findCalledShaders)
+{}
+
+void DxrFallbackCompiler::compile(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap)
+{
+  std::vector<std::string> shaderNames = m_entryShaderNames;
+  initShaderMap(shaderNames);
+
+  // Bring in runtime so we can get the runtime data type
+  linkRuntime();
+  Type* runtimeDataArgTy = getRuntimeDataArgType();
+  
+  // Make sure all calls to intrinsics and shaders are at function scope and 
+  // fix up control flow.
+  lowerAnyHitControlFlowFuncs();
+  lowerReportHit();
+  lowerTraceRay(runtimeDataArgTy);
+  
+  // Create state functions
+  IntToFuncMap stateFunctionMap; // stateID -> state function
+  const int baseStateId = 1000;  // could be anything but this makes stateIds more recognizable 
+  createStateFunctions(stateFunctionMap, shaderEntryStateIds, shaderStackSizes, baseStateId, shaderNames, runtimeDataArgTy);
+
+  if (pCachedMap)
+  {
+      for (auto &entry : stateFunctionMap)
+      {
+          (*pCachedMap)[entry.first] = entry.second->getName().str();
+      }
+  }
+}
+
+void DxrFallbackCompiler::link(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap)
+{
+    IntToFuncMap stateFunctionMap; // stateID -> state function
+    if (pCachedMap)
+    {
+        for (auto entry : *pCachedMap)
+        {
+            stateFunctionMap[entry.first] = m_module->getFunction(entry.second);
+        }
+    }
+    else
+    {
+        for (UINT i = 0; i < shaderEntryStateIds.size(); i++)
+        {
+            UINT substateIndex = 0;
+            UINT baseStateId = shaderEntryStateIds[i];
+            while (true)
+            {
+                auto substateName = m_entryShaderNames[i] + ".ss_" + std::to_string(substateIndex);
+
+                auto function = m_module->getFunction(substateName);
+                if (!function) break;
+                stateFunctionMap[baseStateId + substateIndex] = m_module->getFunction(substateName);
+                substateIndex++;
+            }
+        }
+    }
+    
+    // Fix up scheduler
+    Function* schedulerFunc = m_module->getFunction("fb_Fallback_Scheduler");
+    createLaunchParams(schedulerFunc);
+
+    Type* runtimeDataArgTy = getRuntimeDataArgType();
+    createStateDispatch(schedulerFunc, stateFunctionMap, runtimeDataArgTy);
+    createStack(schedulerFunc);
+
+    lowerIntrinsics();
+}
+
+
+void DxrFallbackCompiler::setDebugOutputLevel(int val)
+{
+  m_debugOutputLevel = val;
+}
+
+static bool isShader(Function* F)
+{
+  if (F->hasFnAttribute("exp-shader"))
+    return true;
+
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  return (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay());
+}
+
+DXIL::ShaderKind getRayShaderKind(Function* F)
+{
+  if (F->hasFnAttribute("exp-shader"))
+    return DXIL::ShaderKind::RayGeneration;
+
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  if (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay())
+    return DM.GetDxilFunctionProps(F).shaderKind;
+
+  return DXIL::ShaderKind::Invalid;
+}
+
+
+// Some shaders should use the "pending" values of intrinsics instead of the 
+// committed ones. In particular anyhit and intersection shaders use the
+// pending values with the exception that the committed rayTCurrent should be
+// used in intersection.
+static bool shouldUsePendingValue(Function* F, StringRef instrinsicName)
+{
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  if (!DM.HasDxilFunctionProps(F))
+    return false;
+  const hlsl::DxilFunctionProps& props = DM.GetDxilFunctionProps(F);
+
+  return props.IsAnyHit() || (props.IsIntersection() && instrinsicName != "rayTCurrent");
+}
+
+void DxrFallbackCompiler::initShaderMap(std::vector<std::string>& shaderNames)
+{
+  // Clean names and initialize shaderMap
+  StringToFuncMap allShadersMap;
+  for (Function& F : m_module->functions())
+  {
+    if (isShader(&F))
+    {
+      if (!F.isDeclaration())
+        allShadersMap[cleanName(F.getName())] = &F;
+    }
+
+    F.removeFnAttr(Attribute::NoInline);
+  }
+
+
+  for (auto& name : shaderNames)
+    m_shaderMap[name] = allShadersMap[name];
+
+
+  if (!m_findCalledShaders)
+    return;
+
+
+  // Create a map from shader name to CallGraphNode
+  CallGraph callGraph(*m_module);
+  std::map<std::string, CallGraphNode*> allShaderNodes;
+  for (auto& kv : m_shaderMap)
+  {
+    const std::string& name = kv.first;
+    Function* func = kv.second;
+    allShaderNodes[name] = callGraph[func];
+  }
+
+  // Start traversing the call graph from given shaderNames
+  std::deque<CallGraphNode*> workList;
+  for (auto& name : shaderNames)
+    workList.push_back(allShaderNodes[name]);
+  while (!workList.empty())
+  {
+    CallGraphNode* cur = workList.front();
+    workList.pop_front();
+    for (size_t i = 0; i < cur->size(); ++i)
+    {
+      Function* nextFunc = (*cur)[i]->getFunction();
+      if (!nextFunc)
+        continue;
+      if (isShader(nextFunc))
+      {
+        const std::string nextName = cleanName(nextFunc->getName());
+        if (m_shaderMap.count(nextName) == 0) // not in the shaderMap yet?
+        {
+          workList.push_back(allShaderNodes[nextName]);
+          shaderNames.push_back(nextName);
+          m_shaderMap[nextName] = workList.back()->getFunction();
+        }
+      }
+    }
+  }
+}
+
+void DxrFallbackCompiler::linkRuntime()
+{
+  Linker linker(m_module);
+  std::unique_ptr<Module> runtimeModule = loadModuleFromAsmString(m_module->getContext(), getRuntimeString());
+  bool linkErr = linker.linkInModule(runtimeModule.get());
+  assert(!linkErr && "Error linking runtime");
+  UNREFERENCED_PARAMETER(linkErr);
+
+}
+
+static void inlineFuncAndAddRet(CallInst* call, Function*F)
+{
+  // Add a return after the function call.
+  // Should be followed immediately by "unreachable". Turn that into a "ret void".
+  Instruction* ret = ReturnInst::Create(call->getContext());
+  ReplaceInstWithInst(call->getParent()->getTerminator(), ret);
+
+  bool success = inlineFunc(call, F);
+  assert(success);
+  UNREFERENCED_PARAMETER(success);
+}
+
+void DxrFallbackCompiler::lowerAnyHitControlFlowFuncs()
+{
+  std::vector<CallInst*> callsToIgnoreHit = getCallsInShadersToFunction("dx.op.ignoreHit");
+  if (!callsToIgnoreHit.empty())
+  {
+    Function* ignoreHitFunc = m_module->getFunction("\x1?Fallback_IgnoreHit@@YAXXZ");
+    assert(ignoreHitFunc && "IgnoreHit() implementation not found");
+    for (CallInst* call : callsToIgnoreHit)
+      inlineFuncAndAddRet(call, ignoreHitFunc);
+  }
+
+  std::vector<CallInst*> callsToAcceptHitAndEndSearch = getCallsInShadersToFunction("dx.op.acceptHitAndEndSearch");
+  if (!callsToAcceptHitAndEndSearch.empty())
+  {
+    Function* acceptHitAndEndSearchFunc = m_module->getFunction("\x1?Fallback_AcceptHitAndEndSearch@@YAXXZ");
+    assert(acceptHitAndEndSearchFunc && "AcceptHitAndEndSearch() implementation not found");
+    for (CallInst* call : callsToAcceptHitAndEndSearch)
+      inlineFuncAndAddRet(call, acceptHitAndEndSearchFunc);
+  }
+}
+
+void DxrFallbackCompiler::lowerReportHit()
+{
+  std::vector<CallInst*> callsToReportHit = getCallsInShadersToFunctionWithPrefix("dx.op.reportHit");
+  if (callsToReportHit.empty())
+    return;
+
+  Function* reportHitFunc = m_module->getFunction("\x1?Fallback_ReportHit@@YAHMI@Z");
+  assert(reportHitFunc && "ReportHit() implementation not found");
+
+  LLVMContext& C = m_module->getContext();
+  for (CallInst* call : callsToReportHit)
+  {
+    // Wrap attribute arguments in Fallback_SetPendingAttr() call
+    Instruction* insertBefore = call;
+    hlsl::DxilInst_ReportHit reportHitCall(call);
+
+    Value* attr = reportHitCall.get_Attributes();
+    Function* setPendingAttrFunc = FunctionBuilder(m_module, "\x1?Fallback_SetPendingAttr@@").voidTy().type(attr->getType(), "attr").build();
+    CallInst::Create(setPendingAttrFunc, { attr }, "", insertBefore);
+
+    // Make call to implementation and load result
+    CallInst* callImpl = CallInst::Create(reportHitFunc, { reportHitCall.get_THit(), reportHitCall.get_HitKind() }, "reportHit.result", insertBefore);
+    Value* result = callImpl;
+
+    // Result < 0 ==> ret
+    Value* zero = makeInt32(0, C);
+    Value* ltz = new ICmpInst(insertBefore, CmpInst::ICMP_SLT, result, zero, "endSearch");
+    BasicBlock* prevBlock = call->getParent();
+    BasicBlock* retBlock = prevBlock->splitBasicBlock(call, "endSearch");
+    BasicBlock* nextBlock = retBlock->splitBasicBlock(call, "afterReportHit");
+    ReplaceInstWithInst(prevBlock->getTerminator(), BranchInst::Create(retBlock, nextBlock, ltz));
+    ReplaceInstWithInst(retBlock->getTerminator(), ReturnInst::Create(C));
+
+    // Compare result to zero and store into original result
+    Value* gtz = new ICmpInst(insertBefore, CmpInst::ICMP_SGT, result, zero, "accepted");
+    call->replaceAllUsesWith(gtz);
+
+    bool success = inlineFunc(callImpl, reportHitFunc);
+    assert(success);
+    (void)success;
+
+    call->eraseFromParent();
+  }
+}
+
+void DxrFallbackCompiler::lowerTraceRay(Type* runtimeDataArgTy)
+{
+  std::vector<CallInst*> callsToTraceRay = getCallsInShadersToFunctionWithPrefix("dx.op.traceRay");
+  if (callsToTraceRay.empty())
+  {
+    // TODO: It might be worth dropping this from the tests eventually
+    callsToTraceRay = getCallsInShadersToFunctionWithPrefix("\x1?TraceRayTest@@");
+    if (callsToTraceRay.empty())
+      return;
+  }
+
+  std::vector<Function*> traceRayImpl = getFunctionsWithPrefix(m_module, "\x1?Fallback_TraceRay@@");
+  assert(traceRayImpl.size() == 1 && "Could not find Fallback_TraceRay() implementation");
+
+  enum { CLOSEST_HIT = 0, MISS = 1 };
+  Function* traceRaySave[] = { m_module->getFunction("traceRaySave_ClosestHit"), m_module->getFunction("traceRaySave_Miss") };
+  Function* traceRayRestore[] = { m_module->getFunction("traceRayRestore_ClosestHit"), m_module->getFunction("traceRayRestore_Miss") };
+  assert(traceRaySave[CLOSEST_HIT] && traceRayRestore[CLOSEST_HIT] && traceRaySave[MISS] && traceRayRestore[MISS] &&
+    "Could not find TraceRay spill functions");
+
+  Function* dummyRuntimeDataArgFunc = StateFunctionTransform::createDummyRuntimeDataArgFunc(m_module, runtimeDataArgTy);
+  assert(dummyRuntimeDataArgFunc && "dummyRuntimeDataArg function could not be created.");
+
+  // Process calls
+  LLVMContext& C = m_module->getContext();
+  Type* int32Ty = Type::getInt32Ty(C);
+  std::map<FunctionType*, Function*> movePayloadToStackFuncs;
+  std::map<Function*, AllocaInst*> funcToSpillAlloca;
+  for (CallInst* call : callsToTraceRay)
+  {
+    Instruction* insertBefore = call;
+
+    
+    // Spill runtime data values, if necessary (closesthit and miss shaders)
+    Function* caller = call->getParent()->getParent();
+    DXIL::ShaderKind kind = getRayShaderKind(caller);
+    if (kind == DXIL::ShaderKind::ClosestHit || kind == DXIL::ShaderKind::Miss)
+    {
+      int sh = (kind == DXIL::ShaderKind::ClosestHit) ? CLOSEST_HIT : MISS;
+      AllocaInst* spillAlloca = get(funcToSpillAlloca, caller);
+      if (!spillAlloca)
+      {
+        Argument* spillAllocaArg = (++traceRaySave[sh]->arg_begin());
+        Type* spillAllocaTy = spillAllocaArg->getType()->getPointerElementType();
+        spillAlloca = new AllocaInst(spillAllocaTy, "spill.alloca", caller->getEntryBlock().begin());
+        funcToSpillAlloca[caller] = spillAlloca;
+      }
+      
+      // Create calls. SFT will inline them.
+      Value* runtimeDataArg = CallInst::Create(dummyRuntimeDataArgFunc, "runtimeData", insertBefore);
+      CallInst::Create(traceRaySave[sh], {runtimeDataArg, spillAlloca}, "", insertBefore);
+      CallInst::Create(traceRayRestore[sh], {runtimeDataArg, spillAlloca}, "", getInstructionAfter(call));    
+    }
+
+    
+    // Get the payload offset to pass to trace implementation
+    //hlsl::DxilInst_TraceRay traceRayCall(call);
+    // TODO: Avoiding the intrinsic to support the test's use of TraceRayTest
+    Value* payload = call->getOperand(call->getNumArgOperands() - 1);
+    FunctionType* funcType = FunctionType::get(int32Ty, { payload->getType() }, false);
+    Function* movePayloadToStackFunc = getOrCreateFunction("movePayloadToStack", m_module, funcType, movePayloadToStackFuncs);
+    Value* newPayloadOffset = CallInst::Create(movePayloadToStackFunc, { payload }, "new.payload.offset", insertBefore);
+
+    // Call implementation
+    unsigned i = 0;
+    if (call->getCalledFunction()->getName().startswith("dx.op"))
+      i += 2; // skip intrinsic number and acceleration structure (for now)
+    std::vector<Value*> args;
+    for (; i < call->getNumArgOperands() - 1; ++i)
+      args.push_back(call->getArgOperand(i));
+    args.push_back(newPayloadOffset);
+    CallInst::Create(traceRayImpl[0], args, "", insertBefore);
+
+    call->eraseFromParent();
+  }
+}
+
+static std::vector<StateFunctionTransform::ParameterSemanticType> getParameterTypes(Function* F, DXIL::ShaderKind shaderKind)
+{
+  std::vector<StateFunctionTransform::ParameterSemanticType> paramTypes;
+  if (shaderKind == DXIL::ShaderKind::AnyHit || shaderKind == DXIL::ShaderKind::ClosestHit)
+  {
+    paramTypes.push_back(StateFunctionTransform::PST_PAYLOAD);
+    paramTypes.push_back(StateFunctionTransform::PST_ATTRIBUTE);
+  }
+  else if (shaderKind == DXIL::ShaderKind::Miss)
+  {
+    paramTypes.push_back(StateFunctionTransform::PST_PAYLOAD);
+  }
+  else
+  {
+    paramTypes.assign(F->getNumOperands(), StateFunctionTransform::PST_NONE);
+  }
+  return paramTypes;
+}
+
+static void collectResources(DxilModule& DM, std::set<Value*>& resources)
+{
+  for (auto& r : DM.GetCBuffers())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetUAVs())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetSRVs())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetSamplers())
+    resources.insert(r->GetGlobalSymbol());
+}
+
+
+void DxrFallbackCompiler::createStateFunctions(
+  IntToFuncMap& stateFunctionMap,
+  std::vector<int>& shaderEntryStateIds,
+  std::vector<unsigned int>& shaderStackSizes,
+  int baseStateId,
+  const std::vector<std::string>& shaderNames,
+  Type* runtimeDataArgTy
+)
+{
+  for (auto& kv : m_shaderMap)
+  {
+    if (kv.second == nullptr)
+      errs() << "Function not found for shader " << kv.first << "\n";
+  }
+
+  DxilModule& DM = m_module->GetOrCreateDxilModule();
+  std::set<Value*> resources;
+  collectResources(DM, resources);
+
+  shaderEntryStateIds.clear();
+  shaderStackSizes.clear();
+  int stateId = baseStateId;
+  for (auto& shader : shaderNames)
+  {
+    std::vector<Function*> stateFunctions;
+    Function* F = m_shaderMap[shader];
+    StateFunctionTransform sft(F, shaderNames, runtimeDataArgTy);
+    if (m_debugOutputLevel >= 2)
+      sft.setVerbose(true);
+    if (m_debugOutputLevel >= 3)
+      sft.setDumpFilename("dump.ll");
+    if (shader == "Fallback_TraceRay")
+      sft.setAttributeSize(m_maxAttributeSize);
+    DXIL::ShaderKind shaderKind = getRayShaderKind(F);
+    if (shaderKind != DXIL::ShaderKind::Invalid)
+      sft.setParameterInfo(getParameterTypes(F, shaderKind), shaderKind == DXIL::ShaderKind::ClosestHit);
+    sft.setResourceGlobals(resources);
+    UINT shaderStackSize = 0;
+    sft.run(stateFunctions, shaderStackSize);
+
+    shaderEntryStateIds.push_back(stateId);
+    shaderStackSizes.push_back(shaderStackSize);
+    for (Function* stateF : stateFunctions)
+    {
+      stateFunctionMap[stateId++] = stateF;
+      if (DM.HasDxilFunctionProps(F)) {
+        DM.CloneDxilEntryProps(F, stateF);
+      }
+    }
+  }
+
+  StateFunctionTransform::finalizeStateIds(m_module, shaderEntryStateIds);
+}
+
+void DxrFallbackCompiler::createLaunchParams(Function* func)
+{
+  Module* module = func->getParent();
+  Function* rewrite_setLaunchParams = module->getFunction("rewrite_setLaunchParams");
+  CallInst* call = dyn_cast<CallInst>(*rewrite_setLaunchParams->user_begin());
+
+  LLVMContext& context = module->getContext();
+  Instruction* insertBefore = call;
+
+  Function* DTidFunc = FunctionBuilder(module, "dx.op.threadId.i32").i32().i32().i32().build();
+  Value* DTidx = CallInst::Create(DTidFunc, { makeInt32((int)hlsl::OP::OpCode::ThreadId, context), makeInt32(0, context) }, "DTidx", insertBefore);
+  Value* DTidy = CallInst::Create(DTidFunc, { makeInt32((int)hlsl::OP::OpCode::ThreadId, context), makeInt32(1, context) }, "DTidy", insertBefore);
+
+  Value* dimx = call->getArgOperand(1);
+  Value* dimy = call->getArgOperand(2);
+
+  Function* groupIndexFunc = FunctionBuilder(module, "dx.op.flattenedThreadIdInGroup.i32").i32().i32().build();
+  Value* groupIndex = CallInst::Create(groupIndexFunc, { makeInt32(96, context) }, "groupIndex", insertBefore);
+
+  Function* fb_setLaunchParams = module->getFunction("fb_Fallback_SetLaunchParams");
+  Value* runtimeDataArg = call->getArgOperand(0);
+  CallInst::Create(fb_setLaunchParams, { runtimeDataArg, DTidx, DTidy, dimx, dimy, groupIndex }, "", insertBefore);
+
+  call->eraseFromParent();
+  rewrite_setLaunchParams->eraseFromParent();
+}
+
+void DxrFallbackCompiler::createStateDispatch(Function* func, const IntToFuncMap& stateFunctionMap, Type* runtimeDataArgTy)
+{
+  Module* module = func->getParent();
+  Function* dispatchFunc = createDispatchFunction(stateFunctionMap, runtimeDataArgTy);
+  Function* rewrite_dispatchFunc = module->getFunction("rewrite_dispatch");
+  rewrite_dispatchFunc->replaceAllUsesWith(dispatchFunc);
+  rewrite_dispatchFunc->eraseFromParent();
+}
+
+void DxrFallbackCompiler::createStack(Function* func)
+{
+  LLVMContext& context = func->getContext();
+
+  // We would like to allocate the properly sized stack here, but DXIL doesn't
+  // allow bitcasts between objects of different sizes. So we have to use the
+  // default size from the runtime and replace all the accesses later.
+  Function* rewrite_createStack = m_module->getFunction("rewrite_createStack");
+  CallInst* call = dyn_cast<CallInst>(*rewrite_createStack->user_begin());
+  AllocaInst* stack = new AllocaInst(call->getType()->getPointerElementType(), "theStack", call);
+  stack->setAlignment(sizeof(int));
+  call->replaceAllUsesWith(stack);
+  call->eraseFromParent();
+  rewrite_createStack->eraseFromParent();
+
+  if (m_stackSizeInBytes == 0) // Take the default
+    m_stackSizeInBytes = stack->getType()->getPointerElementType()->getArrayNumElements() * sizeof(int);
+  Function* rewrite_getStackSize = m_module->getFunction("rewrite_getStackSize");
+  call = dyn_cast<CallInst>(*rewrite_getStackSize->user_begin());
+  Value* stackSizeVal = makeInt32(m_stackSizeInBytes, context);
+  call->replaceAllUsesWith(stackSizeVal);
+  call->eraseFromParent();
+  rewrite_getStackSize->eraseFromParent();
+}
+
+// WAR to avoid crazy <3 x float> code emitted by vanilla clang in the runtime
+static bool expandFloat3(std::vector<Value*>& args, Value* arg, Instruction* insertBefore)
+{
+  VectorType* argTy = dyn_cast<VectorType>(arg->getType());
+  if (!argTy || argTy->getVectorNumElements() != 3)
+    return false;
+
+  LLVMContext& C = arg->getContext();
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(0, C), "vec.x", insertBefore));
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(1, C), "vec.y", insertBefore));
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(2, C), "vec.z", insertBefore));
+
+  return true;
+}
+
+static bool float3x4ToFloat12(std::vector<Value*>& args, Value* arg, Instruction* insertBefore)
+{
+  StructType* STy = dyn_cast<StructType>(arg->getType());
+  if (!STy || STy->getName() != "class.matrix.float.3.4")
+    return false;
+
+  BasicBlock& entryBlock = insertBefore->getParent()->getParent()->getEntryBlock();
+  AllocaInst* alloca = new AllocaInst(arg->getType(), "tmp", entryBlock.begin());
+  new StoreInst(arg, alloca, insertBefore);
+  VectorType* VTy = VectorType::get(Type::getFloatTy(arg->getContext()), 12);
+  Value* vec12Ptr = new BitCastInst(alloca, VTy->getPointerTo(), "vec12.ptr", insertBefore);
+  Value* vec12 = new LoadInst(vec12Ptr, "vec12.", insertBefore);
+  args.push_back(vec12);
+
+  return true;
+}
+
+void DxrFallbackCompiler::lowerIntrinsics()
+{
+  std::vector<Function*> intrinsics = getFunctionsWithPrefix(m_module, "fb_");
+  assert(intrinsics.size() > 0);
+
+
+  // Replace intrinsics in anyhit shaders with their pending versions
+  LLVMContext& C = m_module->getContext();
+  std::map<std::string, Function*> pendingIntrinsics;
+  std::string pendingPrefixes[] = { "fb_dxop_pending_",  "fb_Fallback_Pending" };
+  for (auto& F : intrinsics)
+  {
+    std::string intrinsicName;
+    if (F->getName().startswith(pendingPrefixes[0]))
+      intrinsicName = F->getName().substr(pendingPrefixes[0].length());
+    else if (F->getName().startswith(pendingPrefixes[1]))
+      intrinsicName = "Fallback_" + F->getName().substr(pendingPrefixes[1].length()).str();
+    else
+      continue;
+
+    pendingIntrinsics[intrinsicName] = F;
+  }
+
+  for (Function* func : intrinsics)
+  {
+    StringRef intrinsicName;
+    std::string name;
+    bool isDxilOp = false;
+    if (func->getName().startswith("fb_Fallback_"))
+    {
+      intrinsicName = func->getName().substr(3); // after the "fb_" prefix
+      name = "\x1?" + intrinsicName.str();
+    }
+    else if (func->getName().startswith("fb_dxop_"))
+    {
+      intrinsicName = func->getName().substr(8);
+      name = "dx.op." + intrinsicName.str();
+      isDxilOp = true;
+    }
+    else
+    {
+      assert(0 && "Bad intrinsic");
+    }
+    std::vector<Function*> calledFunc = getFunctionsWithPrefix(m_module, name);
+    if (calledFunc.empty())
+      continue;
+    std::vector<CallInst*> calls = getCallsToFunction(calledFunc[0]);
+    if (calls.empty())
+      continue;
+
+
+    bool needsRuntimeDataArg = (intrinsicName != "Fallback_Scheduler");
+    Function* pendingFunc = get(pendingIntrinsics, intrinsicName.str());
+    Function* funcInModule = nullptr;
+    Function* pendingFuncInModule = nullptr;
+    for (CallInst* call : calls)
+    {
+      Function* caller = call->getParent()->getParent();
+      if (needsRuntimeDataArg && !caller->hasFnAttribute("state_function"))
+        continue;
+
+      Function* F = nullptr;
+      if (pendingFunc && shouldUsePendingValue(caller, intrinsicName))
+      {
+        if (!pendingFuncInModule)
+          pendingFuncInModule = getOrInsertFunction(m_module, pendingFunc);
+        F = pendingFuncInModule;
+      }
+      else
+      {
+        if (!funcInModule)
+          funcInModule = getOrInsertFunction(m_module, func);
+        F = funcInModule;
+      }
+
+      // insert runtime data and the rest of the arguments
+      std::vector<Value*> args;
+      if (needsRuntimeDataArg)
+        args.push_back(caller->arg_begin());
+      int argIdx = 0;
+      for (auto& arg : call->arg_operands())
+      {
+        if (argIdx++ == 0 && isDxilOp)
+          continue; // skip the intrinsic number
+        if (!expandFloat3(args, arg, call) && !float3x4ToFloat12(args, arg, call))
+          args.push_back(arg);
+      }
+
+      CallInst* newCall = CallInst::Create(F, args, "", call);
+      if (F->getFunctionType()->getReturnType() != Type::getVoidTy(C))
+      {
+        newCall->takeName(call);
+        call->replaceAllUsesWith(newCall);
+      }
+      call->eraseFromParent();
+    }
+  }
+}
+
+Type* DxrFallbackCompiler::getRuntimeDataArgType()
+{
+  // Get the first argument from a known runtime function (assuming the runtime
+  // has already been linked in).
+  Function* F = m_module->getFunction("stackIntPtr");
+  return F->arg_begin()->getType();
+}
+
+Function* DxrFallbackCompiler::createDispatchFunction(const IntToFuncMap &stateFunctionMap, Type* runtimeDataArgTy)
+{
+  LLVMContext& context = m_module->getContext();
+  FunctionType* stateFuncTy = FunctionType::get(Type::getInt32Ty(context), { runtimeDataArgTy }, false);
+
+  Function* dispatchFunc = FunctionBuilder(m_module, "dispatch").i32().type(runtimeDataArgTy, "runtimeData").i32("stateID").build();
+  Value* runtimeDataArg = dispatchFunc->arg_begin();
+  Value* stateIdArg = ++dispatchFunc->arg_begin();
+  BasicBlock* entryBlock = BasicBlock::Create(context, "entry", dispatchFunc);
+  BasicBlock* badBlock = BasicBlock::Create(context, "badStateID", dispatchFunc);
+  IRBuilder<> builder(badBlock);
+  builder.SetInsertPoint(badBlock);
+  builder.CreateRet(makeInt32(-3, context)); // return an error value
+
+  builder.SetInsertPoint(entryBlock);
+  SwitchInst* switchInst = builder.CreateSwitch(stateIdArg, badBlock, stateFunctionMap.size());
+  BasicBlock* endBlock = badBlock;
+  for (auto& kv : stateFunctionMap)
+  {
+    int stateId = kv.first;
+    Function* stateFunc = kv.second;
+
+    Value* stateFuncInModule = m_module->getOrInsertFunction(stateFunc->getName(), stateFuncTy);
+    BasicBlock* block = BasicBlock::Create(context, "state_" + Twine(stateId) + "." + stateFunc->getName(), dispatchFunc, endBlock);
+    builder.SetInsertPoint(block);
+    Value* nextStateId = builder.CreateCall(stateFuncInModule, { runtimeDataArg }, "nextStateId");
+    builder.CreateRet(nextStateId);
+
+    switchInst->addCase(makeInt32(stateId, context), block);
+  }
+
+  return dispatchFunc;
+}
+
+std::vector<CallInst*> DxrFallbackCompiler::getCallsInShadersToFunction(const std::string& funcName)
+{
+  std::vector<CallInst*> calls;
+  Function* F = m_module->getFunction(funcName);
+  if (!F)
+    return calls;
+
+  for (User* U : F->users())
+  {
+    CallInst* call = dyn_cast<CallInst>(U);
+    if (!call)
+      continue;
+
+    Function* caller = call->getParent()->getParent();
+    auto it = m_shaderMap.find(cleanName(caller->getName()));
+    if (it != m_shaderMap.end())
+      calls.push_back(call);
+  }
+  return calls;
+}
+
+std::vector<CallInst*> DxrFallbackCompiler::getCallsInShadersToFunctionWithPrefix(const std::string& funcNamePrefix)
+{
+  std::vector<CallInst*> calls;
+  for (Function* F : getFunctionsWithPrefix(m_module, funcNamePrefix))
+  {
+    for (User* U : F->users())
+    {
+      CallInst* call = dyn_cast<CallInst>(U);
+      if (!call)
+        continue;
+
+      Function* caller = call->getParent()->getParent();
+      if (m_shaderMap.count(cleanName(caller->getName())))
+        calls.push_back(call);
+    }
+  }
+  return calls;
+}
+
+void DxrFallbackCompiler::resizeStack(Function* F, unsigned sizeInBytes)
+{
+  // Find the stack
+  AllocaInst* stack = nullptr;
+  for (auto& I : F->getEntryBlock().getInstList())
+  {
+    AllocaInst* alloc = dyn_cast<AllocaInst>(&I);
+    if (alloc && alloc->getName().startswith("theStack"))
+    {
+      stack = alloc;
+      break;
+    }
+  }
+  if (!stack)
+    return;
+
+  // Create a new stack
+  LLVMContext& C = F->getContext();
+  ArrayType* newStackTy = ArrayType::get(Type::getInt32Ty(C), sizeInBytes / sizeof(int));
+  AllocaInst* newStack = new AllocaInst(newStackTy, "", stack);
+  newStack->takeName(stack);
+
+  // Remap all GEPs - replaceAllUsesWith() won't change types
+  for (auto U = stack->user_begin(), UE = stack->user_end(); U != UE; )
+  {
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(*U++);
+    assert(gep && "theStack has non-gep user.");
+
+    std::vector<Value*> idxList(gep->idx_begin(), gep->idx_end());
+    GetElementPtrInst* newGep = GetElementPtrInst::CreateInBounds(newStack, idxList, "", gep);
+    newGep->takeName(gep);
+    gep->replaceAllUsesWith(newGep);
+    gep->eraseFromParent();
+  }
+
+  stack->eraseFromParent();
+}

+ 148 - 0
lib/DxrFallback/FunctionBuilder.h

@@ -0,0 +1,148 @@
+#pragma once
+
+#include "llvm/IR/Module.h"
+
+#include <string>
+#include <vector>
+
+//==============================================================================
+// Simplifies the creation of functions.
+//
+// To create a function 'void foo( userType, i32, float* )' use the following
+// code:
+//   FunctionBuilder(module, "foo").voidTy().type(userType).i32().floatPtr().build()
+//
+// The first type specified is the return type.
+class FunctionBuilder
+{
+public:
+  FunctionBuilder(llvm::Module* module, const std::string& name)
+    : m_context(module->getContext())
+    , m_module(module)
+    , m_name(name)
+  {}
+
+  FunctionBuilder& voidTy()
+  {
+    m_argNames.push_back("");
+    m_types.push_back(llvm::Type::getVoidTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& floatTy(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getFloatTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& floatPtr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getFloatPtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& doubleTy(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getDoubleTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& doublePtr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getDoublePtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i32(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt32Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i32Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt32PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i16(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt16Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i16Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt16PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i8(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt8Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i8Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt8PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i1(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt1Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i1Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt1PtrTy(m_context));
+    return *this;
+  }
+
+  FunctionBuilder& type(llvm::Type* ty, const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(ty);
+    return *this;
+  }
+  FunctionBuilder& types(const std::vector<llvm::Type*>& ty, const std::vector<std::string>& argNames)
+  {
+    if (argNames.empty())
+      for (size_t i = 0; i < ty.size(); ++i)
+        m_argNames.push_back("");
+    m_types.insert(m_types.end(), ty.begin(), ty.end());
+    return *this;
+  }
+
+  llvm::Function* build()
+  {
+    using namespace llvm;
+
+    Type*        retTy = m_types[0];
+    AttributeSet attributes;
+    Type**       argsBegin = (&m_types[0]) + 1;
+    Type**       argsEnd = argsBegin + m_types.size() - 1;
+    Constant*    funcC =
+      m_module->getOrInsertFunction(m_name, FunctionType::get(retTy, ArrayRef<Type*>(argsBegin, argsEnd), false), attributes);
+    Function* func = cast<Function>(funcC);
+
+    std::string* argNamePtr = m_argNames.data() + 1;
+    for (auto& arg : func->args())
+      arg.setName(*argNamePtr++);
+
+    return func;
+  }
+
+private:
+  llvm::LLVMContext&       m_context;
+  llvm::Module*            m_module = nullptr;
+  std::string              m_name;
+  std::vector<std::string> m_argNames;
+  std::vector<llvm::Type*> m_types;
+
+  // forbidden
+  FunctionBuilder();
+  FunctionBuilder(const FunctionBuilder&);
+};

+ 16 - 0
lib/DxrFallback/LLVMBuild.txt

@@ -0,0 +1,16 @@
+; Copyright (C) Microsoft Corporation. All rights reserved.
+; This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DxrFallback
+parent = Libraries
+required_libraries = Core Support

+ 122 - 0
lib/DxrFallback/LLVMUtils.cpp

@@ -0,0 +1,122 @@
+#include "llvm/Analysis/CFGPrinter.h"  // needed for DOTGraphTraits<const Function*>
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/GraphWriter.h"
+
+
+using namespace llvm;
+
+std::vector<CallInst*> getCallsToFunction(Function* callee, const Function* caller)
+{
+  std::vector<CallInst*> calls;
+  if (callee == nullptr)
+    return calls;
+
+  for (auto U = callee->user_begin(), UE = callee->user_end(); U != UE; ++U)
+  {
+    CallInst* CI = dyn_cast<CallInst>(*U);
+    if (!CI) // We are not interested in uses that are not calls 
+      continue;
+    assert(CI->getCalledFunction() == callee);
+
+    if (caller == nullptr || CI->getParent()->getParent() == caller)
+      calls.push_back(CI);
+  }
+  return calls;
+}
+
+ConstantInt* makeInt32(int val, LLVMContext& context)
+{
+  return ConstantInt::get(Type::getInt32Ty(context), val);
+}
+
+Instruction* getInstructionAfter(Instruction* inst)
+{
+  return ++BasicBlock::iterator(inst);
+}
+
+std::unique_ptr<Module> loadModuleFromAsmFile(LLVMContext& context, const std::string& filename)
+{
+  SMDiagnostic err;
+  std::unique_ptr<Module> module = parseIRFile(filename, err, context);
+  if (!module)
+  {
+    err.print(filename.c_str(), errs());
+    exit(1);
+  }
+
+  return module;
+}
+
+std::unique_ptr<Module> loadModuleFromAsmString(LLVMContext& context, const std::string& str)
+{
+  SMDiagnostic  err;
+  MemoryBufferRef memBuffer(str, "id");
+  std::unique_ptr<Module> module = parseIR(memBuffer, err, context);
+  return module;
+}
+
+void saveModuleToAsmFile(const llvm::Module* module, const std::string& filename)
+{
+  std::error_code EC;
+  raw_fd_ostream out(filename, EC, sys::fs::F_Text);
+  if (!out.has_error())
+  {
+    module->print(out, 0);
+    out.close();
+  }
+  if (out.has_error())
+  {
+    errs() << "Error saving to " << filename << "\n";
+    exit(1);
+  }
+}
+
+
+void dumpCFG(const Function* F, const std::string& suffix)
+{
+  std::string filename = ("cfg." + F->getName() + "." + suffix + ".dot").str();
+
+  std::error_code EC;
+  raw_fd_ostream out(filename, EC, sys::fs::F_Text);
+  if (!out.has_error())
+  {
+    errs() << "Writing '" << filename << "'...\n";
+    WriteGraph(out, F, true, F->getName());
+    out.close();
+  }
+  if (out.has_error())
+  {
+    errs() << "Error saving to " << filename << "\n";
+    exit(1);
+  }
+}
+
+Function* getOrCreateFunction(const std::string& name, Module* module, FunctionType* funcType, std::map<FunctionType*, Function*>& typeToFuncMap)
+{
+  auto it = typeToFuncMap.find(funcType);
+  if (it != typeToFuncMap.end())
+    return it->second;
+
+  // Give name a numerical suffix to make it unique 
+  std::string uniqueName = name + std::to_string(typeToFuncMap.size());
+  Function* F = dyn_cast<Function>(module->getOrInsertFunction(uniqueName, funcType));
+  typeToFuncMap[funcType] = F;
+  return F;
+}
+
+void runPasses(llvm::Function* F, const std::vector<llvm::Pass*>& passes)
+{
+  legacy::FunctionPassManager FPM(F->getParent());
+  for (Pass* pass : passes)
+    FPM.add(pass);
+  FPM.doInitialization();
+  FPM.run(*F);
+  FPM.doFinalization();
+}

+ 34 - 0
lib/DxrFallback/LLVMUtils.h

@@ -0,0 +1,34 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class CallInst;
+  class ConstantInt;
+  class Function;
+  class FunctionType;
+  class Instruction;
+  class LLVMContext;
+  class Module;
+  class Pass;
+}
+
+std::vector<llvm::CallInst*> getCallsToFunction(llvm::Function* callee, const llvm::Function* caller = nullptr);
+
+llvm::Function* getOrCreateFunction(const std::string& name, llvm::Module* module, llvm::FunctionType* funcType, std::map<llvm::FunctionType*, llvm::Function*>& typeToFuncMap);
+
+llvm::ConstantInt* makeInt32(int val, llvm::LLVMContext& context);
+
+llvm::Instruction* getInstructionAfter(llvm::Instruction* inst);
+
+std::unique_ptr<llvm::Module> loadModuleFromAsmFile(llvm::LLVMContext& context, const std::string& filename);
+std::unique_ptr<llvm::Module> loadModuleFromAsmString(llvm::LLVMContext& context, const std::string& str);
+void saveModuleToAsmFile(const llvm::Module* module, const std::string& filename);
+
+void dumpCFG(const llvm::Function* F, const std::string& suffix);
+
+void runPasses(llvm::Function*, const std::vector<llvm::Pass*>& passes);

+ 337 - 0
lib/DxrFallback/LiveValues.cpp

@@ -0,0 +1,337 @@
+#include "LiveValues.h"
+
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+static void applyMapping(InstructionSetVector& iset, llvm::DenseMap<llvm::Instruction *, llvm::Instruction *>& imap)
+{
+  // There will be probably be few entries in the imap, so apply them one at a time to the iset.
+  for (auto& kv : imap)
+  {
+    if (iset.count(kv.first) != 0)
+    {
+      iset.remove(kv.first);
+      iset.insert(kv.second);
+    }
+  }
+}
+
+// Compute liveness of a value at basic blocks. Roughly based on
+// Algorithm 6 & 7 from the paper "Computing Liveness Sets for SSA-
+// Form Programs" by Brander et al., 2011.
+
+LiveValues::LiveValues(ArrayRef<Instruction*> computeLiveAt)
+{
+  m_liveSets.resize(computeLiveAt.size());
+
+  // Build index and set of active blocks
+  for (unsigned int i = 0; i < computeLiveAt.size(); i++)
+  {
+    Instruction* v = computeLiveAt[i];
+    m_computeLiveAtIndex.insert(std::make_pair(v, i));
+
+    m_activeBlocks.insert(v->getParent());
+  }
+
+  if (computeLiveAt.size() > 0)
+  {
+      m_function = computeLiveAt[0]->getParent()->getParent();
+  }
+}
+
+// Go over all the instructions between begin (included) and end (excluded) and mark the given value
+// live for code locations contained in the given range.
+void LiveValues::markLiveRange(Instruction* value, BasicBlock::iterator begin, BasicBlock::iterator end)
+{
+  BasicBlock* B = begin->getParent();
+
+  if (m_activeBlocks.count(B) == 0)
+    return;  // Nothing to mark in this block
+
+  for (BasicBlock::iterator I = begin; I != end; ++I)
+  {
+    if (m_computeLiveAtIndex.count(I))
+    {
+      // Mark this value
+      unsigned int index = m_computeLiveAtIndex[I];
+      m_liveSets[index].insert(value);
+      m_allLiveSet.insert(value);
+      // Also store for each value where it is live.
+      m_liveAtIndices[value].insert(index);
+    }
+  }
+}
+
+void LiveValues::upAndMark(Instruction* def, Use& use, BlockSet& scanned)
+{
+  // Determine the starting point for the backwards search.
+  // (Remember that Use represents an edge between the definition of a value and its use)
+  // In the case in which the user of the use is a phi node we start the search from the terminator
+  // of the preceding block.
+  // This allows to avoid going through loop back-edges in cases like these:
+  //                 |
+  //                 | (y)
+  //                 v
+  //          -----------------
+  //     (x)  | z = phi(x, y) |
+  //    ----> | ...           |
+  //    |     | x = z + 1     |
+  //    |     -----------------
+  //    |             |
+  //    |             |
+  //    |             |
+  //    |             v
+  //    |     -----------------
+  //    |     |               |
+  //    ------| INDIRECT CALL |
+  //          |               |
+  //          -----------------
+  //                  | (Start the search for the definition of x (backwards) from here!)
+  //                  v
+  //
+  // Notice that here x is live across the call. This case is tricky because the def comes 'after'
+  // the use. The def still dominates the use because phi nodes logically use their input values on the
+  // edges, i.e. on the terminator of the preceding blocks.
+  //
+  // This has the advantage of being able to traverse edges strictly backwards.
+
+  Instruction* startingPoint = dyn_cast<Instruction>(use.getUser());
+  if (PHINode* usePHI = dyn_cast<PHINode>(startingPoint))
+  {
+    BasicBlock* predecessor = usePHI->getIncomingBlock(use);
+    startingPoint = predecessor->getTerminator();
+  }
+
+  BasicBlock* startingPointBB = startingPoint->getParent();
+  BasicBlock* defBB = def->getParent();
+
+  // Start a bottom-up recursive search from startingPoint to the definition of the current value.
+  // Mark all the code ranges that we encounter on the way a having the current value 'live'.
+  // 'scanned' contains the blocks that we have scanned to the bottom of the block and the we know
+  // already having the current value 'live'.
+
+  SmallVector<BasicBlock*, 16> worklist;
+  worklist.push_back(startingPointBB);
+
+  BlockSet visited;
+
+  while (!worklist.empty())
+  {
+    BasicBlock* B = worklist.pop_back_val();
+
+    if (scanned.count(B) != 0)
+      continue;
+
+    // We have reached the block that contains the definition of the value. We are done for this
+    // branch of the search.
+    if (B == defBB)
+    {
+      if (defBB == startingPointBB)
+      {
+        // If the first block that we visit is also the last mark only the range of instructions
+        // between the def and the starting point.
+        //    -----------------
+        //    |               |
+        //    | x = // def    |  <--
+        //    |               |    !
+        //    |               |    ! This is the range in which x is live.
+        //    |               |    !
+        //    | = x // use    |  <--
+        //    |               |
+        //    -----------------
+
+        markLiveRange(def, ++BasicBlock::iterator(def), BasicBlock::iterator(startingPoint));
+      }
+      else
+      {
+        markLiveRange(def, ++BasicBlock::iterator(def), defBB->end());
+        scanned.insert(B);
+      }
+    }
+    else
+    {
+      if (B == startingPointBB)
+      {
+        // We are in the starting-point block.
+        // This can mean two things:
+        // 1. We are in the first iteration, mark the range between begin and starting point as
+        // live.
+        if (visited.count(B) == 0)
+        {
+          markLiveRange(def, B->begin(), BasicBlock::iterator(startingPoint));
+        }
+        // 2. We came back here because the starting point is in a loop.
+        // In this case mark the whole block as live range and don't come back anymore.
+        else
+        {
+          markLiveRange(def, B->begin(), B->end());
+          scanned.insert(B);
+        }
+
+        // The if statement above allows to manage situations like this:
+        //         BB0
+        //        -----------------
+        //        | x = ...       |
+        //        -----------------
+        //                |
+        //                |
+        //                |
+        //         BB1    v
+        //        -----------------<--                     <--
+        //        |               |  !                       !
+        //  ----->|               |  ! First range marked    !
+        //  |     |               |  !                       !
+        //  |     | ... = x       |<--                       ! Second and final range marked
+        //  |     |               |                          !
+        //  |     | INDIRECT CALL |                          !
+        //  |     |               |                          !
+        //  |     -----------------                        <--
+        //  |              |
+        //  ---------------
+        // x is defined outside a loop and used inside a loop. This means that it is live inside the
+        // whole loop.
+        // So, we first mark the range from the use of x to the top of BB1 and, when we visit BB1
+        // again (because BB1 is a predecessor of BB1) we mark the whole block as live range.
+        // <rant>
+        // This case could have been managed much more easily and efficiently if we had access to
+        // LLVM LoopInfo analysis pass.
+        // We could have done the following: x is uses in a loop and defined outside of it => mark
+        // the whole loop body as live range.
+        // </rant>
+      }
+      else
+      {
+        // We are in an intermediate block on the way to the definition mark it, all as live range.
+        markLiveRange(def, B->begin(), B->end());
+        scanned.insert(B);
+      }
+
+      visited.insert(B);
+
+      for (pred_iterator P = pred_begin(B), PE = pred_end(B); P != PE; ++P)
+      {
+        worklist.push_back(*P);
+      }
+    }
+  }
+}
+
+void LiveValues::run()
+{
+  if (m_computeLiveAtIndex.empty())
+    return;
+
+  // for each variable v do
+  for (inst_iterator I = inst_begin(m_function), E = inst_end(m_function); I != E; ++I)
+  {
+    Instruction* v = &*I;
+    assert(v->getParent()->getParent() == m_function);
+
+    // for each block B where v is used do
+    BlockSet scanned;
+    for (Value::use_iterator U = v->use_begin(), UE = v->use_end(); U != UE; ++U)
+    {
+      Instruction* user = cast<Instruction>(U->getUser());
+      assert(user->getParent()->getParent() == m_function);
+      (void)user;
+
+      upAndMark(v, *U, scanned);
+    }
+  }
+}
+
+void LiveValues::remapLiveValues(llvm::DenseMap<llvm::Instruction*, llvm::Instruction*>& imap)
+{
+  applyMapping(m_allLiveSet, imap);
+  for (auto& liveSet : m_liveSets)
+    applyMapping(liveSet, imap);
+}
+
+const LiveValues::Indices* LiveValues::getIndicesWhereLive(const Value* value) const
+{
+  const auto& iter = m_liveAtIndices.find(value);
+  if (iter == m_liveAtIndices.end())
+    return nullptr;
+  return &iter->second;
+}
+
+void LiveValues::setIndicesWhereLive(Value* value, const Indices* indices)
+{
+  for (unsigned int idx : *indices)
+    setLiveAtIndex(value, idx, true);
+}
+
+bool LiveValues::liveInDisjointRegions(const Value* valueA, const Value* valueB) const
+{
+  const Indices* indicesA = getIndicesWhereLive(valueA);
+  if (!indicesA)
+    return true;
+
+  const Indices* indicesB = getIndicesWhereLive(valueB);
+  if (!indicesB)
+    return true;
+
+  for (const unsigned int index : *indicesA)
+  {
+    if (indicesB->count(index))
+      return false;
+  }
+
+  return true;
+}
+
+void LiveValues::setLiveAtIndex(Value* value, unsigned int index, bool live)
+{
+  assert(index <= m_computeLiveAtIndex.size());
+  if (live)
+  {
+    m_liveAtIndices[value].insert(index);
+    Instruction* inst = cast<Instruction>(value);
+    m_liveSets[index].insert(inst);
+    m_allLiveSet.insert(inst);
+  }
+  else
+  {
+    m_liveAtIndices[value].remove(index);
+    Instruction* inst = cast<Instruction>(value);
+    m_liveSets[index].remove(inst);
+    if (m_liveAtIndices[value].empty())
+      m_allLiveSet.remove(inst);
+  }
+}
+
+void LiveValues::setLiveAtAllIndices(llvm::Value* value, bool live)
+{
+  Instruction* inst = cast<Instruction>(value);
+  if (live)
+  {
+    for (unsigned int index = 0; index < m_computeLiveAtIndex.size(); ++index)
+    {
+      m_liveAtIndices[value].insert(index);
+      m_liveSets[index].insert(inst);
+    }
+    m_allLiveSet.insert(inst);
+  }
+  else
+  {
+    for (unsigned int index = 0; index < m_computeLiveAtIndex.size(); ++index)
+    {
+      m_liveAtIndices[value].remove(index);
+      m_liveSets[index].remove(inst);
+    }
+    if (m_liveAtIndices[value].empty())
+      m_allLiveSet.remove(inst);
+  }
+}
+
+bool LiveValues::getLiveAtIndex(const Value* value, unsigned int index) const
+{
+  assert(index <= m_computeLiveAtIndex.size());
+  const auto& it = m_liveAtIndices.find(value);
+  if (it == m_liveAtIndices.end())
+    return false;
+  return (it->second.count(index) != 0);
+}

+ 81 - 0
lib/DxrFallback/LiveValues.h

@@ -0,0 +1,81 @@
+#pragma once
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm
+{
+  class AllocaInst;
+  class BasicBlock;
+  class Function;
+  class Instruction;
+  class Use;
+  class Value;
+}
+
+typedef std::set<llvm::BasicBlock*> BasicBlockSet;
+typedef llvm::SetVector<llvm::Instruction*> InstructionSetVector;
+
+// Compute live values at specified instructions.
+class LiveValues
+{
+public:
+  LiveValues(llvm::ArrayRef<llvm::Instruction*> computeLiveAt);
+
+  // Compute live values at specified instructions (computeLiveAt)
+  void run();
+
+  // Returns all values that are live at the index.
+  const InstructionSetVector& getLiveValues(unsigned int index) const { return m_liveSets[index]; }
+
+  // Returns all live values, excluding allocas.
+  const InstructionSetVector& getAllLiveValues() const { return m_allLiveSet; }
+
+  // Update the live sets using the map
+  void remapLiveValues(llvm::DenseMap<llvm::Instruction*, llvm::Instruction*>& imap);
+
+  typedef llvm::SetVector<unsigned int> Indices;
+
+  // Return all indices at which the given value is live.
+  const Indices* getIndicesWhereLive(const llvm::Value* value) const;
+
+
+  // For the two given values, check if they are both live at any of the
+  // marker instructions. This does not perform a true "lifetime overlap"
+  // test, it considers values to be disjoint if they have disjoint sets of
+  // markers.
+  // For example, value A is live at call sites 0, 1, 2, value B is live at
+  // 3, 4, where A is used for the last time between 2 and 3 and B is defined
+  // before that use. A and B will be considered "disjoint" in the sense of
+  // this method, even though the lifetimes of their values overlap.
+  bool liveInDisjointRegions(const llvm::Value* valueA, const llvm::Value* valueB) const;
+
+  // Return true if the given value is live at the given index.
+  bool getLiveAtIndex(const llvm::Value* value, unsigned int index) const;
+
+  // Update the analysis manually. Use only if you know exactly what you are
+  // doing and document the reason thoroughly.
+  void setLiveAtIndex(llvm::Value* value, unsigned int index, bool live);
+  void setLiveAtAllIndices(llvm::Value* value, bool live);
+  void setIndicesWhereLive(llvm::Value* value, const Indices* indices);
+
+
+private:
+  llvm::Function*                   m_function = nullptr;
+  std::vector<InstructionSetVector> m_liveSets;
+  InstructionSetVector              m_allLiveSet;
+  llvm::SmallSet<llvm::BasicBlock*, 8>             m_activeBlocks;
+  llvm::DenseMap<llvm::Instruction*, unsigned int> m_computeLiveAtIndex;
+  llvm::DenseMap<const llvm::Value*, Indices>      m_liveAtIndices;
+
+  typedef llvm::SmallSet<llvm::BasicBlock*, 8> BlockSet;
+
+  void markLiveRange(llvm::Instruction* value, llvm::BasicBlock::iterator begin, llvm::BasicBlock::iterator end);
+  void upAndMark(llvm::Instruction* v, llvm::Use& use, BlockSet& scanned);
+};

+ 356 - 0
lib/DxrFallback/Reducibility.cpp

@@ -0,0 +1,356 @@
+#include "Reducibility.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "LLVMUtils.h"
+
+#include <fstream>
+#include <vector>
+#include <map>
+
+#define DBGS errs
+//#define DBGS dbgs
+
+using namespace llvm;
+
+struct Node
+{
+  SetVector<Node*> in;
+  SetVector<Node*> out;
+  SetVector<BasicBlock*> blocks; // block 0 dominates all others in this node
+  size_t numInstructions = 0;
+
+  Node() {}
+  Node(BasicBlock* B) { insert(B); }
+
+  void insert(BasicBlock* B)
+  {
+    numInstructions += B->size();
+    blocks.insert(B);
+  }
+};
+
+
+static void printDotGraph(const std::vector<Node*> nodes, const std::string& filename)
+{
+  DBGS() << "Writing " << filename << " ...";
+  std::ofstream out(filename);
+  if (!out)
+  {
+    DBGS() << "FAILED\n";
+    return;
+  }
+
+  // Give nodes a numerical index to make the output cleaner
+  std::map<Node*, int> idxMap;
+  for (size_t i = 0; i < nodes.size(); ++i)
+    idxMap[nodes[i]] = i;
+
+
+  // Error check - make sure that all the out/in nodes are in the map
+  for (Node* N : nodes)
+  {
+    for (Node* P : N->in)
+    {
+      if (idxMap.find(P) == idxMap.end())
+        DBGS() << "MISSING INPUT NODE\n";
+      if (P->out.count(N) == 0)
+        DBGS() << "MISSING OUTGOING EDGE FROM PREDECESSOR.\n";
+    }
+    for (Node* S : N->out)
+    {
+      if (idxMap.find(S) == idxMap.end())
+        DBGS() << "MISSING OUTPUT NODE\n";
+      if (S->in.count(N) == 0)
+        DBGS() << "MISSING INCOMING EDGE FROM SUCCESSOR.\n";
+    }
+  }
+
+
+  // Print header
+  out << "digraph g {\n";
+  out << "node [\n";
+  out << "  fontsize = \"12\"\n";
+  out << "  labeljust = \"l\"\n";
+  out << "]\n";
+
+  for (int i = 0; i < nodes.size(); ++i)
+  {
+    Node* N = nodes[i];
+
+    // node
+    out << "  N" << i << " [shape=record,label=\"";
+    for (BasicBlock* B : N->blocks)
+      out << B->getName().str() << "\\n";
+    out << "\"];\n";
+
+    // out edges
+    for (Node* S : N->out)
+      out << "  N" << i << " -> N" << idxMap[S] << ";\n";
+
+    // in edges
+    //for( Node* P : N->in )    
+    //  out << "  N" << idxMap[P] << " -> N" << i << " [style=dotted];\n";
+  }
+
+  out << "}\n";
+
+  DBGS() << "\n";
+}
+
+static void printDotGraph(const std::vector<Node*> nodes, Function* F, int step)
+{
+  printDotGraph(nodes, ("red." + F->getName() + "_" + std::to_string(step) + ".dot").str());
+}
+
+
+static Node* split(Node* N, std::map<BasicBlock*, Node*>& bbToNode, bool firstSplit)
+{
+  // Remove one predecessor P from N
+  assert(N->in.size() > 1);
+  Node* P = N->in.pop_back_val();
+  P->out.remove(N);
+
+  // Point P to the clone of N, Np
+  Node* Np = new Node();
+  P->out.insert(Np);
+  Np->in.insert(P);
+
+  // Copy successors of N to Np
+  for (Node* S : N->out)
+  {
+    Np->out.insert(S);
+    S->in.insert(Np);
+  }
+
+#if 1
+  // Run reg2mem on the whole function so we don't have to deal with phis
+  if (firstSplit)
+  {
+    runPasses(N->blocks[0]->getParent(), {
+      createDemoteRegisterToMemoryPass()
+    });
+  }
+
+
+  // Clone N into Np
+  ValueToValueMapTy VMap;
+  for (BasicBlock* B : N->blocks)
+  {
+    BasicBlock* Bp = CloneBasicBlock(B, VMap, ".c", B->getParent());
+    Np->insert(Bp);
+    VMap[B] = Bp;
+  }
+  for (BasicBlock* B : Np->blocks)
+    for (Instruction& I : *B)
+      RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+  // Remap terminators of P from N to Np
+  for (BasicBlock* B : P->blocks)
+    RemapInstruction(B->getTerminator(), VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+#else
+  // Clone N into Np
+  ValueToValueMapTy VMap;
+  for (BasicBlock* B : N->blocks)
+  {
+    BasicBlock* Bp = CloneBasicBlock(B, VMap, ".c", B->getParent());
+    Np->insert(Bp);
+    VMap[B] = Bp;
+  }
+  for (BasicBlock* B : Np->blocks)
+    for (Instruction& I : *B)
+      RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+
+  // Remove incoming values from phis in Np that don't come from actual predecessors
+  BasicBlock* NpEntry = Np->blocks[0];
+  std::set<BasicBlock*> predSet(pred_begin(NpEntry), pred_end(NpEntry));
+  auto I = NpEntry->begin();
+  while (PHINode* phi = dyn_cast<PHINode>(I++))
+  {
+    if (phi->getNumIncomingValues() == predSet.size())
+      continue;
+    for (unsigned i = 0; i < phi->getNumIncomingValues(); )
+    {
+      BasicBlock* B = phi->getIncomingBlock(i);
+      if (!predSet.count(B))
+      {
+        phi->removeIncomingValue(B);
+        continue;
+      }
+      ++i;
+    }
+  }
+
+
+  // Remove phi references to P in N. (Do this before remapping terminators.)
+  BasicBlock* Nentry = N->blocks[0];
+  for (BasicBlock* PB : predecessors(Nentry))
+  {
+    if (P->blocks.count(PB))
+      Nentry->removePredecessor(PB);
+  }
+
+  // Remap terminators of P from N to Np
+  for (BasicBlock* B : P->blocks)
+    RemapInstruction(B->getTerminator(), VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+
+  // Update phis in successors of Np.
+  // There are several cases for a value Vs reaching S. Vs may be defined in N and
+  // a clone Vsp in Np or only passing through one or the other. Furthermore, Vs may 
+  // either appear in a phi in the entry block of S or not.
+  // 1) Vs defined in N (and clone Vsp in Np) and in phi:
+  //    Add incoming value [Vsp, Bp] for cloned value Vsp from predecessor basic
+  //    block Bp in Np wherever [Vs, B] appears
+  // 2) Vs defined in N (and clone Vsp in Np) and not in phi:
+  //    Add phi [Vs,B],[Vsp,Bp] if Vs reaches a use in or through S
+  // 3) Vs passing through N or Np and in phi
+  //    Change [Vs,B] to [Vs,Bp] in phis in S if Vs reached S through P
+  // 4) Vs passing through N or Np and not in a phi
+  //    Do nothing
+  // 
+  // TODO: Only 1) is implemented below and it isn't checking for definition in N
+  for (Node* S : Np->out)
+  {
+    BasicBlock* Sentry = S->blocks[0];
+    auto I = Sentry->begin();
+    while (PHINode* phi = dyn_cast<PHINode>(I++))
+    {
+      for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i)
+      {
+        BasicBlock* B = phi->getIncomingBlock(i);
+        if (N->blocks.count(B))
+        {
+          Value* V = phi->getIncomingValue(i);
+          Value* Vp = VMap[V];
+          if (!Vp)
+            Vp = V; // Def not in N
+          BasicBlock* Bp = dyn_cast<BasicBlock>(VMap[B]);
+          phi->addIncoming(Vp, Bp);
+        }
+      }
+    }
+  }
+#endif
+
+  return Np;
+}
+
+// Returns the number of splits
+int makeReducible(Function* F)
+{
+  // Break critical edges now in case we need to do mem2reg in split(). mem2reg
+  // will break critical edges and the CFG needs to remain unchanged.
+  runPasses(F, {
+    createBreakCriticalEdgesPass()
+  });
+
+  // initialize nodes
+  std::vector<Node*> nodes;
+  std::map<BasicBlock*, Node*> bbToNode;
+  for (BasicBlock& B : *F)
+  {
+    nodes.push_back(new Node(&B));
+    bbToNode[&B] = nodes.back();
+  }
+
+  // initialize edges
+  for (Node* N : nodes)
+  {
+    for (BasicBlock* B : successors(N->blocks[0]))
+    {
+      Node* BN = bbToNode[B];
+      N->out.insert(BN);
+      BN->in.insert(N);
+    }
+  }
+
+  int step = 0;
+  bool print = false;
+  if (print) printDotGraph(nodes, F, step++);
+
+  int numSplits = 0;
+  while (!nodes.empty())
+  {
+    bool changed;
+    do
+    {
+  // It might more efficient to use a worklist based implementation instead
+  // of iterating over the vector.
+      changed = false;
+      for (size_t i = 0; i < nodes.size(); )
+      {
+        Node* N = nodes[i];
+
+        // Remove self references
+        if (N->in.count(N))
+        {
+          N->in.remove(N);
+          N->out.remove(N);
+          changed = true;
+        }
+
+        // Remove singletons
+        if (N->in.size() == 0 && N->out.size() == 0)
+        {
+          nodes.erase(nodes.begin() + i);
+          changed = true;
+          if (print) printDotGraph(nodes, F, step++);
+          continue;
+        }
+
+        // Remove nodes with only one incoming edge
+        if (N->in.size() == 1)
+        {
+          // fold into predecessor
+          Node* P = N->in.back();
+          P->blocks.insert(N->blocks.begin(), N->blocks.end());
+          P->out.remove(N);
+          for (Node* S : N->out)
+          {
+            S->in.remove(N);
+            P->out.insert(S);
+            S->in.insert(P);
+          }
+          P->numInstructions += N->numInstructions;
+          nodes.erase(nodes.begin() + i);
+          changed = true;
+          if (print) printDotGraph(nodes, F, step++);
+          continue;
+        }
+
+        i++;
+      }
+    } while (changed);
+
+    if (!nodes.empty())
+    {
+      // Duplicate the smallest node with more than one incoming edge. Better 
+      // methods exist for picking the node to split, e.g. "Making Graphs Reducible
+      // with Controlled Node Splitting" by Janssen and Corporaal.
+      size_t idxMin = ~0;
+      for (size_t i = 0; i < nodes.size(); ++i)
+      {
+        if (nodes[i]->in.size() <= 1)
+          continue;
+
+        if (idxMin == ~0 || nodes[i]->numInstructions < nodes[idxMin]->numInstructions)
+          idxMin = i;
+      }
+      nodes.push_back(split(nodes[idxMin], bbToNode, numSplits == 0));
+      numSplits++;
+      if (print) printDotGraph(nodes, F, step++);
+    }
+  }
+  return numSplits;
+}

+ 10 - 0
lib/DxrFallback/Reducibility.h

@@ -0,0 +1,10 @@
+#pragma once
+
+namespace llvm
+{
+  class Function;
+}
+
+// Analyzes the reducibility of the control flow graph of F and uses node splitting
+// to make an irredicible CFG reducible. Returns the number of node splits.
+int makeReducible(llvm::Function* F);

+ 1797 - 0
lib/DxrFallback/StateFunctionTransform.cpp

@@ -0,0 +1,1797 @@
+#include "StateFunctionTransform.h"
+
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include "FunctionBuilder.h"
+#include "LiveValues.h"
+#include "LLVMUtils.h"
+#include "Reducibility.h"
+
+#define DBGS dbgs
+//#define DBGS errs
+
+using namespace llvm;
+
+static const char* CALL_INDIRECT_NAME = "\x1?Fallback_CallIndirect@@YAXH@Z";
+static const char* SET_PENDING_ATTR_PREFIX = "\x1?Fallback_SetPendingAttr@@";
+
+
+// Create a string with printf-like arguments
+inline std::string stringf(const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+#ifdef WIN32
+  int size = _vscprintf(fmt, args);
+#else
+  int size = vsnprintf(0, 0, fmt, args);
+#endif
+  va_end(args);
+
+  std::string ret;
+  if (size > 0)
+  {
+    ret.resize(size);
+    va_start(args, fmt);
+    vsnprintf((char*)ret.data(), size + 1, fmt, args);
+    va_end(args);
+  }
+  return ret;
+}
+
+
+// Remove ELF mangling
+static std::string cleanName(StringRef name)
+{
+  if (!name.startswith("\x1?"))
+    return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+  std::string newName = name.substr(2, pos - 2);
+  return newName;
+}
+
+
+// Utility to append the suffix to the name of the value, but returns
+// an empty string if name is empty.  This is to avoid names like ".ptr".
+static std::string addSuffix(StringRef valueName, StringRef suffix)
+{
+  if (!valueName.empty())
+  {
+
+    if (valueName.back() == '.' && suffix.front() == '.') // avoid double dots
+      return (valueName + suffix.substr(1)).str();
+    else
+      return (valueName + suffix).str();
+  }
+  else
+    return valueName.str();
+}
+
+
+// Remove suffix from name.
+static std::string stripSuffix(StringRef name, StringRef suffix)
+{
+  size_t pos = name.rfind(suffix);
+  if (pos != name.npos)
+    return name.substr(0, pos).str();
+  else
+    return name.str();
+}
+
+
+static std::string stripAfter(StringRef name, StringRef suffixStart)
+{
+  size_t pos = name.find(suffixStart);
+  if (pos != name.npos)
+    return name.substr(0, pos).str();
+  else
+    return name.str();
+}
+
+
+// Insert str before the final "." in filename.
+static std::string insertBeforeExtension(const std::string& filename, const std::string& str)
+{
+  std::string ret = filename;
+  size_t      pos = filename.rfind('.');
+  if (pos != std::string::npos)
+    ret.insert(pos, str);
+  else
+    ret += str;
+
+  return ret;
+}
+
+
+// Inserts <functionName>-<id>-<suffix> before the extension in baseName
+static std::string createDumpPath(
+  const std::string& baseName,
+  unsigned           id,
+  const std::string& suffix,
+  const std::string& functionName)
+{
+  std::string s;
+  if (!functionName.empty())
+    s = "-" + functionName;
+  s += stringf("-%02d-", id) + suffix;
+  return insertBeforeExtension(baseName, s);
+}
+
+
+// Return byte offset aligned to the alignment required by inst.
+static uint64_t align(uint64_t offset, Instruction* inst, DataLayout& DL)
+{
+  unsigned alignment = 0;
+  if (AllocaInst* ai = dyn_cast<AllocaInst>(inst))
+    alignment = ai->getAlignment();
+
+  if (alignment == 0)
+    alignment = DL.getPrefTypeAlignment(inst->getType());
+
+  return RoundUpToAlignment(offset, alignment);
+}
+
+
+template <class T>  // T can be Value* or Instruction*
+T createCastForStack(T ptr, llvm::Type* targetPtrElemType, llvm::Instruction* insertBefore)
+{
+  llvm::PointerType* requiredType = llvm::PointerType::get(targetPtrElemType, ptr->getType()->getPointerAddressSpace());
+  if (ptr->getType() == requiredType)
+    return ptr;
+
+  return new llvm::BitCastInst(ptr, requiredType, ptr->getName(), insertBefore);
+}
+
+
+static Value* createCastToInt(Value* val, Instruction* insertBefore)
+{
+  Type* i32Ty = Type::getInt32Ty(val->getContext());
+  if (val->getType() == i32Ty)
+    return val;
+
+  if (val->getType() == Type::getInt1Ty(val->getContext()))
+    return new ZExtInst(val, i32Ty, addSuffix(val->getName(), ".int"), insertBefore);
+
+  Value* intVal = new BitCastInst(val, i32Ty, addSuffix(val->getName(), ".int"), insertBefore);
+  return intVal;
+}
+
+
+static Value* createCastFromInt(Value* intVal, Type* ty, Instruction* insertBefore)
+{
+  Type* i32Ty = Type::getInt32Ty(intVal->getContext());
+  if (ty == i32Ty)
+    return intVal;
+
+  std::string name = intVal->getName();
+  intVal->setName(addSuffix(name, ".int"));
+
+  // Create boolean with compare
+  if (ty == Type::getInt1Ty(intVal->getContext()))
+    return new ICmpInst(insertBefore, CmpInst::ICMP_SGT, intVal, makeInt32(0, intVal->getContext()), name);
+
+  return new BitCastInst(intVal, ty, name, insertBefore);
+}
+
+
+// Gives every value in the given function a name. This can aid in debugging.
+static void dbgNameUnnamedVals(Function* func)
+{
+  Type* voidTy = Type::getVoidTy(func->getContext());
+  for (auto& I : inst_range(func))
+  {
+    if (!I.hasName() && I.getType() != voidTy)
+      I.setName("v"); // LLVM will uniquify the name by adding a numeric suffix
+  }
+}
+
+
+// Returns an iterator for the instruction after the last alloca in the entry block
+// (assuming that allocas are at the top of the entry block).
+static BasicBlock::iterator afterEntryBlockAllocas(Function* function)
+{
+  BasicBlock::iterator insertBefore = function->getEntryBlock().begin();
+  while (isa<AllocaInst>(insertBefore))
+    ++insertBefore;
+  return insertBefore;
+}
+
+
+// Return all the blocks reachable from entryBlock.
+static BasicBlockVector getReachableBlocks(BasicBlock* entryBlock)
+{
+  BasicBlockVector        blocks;
+  std::deque<BasicBlock*> stack = { entryBlock };
+  ::BasicBlockSet         visited = { entryBlock };
+  while (!stack.empty())
+  {
+    BasicBlock* block = stack.front();
+    stack.pop_front();
+
+    blocks.push_back(block);
+
+    TerminatorInst* termInst = block->getTerminator();
+    for (unsigned int succ = 0, succEnd = termInst->getNumSuccessors(); succ != succEnd; ++succ)
+    {
+      BasicBlock* succBlock = termInst->getSuccessor(succ);
+      if (visited.insert(succBlock).second)
+        stack.push_front(succBlock);
+    }
+  }
+
+  return blocks;
+}
+
+
+// Creates a new function with the same arguments and attributes as oldFunction
+static Function* cloneFunctionPrototype(const Function* oldFunction, ValueToValueMapTy& VMap)
+{
+  std::vector<Type*> argTypes;
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I)
+    argTypes.push_back(I->getType());
+
+  FunctionType* FTy = FunctionType::get(oldFunction->getFunctionType()->getReturnType(), argTypes,
+    oldFunction->getFunctionType()->isVarArg());
+  Function* newFunction = Function::Create(FTy, oldFunction->getLinkage(), oldFunction->getName());
+
+  Function::arg_iterator destI = newFunction->arg_begin();
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I, ++destI)
+  {
+    destI->setName(I->getName());
+    VMap[I] = destI;
+  }
+
+  AttributeSet oldAttrs = oldFunction->getAttributes();
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I)
+  {
+    if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
+    {
+      AttributeSet attrs = oldAttrs.getParamAttributes(I->getArgNo() + 1);
+      if (attrs.getNumSlots() > 0)
+        Anew->addAttr(attrs);
+    }
+  }
+
+  newFunction->setAttributes(newFunction->getAttributes().addAttributes(newFunction->getContext(), AttributeSet::ReturnIndex,
+    oldAttrs.getRetAttributes()));
+  newFunction->setAttributes(newFunction->getAttributes().addAttributes(newFunction->getContext(), AttributeSet::FunctionIndex,
+    oldAttrs.getFnAttributes()));
+  return newFunction;
+}
+
+
+// Creates a new function by cloning blocks reachable from entryBlock
+static Function* cloneBlocksReachableFrom(BasicBlock* entryBlock, ValueToValueMapTy& VMap)
+{
+  Function* oldFunction = entryBlock->getParent();
+  Function* newFunction = cloneFunctionPrototype(oldFunction, VMap);
+
+  // Insert a clone of the entry block into the function.
+  BasicBlock* newEntry = CloneBasicBlock(entryBlock, VMap, "", newFunction);
+  VMap[entryBlock] = newEntry;
+
+  // Clone all other blocks.
+  BasicBlockVector blocks = getReachableBlocks(entryBlock);
+  for (auto block : blocks)
+  {
+    if (block == entryBlock)
+      continue;
+    BasicBlock* clonedBlock = CloneBasicBlock(block, VMap, "", newFunction);
+    VMap[block] = clonedBlock;
+  }
+
+  // Remap new instructions to reference blocks and instructions of the new function.
+  for (auto block : blocks)
+  {
+    auto clonedBlock = cast<BasicBlock>(VMap[block]);
+    for (BasicBlock::iterator I = clonedBlock->begin(); I != clonedBlock->end(); ++I)
+    {
+      RemapInstruction(I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+    }
+  }
+
+  // Remove phi operands incoming from blocks that are not present in the new function anymore.
+  for (auto& block : *newFunction)
+  {
+    PHINode* firstPHI = dyn_cast<PHINode>(block.begin());
+    if (firstPHI == nullptr)
+      continue; // phi instructions only at beginning
+
+    // Create set of actual predecessors
+    BasicBlockSet preds(pred_begin(&block), pred_end(&block));
+    if (preds.size() == firstPHI->getNumIncomingValues())
+      continue;
+
+    // Remove phi incoming blocks not in preds
+    for (auto iter = block.begin(); isa<PHINode>(iter); ++iter)
+    {
+      std::vector<unsigned int> toRemove;
+      PHINode*                  phi = cast<PHINode>(iter);
+      for (unsigned int op = 0, opEnd = phi->getNumIncomingValues(); op != opEnd; ++op)
+      {
+        BasicBlock* pred = phi->getIncomingBlock(op);
+        if (preds.count(pred) == 0)
+        {
+          toRemove.push_back(op);
+        }
+      }
+      for (auto I = toRemove.rbegin(), E = toRemove.rend(); I != E; ++I)
+        phi->removeIncomingValue(*I, false);
+    }
+  }
+
+  return newFunction;
+}
+
+
+// Replace and remove calls to func with val
+static void replaceValAndRemoveUnusedDummyFunc(Value* oldVal, Value* newVal, Function* caller)
+{
+  CallInst* call = dyn_cast<CallInst>(oldVal);
+  assert(call != nullptr && "Must be a call");
+  Function* func = call->getCalledFunction();
+  for (CallInst* CI : getCallsToFunction(func, caller))
+  {
+    CI->replaceAllUsesWith(newVal);
+    CI->eraseFromParent();
+  }
+  if (func->getNumUses() == 0)
+    func->eraseFromParent();
+}
+
+
+// Get the integer value of val. If val is not a ConstantInt return false.
+static bool getConstantValue(int& constant, const Value* val)
+{
+  const ConstantInt* CI = dyn_cast<ConstantInt>(val);
+  if (!CI)
+    return false;
+
+  if (CI->getBitWidth() > 32)
+    return false;
+
+  constant = static_cast<int>(CI->getSExtValue());
+  return true;
+}
+
+static int getConstantValue(const Value* val)
+{
+    const ConstantInt* CI = dyn_cast<ConstantInt>(val);
+    assert(CI && CI->getBitWidth() <= 32);
+    return static_cast<int>(CI->getSExtValue());
+}
+
+
+struct StoreInfo
+{
+  Function* stackIntPtrFunc;
+  Value* runtimeDataArg;
+  Value* baseOffset;
+  Instruction* insertBefore;
+
+  Value* val;
+  std::vector<Value*> idxList;
+};
+
+// Takes the offset at which to store the next value.
+// Returns the next available offset.
+static int store(int offset, StoreInfo& SI, Type* ty)
+{
+  if (StructType* STy = dyn_cast<StructType>(ty))
+  {
+    SI.idxList.push_back(nullptr);
+    int elIdx = 0;
+    for (auto& elTy : STy->elements())
+    {
+      SI.idxList.back() = makeInt32(elIdx++, ty->getContext());
+      offset = store(offset, SI, elTy);
+    }
+    SI.idxList.pop_back();
+  }
+  else if (ArrayType* ATy = dyn_cast<ArrayType>(ty))
+  {
+    Type* elTy = ATy->getArrayElementType();
+    SI.idxList.push_back(nullptr);
+    for (int elIdx = 0; elIdx < (int)ATy->getArrayNumElements(); ++elIdx)
+    {
+      SI.idxList.back() = makeInt32(elIdx, ty->getContext());
+      offset = store(offset, SI, elTy);
+    }
+    SI.idxList.pop_back();
+  }
+  else if (PointerType* PTy = dyn_cast<PointerType>(ty))
+  {
+    SI.idxList.push_back(makeInt32(0, ty->getContext()));
+    offset = store(offset, SI, PTy->getPointerElementType());
+    SI.idxList.pop_back();
+  }
+  else
+  {
+    Value* val = SI.val;
+    if (!SI.idxList.empty())
+    {
+      Value* gep = GetElementPtrInst::CreateInBounds(SI.val, SI.idxList, "", SI.insertBefore);
+      val = new LoadInst(gep, "", SI.insertBefore);
+    }
+    if (VectorType* VTy = dyn_cast<VectorType>(ty))
+    {
+      std::vector<Value*>idxList = std::move(SI.idxList);
+      Type* elTy = VTy->getVectorElementType();
+      for (int elIdx = 0; elIdx < (int)VTy->getVectorNumElements(); ++elIdx)
+      {
+        Value* idxVal = makeInt32(elIdx, ty->getContext());
+        Value* el = ExtractElementInst::Create(val, idxVal, "", SI.insertBefore);
+        SI.val = el;
+        offset = store(offset, SI, elTy);
+      }
+      SI.idxList = std::move(idxList);
+    }
+    else
+    {
+      Value* idxVal = makeInt32(offset, val->getContext());
+      Value* intVal = createCastToInt(val, SI.insertBefore);
+      Value* intPtr = CallInst::Create(SI.stackIntPtrFunc, { SI.runtimeDataArg, SI.baseOffset, idxVal }, addSuffix(val->getName(), ".ptr"), SI.insertBefore);
+      new StoreInst(intVal, intPtr, SI.insertBefore);
+      offset += 1;
+    }
+  }
+  return offset;
+}
+
+// Store value to the stack at given baseOffset + offset. Will flatten aggregates and vectors.
+// Returns the offset where writing left off. For pointer vals stores what is pointed to.
+static int store(Value* val, Function* stackIntPtrFunc, Value* runtimeDataArg, Value* baseOffset, int offset, Instruction* insertBefore)
+{
+  StoreInfo SI;
+  SI.stackIntPtrFunc = stackIntPtrFunc;
+  SI.runtimeDataArg = runtimeDataArg;
+  SI.baseOffset = baseOffset;
+  SI.insertBefore = insertBefore;
+  SI.val = val;
+
+  return store(offset, SI, val->getType());
+}
+
+
+static Value* load(llvm::Function* m_stackIntPtrFunc, Value* runtimeDataArg, Value* offset, Value* idx, const std::string& name, Type* ty, Instruction* insertBefore)
+{
+  if (VectorType* VTy = dyn_cast<VectorType>(ty))
+  {
+    LLVMContext& C = ty->getContext();
+    int baseIdx = getConstantValue(idx);
+    Type* elTy = VTy->getVectorElementType();
+    Value* vec = UndefValue::get(VTy);
+    for (int i = 0; i < (int)VTy->getVectorNumElements(); ++i)
+    {
+      std::string elName = stringf("el%d.", i);
+      Value* intPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, makeInt32(baseIdx + i, C) }, elName + "ptr", insertBefore);
+      Value* intEl = new LoadInst(intPtr, elName, insertBefore);
+      Value* el = createCastFromInt(intEl, elTy, insertBefore);
+      vec = InsertElementInst::Create(vec, el, makeInt32(i, C), "tmpvec", insertBefore);
+    }
+    vec->setName(name);
+    return vec;
+  }
+  else
+  {
+    Value* intPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, idx }, addSuffix(name, ".ptr"), insertBefore);
+    Value* intVal = new LoadInst(intPtr, name, insertBefore);
+    Value* val = createCastFromInt(intVal, ty, insertBefore); 
+    return val;
+  }
+}
+
+static void reg2Mem(DenseMap<Instruction*, AllocaInst*>& valToAlloca, DenseMap<AllocaInst*, Instruction*>& allocaToVal, Instruction* inst)
+{
+  if (valToAlloca.count(inst))
+    return;
+
+  // Convert the value to an alloca
+  AllocaInst*  allocaPtr = DemoteRegToStack(*inst, false);
+  if (allocaPtr)
+  {
+    valToAlloca[inst] = allocaPtr;
+    allocaToVal[allocaPtr] = inst;
+  }
+}
+
+
+// Utility class for rematerializing values at a callsite
+class Rematerializer
+{
+public:
+  Rematerializer(
+    DenseMap<AllocaInst*, Instruction*>& allocaToVal,
+    const InstructionSetVector& liveHere,
+    const std::set<Value*>& resources
+  )
+    : m_allocaToVal(allocaToVal)
+    , m_liveHere(liveHere)
+    , m_resources(resources)
+  {}
+
+
+  // Returns true if inst can be rematerialized.
+  bool canRematerialize(Instruction* inst)
+  {
+    if (CallInst* call = dyn_cast<CallInst>(inst))
+    {
+      StringRef funcName = call->getCalledFunction()->getName();
+      if (funcName.startswith("dummyStackFrameSize"))
+        return true;
+      if (funcName.startswith("stack.ptr"))
+        return true;
+      if (funcName.startswith("stack.load"))
+        return true;
+      if (funcName.startswith("dx.op.createHandle"))
+        return true;
+    }
+    else if (LoadInst* load = dyn_cast<LoadInst>(inst))
+    {
+      Value* op = load->getOperand(0);
+      if (GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(op)) // for descriptor tables
+        op = gep->getOperand(0);
+      if (m_resources.count(op))
+        return true;
+    }
+    else if (GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(inst))
+    {
+      assert(gep->hasAllConstantIndices() && "Unhandled non-constant index"); // Should have been changed to stack.ptr
+      return true;
+    }
+
+    return false;
+  }
+
+
+  // Rematerialize the given instruction and its dependency graph, adding 
+  // any nonrematerializable values that are live in the function, but not 
+  // at this callsite to the work list to insure that their values are restored.
+  Instruction* rematerialize(Instruction* inst, std::vector<Instruction *> workList, Instruction* insertBefore, int depth = 0)
+  {
+    // Signal if we hit a complex case. Deep rematerialization needs more analysis.
+    // To make this robust we would need to make it possible to run the current
+    // value through the live value handling pipeline: figure out where it is live,
+    // reg2mem, save/restore at appropriate callsites, etc.
+    assert(depth < 8);
+
+    // Reuse an already rematerialized value?
+    auto it = m_rematMap.find(inst);
+    if (it != m_rematMap.end())
+      return it->second;
+
+    // Handle allocas
+    if (AllocaInst* alloc = dyn_cast<AllocaInst>(inst))
+    {
+      assert(depth > 0); // Should only be an operand to another rematerialized value
+      auto it = m_allocaToVal.find(alloc);
+      if (it != m_allocaToVal.end()) // Is it a value that is live at some callsite (and reg2mem'd)?
+      {
+        Instruction* val = it->second;
+        if (canRematerialize(val))
+        {
+          // Rematerialize here and store to the alloca. We may have already rematerialized a load
+          // from the alloca. Any future uses will use the rematerialized value directly.
+          Instruction* remat = rematerialize(val, workList, insertBefore, depth + 1);
+          new StoreInst(remat, alloc, insertBefore);
+        }
+        else
+        {
+          // Value has to be restored, but it rematerialization may have extended
+          // the liveness of this value to this callsite. Make sure it gets restored.
+          if (!m_liveHere.count(val))
+            workList.push_back(val);
+        }
+      }
+
+      // Allocas are not cloned.
+      return inst;
+    }
+
+    Instruction* clone = inst->clone();
+    clone->setName(addSuffix(inst->getName(), ".remat"));
+    for (unsigned i = 0; i < inst->getNumOperands(); ++i)
+    {
+      Value* op = inst->getOperand(i);
+      if (Instruction* opInst = dyn_cast<Instruction>(op))
+        clone->setOperand(i, rematerialize(opInst, workList, insertBefore, depth + 1));
+      else
+        clone->setOperand(i, op);
+    }
+    clone->insertBefore(insertBefore); // insert after any instructions cloned for operands
+    m_rematMap[inst] = clone;
+    return clone;
+  }
+
+
+  Instruction* getRematerializedValueFor(Instruction* val)
+  {
+    auto it = m_rematMap.find(val);
+    if (it != m_rematMap.end())
+      return it->second;
+    else
+      return nullptr;
+  }
+
+
+private:
+  DenseMap<Instruction*, Instruction*> m_rematMap;    // Map instructions to their rematerialized counterparts
+  DenseMap<AllocaInst*, Instruction*>& m_allocaToVal; // Map allocas for reg2mem'd live values back to the value
+  const InstructionSetVector& m_liveHere;             // Values live at this callsite
+  const std::set<Value*>& m_resources;                // Values for resources like SRVs, UAVs, etc.
+};
+
+
+
+StateFunctionTransform::StateFunctionTransform(Function* func, const std::vector<std::string>& candidateFuncNames, Type* runtimeDataArgTy)
+  : m_function(func)
+  , m_candidateFuncNames(candidateFuncNames)
+  , m_runtimeDataArgTy(runtimeDataArgTy)
+{
+  m_functionName = cleanName(m_function->getName());
+  auto it = std::find(m_candidateFuncNames.begin(), m_candidateFuncNames.end(), m_functionName);
+  assert(it != m_candidateFuncNames.end());
+  m_functionIdx = it - m_candidateFuncNames.begin();
+}
+
+void StateFunctionTransform::setAttributeSize(int size)
+{
+  m_attributeSizeInBytes = size;
+}
+
+void StateFunctionTransform::setParameterInfo(const std::vector<ParameterSemanticType>& paramTypes, bool useCommittedAttr)
+{
+  m_paramTypes = paramTypes;
+  m_useCommittedAttr = useCommittedAttr;
+}
+
+void StateFunctionTransform::setResourceGlobals(const std::set<llvm::Value*>& resources)
+{
+  m_resources = &resources;
+}
+
+Function* StateFunctionTransform::createDummyRuntimeDataArgFunc(Module* module, Type* runtimeDataArgTy)
+{
+  return FunctionBuilder(module, "dummyRuntimeDataArg").type(runtimeDataArgTy).build();
+}
+
+void StateFunctionTransform::setVerbose(bool val)
+{
+  m_verbose = val;
+}
+
+void StateFunctionTransform::setDumpFilename(const std::string& dumpFilename)
+{
+  m_dumpFilename = dumpFilename;
+}
+
+void StateFunctionTransform::run(std::vector<Function*>& stateFunctions, _Out_ unsigned int &shaderStackSize)
+{
+  printFunction("Initial");
+
+  init();
+  printFunction("AfterInit");
+
+  changeCallingConvention();
+  printFunction("AfterCallingConvention");
+
+  preserveLiveValuesAcrossCallsites(shaderStackSize);
+  printFunction("AfterPreserveLiveValues");
+
+  createSubstateFunctions(stateFunctions);
+  printFunctions(stateFunctions, "AfterSubstateFunctions");
+
+  lowerStackFuncs();
+  printFunctions(stateFunctions, "AfterLowerStackFuncs");
+}
+
+void StateFunctionTransform::finalizeStateIds(llvm::Module* module, const std::vector<int>& candidateFuncEntryStateIds)
+{
+  LLVMContext& context = module->getContext();
+  Function* func = module->getFunction("dummyStateId");
+  if (!func)
+    return;
+
+  std::vector<Instruction*> toRemove;
+  for (User* U : func->users())
+  {
+    CallInst* call = dyn_cast<CallInst>(U);
+    if (!call)
+      continue;
+
+    int  functionIdx = 0;
+    int  substate = 0;
+    getConstantValue(functionIdx, call->getArgOperand(0));
+    getConstantValue(substate, call->getArgOperand(1));
+    int stateId = candidateFuncEntryStateIds[functionIdx] + substate;
+
+    call->replaceAllUsesWith(makeInt32(stateId, context));
+    toRemove.push_back(call);
+  }
+
+  for (Instruction* v : toRemove)
+    v->eraseFromParent();
+  func->eraseFromParent();
+
+}
+
+void StateFunctionTransform::init()
+{
+  Module* module = m_function->getParent();
+  m_function->setName(cleanName(m_function->getName()));
+
+  // Run preparatory passes
+  runPasses(m_function, {
+    //createBreakCriticalEdgesPass(),
+    //createLoopSimplifyPass(),
+    //createLCSSAPass(),
+    createPromoteMemoryToRegisterPass()
+  });
+
+  // Make debugging a little easier by giving things names
+  dbgNameUnnamedVals(m_function);
+
+
+  findCallSitesIntrinsicsAndReturns();
+
+
+  // Create a bunch of functions that we are going to need
+  m_stackIntPtrFunc = FunctionBuilder(module, "stackIntPtr").i32Ptr().type(m_runtimeDataArgTy, "runtimeData").i32("baseOffset").i32("offset").build();
+
+  Instruction* insertBefore = afterEntryBlockAllocas(m_function);
+  Function* runtimeDataArgFunc = createDummyRuntimeDataArgFunc(module, m_runtimeDataArgTy);
+  m_runtimeDataArg = CallInst::Create(runtimeDataArgFunc, "runtimeData", insertBefore);
+
+  Function* stackFrameSizeFunc = FunctionBuilder(module, "dummyStackFrameSize").i32().build();
+  m_stackFrameSizeVal = CallInst::Create(stackFrameSizeFunc, "stackFrame.size", insertBefore);
+
+  // TODO only create the values that are actually needed
+  Function* payloadOffsetFunc = FunctionBuilder(module, "payloadOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_payloadOffset = CallInst::Create(payloadOffsetFunc, { m_runtimeDataArg }, "payload.offset", insertBefore);
+
+  Function* committedAttrOffsetFunc = FunctionBuilder(module, "committedAttrOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_committedAttrOffset = CallInst::Create(committedAttrOffsetFunc, { m_runtimeDataArg }, "committedAttr.offset", insertBefore);
+
+  Function* pendingAttrOffsetFunc = FunctionBuilder(module, "pendingAttrOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_pendingAttrOffset = CallInst::Create(pendingAttrOffsetFunc, { m_runtimeDataArg }, "pendingAttr.offset", insertBefore);
+
+  Function* stackFrameOffsetFunc = FunctionBuilder(module, "stackFrameOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_stackFrameOffset = CallInst::Create(stackFrameOffsetFunc, { m_runtimeDataArg }, "stackFrame.offset", insertBefore);
+
+
+  // lower SetPendingAttr() now
+  for (CallInst* call : m_setPendingAttrCalls)
+  {
+    // Get the current pending attribute offset. It can change when a hit is committed
+    Instruction* insertBefore = call;
+    Value* currentPendingAttrOffset = CallInst::Create(pendingAttrOffsetFunc, { m_runtimeDataArg }, "cur.pendingAttr.offset", insertBefore);
+    Value* attr = call->getArgOperand(0);
+    createStackStore(currentPendingAttrOffset, attr, 0, insertBefore);
+    call->eraseFromParent();
+  }
+}
+
+void StateFunctionTransform::findCallSitesIntrinsicsAndReturns()
+{
+  // Create a map for log N lookup
+  std::map<std::string, int> candidateFuncMap;
+  for (int i = 0; i < (int)m_candidateFuncNames.size(); ++i)
+    candidateFuncMap[m_candidateFuncNames[i]] = i;
+
+  for (auto& I : inst_range(m_function))
+  {
+    if (CallInst* call = dyn_cast<CallInst>(&I))
+    {
+      StringRef calledFuncName = call->getCalledFunction()->getName();
+      if (calledFuncName.startswith(SET_PENDING_ATTR_PREFIX))
+        m_setPendingAttrCalls.push_back(call);
+      else if (calledFuncName.startswith("movePayloadToStack"))
+        m_movePayloadToStackCalls.push_back(call);
+      else if (calledFuncName == CALL_INDIRECT_NAME)
+        m_callSites.push_back(call);
+      else
+      {
+        auto it = candidateFuncMap.find(cleanName(calledFuncName));
+        if (it == candidateFuncMap.end())
+          continue;
+
+        assert(call->getCalledFunction()->getReturnType() == Type::getVoidTy(call->getContext()) && "Continuations with returns not supported");
+        m_callSites.push_back(call);
+        m_callSiteFunctionIdx.push_back(it->second);
+      }
+    }
+    else if (ReturnInst* ret = dyn_cast<ReturnInst>(&I))
+    {
+      m_returns.push_back(ret);
+    }
+  }
+}
+
+void StateFunctionTransform::changeCallingConvention()
+{
+  if (!m_callSites.empty() || m_attributeSizeInBytes >= 0)
+    allocateStackFrame();
+
+  if (m_attributeSizeInBytes >= 0)
+    allocateTraceFrame();
+
+  createArgFrames();
+
+  changeFunctionSignature();
+}
+
+static bool isCallToStackPtr(Value* inst)
+{
+  CallInst* call = dyn_cast<CallInst>(inst);
+  if (call && call->getCalledFunction()->getName().startswith("stack.ptr"))
+    return true;
+
+  return false;
+}
+
+static void extendAllocaLifetimes(LiveValues& lv)
+{
+  for (Instruction* inst : lv.getAllLiveValues())
+  {
+    if (!inst->getType()->isPointerTy())
+      continue;
+
+    if (isa<AllocaInst>(inst) || isCallToStackPtr(inst))
+      continue;
+
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(inst);
+    assert(gep && "Unhandled live pointer");
+    Value* ptr = gep->getPointerOperand();
+    if (isCallToStackPtr(ptr))
+      continue;
+    AllocaInst* alloc = dyn_cast<AllocaInst>(gep->getPointerOperand());
+    assert(alloc && "GEP of non-alloca pointer");
+
+    // TODO: We need to set indices of the uses of the gep, not the gep itself
+    const LiveValues::Indices* gepIndices = lv.getIndicesWhereLive(gep);
+    const LiveValues::Indices* allocIndices = lv.getIndicesWhereLive(alloc);
+    if (!allocIndices || *allocIndices != *gepIndices)
+      lv.setIndicesWhereLive(alloc, gepIndices);
+  }
+}
+
+
+void StateFunctionTransform::preserveLiveValuesAcrossCallsites(_Out_ unsigned int &shaderStackSize)
+{
+  if (m_callSites.empty())
+  {
+    // No stack frame. Nothing to do.
+    rewriteDummyStackSize(0);
+    return;
+  }
+
+  SetVector<Instruction*> stackOffsets;
+  stackOffsets.insert(m_stackFrameOffset);
+  if (m_payloadOffset && !m_payloadOffset->user_empty())
+    stackOffsets.insert(m_payloadOffset);
+  if (m_committedAttrOffset && !m_committedAttrOffset->user_empty())
+    stackOffsets.insert(m_committedAttrOffset);
+  if (m_pendingAttrOffset && !m_pendingAttrOffset->user_empty())
+    stackOffsets.insert(m_pendingAttrOffset);
+
+  // Do liveness analysis
+  ArrayRef<Instruction*> instructions((Instruction**)m_callSites.data(), m_callSites.size());
+  LiveValues lv(instructions);
+  lv.run();
+
+  // Make sure alloca lifetimes match their uses
+  extendAllocaLifetimes(lv);
+
+  // Make sure stack offsets get included
+  for (auto o : stackOffsets)
+    lv.setLiveAtAllIndices(o, true);
+
+  // Add payload allocas, if any
+  for (CallInst* call : m_movePayloadToStackCalls)
+  {
+    if (AllocaInst* payloadAlloca = dyn_cast<AllocaInst>(call->getArgOperand(0)))
+      lv.setLiveAtAllIndices(payloadAlloca, true);
+  }
+
+  printSet(lv.getAllLiveValues(), "live values");
+
+
+
+  //
+  // Carve up the stack frame. 
+  //
+  uint64_t offsetInBytes = 0;
+
+  // ... argument frame
+  offsetInBytes += m_maxCallerArgFrameSizeInBytes;
+
+
+  // ... live allocas. 
+  Module* module = m_function->getParent();
+  DataLayout DL(module);
+  DenseMap<Instruction*, Instruction*> allocaToStack;
+  Instruction* insertBefore = getInstructionAfter(m_stackFrameOffset);
+  for (Instruction* inst : lv.getAllLiveValues())
+  {
+    AllocaInst* alloc = dyn_cast<AllocaInst>(inst);
+    if (!alloc)
+      continue;
+
+    // Allocate a slot in the stack frame for the alloca
+    offsetInBytes = align(offsetInBytes, inst, DL);
+    Instruction* stackAlloca = createStackPtr(m_stackFrameOffset, alloc, offsetInBytes, insertBefore);
+    alloc->replaceAllUsesWith(stackAlloca);
+    allocaToStack[inst] = stackAlloca;
+
+    offsetInBytes += DL.getTypeAllocSize(alloc->getAllocatedType());
+  }
+  lv.remapLiveValues(allocaToStack); // replace old allocas with stackAllocas
+  for (auto& kv : allocaToStack)
+    kv.first->eraseFromParent(); // delete old allocas
+
+  // Set payload offsets now that they are all on the stack
+  for (CallInst* call : m_movePayloadToStackCalls)
+  {
+    CallInst* payloadStackPtr = dyn_cast<CallInst>(call->getArgOperand(0));
+    assert(payloadStackPtr->getCalledFunction()->getName().startswith("stack.ptr"));
+    Value* baseOffset = payloadStackPtr->getArgOperand(0);
+    Value* idx = payloadStackPtr->getArgOperand(1);
+    Value* payloadOffset = BinaryOperator::Create(Instruction::Add, baseOffset, idx, "", call);
+    call->replaceAllUsesWith(payloadOffset);
+    payloadOffset->takeName(call);
+    call->eraseFromParent();
+  }
+  //printFunction("AfterStackAllocas");
+
+
+  // ... saves/restores for each call site
+  // Create allocas for live values. This makes it easier to generate code because
+  // we don't have to maintain the use-def chains of SSA form. We can just
+  // load/store from/to the alloca for a particular value. A subsequent mem2reg
+  // pass will rebuild the SSA form.
+  DenseMap<Instruction*, AllocaInst*> valToAlloca;
+  DenseMap<AllocaInst*, Instruction*> allocaToVal;
+  for (Instruction* inst : lv.getAllLiveValues())
+    reg2Mem(valToAlloca, allocaToVal, inst);
+  //printFunction("AfterReg2Mem");
+
+  uint64_t baseOffsetInBytes = offsetInBytes;
+  uint64_t maxOffsetInBytes = offsetInBytes;
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    offsetInBytes = baseOffsetInBytes;
+
+    const InstructionSetVector& liveHere = lv.getLiveValues(i);
+    std::vector<Instruction*> workList(liveHere.begin(), liveHere.end());
+    std::set<Instruction*> visited;
+    Rematerializer R(allocaToVal, liveHere, *m_resources);
+    Instruction* saveInsertBefore = m_callSites[i];
+    Instruction* restoreInsertBefore = getInstructionAfter(m_callSites[i]);
+    Instruction* rematInsertBefore = nullptr; // create only if needed
+
+    // Rematerialize stack offsets after the continuation before other restores
+    for (Instruction* inst : stackOffsets)
+    {
+      visited.insert(inst);
+      Instruction* remat = R.rematerialize(inst, workList, restoreInsertBefore);
+      new StoreInst(remat, valToAlloca[inst], restoreInsertBefore);
+    }
+    Instruction* saveStackFrameOffset = new LoadInst(valToAlloca[m_stackFrameOffset], "stackFrame.offset", saveInsertBefore);
+    Instruction* restoreStackFrameOffset = R.getRematerializedValueFor(m_stackFrameOffset);
+
+    while (!workList.empty())
+    {
+      Instruction* inst = workList.back();
+      workList.pop_back();
+      if (!visited.insert(inst).second)
+        continue;
+
+      if (!R.canRematerialize(inst))
+      {
+        assert(!inst->getType()->isPointerTy() && "Can not save pointers");
+
+        offsetInBytes = align(offsetInBytes, inst, DL);
+        AllocaInst* alloca = valToAlloca[inst];
+
+        Value* saveVal = new LoadInst(alloca, addSuffix(inst->getName(), ".save"), saveInsertBefore);
+        createStackStore(saveStackFrameOffset, saveVal, offsetInBytes, saveInsertBefore);
+
+        Value* restoreVal = createStackLoad(restoreStackFrameOffset, inst, offsetInBytes, restoreInsertBefore);
+        new StoreInst(restoreVal, alloca, restoreInsertBefore);
+
+        offsetInBytes += DL.getTypeAllocSize(inst->getType());
+      }
+      else if (R.getRematerializedValueFor(inst) == nullptr)
+      {
+        if (!rematInsertBefore)
+        {
+          // Create a new block after restores for rematerialized values. This 
+          // ensures that we can use restored values (through their allocas) even
+          // if we haven't generated the actual restore yet.
+          rematInsertBefore = restoreInsertBefore->getParent()->splitBasicBlock(restoreInsertBefore, "remat_begin")->begin();
+          restoreInsertBefore = m_callSites[i]->getParent()->getTerminator();
+        }
+        Instruction* remat = R.rematerialize(inst, workList, rematInsertBefore);
+        new StoreInst(remat, valToAlloca[inst], rematInsertBefore);
+      }
+    }
+
+    // Take the max offset over all call sites
+    maxOffsetInBytes = std::max(maxOffsetInBytes, offsetInBytes);
+  }
+
+
+  // ... traceFrame (if any)
+  maxOffsetInBytes += m_traceFrameSizeInBytes;
+
+
+  // Set the stack size
+  rewriteDummyStackSize(maxOffsetInBytes);
+  shaderStackSize = maxOffsetInBytes;
+}
+
+void StateFunctionTransform::createSubstateFunctions(std::vector<Function*>& stateFunctions)
+{
+  // The runtime perf of split() depends on the number of blocks in the function.
+  // Simplifying the CFG before the split helps reduce the cost of that operation.
+  runPasses(m_function, {
+    createCFGSimplificationPass()
+  });
+
+  stateFunctions.resize(m_callSites.size() + 1);
+  BasicBlockVector substateEntryBlocks = replaceCallSites();
+  for (size_t i = 0, e = stateFunctions.size(); i < e; ++i)
+  {
+    stateFunctions[i] = split(m_function, substateEntryBlocks[i], i);
+
+    // Add an attribute so we can detect when an intrinsic is not being called
+    // from a state function, and thus doesn't have access to the runtimeData pointer.
+    stateFunctions[i]->addFnAttr("state_function", "true");
+  }
+
+  // Erase base function
+  m_function->eraseFromParent();
+  m_function = nullptr;
+}
+
+void StateFunctionTransform::allocateStackFrame()
+{
+  Module* module = m_function->getParent();
+
+  // Push stack frame in entry block. 
+  Instruction* insertBefore = m_stackFrameOffset;
+  Function* stackFramePushFunc = FunctionBuilder(module, "stackFramePush").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("size").build();
+  m_stackFramePush = CallInst::Create(stackFramePushFunc, { m_runtimeDataArg, m_stackFrameSizeVal }, "", insertBefore);
+
+  // Pop the stack frame just before returns.
+  Function* stackFramePop = FunctionBuilder(module, "stackFramePop").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("size").build();
+  for (Instruction* insertBefore : m_returns)
+    CallInst::Create(stackFramePop, { m_runtimeDataArg, m_stackFrameSizeVal }, "", insertBefore);
+}
+
+void StateFunctionTransform::allocateTraceFrame()
+{
+  assert(m_attributeSizeInBytes >= 0 && "Attribute size has not been specified");
+
+  m_traceFrameSizeInBytes =
+      2 * m_attributeSizeInBytes // committed and pending attributes
+    + 2 * sizeof(int);           // old committed/pending attribute offsets
+  int attrSizeInInts = m_attributeSizeInBytes / sizeof(int);
+
+  // Push the trace frame first thing so that the runtime 
+  // can do setup relative to the entry stack offset.
+  Module* module = m_function->getParent();
+  Instruction* insertBefore = afterEntryBlockAllocas(m_function);
+  Value* attrSize = makeInt32(attrSizeInInts, module->getContext());
+  Function* traceFramePushFunc = FunctionBuilder(module, "traceFramePush").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("attrSize").build();
+  CallInst::Create(traceFramePushFunc, { m_runtimeDataArg, attrSize }, "", insertBefore);
+
+  // Pop the trace frame just before returns.
+  Function* traceFramePopFunc = FunctionBuilder(module, "traceFramePop").voidTy().type(m_runtimeDataArgTy, "runtimeData").build();
+  for (Instruction* insertBefore : m_returns)
+    CallInst::Create(traceFramePopFunc, { m_runtimeDataArg }, "", insertBefore);
+}
+
+bool isTemporaryAlloca(Value* op)
+{
+  // TODO: Need to some analysis to figure this out. We can put the alloca on
+  // the caller stack if:
+  //  there is only a single callsite OR
+  //  if no callsite between stores/loads and this callsite
+  return true;
+}
+
+void StateFunctionTransform::createArgFrames()
+{
+  Module* module = m_function->getParent();
+  DataLayout DL(module);
+  Instruction* stackAllocaInsertBefore = getInstructionAfter(m_stackFrameOffset);
+
+  // Retrieve this function's arguments from the stack
+  if (m_function->getFunctionType()->getNumParams() > 0)
+  {
+    if (m_paramTypes.empty())
+      m_paramTypes.assign(m_function->getFunctionType()->getNumParams(), PST_NONE); // assume standard argument types
+
+    static_assert(PST_COUNT == 3, "Expected 3 parameter semantic types");
+    int offsetInBytes[PST_COUNT] = { 0, 0, 0 };
+    Value* baseOffset[PST_COUNT] = { nullptr, nullptr, nullptr };
+
+    Instruction* insertBefore = stackAllocaInsertBefore;
+    for (auto pst : m_paramTypes)
+    {
+      if (baseOffset[pst])
+        continue;
+
+      if (pst == PST_NONE)
+      {
+        baseOffset[pst] = BinaryOperator::Create(Instruction::Add, m_stackFrameOffset, m_stackFrameSizeVal, "callerArgFrame.offset", insertBefore);
+        offsetInBytes[pst] = sizeof(int); // skip the first element in caller arg frame (returnStateID)
+      }
+      else if (pst == PST_PAYLOAD)
+      {
+        baseOffset[pst] = m_payloadOffset;
+      }
+      else if (pst == PST_ATTRIBUTE)
+      {
+        baseOffset[pst] = (m_useCommittedAttr) ? m_committedAttrOffset : m_pendingAttrOffset;
+      }
+      else
+      {
+        assert(0 && "Bad parameter type");
+      }
+    }
+
+    int argIdx = 0;
+    for (auto& arg : m_function->args())
+    {
+      ParameterSemanticType pst = m_paramTypes[argIdx];
+      Value* val = nullptr;
+      if (arg.getType()->isPointerTy())
+      {
+        // Assume that pointed to memory is on the stack.
+        val = createStackPtr(baseOffset[pst], &arg, offsetInBytes[pst], insertBefore);
+        offsetInBytes[pst] += DL.getTypeAllocSize(arg.getType()->getPointerElementType());
+      }
+      else
+      {
+        val = createStackLoad(baseOffset[pst], &arg, offsetInBytes[pst], insertBefore);
+        offsetInBytes[pst] += DL.getTypeAllocSize(arg.getType());
+      }
+
+      // Replace use of the argument with the loaded value
+      if (arg.hasName())
+        val->takeName(&arg);
+      else
+        val->setName("arg" + std::to_string(argIdx));
+      arg.replaceAllUsesWith(val);
+
+      argIdx++;
+    }
+  }
+
+
+  // Process function arguments for each call site
+  m_maxCallerArgFrameSizeInBytes = 0;
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    int offsetInBytes = 0;
+    CallInst* call = m_callSites[i];
+    FunctionType* FT = call->getCalledFunction()->getFunctionType();
+    StringRef calledFuncName = call->getCalledFunction()->getName();
+
+    Instruction* insertBefore = call;
+
+    // Set the return stateId (next substate of this function)
+    int nextSubstate = i + 1;
+    Value* nextStateId = getDummyStateId(m_functionIdx, nextSubstate, insertBefore);
+    createStackStore(m_stackFrameOffset, nextStateId, offsetInBytes, insertBefore);
+    offsetInBytes += DL.getTypeAllocSize(nextStateId->getType());
+    if (FT->getNumParams() && calledFuncName != CALL_INDIRECT_NAME)
+    {
+      for (unsigned index = 0; index < FT->getNumParams(); ++index)
+      {
+        // Save the argument from the argFrame
+        Value* op = call->getArgOperand(index);
+        Type* opTy = op->getType();
+        if (opTy->isPointerTy())
+        {
+          // TODO: Until we have callable shaders we should not get here except
+          // in tests.
+          if (isTemporaryAlloca(op))
+          {
+            // We can just replace the alloca with space in the arg frame
+            assert(isa<AllocaInst>(op));
+            Value* stackAlloca = createStackPtr(m_stackFrameOffset, op, offsetInBytes, stackAllocaInsertBefore);
+            op->replaceAllUsesWith(stackAlloca);
+            cast<AllocaInst>(op)->eraseFromParent();
+          }
+          else
+          {
+            // copy in/out
+            assert(0);
+          }
+          offsetInBytes += DL.getTypeAllocSize(opTy->getPointerElementType());
+        }
+        else
+        {
+          createStackStore(m_stackFrameOffset, op, offsetInBytes, insertBefore);
+          offsetInBytes += DL.getTypeAllocSize(opTy);
+        }
+
+        // Replace use of the argument with undef
+        call->setArgOperand(index, UndefValue::get(opTy));
+
+      }
+    }
+
+    if (offsetInBytes > m_maxCallerArgFrameSizeInBytes)
+      m_maxCallerArgFrameSizeInBytes = offsetInBytes;
+  }
+}
+
+void StateFunctionTransform::changeFunctionSignature()
+{
+  // Create a new function that takes a state object pointer and returns next state ID
+  // and splice in the body of the old function into the new one.
+  Function* newFunc = FunctionBuilder(m_function->getParent(), m_functionName + "_tmp").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  newFunc->getBasicBlockList().splice(newFunc->begin(), m_function->getBasicBlockList());
+  m_function = newFunc;
+
+  // Set the runtime data pointer and remove the dummy function .
+  Value* runtimeDataArg = m_function->arg_begin();
+  replaceValAndRemoveUnusedDummyFunc(m_runtimeDataArg, runtimeDataArg, m_function);
+  m_runtimeDataArg = runtimeDataArg;
+
+  // Get return stateID from stack on each return.
+  LLVMContext& context = m_function->getContext();
+  Value* zero = makeInt32(0, context);
+  CallInst* retStackFrameOffset = m_stackFrameOffset;
+  for (ReturnInst*& ret : m_returns)
+  {
+    Instruction* insertBefore = ret;
+    if (m_stackFramePush)
+      retStackFrameOffset = CallInst::Create(m_stackFrameOffset->getCalledFunction(), { m_runtimeDataArg }, "ret.stackFrame.offset", insertBefore);
+    Instruction* returnStateIdPtr = CallInst::Create(m_stackIntPtrFunc, { m_runtimeDataArg, retStackFrameOffset, zero }, "ret.stateId.ptr", insertBefore);
+    Value* returnStateId = new LoadInst(returnStateIdPtr, "ret.stateId", insertBefore);
+    ReturnInst* newRet = ReturnInst::Create(context, returnStateId);
+    ReplaceInstWithInst(ret, newRet);
+    ret = newRet; // update reference
+  }
+}
+
+
+void StateFunctionTransform::rewriteDummyStackSize(uint64_t frameSizeInBytes)
+{
+  assert(frameSizeInBytes % sizeof(int) == 0);
+  Value*   frameSizeVal = makeInt32(frameSizeInBytes / sizeof(int), m_function->getContext());
+  replaceValAndRemoveUnusedDummyFunc(m_stackFrameSizeVal, frameSizeVal, m_function);
+  m_stackFrameSizeVal = frameSizeVal;
+}
+
+static inline Value* toIntIndex(int offsetInBytes, Value* baseOffset, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  if (baseOffset)
+    intIndex = BinaryOperator::Create(Instruction::Add, intIndex, baseOffset, "", insertBefore);
+  return intIndex;
+}
+
+void StateFunctionTransform::createStackStore(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Value* args[] = { val, baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType(), args[2]->getType() };
+  FunctionType* FT = FunctionType::get(Type::getVoidTy(val->getContext()), argTypes, false);
+  Function* F = getOrCreateFunction("stack.store", insertBefore->getModule(), FT, m_stackStoreFuncs);
+  CallInst::Create(F, args, "", insertBefore);
+}
+
+Instruction* StateFunctionTransform::createStackLoad(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Value* args[] = { baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType() };
+  FunctionType* FT = FunctionType::get(val->getType(), argTypes, false);
+  Function* F = getOrCreateFunction("stack.load", insertBefore->getModule(), FT, m_stackLoadFuncs);
+  return CallInst::Create(F, args, addSuffix(val->getName(), ".restore"), insertBefore);
+}
+
+Instruction* StateFunctionTransform::createStackPtr(Value* baseOffset, Type* valTy, Value* intIndex, Instruction* insertBefore)
+{
+  Value* args[] = { baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType() };
+  FunctionType* FT = FunctionType::get(valTy, argTypes, false);
+  Function* F = getOrCreateFunction("stack.ptr", insertBefore->getModule(), FT, m_stackPtrFuncs);
+  CallInst* call = CallInst::Create(F, args, "", insertBefore);
+  return call;
+}
+
+Instruction* StateFunctionTransform::createStackPtr(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Instruction* ptr = createStackPtr(baseOffset, val->getType(), intIndex, insertBefore);
+  ptr->takeName(val);
+  return ptr;
+}
+
+static bool isStackIntPtr(Value* val)
+{
+  CallInst* call = dyn_cast<CallInst>(val);
+  return call && call->getCalledFunction()->getName().startswith("stack.ptr");
+}
+
+// This code adapted from GetElementPtrInst::accumulateConstantOffset(). 
+// TODO: Use a single function for both constant and dynamic offsets? Could do
+// some constant folding along the way for dynamic offsets.
+Value* accumulateDynamicOffset(GetElementPtrInst* gep, const DataLayout &DL)
+{
+  LLVMContext& C = gep->getContext();
+  Instruction* insertBefore = gep;
+  Value* offset = makeInt32(0, C);
+  for (gep_type_iterator GTI = gep_type_begin(gep), GTE = gep_type_end(gep); GTI != GTE; ++GTI)
+  {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (OpC && OpC->isZero())
+      continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    Value* elementOffset = nullptr;
+    if (StructType *STy = dyn_cast<StructType>(*GTI))
+    {
+      assert(OpC && "Structure indices must be constant");
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = DL.getStructLayout(STy);
+      elementOffset = makeInt32(SL->getElementOffset(ElementIdx) / sizeof(int), C);
+    }
+    else
+    {
+      // For array or vector indices, scale the index by the size of the type.
+      Value* stride = makeInt32(DL.getTypeAllocSize(GTI.getIndexedType()) / sizeof(int), C);
+      elementOffset = BinaryOperator::Create(Instruction::Mul, GTI.getOperand(), stride, "elOffs", insertBefore);
+    }
+
+    offset = BinaryOperator::Create(Instruction::Add, offset, elementOffset, "offs", insertBefore);
+  }
+  return offset;
+}
+
+
+// Adds gep offset to offsetVal and returns the result
+static Value* accumulateGepOffset(GetElementPtrInst* gep, Value* offsetVal)
+{
+  Module* M = gep->getModule();
+  const DataLayout& DL = M->getDataLayout();
+
+  Value* elementOffsetVal = nullptr;
+  APInt constOffset(DL.getPointerSizeInBits(), 0);
+  if (gep->accumulateConstantOffset(DL, constOffset))
+    elementOffsetVal = makeInt32((int)constOffset.getZExtValue() / sizeof(int), M->getContext());
+  else
+    elementOffsetVal = accumulateDynamicOffset(gep, DL);
+  elementOffsetVal = BinaryOperator::Create(Instruction::Add, offsetVal, elementOffsetVal, "offs", gep);
+
+  return elementOffsetVal;
+}
+
+// Turn GEPs on a stack.ptr of aggregate type into stack.ptrs of scalar type
+void StateFunctionTransform::flattenGepsOnValue(Value* val, Value* baseOffset, Value* offsetVal)
+{
+  for (auto U = val->user_begin(), UE = val->user_end(); U != UE;)
+  {
+    User* user = *U++;
+    if (CallInst* call = dyn_cast<CallInst>(user))
+    {
+      // inline the call to expose GEPs and restart the loop. 
+      InlineFunctionInfo IFI;
+      bool success = InlineFunction(call, IFI, false);
+      assert(success);
+      (void)success; 
+
+      U = val->user_begin();
+      UE = val->user_end();
+      continue;
+    }
+
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(user);
+    if (!gep)
+      continue;
+
+    Value* elementOffsetVal = accumulateGepOffset(gep, offsetVal);
+    Type* gepElTy = gep->getType()->getPointerElementType();
+    if (gepElTy->isAggregateType())
+    {
+      // flatten geps on this gep
+      flattenGepsOnValue(gep, baseOffset, elementOffsetVal);
+    }
+    else if (isa<VectorType>(gepElTy))
+      scalarizeVectorStackAccess(gep, baseOffset, elementOffsetVal);
+    else 
+    {
+      Value* ptr = createStackPtr(baseOffset, gep->getType(), elementOffsetVal, gep);
+      ptr->takeName(gep); // could use a name that encodes the gep type and indices
+      gep->replaceAllUsesWith(ptr);
+    }
+
+    gep->eraseFromParent();
+  }
+}
+
+
+void StateFunctionTransform::scalarizeVectorStackAccess(Instruction* vecPtr, Value* baseOffset, Value* offsetVal)
+{
+  std::vector<Value*> elPtrs;
+  Type* VTy = vecPtr->getType()->getPointerElementType();
+  Type* elTy = VTy->getVectorElementType();
+  LLVMContext& C = vecPtr->getContext();
+  Value* curOffsetVal = offsetVal;
+  Value* one = makeInt32(1, C);
+  offsetVal->setName("offs0.");
+  for (unsigned i = 0; i < VTy->getVectorNumElements(); ++i)
+  {
+    // TODO: If offsetVal is a constant we could just create constants instead of add instructions
+    if (i > 0)
+      curOffsetVal = BinaryOperator::Create(Instruction::Add, curOffsetVal, one, stringf("offs%d.", i), vecPtr);
+    elPtrs.push_back(createStackPtr(baseOffset, elTy->getPointerTo(), curOffsetVal, vecPtr));
+    elPtrs.back()->setName(addSuffix(vecPtr->getName(), stringf(".el%d.", i)));
+  }
+
+  // Scalarize load/stores
+  for (auto U = vecPtr->user_begin(), UE = vecPtr->user_end(); U != UE;)
+  {
+    User* user = *U++;
+    if (LoadInst* load = dyn_cast<LoadInst>(user))
+    {
+      Value* vec = UndefValue::get(VTy);
+      for (size_t i = 0; i < elPtrs.size(); ++i)
+      {
+        Value* el = new LoadInst(elPtrs[i], stringf("el%d.", i), load);
+        vec = InsertElementInst::Create(vec, el, makeInt32(i, C), "vec", load);
+      }
+      load->replaceAllUsesWith(vec);
+      load->eraseFromParent();
+    }
+    else if (StoreInst* store = dyn_cast<StoreInst>(user))
+    {
+      Value* vec = store->getOperand(0);
+      for (size_t i = 0; i < elPtrs.size(); ++i)
+      {
+        Value* el = ExtractElementInst::Create(vec, makeInt32(i, C), stringf("el%d.", i), store);
+        new StoreInst(el, elPtrs[i], store);
+      }
+      store->eraseFromParent();
+    }
+    else
+    {
+      assert(0 && "Unhandled user");
+    }
+  }
+}
+
+
+void StateFunctionTransform::lowerStackFuncs()
+{
+  LLVMContext& C = m_stackIntPtrFunc->getContext();
+  const DataLayout& DL = m_stackIntPtrFunc->getParent()->getDataLayout();
+
+  // stack.store functions
+  for (auto& kv : m_stackStoreFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* val = call->getArgOperand(0);
+      Value* offset = call->getArgOperand(1);
+      int idx = getConstantValue(call->getArgOperand(2));
+
+      Instruction* insertBefore = call;
+      if (isStackIntPtr(val))
+      {
+        // Copy from one part of the stack to another
+        CallInst* valCall = dyn_cast<CallInst>(val);
+        Value* srcOffset = valCall->getArgOperand(0);
+        int srcIdx = getConstantValue(valCall->getArgOperand(1));
+        Value* dstOffset = offset;
+        int dstIdx = idx;
+        int intCount = (int)DL.getTypeAllocSize(val->getType()->getPointerElementType()) / sizeof(int);
+        for (int i = 0; i < intCount; ++i)
+        {
+          std::string idxStr = stringf("%d.", i);
+          Value* srcPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, srcOffset, makeInt32(srcIdx + i, C) }, addSuffix(val->getName(), ".ptr" + idxStr), insertBefore);
+          Value* dstPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, dstOffset, makeInt32(dstIdx + i, C) }, "dst.ptr" + idxStr, insertBefore);
+          Value* intVal = new LoadInst(srcPtr, "copy.val" + idxStr, insertBefore);
+          new StoreInst(intVal, dstPtr, insertBefore);
+        }
+      }
+      else
+      {
+        store(val, m_stackIntPtrFunc, runtimeDataArg, offset, idx, insertBefore);
+      }
+
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+
+  // stack.load functions
+  for (auto& kv : m_stackLoadFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      std::string name = stripSuffix(call->getName(), ".restore");
+      call->setName("");
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+
+      Instruction* insertBefore = call;
+      Value* val = load(m_stackIntPtrFunc, runtimeDataArg, offset, idx, name, call->getType(), insertBefore);
+      call->replaceAllUsesWith(val);
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+
+
+  // Scalarize accesses based on a stack.ptr func
+  for (auto& kv : m_stackPtrFuncs)
+  {
+    Function* F = kv.second;
+    if (!F->getReturnType()->getPointerElementType()->isAggregateType())
+      continue;
+    for (auto U = F->user_begin(), UE = F->user_end(); U != UE; )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+      flattenGepsOnValue(call, offset, idx);
+      call->eraseFromParent();
+    }
+  }
+
+
+  // stack.ptr functions
+  for (auto& kv : m_stackPtrFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      std::string name = call->getName();
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+
+      Instruction* insertBefore = call;
+      Value* ptr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, idx }, addSuffix(name, ".ptr"), insertBefore);
+      if (ptr->getType() != call->getType())
+        ptr = new BitCastInst(ptr, call->getType(), "", insertBefore);
+      ptr->takeName(call);
+      call->replaceAllUsesWith(ptr);
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+}
+
+Function* StateFunctionTransform::split(Function* baseFunc, BasicBlock* substateEntryBlock, int substateIndex)
+{
+  ValueToValueMapTy VMap;
+  Function*         substateFunc = cloneBlocksReachableFrom(substateEntryBlock, VMap);
+  Module*           module = baseFunc->getParent();
+  module->getFunctionList().push_back(substateFunc);
+  substateFunc->setName(m_functionName + ".ss_" + std::to_string(substateIndex));
+
+  if (substateIndex != 0)
+  {
+    // Collect allocas from entry block
+    SmallVector<Instruction*, 16> allocasToClone;
+    for (auto& I : baseFunc->getEntryBlock().getInstList())
+    {
+      if (isa<AllocaInst>(&I))
+        allocasToClone.push_back(&I);
+    }
+
+    // Clone collected allocas
+    BasicBlock* newEntryBlock = &substateFunc->getEntryBlock();
+    for (auto I : allocasToClone)
+    {
+      // Collect users of original instruction in substateFunc
+      std::vector<Instruction*> users;
+      for (auto U : I->users())
+      {
+        Instruction* inst = dyn_cast<Instruction>(U);
+        if (inst->getParent()->getParent() == substateFunc)
+          users.push_back(inst);
+      }
+
+      if (users.empty())
+        continue;
+
+      // Clone instruction
+      Instruction* clone = I->clone();
+      if (I->hasName())
+        clone->setName(I->getName());
+      clone->insertBefore(newEntryBlock->getFirstInsertionPt()); // allocas first in entry block
+      RemapInstruction(clone, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+      // Replaces uses
+      for (auto user : users)
+        user->replaceUsesOfWith(I, clone);
+    }
+  }
+
+  //printFunction( substateFunc, substateFunc->getName().str() + "-BeforeSplittingOpt", m_dumpId++ );
+
+  makeReducible(substateFunc);
+
+  // Undo the reg2mem done in preserveLiveValuesAcrossCallSites()
+  runPasses(substateFunc, {
+    createVerifierPass(),
+    createPromoteMemoryToRegisterPass()
+  });
+
+  //printFunction( substateFunc, substateFunc->getName().str() + "-AfterSplitting", m_dumpId++ );
+
+  return substateFunc;
+}
+
+BasicBlockVector StateFunctionTransform::replaceCallSites()
+{
+  LLVMContext& context = m_function->getContext();
+
+  BasicBlockVector substateEntryPoints{ &m_function->getEntryBlock() };
+  substateEntryPoints[0]->setName(m_functionName + ".BB0");
+
+  // Add other substates by splitting blocks at call sites.
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    CallInst*   call = m_callSites[i];
+    BasicBlock* block = call->getParent();
+    StringRef calledFuncName = call->getCalledFunction()->getName();
+
+    BasicBlock* nextBlock =
+      block->splitBasicBlock(call->getNextNode(), m_functionName + ".BB" + std::to_string(i + 1) + ".from."
+        + cleanName(calledFuncName));
+    substateEntryPoints.push_back(nextBlock);
+
+    // Return state id for entry state of the function being called
+    Instruction* insertBefore = call;
+    Value* returnStateId = nullptr;
+    if (calledFuncName == CALL_INDIRECT_NAME)
+      returnStateId = call->getArgOperand(0);
+    else
+      returnStateId = getDummyStateId(m_callSiteFunctionIdx[i], 0, insertBefore);
+    ReplaceInstWithInst(call->getParent()->getTerminator(), ReturnInst::Create(context, returnStateId));
+    call->eraseFromParent();
+  }
+  return substateEntryPoints;
+}
+
+llvm::Value* StateFunctionTransform::getDummyStateId(int functionIdx, int substate, llvm::Instruction* insertBefore)
+{
+  if (!m_dummyStateIdFunc)
+  {
+    Module* M = m_function->getParent();
+    m_dummyStateIdFunc = FunctionBuilder(M, "dummyStateId").i32().i32("functionIdx").i32("substate").build();
+  }
+  LLVMContext& context = insertBefore->getContext();
+  Value* functionIdxVal = makeInt32(functionIdx, context);
+  Value* substateVal = makeInt32(substate, context);
+  return CallInst::Create(m_dummyStateIdFunc, { functionIdxVal, substateVal }, "stateId", insertBefore);
+}
+
+raw_ostream& StateFunctionTransform::getOutputStream(const std::string functionName, const std::string& suffix, unsigned int dumpId)
+{
+  if (m_dumpFilename.empty())
+    return DBGS();
+
+  const std::string filename = createDumpPath(m_dumpFilename, dumpId, suffix, functionName);
+  std::error_code  errorCode;
+  raw_ostream* out = new raw_fd_ostream(filename, errorCode, sys::fs::OpenFlags::F_None);
+  if (errorCode)
+  {
+    DBGS() << "Failed to open " << filename << " for writing sft output. " << errorCode.message() << "\n";
+    delete out;
+    return DBGS();
+  }
+
+  return *out;
+}
+
+void StateFunctionTransform::printFunction(const Function* function, const std::string& suffix, unsigned int dumpId)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream(m_functionName, suffix, dumpId);
+  out << "; ########################### " << suffix << "\n";
+  out << *function << "\n";
+  if (&out != &DBGS())
+    delete &out;
+}
+
+void StateFunctionTransform::printFunction(const std::string& suffix)
+{
+  printFunction(m_function, suffix, m_dumpId++);
+}
+
+void StateFunctionTransform::printFunctions(const std::vector<Function*>& funcs, const char* suffix)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream(m_functionName, suffix, m_dumpId++);
+  out << "; ########################### " << suffix << "\n";
+  for (Function* F : funcs)
+    out << *F << "\n";
+  if (&out != &DBGS())
+    delete &out;
+}
+
+void StateFunctionTransform::printModule(const Module* module, const std::string& suffix)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream("module", suffix, m_dumpId++);
+  out << "; ########################### " << suffix << "\n";
+  out << *module << "\n";
+}
+
+void StateFunctionTransform::printSet(const InstructionSetVector& vals, const char* msg)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = DBGS();
+  if (msg)
+    out << msg << " --------------------\n";
+
+  uint64_t totalBytes = 0;
+  if (vals.size() > 0)
+  {
+    Module*    module = m_function->getParent();
+    DataLayout DL(module);
+    for (InstructionSetVector::const_iterator I = vals.begin(), IE = vals.end(); I != IE; ++I)
+    {
+      const Instruction* inst = *I;
+      uint64_t           size = DL.getTypeAllocSize(inst->getType());
+      out << stringf("%3dB: ", size) << *inst << '\n';
+      totalBytes += size;
+    }
+  }
+  out << "Count:" << vals.size() << "  Bytes:" << totalBytes << "\n\n";
+}

+ 295 - 0
lib/DxrFallback/StateFunctionTransform.h

@@ -0,0 +1,295 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class AllocaInst;
+  class BasicBlock;
+  class CallInst;
+  class Function;
+  class FunctionType;
+  class Instruction;
+  class Module;
+  class raw_ostream;
+  class ReturnInst;
+  class StructType;
+  class Type;
+  class Value;
+}
+
+class LiveValues;
+
+typedef std::vector<llvm::BasicBlock*>  BasicBlockVector;
+typedef llvm::SetVector<llvm::Instruction*> InstructionSetVector;
+
+
+//==============================================================================
+// Transforms the given function into a number of state functions to be 
+// used in a state machine. 
+//
+// State functions have the following signature: 
+//    int (<RuntimeDataTy> runtimeData). 
+// They take an runtime data argument with a given type used by the runtime and 
+// return the state ID of the next state. If the function contains calls to other  
+// candidate functions that are to be transformed into state functions, the 
+// function is split into multiple substate functions at call sites and the calls 
+// are replaced with continuations. For example candidate funcA() calling candidate 
+// funcB():
+//   void funcA(int param0)
+//   {
+//      // code moved to funcA_ss0()
+//      int foo = 10;
+//      ...
+//
+//      funcB(arg0, arg1); 
+//
+//      // code moved to funcA_ss1()
+//      int bar = someFunc(foo);
+//      
+//   } 
+// will be split into two substate functions, funcA_ss0() and funcA_ss1(). 
+// funcA_ss0() pushes the stateID for funcA_ss1() onto the stack, and
+// returns the state ID for the entry substate of funcB, funcB_ss0(). 
+// A substate of funcB will eventually pop the stack and return the state ID
+// for funcA_ss1(). funcA_ss1() in turn pops the stack to get the state ID
+// placed there by its caller. 
+//
+// If candidate functions, like funcB(), have arguments they are moved to the stack.
+// Any values that are live across continuations, like foo in this example,
+// must also be saved to the stack before the continuation and restored before use. 
+// Some values, like DXIL buffer handles should not be saved and must be 
+// rematerialized after a continuation. The stack frame in a state function has
+// the following layout:
+//   
+//   |               |
+//   +---------------+  
+//   | argN          |  
+//   | ...           |   
+//   | arg0          |  
+//   | returnStateID | caller arg frame
+//   +---------------+ <-- entry stack pointer
+//   |               |
+//   | saved values  |
+//   |               |
+//   +---------------+
+//   | argN          |
+//   | ...           |
+//   | arg0          |
+//   | returnStateID | callee arg frame
+//   +---------------+ <-- stack frame pointer
+//           |
+//           V stack grows downward towards smaller addresses
+//
+// The return state ID is stored at the base of the argument frame, followed by
+// function arguments, if any. The saved values follow the argument frame. Instead
+// of adjusting the size of the stack frame for the saved values and argument
+// frames of each continuation a single allocation is made with enough space to
+// accommodate all continuations in the function.
+//
+// Several placeholder functions are used during the process of the state function
+// transform to break dependency cycles. A placeholder for the runtime data pointer
+// is used to allocate the stack frame before the function signature is changed
+// and the pointer parameter is created. The stack frame is also allocated before
+// its size has been determined, so a placeholder is used. The state IDs corresponding
+// to function entry substates may also not be known before the transform has been 
+// run on all the candidate functions. Therefore a placeholder is used for state 
+// IDs as well. These are replaced by calling StateFunctionTransform::finalizeStateIds()
+// after all the candidate functions have been transformed.
+//
+// If the intrinsic Internal_CallIndirect(int stateId) appears in the body of
+// the function then it is treated as a continuation with a transition to the
+// specified stateId.
+//
+// When an attribute size is specified, space is allocated on the stack frame for
+// committed/pending attributes, as well as the previous offsets for the committed/
+// pending attributes. The attribute size should be set if the 
+// function is TraceRay(). The payload offset needs to be set by the caller. The 
+// stack frame for TraceRay() has the following layout:
+//
+//   |                         |
+//   +-------------------------+ 
+//   |                         |
+//   | TraceRay() args         |
+//   |                         |
+//   +-------------------------+
+//   | returnStateID           | caller arg frame
+//   +-------------------------+ <-- entry stack offset
+//   | old committed attr offs |
+//   | old pending attr offset |
+//   +-------------------------+ 
+//   |                         |
+//   | committed attributes    |
+//   |                         |
+//   +-------------------------+ <-- new committed attribute offset
+//   |                         |
+//   | pending attributes      |
+//   |                         |
+//   +-------------------------+ <-- new pending attribute offset
+//   |                         |
+//   | saved values            |
+//   |                         |
+//   +-------------------------+
+//   | argN                    |
+//   | ...                     |
+//   | arg0                    |
+//   | returnStateID           | callee arg frame
+//   +-------------------------+ <-- stack frame offset
+//      
+// The arguments to some functions (e.g. closesthit, anyhit, and miss shaders)
+// come from the payload or attributes. The positions of these arguments can be 
+// specified to SFT, which will redirect the defs from the args to corresponding
+// values on the stack.
+//
+// The following runtime (LLVM) functions are used by SFT (all sizes and offsets
+// are in terms of ints):
+//   void stackFramePush(<RuntimeDataTy> runtimeData, i32 size)
+//   void stackFramePop(<RuntimeDataTy> runtimeData, i32 size)
+//
+//   i32 stackFrameOffset(<RuntimeDataTy> runtimeData)
+//   i32 payloadOffset(<RuntimeDataTy> runtimeData) 
+//   i32 committedAttrOffset(<RuntimeDataTy> runtimeData)
+//   i32 pendingAttrOffset(<RuntimeDataTy> runtimeData)
+//
+//   i32* stackIntPtr(<RuntimeDataTy> runtimeData, i32 baseOffset, i32 offset)
+//   
+// Called before/after stackFramePush()/stackFramePop():
+//   void traceFramePush(<RuntimeDataTy> runtimeData, i32 attrSize) 
+//   void traceFramePop(<RuntimeDataTy> runtimeData)               
+
+class StateFunctionTransform
+{
+public:
+  enum ParameterSemanticType
+  {
+    PST_NONE = 0,
+    PST_PAYLOAD,
+    PST_ATTRIBUTE,
+
+    PST_COUNT
+  };
+
+  // func is the function to be transformed. candidateFuncNames is a list of all 
+  // functions that which have been or will be transformed to state functions, 
+  // including func. The runtimeDataArgTy is the type to use for the first argument
+  // in state functions.
+  StateFunctionTransform(llvm::Function* func, const std::vector<std::string>& candidateFuncNames, llvm::Type* runtimeDataArgTy);
+
+  // Optional parameters to be specified before run()
+  void setAttributeSize(int sizeInBytes); // needed for TraceRay()
+  void setParameterInfo(const std::vector<ParameterSemanticType>& paramTypes, bool useCommittedAttr = true);
+  void setResourceGlobals(const std::set<llvm::Value*>& resources);
+
+  static llvm::Function* createDummyRuntimeDataArgFunc(llvm::Module* M, llvm::Type* runtimeDataArgTy);
+
+  // Generates state functions from func into the same module. The original function
+  // is left only as a declaration.
+  void run(std::vector<llvm::Function*>& stateFunctions, _Out_ unsigned int &shaderStackSize);
+
+  // candidateFuncEntryStateIds corresponding to the candidateFuncNames passed to
+  // the constructor. stateIDs are computed as candidateFuncEntryStateIds[functionIdx]
+  // + substateIdx, where functionIdx and substateIdx come from the arguments to
+  // the placeholder stateID function.
+  static void finalizeStateIds(llvm::Module* module, const std::vector<int>& candidateFuncEntryStateIds);
+
+  // Outputs detailed diagnostic information if set to true.
+  void setVerbose(bool val);
+
+  void setDumpFilename(const std::string& dumpFilename);
+
+
+private:
+  // Function to transform
+  llvm::Function* m_function = nullptr;
+
+  // Name of the function to transform
+  std::string m_functionName;
+
+  // Index of the function to transform in m_candidateFuncNames
+  int m_functionIdx = 0;
+
+  // cadidateFuncNames is a list of all functions that which have been or will 
+  // be transformed to state functions. Used to create function index used
+  // by the stateID placeholder function.
+  const std::vector<std::string>& m_candidateFuncNames;
+
+  llvm::Type* m_runtimeDataArgTy = nullptr;
+  llvm::Value* m_runtimeDataArg = nullptr;     // set in init() and changeFunctionSignature()
+  llvm::Value* m_stackFrameSizeVal = nullptr;  // set in init() and preserveLiveValuesAcrossCallsites()
+
+  int m_attributeSizeInBytes = -1;
+  std::vector<ParameterSemanticType> m_paramTypes;
+  bool m_useCommittedAttr = false;
+  const std::set<llvm::Value*>* m_resources;
+
+  std::vector<llvm::CallInst*> m_callSites;
+  std::vector<int> m_callSiteFunctionIdx;
+  std::vector<llvm::CallInst*> m_movePayloadToStackCalls;
+  std::vector<llvm::CallInst*> m_setPendingAttrCalls;
+  std::vector<llvm::ReturnInst*> m_returns;
+
+  bool m_verbose = false;
+  std::string m_dumpFilename;
+  unsigned int m_dumpId = 0;
+
+  llvm::Function* m_stackIntPtrFunc = nullptr;
+
+  llvm::CallInst* m_stackFramePush = nullptr;
+  llvm::CallInst* m_stackFrameOffset = nullptr;
+  llvm::CallInst* m_payloadOffset = nullptr;          // Offset at beginning of function
+  llvm::CallInst* m_committedAttrOffset = nullptr;    // Offset at beginning of function
+  llvm::CallInst* m_pendingAttrOffset = nullptr;      // Offset at beginning of function
+
+  // Placeholder function taking constant values functionIdx and substate. 
+  // These are later translated to a stateId by finalizeStateIds().
+  llvm::Function* m_dummyStateIdFunc = nullptr;
+
+  int m_maxCallerArgFrameSizeInBytes = 0;
+  int m_traceFrameSizeInBytes = 0;
+
+  // Functions used to abstract stack operations. These make intermediate stages
+  // in the transform a little bit cleaner. 
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackStoreFuncs;
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackLoadFuncs;
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackPtrFuncs;
+
+  // Main stages of the transformation 
+  void init();
+  void findCallSitesIntrinsicsAndReturns();
+  void changeCallingConvention();
+  void preserveLiveValuesAcrossCallsites(_Out_ unsigned int &shaderStackSize);
+  void createSubstateFunctions(std::vector<llvm::Function*>& stateFunctions);
+  void lowerStackFuncs();
+
+  llvm::Value* getDummyStateId(int functionIdx, int substate, llvm::Instruction* insertBefore);
+
+  void allocateStackFrame();
+  void allocateTraceFrame();
+  void createArgFrames();
+  void changeFunctionSignature();
+
+  void createStackStore(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackLoad(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackPtr(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackPtr(llvm::Value* baseOffset, llvm::Type* valTy, llvm::Value* intIndex, llvm::Instruction* insertBefore);
+  void rewriteDummyStackSize(uint64_t frameSizeInBytes);
+
+  BasicBlockVector replaceCallSites();
+  llvm::Function* split(llvm::Function* baseFunc, llvm::BasicBlock* subStateEntryBlock, int substateIndex);
+
+  void flattenGepsOnValue(llvm::Value* val, llvm::Value* baseOffset, llvm::Value* offset);
+  void scalarizeVectorStackAccess(llvm::Instruction* vecPtr, llvm::Value* baseOffset, llvm::Value* offsetVal);
+
+  // Diagnostic printing functions
+  llvm::raw_ostream& getOutputStream(const std::string functionName, const std::string& suffix, unsigned int dumpId);
+  void printFunction(const llvm::Function* function, const std::string& suffix, unsigned int dumpId);
+  void printFunction(const std::string& suffix);
+  void printFunctions(const std::vector<llvm::Function*>& funcs, const char* suffix);
+  void printModule(const llvm::Module* module, const std::string& suffix);
+  void printSet(const InstructionSetVector& vals, const char* msg = nullptr);
+};

+ 26 - 0
lib/DxrFallback/readme.md

@@ -0,0 +1,26 @@
+# DXR Fallback Compiler
+The DXR Fallback Compiler is a specialized compiler that's a part of the [D3D12 Raytracing Fallback Layer](https://github.com/Microsoft/DirectX-Graphics-Samples/tree/master/Libraries/D3D12RaytracingFallback). The purpose of the DXR Fallback Compiler is to take input DXR shader libs and link them into a single compute shader that is runnable DX12 hardware (even without DXR driver support).
+
+## Building the DXR Fallback Compiler
+In order to build the DXR Fallback Compiler in Visual Studio, simply build the dxrfallbackcompiler project in the *Clang Libraries* folder.
+
+## Using with the D3D12 Raytracing Fallback Layer
+To use the DXR Fallback Compiler with the [DirectX Graphics Samples](https://github.com/Microsoft/DirectX-Graphics-Samples/blob/master/Samples/Desktop/D3D12Raytracing/readme.md), build a dxrfallbackcompiler.dll using the Build instructions and place the output dll in Samples/Desktop/D3D12Raytracing/tools/x64. 
+
+If you're incorporating the Fallback Layer into your own personal project, you need to ensure that the dll is either alongside your executable or in the working directory.
+
+## Overview
+Note that the below overview and all proceeding documentation assumes familiarity with the DirectX Raytracing API.
+
+The DXR Fallback Compiler addresses several challenges that native DX12 compute shaders are not normally capable of handling:
+ * Combining multiple orthogonal shaders into a single large compute shader
+ * Uses of all new DXR HLSL intrinsics
+ * Invocation of another shader in the middle of shader code - *i.e. TraceRay and CallShader*
+ * Recursive invocations of shader calls
+
+These challenges are handled by abstractly viewing GPU execution of a DXR pipeline as State Machine traversal, where each shader is transformed into one or more state functions. further technical details are described in the header of [StateFunctionTransform.h](..\\DxrFallback\StateFunctionTransform.h).
+
+## Building runtime.h
+Download LLVM 3.7: http://releases.llvm.org/3.7.0/LLVM-3.7.0-win64.exe
+You may need to adjust BINPATH in script.cmd to point to your llvm binaries
+Run script.cmd and it should output a patched runtime.h

+ 1974 - 0
lib/DxrFallback/runtime.h

@@ -0,0 +1,1974 @@
+
+// This file generated by compiling the following source (runtime.c) as follows:
+//    clang -S -emit-llvm -target nvptr runtime.c
+//    opt -S -mem2reg runtime.ll -o runtime.opt.ll
+// The resulting LLVM-IR is stripped of its datalayout and replaced with one
+// compatible with DXIL.
+
+// runtime.c
+#if 0 
+#include <stddef.h>
+
+static const int STACK_SIZE_IN_BYTES = 1024;
+
+typedef float float3 __attribute__((vector_size(3*sizeof(float))));
+typedef float float4 __attribute__((vector_size(4*sizeof(float))));
+typedef float float12 __attribute__((vector_size(12*sizeof(float))));
+typedef float (M3x4)[12];
+typedef int   (StackType)[STACK_SIZE_IN_BYTES/sizeof(int)];
+typedef unsigned char byte;
+
+
+typedef struct RuntimeDataStruct
+{
+  int DispatchRaysIndex[2];
+  int DispatchRaysDimensions[2];
+
+  float RayTMin;
+  float RayTCurrent;
+  unsigned RayFlags;
+  float WorldRayOrigin[3];
+  float WorldRayDirection[3];
+  float ObjectRayOrigin[3];
+  float ObjectRayDirection[3];
+  M3x4 ObjectToWorld;
+  M3x4 WorldToObject;
+
+  unsigned PrimitiveIndex;
+  unsigned InstanceIndex;
+  unsigned InstanceID;
+  unsigned HitKind;
+  unsigned ShaderRecordOffset;
+
+
+  // Pending hit values - accessed in anyHit and intersection shaders before a hit has been committed
+  float PendingRayTCurrent;
+  unsigned PendingPrimitiveIndex;
+  unsigned PendingInstanceIndex;
+  unsigned PendingInstanceID;
+  unsigned PendingHitKind;
+  unsigned PendingShaderRecordOffset; 
+
+  int GroupIndex; 
+  int AnyHitResult;
+  int AnyHitStateId;  // Originally temporary. We needed to avoid resource usage
+                      // in ReportHit() because of linking issues so weset the value here first. 
+                      // May be worth retaining to cache the value when fetching the intersection 
+                      // stateId (fetch them both at once). 
+
+  int PayloadOffset;            
+  int CommittedAttrOffset;      
+  int PendingAttrOffset;        
+  
+  int StackOffset; // offset from the start of the stack
+  StackType* Stack;
+} RuntimeData;
+
+typedef RuntimeData* RuntimeDataType;
+
+typedef struct TraceRaySpills_ClosestHit
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+  float ObjectRayOrigin[3];      
+  float ObjectRayDirection[3];   
+
+  unsigned PrimitiveIndex;       
+  unsigned InstanceIndex;        
+  unsigned InstanceID;           
+  unsigned HitKind;              
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_ClosestHit;
+
+typedef struct TraceRaySpills_Miss
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+            
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_Miss;
+
+
+#define REF(x) (runtimeData->x)
+#define REF_FLT(x) (runtimeData->x)
+#define REF_STACK(offset) ((*runtimeData->Stack)[runtimeData->StackOffset + offset])
+#define REF_FLT_OFS(x, offset) (runtimeData->x[offset])
+
+// Return next stateID
+int rewrite_dispatch(RuntimeDataType runtimeData, int stateID);
+void* rewrite_setLaunchParams(RuntimeDataType runtimeData, unsigned dimx, unsigned dimy);
+unsigned rewrite_getStackSize(void);
+StackType* rewrite_createStack(void);
+
+void stackInit(RuntimeDataType runtimeData, StackType* theStack, unsigned stackSize)
+{
+  REF(Stack) = theStack;
+  REF(StackOffset) = stackSize/sizeof(int) - 1;
+  REF(PayloadOffset)       = 1111; // recognizable bogus values
+  REF(CommittedAttrOffset) = 2222;
+  REF(PendingAttrOffset)   = 3333;
+}
+
+void stackFramePush(RuntimeDataType runtimeData, int size)
+{
+  REF(StackOffset) -= size;
+}
+
+void stackFramePop(RuntimeDataType runtimeData, int size)
+{ 
+  REF(StackOffset) += size;
+}
+
+int stackFrameOffset(RuntimeDataType runtimeData)
+{
+  return REF(StackOffset);
+}
+
+int payloadOffset(RuntimeDataType runtimeData)
+{
+  return REF(PayloadOffset);
+}
+
+int committedAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(CommittedAttrOffset);
+}
+
+int pendingAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingAttrOffset);
+}
+
+int* stackIntPtr(RuntimeDataType runtimeData, int baseOffset, int offset)
+{
+  return &(*runtimeData->Stack)[baseOffset + offset];
+}
+
+
+void traceFramePush(RuntimeDataType runtimeData, int attrSize)
+{
+  // Save the old payload and attribute offsets
+  REF_STACK(-1) = REF(CommittedAttrOffset);
+  REF_STACK(-2) = REF(PendingAttrOffset);
+
+  // Set new offsets
+  REF(CommittedAttrOffset) = REF(StackOffset) - 2 - attrSize; 
+  REF(PendingAttrOffset)   = REF(StackOffset) - 2 - 2 * attrSize; 
+}
+
+void traceFramePop(RuntimeDataType runtimeData)
+{
+  // Restore the old attribute offsets
+  REF(CommittedAttrOffset) = REF_STACK(-1); 
+  REF(PendingAttrOffset) = REF_STACK(-2);
+}
+
+void traceRaySave_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+  spills->ObjectRayOrigin[0]    = REF_FLT(ObjectRayOrigin[0]);
+  spills->ObjectRayOrigin[1]    = REF_FLT(ObjectRayOrigin[1]);
+  spills->ObjectRayOrigin[2]    = REF_FLT(ObjectRayOrigin[2]);
+  spills->ObjectRayDirection[0] = REF_FLT(ObjectRayDirection[0]);
+  spills->ObjectRayDirection[1] = REF_FLT(ObjectRayDirection[1]);
+  spills->ObjectRayDirection[2] = REF_FLT(ObjectRayDirection[2]);
+
+  spills->PrimitiveIndex      = REF(PrimitiveIndex);       
+  spills->InstanceIndex       = REF(InstanceIndex);        
+  spills->InstanceID          = REF(InstanceID);           
+  spills->HitKind             = REF(HitKind);              
+  spills->ShaderRecordOffset  = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+  REF_FLT(ObjectRayOrigin[0])    = spills->ObjectRayOrigin[0];     
+  REF_FLT(ObjectRayOrigin[1])    = spills->ObjectRayOrigin[1];     
+  REF_FLT(ObjectRayOrigin[2])    = spills->ObjectRayOrigin[2];     
+  REF_FLT(ObjectRayDirection[0]) = spills->ObjectRayDirection[0];  
+  REF_FLT(ObjectRayDirection[1]) = spills->ObjectRayDirection[1];  
+  REF_FLT(ObjectRayDirection[2]) = spills->ObjectRayDirection[2];  
+
+  REF(PrimitiveIndex)     = spills->PrimitiveIndex;          
+  REF(InstanceIndex)      = spills->InstanceIndex;           
+  REF(InstanceID)         = spills->InstanceID;              
+  REF(HitKind)            = spills->HitKind;                 
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+void traceRaySave_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+
+  spills->ShaderRecordOffset    = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Intrinsics for the fallback layer
+//
+//////////////////////////////////////////////////////////////////////////
+
+void fb_Fallback_Scheduler(int initialStateId, unsigned dimx, unsigned dimy)
+{
+  StackType* theStack = rewrite_createStack();
+  RuntimeData theRuntimeData;
+  RuntimeDataType runtimeData = &theRuntimeData;
+
+  rewrite_setLaunchParams(runtimeData, dimx, dimy);
+  if(REF(DispatchRaysIndex[0]) >= REF(DispatchRaysDimensions[0]) ||
+     REF(DispatchRaysIndex[1]) >= REF(DispatchRaysDimensions[1]))
+  { 
+    return;
+  }
+
+
+  // Set final return stateID into reserved area at stack top
+  unsigned stackSize = rewrite_getStackSize();
+  stackInit(runtimeData, theStack, stackSize);
+  int stackFrameOffs = stackFrameOffset(runtimeData);
+  *stackIntPtr(runtimeData, stackFrameOffs, 0) = -1;
+
+  int stateId = initialStateId;
+  int count = 0;
+  while( stateId >= 0 )
+  {
+    stateId = rewrite_dispatch(runtimeData, stateId);
+  }
+}
+
+void fb_Fallback_SetLaunchParams(RuntimeDataType runtimeData, unsigned DTidx, unsigned DTidy, unsigned dimx, unsigned dimy, unsigned groupIndex)
+{ 
+  REF(DispatchRaysIndex[0]) = DTidx;
+  REF(DispatchRaysIndex[1]) = DTidy;
+  REF(DispatchRaysDimensions[0]) = dimx;
+  REF(DispatchRaysDimensions[1]) = dimy;
+
+  REF(GroupIndex) = groupIndex;
+}
+
+int fb_Fallback_TraceRayBegin(RuntimeDataType runtimeData, unsigned rayFlags, float ox, float oy, float oz, float tmin, float dx, float dy, float dz, float tmax, int newPayloadOffset)
+{ 
+  REF(RayFlags) = rayFlags;
+  REF_FLT(WorldRayOrigin[0]) = ox;
+  REF_FLT(WorldRayOrigin[1]) = oy;
+  REF_FLT(WorldRayOrigin[2]) = oz;
+  REF_FLT(WorldRayDirection[0]) = dx;
+  REF_FLT(WorldRayDirection[1]) = dy;
+  REF_FLT(WorldRayDirection[2]) = dz;
+  REF_FLT(RayTCurrent) = tmax;
+  REF_FLT(RayTMin) = tmin;
+
+  int oldOffset = REF(PayloadOffset);
+  REF(PayloadOffset) = newPayloadOffset;
+  return oldOffset;
+}
+
+void fb_Fallback_TraceRayEnd(RuntimeDataType runtimeData, int oldPayloadOffset)
+{
+  REF(PayloadOffset) = oldPayloadOffset;
+}
+
+void fb_Fallback_SetPendingTriVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID, float t, unsigned hitKind)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+  REF_FLT(PendingRayTCurrent) = t;
+  REF(PendingHitKind) = hitKind;
+}
+
+void fb_Fallback_SetPendingCustomVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+}
+
+void fb_Fallback_CommitHit(RuntimeDataType runtimeData)
+{
+  REF_FLT(RayTCurrent)    = REF_FLT(PendingRayTCurrent);
+  REF(ShaderRecordOffset) = REF(PendingShaderRecordOffset);
+  REF(PrimitiveIndex)     = REF(PendingPrimitiveIndex);
+  REF(InstanceIndex)      = REF(PendingInstanceIndex);
+  REF(InstanceID)         = REF(PendingInstanceID);
+  REF(HitKind)            = REF(PendingHitKind);  
+
+  int PendingAttrOffset = REF(PendingAttrOffset);
+  REF(PendingAttrOffset) = REF(CommittedAttrOffset);
+  REF(CommittedAttrOffset) = PendingAttrOffset;
+}
+
+
+int fb_Fallback_RuntimeDataLoadInt(RuntimeDataType runtimeData, int offset)
+{
+  return (*runtimeData->Stack)[offset];
+}
+
+void fb_Fallback_RuntimeDataStoreInt(RuntimeDataType runtimeData, int offset, int val)
+{
+  (*runtimeData->Stack)[offset] = val;
+}
+
+unsigned fb_dxop_dispatchRaysIndex(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysIndex[i]);
+}
+
+unsigned fb_dxop_dispatchRaysDimensions(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysDimensions[i]);
+}
+
+float fb_dxop_rayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+float fb_Fallback_RayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+void fb_Fallback_SetRayTMin(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTMin) = t;
+}
+
+float fb_dxop_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+float fb_Fallback_RayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+void fb_Fallback_SetRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTCurrent) = t;
+}
+
+unsigned fb_dxop_rayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+unsigned fb_Fallback_RayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+void fb_Fallback_SetRayFlags(RuntimeDataType runtimeData, unsigned flags)
+{
+  REF(RayFlags) = flags;
+}
+
+float fb_dxop_worldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+float fb_Fallback_WorldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+void fb_Fallback_SetWorldRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayOrigin[0]) = x;
+  REF_FLT(WorldRayOrigin[1]) = y;
+  REF_FLT(WorldRayOrigin[2]) = z;
+}
+
+float fb_dxop_worldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+float fb_Fallback_WorldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+void fb_Fallback_SetWorldRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayDirection[0]) = x;
+  REF_FLT(WorldRayDirection[1]) = y;
+  REF_FLT(WorldRayDirection[2]) = z;
+}
+
+float fb_dxop_objectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+float fb_Fallback_ObjectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+void fb_Fallback_SetObjectRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayOrigin[0]) = x;
+  REF_FLT(ObjectRayOrigin[1]) = y;
+  REF_FLT(ObjectRayOrigin[2]) = z;
+}
+
+float fb_dxop_objectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+float fb_Fallback_ObjectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+void fb_Fallback_SetObjectRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayDirection[0]) = x;
+  REF_FLT(ObjectRayDirection[1]) = y;
+  REF_FLT(ObjectRayDirection[2]) = z;
+}
+
+float fb_dxop_objectToWorld(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(ObjectToWorld, i);
+}
+
+void fb_Fallback_SetObjectToWorld(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(ObjectToWorld, 0)  = M[0]; 
+  REF_FLT_OFS(ObjectToWorld, 1)  = M[1]; 
+  REF_FLT_OFS(ObjectToWorld, 2)  = M[2]; 
+  REF_FLT_OFS(ObjectToWorld, 3)  = M[3]; 
+  REF_FLT_OFS(ObjectToWorld, 4)  = M[4]; 
+  REF_FLT_OFS(ObjectToWorld, 5)  = M[5]; 
+  REF_FLT_OFS(ObjectToWorld, 6)  = M[6]; 
+  REF_FLT_OFS(ObjectToWorld, 7)  = M[7]; 
+  REF_FLT_OFS(ObjectToWorld, 8)  = M[8]; 
+  REF_FLT_OFS(ObjectToWorld, 9)  = M[9]; 
+  REF_FLT_OFS(ObjectToWorld, 10) = M[10];
+  REF_FLT_OFS(ObjectToWorld, 11) = M[11];
+}
+
+float fb_dxop_worldToObject(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(WorldToObject, i);
+}
+
+void fb_Fallback_SetWorldToObject(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(WorldToObject, 0)  = M[0]; 
+  REF_FLT_OFS(WorldToObject, 1)  = M[1]; 
+  REF_FLT_OFS(WorldToObject, 2)  = M[2]; 
+  REF_FLT_OFS(WorldToObject, 3)  = M[3]; 
+  REF_FLT_OFS(WorldToObject, 4)  = M[4]; 
+  REF_FLT_OFS(WorldToObject, 5)  = M[5]; 
+  REF_FLT_OFS(WorldToObject, 6)  = M[6]; 
+  REF_FLT_OFS(WorldToObject, 7)  = M[7]; 
+  REF_FLT_OFS(WorldToObject, 8)  = M[8]; 
+  REF_FLT_OFS(WorldToObject, 9)  = M[9]; 
+  REF_FLT_OFS(WorldToObject, 10) = M[10];
+  REF_FLT_OFS(WorldToObject, 11) = M[11];
+}
+
+unsigned fb_dxop_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+unsigned fb_Fallback_PrimitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+void fb_Fallback_SetPrimitiveIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PrimitiveIndex) = i;
+}
+
+unsigned fb_Fallback_ShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(ShaderRecordOffset);
+}
+
+void fb_Fallback_SetShaderRecordOffset(RuntimeDataType runtimeData, unsigned shaderRecordOffset)
+{
+  REF(ShaderRecordOffset) = shaderRecordOffset;
+}
+
+unsigned fb_dxop_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+unsigned fb_Fallback_InstanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+void fb_Fallback_SetInstanceIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceIndex) = i;
+}
+
+unsigned fb_dxop_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+unsigned fb_Fallback_InstanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+void fb_Fallback_SetInstanceID(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceID) = i;
+}
+
+unsigned fb_dxop_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+unsigned fb_Fallback_HitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+void fb_Fallback_SetHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(HitKind) = i;
+}
+
+float fb_dxop_pending_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(PendingRayTCurrent);
+}
+
+void fb_Fallback_SetPendingRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(PendingRayTCurrent) = t;
+}
+
+unsigned fb_dxop_pending_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_pending_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingPrimitiveIndex);
+}
+
+unsigned fb_Fallback_PendingShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingShaderRecordOffset);
+}
+
+unsigned fb_dxop_pending_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceIndex);
+}
+
+unsigned fb_dxop_pending_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceID);
+}
+
+unsigned fb_dxop_pending_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(PendingHitKind);
+}
+
+void fb_Fallback_SetPendingHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PendingHitKind) = i;
+}
+
+unsigned fb_Fallback_GroupIndex(RuntimeDataType runtimeData)
+{ 
+  return REF(GroupIndex);
+}
+
+int fb_Fallback_AnyHitResult(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitResult);
+}
+
+void fb_Fallback_SetAnyHitResult(RuntimeDataType runtimeData, int result)
+{
+  REF(AnyHitResult) = result;
+}
+
+int fb_Fallback_AnyHitStateId(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitStateId);
+}
+
+void fb_Fallback_SetAnyHitStateId(RuntimeDataType runtimeData, int id)
+{
+  REF(AnyHitStateId) = id;
+}
+
+#endif
+
+static const char* runtimeString[] = { R"AAA(
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f:64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+
+
+%struct.RuntimeDataStruct = type { [2 x i32], [2 x i32], float, float, i32, [3 x float], [3 x float], [3 x float], [3 x float], [12 x float], [12 x float], i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [256 x i32]* }
+%struct.TraceRaySpills_ClosestHit = type { float, float, i32, [3 x float], [3 x float], [3 x float], [3 x float], i32, i32, i32, i32, i32 }
+%struct.TraceRaySpills_Miss = type { float, float, i32, [3 x float], [3 x float], i32 }
+
+; Function Attrs: nounwind
+define void @stackInit(%struct.RuntimeDataStruct* %runtimeData, [256 x i32]* %theStack, i32 %stackSize) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  store [256 x i32]* %theStack, [256 x i32]** %Stack, align 4
+  %div = udiv i32 %stackSize, 4
+  %sub = sub i32 %div, 1
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  store i32 %sub, i32* %StackOffset, align 4
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 1111, i32* %PayloadOffset, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 2222, i32* %CommittedAttrOffset, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 3333, i32* %PendingAttrOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @stackFramePush(%struct.RuntimeDataStruct* %runtimeData, i32 %size) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %sub = sub nsw i32 %0, %size
+  store i32 %sub, i32* %StackOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @stackFramePop(%struct.RuntimeDataStruct* %runtimeData, i32 %size) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %0, %size
+  store i32 %add, i32* %StackOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @stackFrameOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @payloadOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  %0 = load i32, i32* %PayloadOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @committedAttrOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %0 = load i32, i32* %CommittedAttrOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @pendingAttrOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %0 = load i32, i32* %PendingAttrOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32* @stackIntPtr(%struct.RuntimeDataStruct* %runtimeData, i32 %baseOffset, i32 %offset) #0 {
+entry:
+  %add = add nsw i32 %baseOffset, %offset
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %add
+  ret i32* %arrayidx
+}
+
+; Function Attrs: nounwind
+define void @traceFramePush(%struct.RuntimeDataStruct* %runtimeData, i32 %attrSize) #0 {
+entry:
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %0 = load i32, i32* %CommittedAttrOffset, align 4
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %1 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %1, -1
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %2 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %2, i32 0, i32 %add
+  store i32 %0, i32* %arrayidx, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %3 = load i32, i32* %PendingAttrOffset, align 4
+  %StackOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %4 = load i32, i32* %StackOffset1, align 4
+  %add2 = add nsw i32 %4, -2
+  %Stack3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %5 = load [256 x i32]*, [256 x i32]** %Stack3, align 4
+  %arrayidx4 = getelementptr inbounds [256 x i32], [256 x i32]* %5, i32 0, i32 %add2
+  store i32 %3, i32* %arrayidx4, align 4
+  %StackOffset5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %6 = load i32, i32* %StackOffset5, align 4
+  %sub = sub nsw i32 %6, 2
+  %sub6 = sub nsw i32 %sub, %attrSize
+  %CommittedAttrOffset7 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %sub6, i32* %CommittedAttrOffset7, align 4
+  %StackOffset8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %7 = load i32, i32* %StackOffset8, align 4
+  %sub9 = sub nsw i32 %7, 2
+  %mul = mul nsw i32 2, %attrSize
+  %sub10 = sub nsw i32 %sub9, %mul
+  %PendingAttrOffset11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %sub10, i32* %PendingAttrOffset11, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceFramePop(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %0, -1
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %1 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %1, i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %2, i32* %CommittedAttrOffset, align 4
+  %StackOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %3 = load i32, i32* %StackOffset1, align 4
+  %add2 = add nsw i32 %3, -2
+  %Stack3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %4 = load [256 x i32]*, [256 x i32]** %Stack3, align 4
+  %arrayidx4 = getelementptr inbounds [256 x i32], [256 x i32]* %4, i32 0, i32 %add2
+  %5 = load i32, i32* %arrayidx4, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %5, i32* %PendingAttrOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRaySave_ClosestHit(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_ClosestHit* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 2
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 1
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 0
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+)AAA",
+R"AAA(
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx25 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  %9 = load float, float* %arrayidx25, align 4
+  %ObjectRayOrigin26 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx27 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin26, i32 0, i32 0
+  store float %9, float* %arrayidx27, align 4
+  %ObjectRayOrigin28 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx29 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin28, i32 0, i32 1
+  %10 = load float, float* %arrayidx29, align 4
+  %ObjectRayOrigin30 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx31 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin30, i32 0, i32 1
+  store float %10, float* %arrayidx31, align 4
+  %ObjectRayOrigin32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx33 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin32, i32 0, i32 2
+  %11 = load float, float* %arrayidx33, align 4
+  %ObjectRayOrigin34 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx35 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin34, i32 0, i32 2
+  store float %11, float* %arrayidx35, align 4
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx36 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  %12 = load float, float* %arrayidx36, align 4
+  %ObjectRayDirection37 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx38 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection37, i32 0, i32 0
+  store float %12, float* %arrayidx38, align 4
+  %ObjectRayDirection39 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx40 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection39, i32 0, i32 1
+  %13 = load float, float* %arrayidx40, align 4
+  %ObjectRayDirection41 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx42 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection41, i32 0, i32 1
+  store float %13, float* %arrayidx42, align 4
+  %ObjectRayDirection43 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx44 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection43, i32 0, i32 2
+  %14 = load float, float* %arrayidx44, align 4
+  %ObjectRayDirection45 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx46 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection45, i32 0, i32 2
+  store float %14, float* %arrayidx46, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %15 = load i32, i32* %PrimitiveIndex, align 4
+  %PrimitiveIndex47 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 7
+  store i32 %15, i32* %PrimitiveIndex47, align 4
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %16 = load i32, i32* %InstanceIndex, align 4
+  %InstanceIndex48 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 8
+  store i32 %16, i32* %InstanceIndex48, align 4
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %17 = load i32, i32* %InstanceID, align 4
+  %InstanceID49 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 9
+  store i32 %17, i32* %InstanceID49, align 4
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %18 = load i32, i32* %HitKind, align 4
+  %HitKind50 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 10
+  store i32 %18, i32* %HitKind50, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %19 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset51 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 11
+  store i32 %19, i32* %ShaderRecordOffset51, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRayRestore_ClosestHit(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_ClosestHit* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 2
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 1
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 0
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+)AAA",
+R"AAA(
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ObjectRayOrigin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx25 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  %9 = load float, float* %arrayidx25, align 4
+  %ObjectRayOrigin26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx27 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin26, i32 0, i32 0
+  store float %9, float* %arrayidx27, align 4
+  %ObjectRayOrigin28 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx29 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin28, i32 0, i32 1
+  %10 = load float, float* %arrayidx29, align 4
+  %ObjectRayOrigin30 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx31 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin30, i32 0, i32 1
+  store float %10, float* %arrayidx31, align 4
+  %ObjectRayOrigin32 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx33 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin32, i32 0, i32 2
+  %11 = load float, float* %arrayidx33, align 4
+  %ObjectRayOrigin34 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx35 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin34, i32 0, i32 2
+  store float %11, float* %arrayidx35, align 4
+  %ObjectRayDirection = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx36 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  %12 = load float, float* %arrayidx36, align 4
+  %ObjectRayDirection37 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx38 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection37, i32 0, i32 0
+  store float %12, float* %arrayidx38, align 4
+  %ObjectRayDirection39 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx40 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection39, i32 0, i32 1
+  %13 = load float, float* %arrayidx40, align 4
+  %ObjectRayDirection41 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx42 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection41, i32 0, i32 1
+  store float %13, float* %arrayidx42, align 4
+  %ObjectRayDirection43 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx44 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection43, i32 0, i32 2
+  %14 = load float, float* %arrayidx44, align 4
+  %ObjectRayDirection45 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx46 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection45, i32 0, i32 2
+  store float %14, float* %arrayidx46, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 7
+  %15 = load i32, i32* %PrimitiveIndex, align 4
+  %PrimitiveIndex47 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %15, i32* %PrimitiveIndex47, align 4
+  %InstanceIndex = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 8
+  %16 = load i32, i32* %InstanceIndex, align 4
+  %InstanceIndex48 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %16, i32* %InstanceIndex48, align 4
+  %InstanceID = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 9
+  %17 = load i32, i32* %InstanceID, align 4
+  %InstanceID49 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %17, i32* %InstanceID49, align 4
+  %HitKind = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 10
+  %18 = load i32, i32* %HitKind, align 4
+  %HitKind50 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %18, i32* %HitKind50, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 11
+  %19 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset51 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %19, i32* %ShaderRecordOffset51, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRaySave_Miss(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_Miss* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 2
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 1
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 0
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+)AAA",
+R"AAA(
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %9 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset25 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 5
+  store i32 %9, i32* %ShaderRecordOffset25, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRayRestore_Miss(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_Miss* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 2
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 1
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 0
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 5
+  %9 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset25 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %9, i32* %ShaderRecordOffset25, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_Scheduler(i32 %initialStateId, i32 %dimx, i32 %dimy) #0 {
+entry:
+  %theRuntimeData = alloca %struct.RuntimeDataStruct, align 4
+  %call = call [256 x i32]* @rewrite_createStack()
+  %call1 = call i8* @rewrite_setLaunchParams(%struct.RuntimeDataStruct* %theRuntimeData, i32 %dimx, i32 %dimy)
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 0
+  %0 = load i32, i32* %arrayidx, align 4
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 1
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 0
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp = icmp sge i32 %0, %1
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %DispatchRaysIndex3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 0
+  %arrayidx4 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex3, i32 0, i32 1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %DispatchRaysDimensions5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 1
+  %arrayidx6 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions5, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx6, align 4
+  %cmp7 = icmp sge i32 %2, %3
+  br i1 %cmp7, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  br label %while.end
+
+if.end:                                           ; preds = %lor.lhs.false
+  %call8 = call i32 @rewrite_getStackSize()
+  call void @stackInit(%struct.RuntimeDataStruct* %theRuntimeData, [256 x i32]* %call, i32 %call8)
+  %call9 = call i32 @stackFrameOffset(%struct.RuntimeDataStruct* %theRuntimeData)
+  %call10 = call i32* @stackIntPtr(%struct.RuntimeDataStruct* %theRuntimeData, i32 %call9, i32 0)
+  store i32 -1, i32* %call10, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %if.end
+  %stateId.0 = phi i32 [ %initialStateId, %if.end ], [ %call12, %while.body ]
+  %cmp11 = icmp sge i32 %stateId.0, 0
+  br i1 %cmp11, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %call12 = call i32 @rewrite_dispatch(%struct.RuntimeDataStruct* %theRuntimeData, i32 %stateId.0)
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond, %if.then
+  ret void
+}
+
+declare [256 x i32]* @rewrite_createStack() #1
+
+declare i8* @rewrite_setLaunchParams(%struct.RuntimeDataStruct*, i32, i32) #1
+
+declare i32 @rewrite_getStackSize() #1
+
+declare i32 @rewrite_dispatch(%struct.RuntimeDataStruct*, i32) #1
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetLaunchParams(%struct.RuntimeDataStruct* %runtimeData, i32 %DTidx, i32 %DTidy, i32 %dimx, i32 %dimy, i32 %groupIndex) #0 {
+entry:
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 0
+  store i32 %DTidx, i32* %arrayidx, align 4
+  %DispatchRaysIndex1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex1, i32 0, i32 1
+  store i32 %DTidy, i32* %arrayidx2, align 4
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx3 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 0
+  store i32 %dimx, i32* %arrayidx3, align 4
+  %DispatchRaysDimensions4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions4, i32 0, i32 1
+)AAA",
+R"AAA(
+  store i32 %dimy, i32* %arrayidx5, align 4
+  %GroupIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 22
+  store i32 %groupIndex, i32* %GroupIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_TraceRayBegin(%struct.RuntimeDataStruct* %runtimeData, i32 %rayFlags, float %ox, float %oy, float %oz, float %tmin, float %dx, float %dy, float %dz, float %tmax, i32 %newPayloadOffset) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %rayFlags, i32* %RayFlags, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  store float %ox, float* %arrayidx, align 4
+  %WorldRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin1, i32 0, i32 1
+  store float %oy, float* %arrayidx2, align 4
+  %WorldRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin3, i32 0, i32 2
+  store float %oz, float* %arrayidx4, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  store float %dx, float* %arrayidx5, align 4
+  %WorldRayDirection6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection6, i32 0, i32 1
+  store float %dy, float* %arrayidx7, align 4
+  %WorldRayDirection8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection8, i32 0, i32 2
+  store float %dz, float* %arrayidx9, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %tmax, float* %RayTCurrent, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %tmin, float* %RayTMin, align 4
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  %0 = load i32, i32* %PayloadOffset, align 4
+  %PayloadOffset10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 %newPayloadOffset, i32* %PayloadOffset10, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_TraceRayEnd(%struct.RuntimeDataStruct* %runtimeData, i32 %oldPayloadOffset) #0 {
+entry:
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 %oldPayloadOffset, i32* %PayloadOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingTriVals(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset, i32 %primitiveIndex, i32 %instanceIndex, i32 %instanceID, float %t, i32 %hitKind) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  store i32 %shaderRecordOffset, i32* %PendingShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  store i32 %primitiveIndex, i32* %PendingPrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  store i32 %instanceIndex, i32* %PendingInstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  store i32 %instanceID, i32* %PendingInstanceID, align 4
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  store float %t, float* %PendingRayTCurrent, align 4
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  store i32 %hitKind, i32* %PendingHitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingCustomVals(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset, i32 %primitiveIndex, i32 %instanceIndex, i32 %instanceID) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  store i32 %shaderRecordOffset, i32* %PendingShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  store i32 %primitiveIndex, i32* %PendingPrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  store i32 %instanceIndex, i32* %PendingInstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  store i32 %instanceID, i32* %PendingInstanceID, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_CommitHit(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  %0 = load float, float* %PendingRayTCurrent, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %0, float* %RayTCurrent, align 4
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  %1 = load i32, i32* %PendingShaderRecordOffset, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %1, i32* %ShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  %2 = load i32, i32* %PendingPrimitiveIndex, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %2, i32* %PrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  %3 = load i32, i32* %PendingInstanceIndex, align 4
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %3, i32* %InstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  %4 = load i32, i32* %PendingInstanceID, align 4
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %4, i32* %InstanceID, align 4
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  %5 = load i32, i32* %PendingHitKind, align 4
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %5, i32* %HitKind, align 4
+  %PendingAttrOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %6 = load i32, i32* %PendingAttrOffset1, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %7 = load i32, i32* %CommittedAttrOffset, align 4
+  %PendingAttrOffset2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %7, i32* %PendingAttrOffset2, align 4
+  %CommittedAttrOffset3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %6, i32* %CommittedAttrOffset3, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_RuntimeDataLoadInt(%struct.RuntimeDataStruct* %runtimeData, i32 %offset) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %offset
+  %1 = load i32, i32* %arrayidx, align 4
+  ret i32 %1
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_RuntimeDataStoreInt(%struct.RuntimeDataStruct* %runtimeData, i32 %offset, i32 %val) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %offset
+  store i32 %val, i32* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_dispatchRaysIndex(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_dispatchRaysDimensions(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+)AAA",
+R"AAA(
+entry:
+  %idxprom = zext i8 %i to i32
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_rayTMin(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %0 = load float, float* %RayTMin, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_RayTMin(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %0 = load float, float* %RayTMin, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayTMin(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %t, float* %RayTMin, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_rayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %0 = load float, float* %RayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_RayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %0 = load float, float* %RayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayTCurrent(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %t, float* %RayTCurrent, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_rayFlags(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_RayFlags(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayFlags(%struct.RuntimeDataStruct* %runtimeData, i32 %flags) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %flags, i32* %RayFlags, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_WorldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %WorldRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %WorldRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_WorldRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldRayDirection(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %WorldRayDirection1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %WorldRayDirection3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_ObjectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %ObjectRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %ObjectRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_ObjectRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectRayDirection(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %ObjectRayDirection1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection1, i32 0, i32 1
+)AAA",
+R"AAA(
+  store float %y, float* %arrayidx2, align 4
+  %ObjectRayDirection3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectToWorld(%struct.RuntimeDataStruct* %runtimeData, i32 %r, i8 zeroext %c) #0 {
+entry:
+  %mul = mul nsw i32 %r, 4
+  %conv = zext i8 %c to i32
+  %add = add nsw i32 %mul, %conv
+  %ObjectToWorld = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld, i32 0, i32 %add
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectToWorld(%struct.RuntimeDataStruct* %runtimeData, <12 x float> %M) #0 {
+entry:
+  %vecext = extractelement <12 x float> %M, i32 0
+  %ObjectToWorld = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld, i32 0, i32 0
+  store float %vecext, float* %arrayidx, align 4
+  %vecext1 = extractelement <12 x float> %M, i32 1
+  %ObjectToWorld2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx3 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld2, i32 0, i32 1
+  store float %vecext1, float* %arrayidx3, align 4
+  %vecext4 = extractelement <12 x float> %M, i32 2
+  %ObjectToWorld5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx6 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld5, i32 0, i32 2
+  store float %vecext4, float* %arrayidx6, align 4
+  %vecext7 = extractelement <12 x float> %M, i32 3
+  %ObjectToWorld8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx9 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld8, i32 0, i32 3
+  store float %vecext7, float* %arrayidx9, align 4
+  %vecext10 = extractelement <12 x float> %M, i32 4
+  %ObjectToWorld11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx12 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld11, i32 0, i32 4
+  store float %vecext10, float* %arrayidx12, align 4
+  %vecext13 = extractelement <12 x float> %M, i32 5
+  %ObjectToWorld14 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx15 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld14, i32 0, i32 5
+  store float %vecext13, float* %arrayidx15, align 4
+  %vecext16 = extractelement <12 x float> %M, i32 6
+  %ObjectToWorld17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx18 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld17, i32 0, i32 6
+  store float %vecext16, float* %arrayidx18, align 4
+  %vecext19 = extractelement <12 x float> %M, i32 7
+  %ObjectToWorld20 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx21 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld20, i32 0, i32 7
+  store float %vecext19, float* %arrayidx21, align 4
+  %vecext22 = extractelement <12 x float> %M, i32 8
+  %ObjectToWorld23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx24 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld23, i32 0, i32 8
+  store float %vecext22, float* %arrayidx24, align 4
+  %vecext25 = extractelement <12 x float> %M, i32 9
+  %ObjectToWorld26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx27 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld26, i32 0, i32 9
+  store float %vecext25, float* %arrayidx27, align 4
+  %vecext28 = extractelement <12 x float> %M, i32 10
+  %ObjectToWorld29 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx30 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld29, i32 0, i32 10
+  store float %vecext28, float* %arrayidx30, align 4
+  %vecext31 = extractelement <12 x float> %M, i32 11
+  %ObjectToWorld32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx33 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld32, i32 0, i32 11
+  store float %vecext31, float* %arrayidx33, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldToObject(%struct.RuntimeDataStruct* %runtimeData, i32 %r, i8 zeroext %c) #0 {
+entry:
+  %mul = mul nsw i32 %r, 4
+  %conv = zext i8 %c to i32
+  %add = add nsw i32 %mul, %conv
+  %WorldToObject = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject, i32 0, i32 %add
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldToObject(%struct.RuntimeDataStruct* %runtimeData, <12 x float> %M) #0 {
+entry:
+  %vecext = extractelement <12 x float> %M, i32 0
+  %WorldToObject = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject, i32 0, i32 0
+  store float %vecext, float* %arrayidx, align 4
+  %vecext1 = extractelement <12 x float> %M, i32 1
+  %WorldToObject2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx3 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject2, i32 0, i32 1
+  store float %vecext1, float* %arrayidx3, align 4
+  %vecext4 = extractelement <12 x float> %M, i32 2
+  %WorldToObject5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx6 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject5, i32 0, i32 2
+  store float %vecext4, float* %arrayidx6, align 4
+  %vecext7 = extractelement <12 x float> %M, i32 3
+  %WorldToObject8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx9 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject8, i32 0, i32 3
+  store float %vecext7, float* %arrayidx9, align 4
+  %vecext10 = extractelement <12 x float> %M, i32 4
+  %WorldToObject11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx12 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject11, i32 0, i32 4
+  store float %vecext10, float* %arrayidx12, align 4
+  %vecext13 = extractelement <12 x float> %M, i32 5
+  %WorldToObject14 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx15 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject14, i32 0, i32 5
+  store float %vecext13, float* %arrayidx15, align 4
+  %vecext16 = extractelement <12 x float> %M, i32 6
+  %WorldToObject17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx18 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject17, i32 0, i32 6
+  store float %vecext16, float* %arrayidx18, align 4
+  %vecext19 = extractelement <12 x float> %M, i32 7
+  %WorldToObject20 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx21 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject20, i32 0, i32 7
+  store float %vecext19, float* %arrayidx21, align 4
+  %vecext22 = extractelement <12 x float> %M, i32 8
+  %WorldToObject23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx24 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject23, i32 0, i32 8
+  store float %vecext22, float* %arrayidx24, align 4
+  %vecext25 = extractelement <12 x float> %M, i32 9
+  %WorldToObject26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx27 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject26, i32 0, i32 9
+  store float %vecext25, float* %arrayidx27, align 4
+  %vecext28 = extractelement <12 x float> %M, i32 10
+  %WorldToObject29 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx30 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject29, i32 0, i32 10
+  store float %vecext28, float* %arrayidx30, align 4
+  %vecext31 = extractelement <12 x float> %M, i32 11
+  %WorldToObject32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx33 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject32, i32 0, i32 11
+  store float %vecext31, float* %arrayidx33, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_primitiveIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %0 = load i32, i32* %PrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_PrimitiveIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %0 = load i32, i32* %PrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPrimitiveIndex(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+)AAA",
+R"AAA(
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %i, i32* %PrimitiveIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_ShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %0 = load i32, i32* %ShaderRecordOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset) #0 {
+entry:
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %shaderRecordOffset, i32* %ShaderRecordOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_instanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %0 = load i32, i32* %InstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_InstanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %0 = load i32, i32* %InstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetInstanceIndex(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %i, i32* %InstanceIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_instanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %0 = load i32, i32* %InstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_InstanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %0 = load i32, i32* %InstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetInstanceID(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %i, i32* %InstanceID, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_hitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %0 = load i32, i32* %HitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_HitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %0 = load i32, i32* %HitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetHitKind(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %i, i32* %HitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_pending_rayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  %0 = load float, float* %PendingRayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingRayTCurrent(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  store float %t, float* %PendingRayTCurrent, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_primitiveID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  %0 = load i32, i32* %PendingPrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_PendingShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  %0 = load i32, i32* %PendingShaderRecordOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_instanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  %0 = load i32, i32* %PendingInstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_instanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  %0 = load i32, i32* %PendingInstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_hitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  %0 = load i32, i32* %PendingHitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingHitKind(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  store i32 %i, i32* %PendingHitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_GroupIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %GroupIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 22
+  %0 = load i32, i32* %GroupIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_AnyHitResult(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %AnyHitResult = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 23
+  %0 = load i32, i32* %AnyHitResult, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetAnyHitResult(%struct.RuntimeDataStruct* %runtimeData, i32 %result) #0 {
+entry:
+  %AnyHitResult = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 23
+  store i32 %result, i32* %AnyHitResult, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_AnyHitStateId(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %AnyHitStateId = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 24
+  %0 = load i32, i32* %AnyHitStateId, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetAnyHitStateId(%struct.RuntimeDataStruct* %runtimeData, i32 %id) #0 {
+entry:
+  %AnyHitStateId = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 24
+  store i32 %id, i32* %AnyHitStateId, align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind }
+)AAA"
+};
+
+#include <sstream>
+static std::string getRuntimeString()
+{
+  std::ostringstream out;
+  for( size_t i=0; i < _countof(runtimeString); ++i)
+    out << runtimeString[i];
+
+  return out.str();
+}

+ 62 - 0
lib/DxrFallback/runtime/rewriteRuntime.py

@@ -0,0 +1,62 @@
+import re
+
+inputFilename = 'runtime.opt.ll'
+sourceFilename = r'C:/Users/chwallis/Desktop/DXILShaderPatch/runtime.c'
+outputFilename = 'C:/Users/chwallis/Desktop/DXILShaderPatch/runtime.h'
+
+source = open(sourceFilename).read()
+
+input = open(inputFilename).read()
+m = re.search(r'"nvptx"(.*?)attributes #', input, re.DOTALL)
+dxil = m.group(1)
+
+# split the string up to avoid  error C2026: string too big, trailing characters truncated
+lines = dxil.splitlines()
+dxil = []
+count = 0
+for line in lines:
+    count += len(line)
+    dxil.append(line)
+    if count > 10000:
+        dxil.append(')AAA",')
+        dxil.append('R"AAA(')
+        count = 0
+dxil = '\n'.join(dxil)
+
+template = """
+// This file generated by compiling the following source (runtime.c) as follows:
+//    clang -S -emit-llvm -target nvptr runtime.c
+//    opt -S -mem2reg runtime.ll -o runtime.opt.ll
+// The resulting LLVM-IR is stripped of its datalayout and replaced with one
+// compatible with DXIL.
+
+// runtime.c
+#if 0 
+%SOURCE%
+#endif
+
+static const char* runtimeString[] = { R"AAA(
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f:64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%DXIL%
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind }
+)AAA"
+};
+
+#include <sstream>
+static std::string getRuntimeString()
+{
+  std::ostringstream out;
+  for( size_t i=0; i < _countof(runtimeString); ++i)
+    out << runtimeString[i];
+
+  return out.str();
+}
+"""
+
+output = re.sub(r'%SOURCE%', source, template)
+output = re.sub(r'%DXIL%', dxil, output)
+open(outputFilename, 'w').write(output)

+ 658 - 0
lib/DxrFallback/runtime/runtime.c

@@ -0,0 +1,658 @@
+#include <stddef.h>
+
+static const int STACK_SIZE_IN_BYTES = 1024;
+
+typedef float float3 __attribute__((vector_size(3*sizeof(float))));
+typedef float float4 __attribute__((vector_size(4*sizeof(float))));
+typedef float float12 __attribute__((vector_size(12*sizeof(float))));
+typedef float (M3x4)[12];
+typedef int   (StackType)[STACK_SIZE_IN_BYTES/sizeof(int)];
+typedef unsigned char byte;
+
+
+typedef struct RuntimeDataStruct
+{
+  int DispatchRaysIndex[2];
+  int DispatchRaysDimensions[2];
+
+  float RayTMin;
+  float RayTCurrent;
+  unsigned RayFlags;
+  float WorldRayOrigin[3];
+  float WorldRayDirection[3];
+  float ObjectRayOrigin[3];
+  float ObjectRayDirection[3];
+  M3x4 ObjectToWorld;
+  M3x4 WorldToObject;
+
+  unsigned PrimitiveIndex;
+  unsigned InstanceIndex;
+  unsigned InstanceID;
+  unsigned HitKind;
+  unsigned ShaderRecordOffset;
+
+
+  // Pending hit values - accessed in anyHit and intersection shaders before a hit has been committed
+  float PendingRayTCurrent;
+  unsigned PendingPrimitiveIndex;
+  unsigned PendingInstanceIndex;
+  unsigned PendingInstanceID;
+  unsigned PendingHitKind;
+  unsigned PendingShaderRecordOffset; 
+
+  int GroupIndex; 
+  int AnyHitResult;
+  int AnyHitStateId;  // Originally temporary. We needed to avoid resource usage
+                      // in ReportHit() because of linking issues so weset the value here first. 
+                      // May be worth retaining to cache the value when fetching the intersection 
+                      // stateId (fetch them both at once). 
+
+  int PayloadOffset;            
+  int CommittedAttrOffset;      
+  int PendingAttrOffset;        
+  
+  int StackOffset; // offset from the start of the stack
+  StackType* Stack;
+} RuntimeData;
+
+typedef RuntimeData* RuntimeDataType;
+
+typedef struct TraceRaySpills_ClosestHit
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+  float ObjectRayOrigin[3];      
+  float ObjectRayDirection[3];   
+
+  unsigned PrimitiveIndex;       
+  unsigned InstanceIndex;        
+  unsigned InstanceID;           
+  unsigned HitKind;              
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_ClosestHit;
+
+typedef struct TraceRaySpills_Miss
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+            
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_Miss;
+
+
+#define REF(x) (runtimeData->x)
+#define REF_FLT(x) (runtimeData->x)
+#define REF_STACK(offset) ((*runtimeData->Stack)[runtimeData->StackOffset + offset])
+#define REF_FLT_OFS(x, offset) (runtimeData->x[offset])
+
+// Return next stateID
+int rewrite_dispatch(RuntimeDataType runtimeData, int stateID);
+void* rewrite_setLaunchParams(RuntimeDataType runtimeData, unsigned dimx, unsigned dimy);
+unsigned rewrite_getStackSize(void);
+StackType* rewrite_createStack(void);
+
+void stackInit(RuntimeDataType runtimeData, StackType* theStack, unsigned stackSize)
+{
+  REF(Stack) = theStack;
+  REF(StackOffset) = stackSize/sizeof(int) - 1;
+  REF(PayloadOffset)       = 1111; // recognizable bogus values
+  REF(CommittedAttrOffset) = 2222;
+  REF(PendingAttrOffset)   = 3333;
+}
+
+void stackFramePush(RuntimeDataType runtimeData, int size)
+{
+  REF(StackOffset) -= size;
+}
+
+void stackFramePop(RuntimeDataType runtimeData, int size)
+{ 
+  REF(StackOffset) += size;
+}
+
+int stackFrameOffset(RuntimeDataType runtimeData)
+{
+  return REF(StackOffset);
+}
+
+int payloadOffset(RuntimeDataType runtimeData)
+{
+  return REF(PayloadOffset);
+}
+
+int committedAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(CommittedAttrOffset);
+}
+
+int pendingAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingAttrOffset);
+}
+
+int* stackIntPtr(RuntimeDataType runtimeData, int baseOffset, int offset)
+{
+  return &(*runtimeData->Stack)[baseOffset + offset];
+}
+
+
+void traceFramePush(RuntimeDataType runtimeData, int attrSize)
+{
+  // Save the old payload and attribute offsets
+  REF_STACK(-1) = REF(CommittedAttrOffset);
+  REF_STACK(-2) = REF(PendingAttrOffset);
+
+  // Set new offsets
+  REF(CommittedAttrOffset) = REF(StackOffset) - 2 - attrSize; 
+  REF(PendingAttrOffset)   = REF(StackOffset) - 2 - 2 * attrSize; 
+}
+
+void traceFramePop(RuntimeDataType runtimeData)
+{
+  // Restore the old attribute offsets
+  REF(CommittedAttrOffset) = REF_STACK(-1); 
+  REF(PendingAttrOffset) = REF_STACK(-2);
+}
+
+void traceRaySave_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+  spills->ObjectRayOrigin[0]    = REF_FLT(ObjectRayOrigin[0]);
+  spills->ObjectRayOrigin[1]    = REF_FLT(ObjectRayOrigin[1]);
+  spills->ObjectRayOrigin[2]    = REF_FLT(ObjectRayOrigin[2]);
+  spills->ObjectRayDirection[0] = REF_FLT(ObjectRayDirection[0]);
+  spills->ObjectRayDirection[1] = REF_FLT(ObjectRayDirection[1]);
+  spills->ObjectRayDirection[2] = REF_FLT(ObjectRayDirection[2]);
+
+  spills->PrimitiveIndex      = REF(PrimitiveIndex);       
+  spills->InstanceIndex       = REF(InstanceIndex);        
+  spills->InstanceID          = REF(InstanceID);           
+  spills->HitKind             = REF(HitKind);              
+  spills->ShaderRecordOffset  = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+  REF_FLT(ObjectRayOrigin[0])    = spills->ObjectRayOrigin[0];     
+  REF_FLT(ObjectRayOrigin[1])    = spills->ObjectRayOrigin[1];     
+  REF_FLT(ObjectRayOrigin[2])    = spills->ObjectRayOrigin[2];     
+  REF_FLT(ObjectRayDirection[0]) = spills->ObjectRayDirection[0];  
+  REF_FLT(ObjectRayDirection[1]) = spills->ObjectRayDirection[1];  
+  REF_FLT(ObjectRayDirection[2]) = spills->ObjectRayDirection[2];  
+
+  REF(PrimitiveIndex)     = spills->PrimitiveIndex;          
+  REF(InstanceIndex)      = spills->InstanceIndex;           
+  REF(InstanceID)         = spills->InstanceID;              
+  REF(HitKind)            = spills->HitKind;                 
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+void traceRaySave_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+
+  spills->ShaderRecordOffset    = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Intrinsics for the fallback layer
+//
+//////////////////////////////////////////////////////////////////////////
+
+void fb_Fallback_Scheduler(int initialStateId, unsigned dimx, unsigned dimy)
+{
+  StackType* theStack = rewrite_createStack();
+  RuntimeData theRuntimeData;
+  RuntimeDataType runtimeData = &theRuntimeData;
+
+  rewrite_setLaunchParams(runtimeData, dimx, dimy);
+  if(REF(DispatchRaysIndex[0]) >= REF(DispatchRaysDimensions[0]) ||
+     REF(DispatchRaysIndex[1]) >= REF(DispatchRaysDimensions[1]))
+  { 
+    return;
+  }
+
+
+  // Set final return stateID into reserved area at stack top
+  unsigned stackSize = rewrite_getStackSize();
+  stackInit(runtimeData, theStack, stackSize);
+  int stackFrameOffs = stackFrameOffset(runtimeData);
+  *stackIntPtr(runtimeData, stackFrameOffs, 0) = -1;
+
+  int stateId = initialStateId;
+  int count = 0;
+  while( stateId >= 0 )
+  {
+    stateId = rewrite_dispatch(runtimeData, stateId);
+  }
+}
+
+void fb_Fallback_SetLaunchParams(RuntimeDataType runtimeData, unsigned DTidx, unsigned DTidy, unsigned dimx, unsigned dimy, unsigned groupIndex)
+{ 
+  REF(DispatchRaysIndex[0]) = DTidx;
+  REF(DispatchRaysIndex[1]) = DTidy;
+  REF(DispatchRaysDimensions[0]) = dimx;
+  REF(DispatchRaysDimensions[1]) = dimy;
+
+  REF(GroupIndex) = groupIndex;
+}
+
+int fb_Fallback_TraceRayBegin(RuntimeDataType runtimeData, unsigned rayFlags, float ox, float oy, float oz, float tmin, float dx, float dy, float dz, float tmax, int newPayloadOffset)
+{ 
+  REF(RayFlags) = rayFlags;
+  REF_FLT(WorldRayOrigin[0]) = ox;
+  REF_FLT(WorldRayOrigin[1]) = oy;
+  REF_FLT(WorldRayOrigin[2]) = oz;
+  REF_FLT(WorldRayDirection[0]) = dx;
+  REF_FLT(WorldRayDirection[1]) = dy;
+  REF_FLT(WorldRayDirection[2]) = dz;
+  REF_FLT(RayTCurrent) = tmax;
+  REF_FLT(RayTMin) = tmin;
+
+  int oldOffset = REF(PayloadOffset);
+  REF(PayloadOffset) = newPayloadOffset;
+  return oldOffset;
+}
+
+void fb_Fallback_TraceRayEnd(RuntimeDataType runtimeData, int oldPayloadOffset)
+{
+  REF(PayloadOffset) = oldPayloadOffset;
+}
+
+void fb_Fallback_SetPendingTriVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID, float t, unsigned hitKind)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+  REF_FLT(PendingRayTCurrent) = t;
+  REF(PendingHitKind) = hitKind;
+}
+
+void fb_Fallback_SetPendingCustomVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+}
+
+void fb_Fallback_CommitHit(RuntimeDataType runtimeData)
+{
+  REF_FLT(RayTCurrent)    = REF_FLT(PendingRayTCurrent);
+  REF(ShaderRecordOffset) = REF(PendingShaderRecordOffset);
+  REF(PrimitiveIndex)     = REF(PendingPrimitiveIndex);
+  REF(InstanceIndex)      = REF(PendingInstanceIndex);
+  REF(InstanceID)         = REF(PendingInstanceID);
+  REF(HitKind)            = REF(PendingHitKind);  
+
+  int PendingAttrOffset = REF(PendingAttrOffset);
+  REF(PendingAttrOffset) = REF(CommittedAttrOffset);
+  REF(CommittedAttrOffset) = PendingAttrOffset;
+}
+
+
+int fb_Fallback_RuntimeDataLoadInt(RuntimeDataType runtimeData, int offset)
+{
+  return (*runtimeData->Stack)[offset];
+}
+
+void fb_Fallback_RuntimeDataStoreInt(RuntimeDataType runtimeData, int offset, int val)
+{
+  (*runtimeData->Stack)[offset] = val;
+}
+
+unsigned fb_dxop_dispatchRaysIndex(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysIndex[i]);
+}
+
+unsigned fb_dxop_dispatchRaysDimensions(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysDimensions[i]);
+}
+
+float fb_dxop_rayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+float fb_Fallback_RayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+void fb_Fallback_SetRayTMin(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTMin) = t;
+}
+
+float fb_dxop_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+float fb_Fallback_RayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+void fb_Fallback_SetRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTCurrent) = t;
+}
+
+unsigned fb_dxop_rayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+unsigned fb_Fallback_RayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+void fb_Fallback_SetRayFlags(RuntimeDataType runtimeData, unsigned flags)
+{
+  REF(RayFlags) = flags;
+}
+
+float fb_dxop_worldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+float fb_Fallback_WorldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+void fb_Fallback_SetWorldRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayOrigin[0]) = x;
+  REF_FLT(WorldRayOrigin[1]) = y;
+  REF_FLT(WorldRayOrigin[2]) = z;
+}
+
+float fb_dxop_worldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+float fb_Fallback_WorldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+void fb_Fallback_SetWorldRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayDirection[0]) = x;
+  REF_FLT(WorldRayDirection[1]) = y;
+  REF_FLT(WorldRayDirection[2]) = z;
+}
+
+float fb_dxop_objectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+float fb_Fallback_ObjectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+void fb_Fallback_SetObjectRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayOrigin[0]) = x;
+  REF_FLT(ObjectRayOrigin[1]) = y;
+  REF_FLT(ObjectRayOrigin[2]) = z;
+}
+
+float fb_dxop_objectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+float fb_Fallback_ObjectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+void fb_Fallback_SetObjectRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayDirection[0]) = x;
+  REF_FLT(ObjectRayDirection[1]) = y;
+  REF_FLT(ObjectRayDirection[2]) = z;
+}
+
+float fb_dxop_objectToWorld(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(ObjectToWorld, i);
+}
+
+void fb_Fallback_SetObjectToWorld(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(ObjectToWorld, 0)  = M[0]; 
+  REF_FLT_OFS(ObjectToWorld, 1)  = M[1]; 
+  REF_FLT_OFS(ObjectToWorld, 2)  = M[2]; 
+  REF_FLT_OFS(ObjectToWorld, 3)  = M[3]; 
+  REF_FLT_OFS(ObjectToWorld, 4)  = M[4]; 
+  REF_FLT_OFS(ObjectToWorld, 5)  = M[5]; 
+  REF_FLT_OFS(ObjectToWorld, 6)  = M[6]; 
+  REF_FLT_OFS(ObjectToWorld, 7)  = M[7]; 
+  REF_FLT_OFS(ObjectToWorld, 8)  = M[8]; 
+  REF_FLT_OFS(ObjectToWorld, 9)  = M[9]; 
+  REF_FLT_OFS(ObjectToWorld, 10) = M[10];
+  REF_FLT_OFS(ObjectToWorld, 11) = M[11];
+}
+
+float fb_dxop_worldToObject(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(WorldToObject, i);
+}
+
+void fb_Fallback_SetWorldToObject(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(WorldToObject, 0)  = M[0]; 
+  REF_FLT_OFS(WorldToObject, 1)  = M[1]; 
+  REF_FLT_OFS(WorldToObject, 2)  = M[2]; 
+  REF_FLT_OFS(WorldToObject, 3)  = M[3]; 
+  REF_FLT_OFS(WorldToObject, 4)  = M[4]; 
+  REF_FLT_OFS(WorldToObject, 5)  = M[5]; 
+  REF_FLT_OFS(WorldToObject, 6)  = M[6]; 
+  REF_FLT_OFS(WorldToObject, 7)  = M[7]; 
+  REF_FLT_OFS(WorldToObject, 8)  = M[8]; 
+  REF_FLT_OFS(WorldToObject, 9)  = M[9]; 
+  REF_FLT_OFS(WorldToObject, 10) = M[10];
+  REF_FLT_OFS(WorldToObject, 11) = M[11];
+}
+
+unsigned fb_dxop_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+unsigned fb_Fallback_PrimitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+void fb_Fallback_SetPrimitiveIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PrimitiveIndex) = i;
+}
+
+unsigned fb_Fallback_ShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(ShaderRecordOffset);
+}
+
+void fb_Fallback_SetShaderRecordOffset(RuntimeDataType runtimeData, unsigned shaderRecordOffset)
+{
+  REF(ShaderRecordOffset) = shaderRecordOffset;
+}
+
+unsigned fb_dxop_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+unsigned fb_Fallback_InstanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+void fb_Fallback_SetInstanceIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceIndex) = i;
+}
+
+unsigned fb_dxop_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+unsigned fb_Fallback_InstanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+void fb_Fallback_SetInstanceID(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceID) = i;
+}
+
+unsigned fb_dxop_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+unsigned fb_Fallback_HitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+void fb_Fallback_SetHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(HitKind) = i;
+}
+
+float fb_dxop_pending_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(PendingRayTCurrent);
+}
+
+void fb_Fallback_SetPendingRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(PendingRayTCurrent) = t;
+}
+
+unsigned fb_dxop_pending_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_pending_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingPrimitiveIndex);
+}
+
+unsigned fb_Fallback_PendingShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingShaderRecordOffset);
+}
+
+unsigned fb_dxop_pending_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceIndex);
+}
+
+unsigned fb_dxop_pending_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceID);
+}
+
+unsigned fb_dxop_pending_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(PendingHitKind);
+}
+
+void fb_Fallback_SetPendingHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PendingHitKind) = i;
+}
+
+unsigned fb_Fallback_GroupIndex(RuntimeDataType runtimeData)
+{ 
+  return REF(GroupIndex);
+}
+
+int fb_Fallback_AnyHitResult(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitResult);
+}
+
+void fb_Fallback_SetAnyHitResult(RuntimeDataType runtimeData, int result)
+{
+  REF(AnyHitResult) = result;
+}
+
+int fb_Fallback_AnyHitStateId(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitStateId);
+}
+
+void fb_Fallback_SetAnyHitStateId(RuntimeDataType runtimeData, int id)
+{
+  REF(AnyHitStateId) = id;
+}

+ 9 - 0
lib/DxrFallback/runtime/script.cmd

@@ -0,0 +1,9 @@
+@setlocal
+@set BINPATH=C:\Program Files\LLVM3.7\bin
+@set CLANG="%BINPATH%\clang"
+@set OPT="%BINPATH%\opt"
+
+
+%CLANG% -S -emit-llvm -target nvptx runtime.c 
+%OPT% -S -mem2reg  runtime.ll -o runtime.opt.ll
+python rewriteRuntime.py

+ 1 - 0
lib/HLSL/CMakeLists.txt

@@ -26,6 +26,7 @@ add_llvm_library(LLVMHLSL
   DxilPreparePasses.cpp
   DxilRemoveDiscards.cpp
   DxilReduceMSAAToSingleSample.cpp
+  DxilPatchShaderRecordBindings.cpp
   DxilPreserveAllOutputs.cpp
   DxilResource.cpp
   DxilResourceBase.cpp

+ 1155 - 0
lib/HLSL/DxilPatchShaderRecordBindings.cpp

@@ -0,0 +1,1155 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilPatchShaderRecordBindings.cpp                                        //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides a pass used by the RayTracing Fallback Lyaer to add modify       //
+// bindings to pull local root signature parameters from a global            //
+// "shader table" buffer instead                                             //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/HLSL/DxilFallbackLayerPass.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilSignatureElement.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/Support/Global.h"
+
+#include "dxc/Support/Unicode.h"
+#include "dxc/HLSL/DxilTypeSystem.h"
+#include "dxc/HLSL/DxilConstants.h"
+#include "dxc/HLSL/DxilInstructions.h"
+#include "dxc/HLSL/DxilSpanAllocator.h"
+#include "dxc/HLSL/DxilRootSignature.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+#include <memory>
+#include <unordered_set>
+#include <functional>
+#include <unordered_map>
+#include <array>
+
+struct D3D12_VERSIONED_ROOT_SIGNATURE_DESC;
+#include "DxilPatchShaderRecordBindingsShared.h"
+
+
+using namespace llvm;
+using namespace hlsl;
+
+
+bool operator==(const ViewKey &a, const ViewKey &b) {
+  return memcmp(&a, &b, sizeof(a)) == 0;
+}
+
+const size_t SizeofD3D12GpuVA = sizeof(uint64_t);
+const size_t SizeofD3D12GpuDescriptorHandle = sizeof(uint64_t);
+
+Function *CloneFunction(Function *Orig,
+    const llvm::Twine &Name,
+    llvm::Module *llvmModule) {
+
+    Function *F = Function::Create(Orig->getFunctionType(),
+        GlobalValue::LinkageTypes::ExternalLinkage,
+        Name, llvmModule);
+
+    SmallVector<ReturnInst *, 2> Returns;
+    ValueToValueMapTy vmap;
+    // Map params.
+    auto entryParamIt = F->arg_begin();
+    for (Argument &param : Orig->args()) {
+        vmap[&param] = (entryParamIt++);
+    }
+
+    DxilModule &DM = llvmModule->GetOrCreateDxilModule();
+
+    llvm::CloneFunctionInto(F, Orig, vmap, /*ModuleLevelChagnes*/ false, Returns);
+    DM.GetTypeSystem().CopyFunctionAnnotation(F, Orig, DM.GetTypeSystem());
+
+    if (DM.HasDxilFunctionProps(F)) {
+        DM.CloneDxilEntryProps(Orig, F);
+    }
+    return F;
+}
+
+
+struct ShaderRecordEntry {
+  DxilRootParameterType ParameterType;
+  unsigned int RecordOffsetInBytes;
+  unsigned int OffsetInDescriptors; // Only valid for descriptor tables
+
+  static ShaderRecordEntry InvalidEntry() { return { (DxilRootParameterType)-1, (unsigned int)-1 }; }
+  bool IsInvalid() { return (unsigned int)ParameterType == (unsigned int)-1; }
+};
+
+struct D3D12_VERSIONED_ROOT_SIGNATURE_DESC;
+class DxilPatchShaderRecordBindings : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilPatchShaderRecordBindings() : ModulePass(ID) {}
+  const char *getPassName() const override { return "DXIL Patch Shader Record Binding"; }
+  void applyOptions(PassOptions O) override;
+  bool runOnModule(Module &M) override;
+
+private:
+  void ValidateParameters();
+  void AddInputBinding(Module &M);
+  void PatchShaderBindings(Module &M);
+  void InitializeViewTable();
+
+  unsigned int AddSRVRawBuffer(Module &M, unsigned int registerIndex, unsigned int registerSpace, const std::string &bufferName);
+  unsigned int AddHandle(Module &M, unsigned int baseRegisterIndex, unsigned int rangeSize, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type = nullptr, unsigned int constantBufferSize = 0);
+  unsigned int AddAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type);
+  unsigned int AddCBufferAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, const std::string &bufferName);
+
+  llvm::Value *CreateOffsetToShaderRecord(Module &M, IRBuilder<> &Builder, unsigned int RecordOffsetInBytes, llvm::Value *CbufferOffsetInBytes);
+  llvm::Value *CreateShaderRecordBufferLoad(Module &M, IRBuilder<> &Builder, llvm::Value *ShaderRecordOffsetInBytes, llvm::Type* type);
+  llvm::Value *CreateCBufferLoadOffsetInBytes(Module &M, IRBuilder<> &Builder, llvm::Instruction *instruction);
+  llvm::Value *CreateCBufferLoadLegacy(Module &M, IRBuilder<> &Builder, llvm::Value *ResourceHandle, unsigned int RowToLoad = 0);
+
+  llvm::Value *LoadShaderRecordData(Module &M, IRBuilder<> &Builder,
+                                    llvm::Value *offsetToShaderRecord,
+                                    unsigned int dataOffsetInShaderRecord);
+
+  void PatchCreateHandleToUseDescriptorIndex(
+      _In_ Module &M,
+      _In_ IRBuilder<> &Builder,
+      _In_ DXIL::ResourceKind &resourceKind,
+      _In_ DXIL::ResourceClass &resourceClass,
+      _In_ llvm::Type *resourceType,
+      _In_ llvm::Value *descriptorIndex,
+      _Inout_ DxilInst_CreateHandleForLib &createHandleInstr);
+
+
+  bool GetHandleInfo(
+    Module &M, 
+    DxilInst_CreateHandleForLib &createHandleStructForLib, 
+    _Out_ unsigned int &shaderRegister, 
+    _Out_ unsigned int &registerSpace, 
+    _Out_ DXIL::ResourceKind &kind, 
+    _Out_ DXIL::ResourceClass &resClass,
+    _Out_ llvm::Type *&resType);
+
+  llvm::Value * GetAliasedDescriptorHeapHandle(Module &M, llvm::Type *, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind);
+
+  unsigned int GetConstantBufferOffsetToShaderRecord();
+
+  bool IsCBufferLoad(llvm::Instruction *instruction);
+
+  // Unlike the LLVM version of this function, this does not requires the InstructionToReplace and the ValueToReplaceWith to be the same instruction type
+  static void ReplaceUsesOfWith(llvm::Instruction *InstructionToReplace, llvm::Value *ValueToReplaceWith);
+
+  static ShaderRecordEntry FindRootSignatureDescriptor(const DxilVersionedRootSignatureDesc &rootSignatureDescriptor, unsigned int ShaderRecordIdentifierSizeInBytes, DXIL::ResourceClass resourceClass, unsigned int baseRegisterIndex, unsigned int registerSpace);
+
+  // TODO: I would like to see these prefixed with m_
+  llvm::Value *ShaderTableHandle = nullptr;
+  llvm::Value *DispatchRaysConstantsHandle = nullptr;
+  llvm::Value *BaseShaderRecordOffset = nullptr;
+
+  static const unsigned int NumViewTypes = 4;
+  struct ViewKeyHasher
+  {
+  public:
+      std::size_t operator()(const ViewKey &x) const {
+        return std::hash<unsigned int>()((unsigned int)x.ViewType) ^ 
+            std::hash<unsigned int>()((unsigned int)x.StructuredStride);
+      }
+  };
+
+
+  std::unordered_map<ViewKey, llvm::Value *, ViewKeyHasher>
+      TypeToAliasedDescriptorHeap[NumViewTypes];
+
+  llvm::Function *EntryPointFunction;
+
+  ShaderInfo *pInputShaderInfo;
+  DxilVersionedRootSignatureDesc *pRootSignatureDesc;
+  DXIL::ShaderKind ShaderKind;
+};
+
+char DxilPatchShaderRecordBindings::ID = 0;
+
+// TODO: Find the right thing to do on failure
+void ThrowFailure() {
+  throw std::exception();
+}
+
+// TODO: Stolen from Brandon's code, merge
+// Remove ELF mangling
+static inline std::string GetUnmangledName(StringRef name) {
+  if (!name.startswith("\x1?"))
+      return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+
+  return name.substr(2, pos - 2);
+}
+
+static Function* getFunctionFromName(Module &M, const std::wstring& exportName) {
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    std::wstring functionName = Unicode::UTF8ToUTF16StringOrThrow(GetUnmangledName(F->getName()).c_str());
+    if (exportName == functionName) {
+      return F;
+    }
+  }
+  return nullptr;
+}
+
+ModulePass *llvm::createDxilPatchShaderRecordBindingsPass() {
+  return new DxilPatchShaderRecordBindings();
+}
+
+INITIALIZE_PASS(DxilPatchShaderRecordBindings, "hlsl-dxil-patch-shader-record-bindings", "Patch shader record bindings to instead pull from the fallback provided bindings", false, false)
+
+void DxilPatchShaderRecordBindings::applyOptions(PassOptions O) {
+  for (const auto & option : O) {
+    if (0 == option.first.compare("root-signature")) {
+      unsigned int cHexRadix = 16;
+      pInputShaderInfo = (ShaderInfo*)strtoull(option.second.data(), nullptr, cHexRadix);
+      pRootSignatureDesc = (DxilVersionedRootSignatureDesc*)pInputShaderInfo->pRootSignatureDesc;
+    }
+  }
+}
+
+void AddAnnoationsIfNeeded(DxilModule &DM, llvm::StructType *StructTy, const std::string &FieldName, unsigned int numFields = 1)
+{
+    auto pAnnotation = DM.GetTypeSystem().GetStructAnnotation(StructTy);
+    if (pAnnotation == nullptr)
+    {
+        pAnnotation = DM.GetTypeSystem().AddStructAnnotation(StructTy);
+        pAnnotation->SetCBufferSize(sizeof(uint32_t) * numFields);
+        for (unsigned int i = 0; i < numFields; i++)
+        {
+            pAnnotation->GetFieldAnnotation(i).SetCBufferOffset(sizeof(uint32_t) * i);
+            pAnnotation->GetFieldAnnotation(i).SetCompType(hlsl::DXIL::ComponentType::I32);
+            pAnnotation->GetFieldAnnotation(i).SetFieldName(FieldName + std::to_string(i));
+        }
+    }
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddHandle(Module &M, unsigned int baseRegisterIndex, unsigned int rangeSize, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type, unsigned int constantBufferSize) {
+  LLVMContext & Ctx = M.getContext();
+  DxilModule &DM = M.GetOrCreateDxilModule();
+
+  // Set up a SRV with byte address buffer
+  unsigned int resourceHandle;
+  std::unique_ptr<DxilResource> pHandle;
+  std::unique_ptr<DxilCBuffer> pCBuf;
+  std::unique_ptr<DxilSampler> pSampler;
+  DxilResourceBase *pBaseHandle;
+  switch (resClass) {
+  case DXIL::ResourceClass::SRV:
+    resourceHandle = static_cast<unsigned int>(DM.GetSRVs().size());
+    pHandle = llvm::make_unique<DxilResource>();
+    pHandle->SetRW(false);
+    pBaseHandle = pHandle.get();
+    break;
+  case DXIL::ResourceClass::UAV:
+    resourceHandle = static_cast<unsigned int>(DM.GetUAVs().size());
+    pHandle = llvm::make_unique<DxilResource>();
+    pHandle->SetRW(true);
+    pBaseHandle = pHandle.get();
+    break;
+  case DXIL::ResourceClass::CBuffer:
+    resourceHandle = static_cast<unsigned int>(DM.GetCBuffers().size());
+    pCBuf = llvm::make_unique<DxilCBuffer>();
+    pCBuf->SetSize(constantBufferSize);
+    pBaseHandle = pCBuf.get();
+    break;
+  case DXIL::ResourceClass::Sampler:
+    resourceHandle = static_cast<unsigned int>(DM.GetSamplers().size());
+    pSampler = llvm::make_unique<DxilSampler>();
+    // TODO: Is this okay? What if one of the samplers in the table is a comparison sampler?
+    pSampler->SetSamplerKind(DxilSampler::SamplerKind::Default);
+    pBaseHandle = pSampler.get();
+    break;
+  }
+
+  if (!type) {
+    SmallVector<llvm::Type*, 1> Elements{ Type::getInt32Ty(Ctx) };
+    std::string ByteAddressBufferName = "struct.ByteAddressBuffer";
+    type = M.getTypeByName(ByteAddressBufferName);
+    if (!type)
+    {
+        StructType *StructTy;
+        type = StructTy = StructType::create(Elements, ByteAddressBufferName);
+  
+        AddAnnoationsIfNeeded(DM, StructTy, ByteAddressBufferName);
+    }
+  }
+
+  GlobalVariable *GV = M.getGlobalVariable(bufferName);
+  if (!GV) {
+    GV = cast<GlobalVariable>(M.getOrInsertGlobal(bufferName, type));
+  }
+
+  pBaseHandle->SetGlobalName(bufferName.c_str());
+  pBaseHandle->SetGlobalSymbol(GV);
+  pBaseHandle->SetID(resourceHandle);
+  pBaseHandle->SetSpaceID(registerSpace);
+  pBaseHandle->SetLowerBound(baseRegisterIndex);
+  pBaseHandle->SetRangeSize(rangeSize);
+  pBaseHandle->SetKind(resKind);
+
+  if (pHandle) {
+    pHandle->SetGloballyCoherent(false);
+    pHandle->SetHasCounter(false);
+    pHandle->SetCompType(CompType::getF32()); // TODO: Need to handle all types
+  }
+
+  unsigned int ID;
+  switch (resClass) {
+  case DXIL::ResourceClass::SRV:
+    ID = DM.AddSRV(std::move(pHandle));
+    break;
+  case DXIL::ResourceClass::UAV:
+    ID = DM.AddUAV(std::move(pHandle));
+    break;
+  case DXIL::ResourceClass::CBuffer:
+    ID = DM.AddCBuffer(std::move(pCBuf));
+    break;
+  case DXIL::ResourceClass::Sampler:
+    ID = DM.AddSampler(std::move(pSampler));
+    break;
+  }
+
+  assert(ID == resourceHandle);
+  return ID;
+}
+
+unsigned int DxilPatchShaderRecordBindings::GetConstantBufferOffsetToShaderRecord()
+{
+    switch (ShaderKind)
+    {
+    case DXIL::ShaderKind::ClosestHit:
+    case DXIL::ShaderKind::AnyHit:
+    case DXIL::ShaderKind::Intersection:
+        return offsetof(DispatchRaysConstants, HitGroupShaderRecordStride);
+    case DXIL::ShaderKind::Miss:
+        return offsetof(DispatchRaysConstants, MissShaderRecordStride);
+    default:
+        ThrowFailure();
+        return -1;
+    }
+}
+
+
+unsigned int DxilPatchShaderRecordBindings::AddSRVRawBuffer(Module &M, unsigned int registerIndex, unsigned int registerSpace, const std::string &bufferName) {
+  return AddHandle(M, registerIndex, 1, registerSpace, DXIL::ResourceClass::SRV, DXIL::ResourceKind::RawBuffer, bufferName);
+}
+
+llvm::Constant *GetArraySymbol(Module &M, const std::string &bufferName) {
+  LLVMContext & Ctx = M.getContext();
+
+  SmallVector<llvm::Type*, 1> Elements{ Type::getInt32Ty(Ctx) };
+  llvm::StructType *StructTy = llvm::StructType::create(Elements, bufferName);
+  llvm::ArrayType *ArrayTy = ArrayType::get(StructTy, -1);
+
+  return UndefValue::get(ArrayTy->getPointerTo());
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddCBufferAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, const std::string &bufferName) {
+  const unsigned int maxConstantBufferSize = 4096 * 16;
+  return AddHandle(M, baseRegisterIndex, UINT_MAX, registerSpace, DXIL::ResourceClass::CBuffer, DXIL::ResourceKind::CBuffer, bufferName, GetArraySymbol(M, bufferName)->getType(), maxConstantBufferSize);
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type) {
+  return AddHandle(M, baseRegisterIndex, UINT_MAX, registerSpace, resClass, resKind, bufferName, type);
+}
+
+// TODO: Stolen from Brandon's code
+DXIL::ShaderKind GetRayShaderKindCopy(Function* F)
+{
+    if (F->hasFnAttribute("exp-shader"))
+        return DXIL::ShaderKind::RayGeneration;
+
+    DxilModule& DM = F->getParent()->GetDxilModule();
+    if (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay())
+        return DM.GetDxilFunctionProps(F).shaderKind;
+
+    return DXIL::ShaderKind::Invalid;
+}
+
+static std::string ws2s(const std::wstring& wide)
+{
+    return std::string(wide.begin(), wide.end());
+}
+
+bool DxilPatchShaderRecordBindings::runOnModule(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  EntryPointFunction = pInputShaderInfo->ExportName ? getFunctionFromName(M, pInputShaderInfo->ExportName) : DM.GetEntryFunction();
+  ShaderKind = GetRayShaderKindCopy(EntryPointFunction);
+
+  ValidateParameters();
+  InitializeViewTable();
+
+  PatchShaderBindings(M);
+  DM.ReEmitDxilResources();
+  return true;
+}
+
+void DxilPatchShaderRecordBindings::ValidateParameters() {
+  if (!pInputShaderInfo || !pInputShaderInfo->pRootSignatureDesc) {
+    throw std::exception();
+  }
+}
+
+DxilResourceBase &GetResourceFromID(DxilModule &DM, DXIL::ResourceClass resClass, unsigned int id)
+{
+    switch (resClass)
+    {
+    case DXIL::ResourceClass::CBuffer:
+        return DM.GetCBuffer(id);
+        break;
+    case DXIL::ResourceClass::SRV:
+        return DM.GetSRV(id);
+        break;
+    case DXIL::ResourceClass::UAV:
+        return DM.GetUAV(id);
+        break;
+    case DXIL::ResourceClass::Sampler:
+        return DM.GetSampler(id);
+        break;
+    default:
+        ThrowFailure();
+        return *(DxilResourceBase*)nullptr;
+    }
+}
+
+unsigned int FindOrInsertViewIntoList(const ViewKey &key, ViewKey *pViewList, unsigned int &numViews, unsigned int maxViews)
+{
+    unsigned int viewIndex = 0;
+    for (; viewIndex < numViews; viewIndex++)
+    {
+        if (pViewList[viewIndex] == key)
+        {
+            break;
+        }
+    }
+
+    if (viewIndex == numViews)
+    {
+        if (viewIndex >= maxViews) {
+            ThrowFailure();
+        }
+
+        pViewList[viewIndex] = key;
+        numViews++;
+    }
+    return viewIndex;
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::GetAliasedDescriptorHeapHandle(Module &M, llvm::Type *type, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind)
+{
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    unsigned int resClassIndex = (unsigned int)resClass;
+    
+    ViewKey key = {};
+    key.ViewType = (unsigned int)resKind;
+    if (resKind == DXIL::ResourceKind::StructuredBuffer)
+    {
+      key.StructuredStride = type->getPrimitiveSizeInBits();
+    } else if (resKind != DXIL::ResourceKind::RawBuffer)
+    {
+      auto containedType = type->getContainedType(0);
+      // If it's a vector, get the type of just a single element
+      if (containedType->getNumContainedTypes() > 0)
+      {
+        assert(containedType->getNumContainedTypes() <= 4);
+        containedType = containedType->getContainedType(0);
+      }
+      key.SRVComponentType = (unsigned int)CompType::GetCompType(containedType).GetKind();
+    }
+    auto aliasedDescriptorHeapHandle = TypeToAliasedDescriptorHeap[resClassIndex].find(key);
+    if (aliasedDescriptorHeapHandle == TypeToAliasedDescriptorHeap[resClassIndex].end())
+    {
+        unsigned int registerSpaceOffset = 0;
+        std::string HandleName;
+
+        if (resClass == DXIL::ResourceClass::SRV)
+        {
+          registerSpaceOffset = FindOrInsertViewIntoList(
+              key, 
+              pInputShaderInfo->pSRVRegisterSpaceArray, 
+              *pInputShaderInfo->pNumSRVSpaces, 
+              FallbackLayerNumDescriptorHeapSpacesPerView);
+
+          HandleName = std::string("SRVDescriptorHeapTable") +
+                       std::to_string(registerSpaceOffset);
+        }
+        else if (resClass == DXIL::ResourceClass::UAV)
+        {
+          registerSpaceOffset = FindOrInsertViewIntoList(
+              key,
+              pInputShaderInfo->pUAVRegisterSpaceArray,
+              *pInputShaderInfo->pNumUAVSpaces,
+              FallbackLayerNumDescriptorHeapSpacesPerView);
+
+          if (registerSpaceOffset == 0)
+          {
+              // Using the descriptor heap declared by the fallback for handling emulated pointers,
+              // make sure the name is an exact match
+              assert(key.ViewType == (unsigned int)hlsl::DXIL::ResourceKind::RawBuffer);
+              HandleName = "\01?DescriptorHeapBufferTable@@3PAURWByteAddressBuffer@@A";
+          }
+          else
+          {
+              HandleName = std::string("UAVDescriptorHeapTable") +
+                  std::to_string(registerSpaceOffset);
+          }
+        }
+        else if (resClass == DXIL::ResourceClass::CBuffer)
+        {
+          HandleName = std::string("CBVDescriptorHeapTable");
+
+        } else {
+          HandleName = std::string("SamplerDescriptorHeapTable");
+        }
+
+
+        llvm::ArrayType *descriptorHeapType = ArrayType::get(type, 0);
+        static unsigned int i = 0;
+        unsigned int id = AddAliasedHandle(M, FallbackLayerDescriptorHeapTable, FallbackLayerRegisterSpace + FallbackLayerDescriptorHeapSpaceOffset + registerSpaceOffset, resClass, resKind, HandleName, descriptorHeapType);
+        
+        TypeToAliasedDescriptorHeap[resClassIndex][key] = GetResourceFromID(DM, resClass, id).GetGlobalSymbol();
+    }
+    return TypeToAliasedDescriptorHeap[resClassIndex][key];
+}
+
+void DxilPatchShaderRecordBindings::AddInputBinding(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  auto & EntryBlock = EntryPointFunction->getEntryBlock();
+  auto & Instructions = EntryBlock.getInstList();
+
+  std::string bufferName;
+  unsigned int bufferRegister;
+
+  switch (ShaderKind) {
+  case DXIL::ShaderKind::AnyHit:
+  case DXIL::ShaderKind::ClosestHit:
+  case DXIL::ShaderKind::Intersection:
+    bufferRegister = FallbackLayerHitGroupRecordByteAddressBufferRegister;
+    bufferName = "\01?HitGroupShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::Miss:
+    bufferRegister = FallbackLayerMissShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?MissShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::RayGeneration:
+    bufferRegister = FallbackLayerRayGenShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?RayGenShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::Callable:
+    bufferRegister = FallbackLayerCallableShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?CallableShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  }
+  unsigned int ShaderRecordID = AddSRVRawBuffer(M, bufferRegister, FallbackLayerRegisterSpace, bufferName);
+
+  auto It = Instructions.begin();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  IRBuilder<> Builder(It);
+  {
+    auto ShaderTableName = "ShaderTableHandle";
+    llvm::Value *Symbol = DM.GetSRV(ShaderRecordID).GetGlobalSymbol();
+    llvm::Value *Load = Builder.CreateLoad(Symbol, "LoadShaderTableHandle");
+
+    Function *CreateHandleForLib = HlslOP->GetOpFunc(DXIL::OpCode::CreateHandleForLib, Load->getType());
+    Constant *CreateHandleOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CreateHandleForLib);
+    ShaderTableHandle = Builder.CreateCall(CreateHandleForLib, { CreateHandleOpcodeArg, Load }, ShaderTableName);
+  }
+
+  {
+    auto CbufferName = "Constants";
+    const unsigned int sizeOfConstantsInBytes = sizeof(DispatchRaysConstants);
+    llvm::StructType *StructTy= M.getTypeByName(CbufferName);
+    if (!StructTy)
+    {
+        const unsigned int numUintsInConstants = sizeOfConstantsInBytes / sizeof(unsigned int);
+        SmallVector<llvm::Type*, numUintsInConstants> Elements(numUintsInConstants);
+        for (unsigned int i = 0; i < numUintsInConstants; i++)
+        {
+            Elements[i] = Type::getInt32Ty(Ctx);
+        }
+        StructTy = llvm::StructType::create(Elements, CbufferName);
+        AddAnnoationsIfNeeded(DM, StructTy, std::string(CbufferName), numUintsInConstants);
+    }
+
+    unsigned int handle = AddHandle(M, FallbackLayerDispatchConstantsRegister, 1, FallbackLayerRegisterSpace, DXIL::ResourceClass::CBuffer, DXIL::ResourceKind::CBuffer, CbufferName, StructTy, sizeOfConstantsInBytes);
+
+    llvm::Value *Symbol = DM.GetCBuffer(handle).GetGlobalSymbol();
+    llvm::Value *Load = Builder.CreateLoad(Symbol, "DispatchRaysConstants");
+
+    Function *CreateHandleForLib = HlslOP->GetOpFunc(DXIL::OpCode::CreateHandleForLib, Load->getType());
+    Constant *CreateHandleOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CreateHandleForLib);
+    DispatchRaysConstantsHandle = Builder.CreateCall(CreateHandleForLib, { CreateHandleOpcodeArg, Load }, CbufferName);
+  }
+  
+  // Raygen always reads from the start so no offset calculations needed
+  if (ShaderKind != DXIL::ShaderKind::RayGeneration)
+  {
+      std::string ShaderRecordOffsetFuncName = "\x1?Fallback_ShaderRecordOffset@@YAIXZ";
+      Function *ShaderRecordOffsetFunc = M.getFunction(ShaderRecordOffsetFuncName);
+      if (!ShaderRecordOffsetFunc)
+      {
+          FunctionType *ShaderRecordOffsetFuncType = FunctionType::get(llvm::Type::getInt32Ty(Ctx), {}, false);
+          ShaderRecordOffsetFunc = Function::Create(ShaderRecordOffsetFuncType, GlobalValue::LinkageTypes::ExternalLinkage, ShaderRecordOffsetFuncName, &M);
+      }
+      BaseShaderRecordOffset = Builder.CreateCall(ShaderRecordOffsetFunc, {}, "shaderRecordOffset");
+  }
+  else
+  {
+      BaseShaderRecordOffset = HlslOP->GetU32Const(0);
+  }
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateOffsetToShaderRecord(Module &M, IRBuilder<> &Builder, unsigned int RecordOffsetInBytes, llvm::Value *CbufferOffsetInBytes) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  // Create handle for the newly-added constant buffer (which is achieved via a function call)
+  auto AdddName = "ShaderRecordOffsetInBytes";
+  Constant *ShaderRecordOffsetInBytes = HlslOP->GetU32Const(RecordOffsetInBytes); // Offset of constants in shader record buffer
+  return Builder.CreateAdd(CbufferOffsetInBytes, ShaderRecordOffsetInBytes, AdddName);
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateCBufferLoadLegacy(Module &M, IRBuilder<> &Builder, llvm::Value *ResourceHandle, unsigned int RowToLoad) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  auto BufferLoadName = "ConstantBuffer";
+  Function *BufferLoad = HlslOP->GetOpFunc(DXIL::OpCode::CBufferLoadLegacy, Type::getInt32Ty(Ctx));
+  Constant *CBufferLoadOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CBufferLoadLegacy);
+  Constant *RowToLoadConst = HlslOP->GetU32Const(RowToLoad);
+  return Builder.CreateCall(BufferLoad, { CBufferLoadOpcodeArg, ResourceHandle, RowToLoadConst }, BufferLoadName);
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateShaderRecordBufferLoad(Module &M, IRBuilder<> &Builder, llvm::Value *ShaderRecordOffsetInBytes, llvm::Type* type) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  // Create handle for the newly-added constant buffer (which is achieved via a function call)
+  auto BufferLoadName = "ShaderRecordBuffer";
+  if (type->getNumContainedTypes() > 1)
+  {
+      // TODO: Buffer loads aren't legal with container types, check if this is the right wait to handle this
+      type = type->getContainedType(0);
+  }
+
+  // TODO Do I need to check the result? Hopefully not
+  Function *BufferLoad = HlslOP->GetOpFunc(DXIL::OpCode::BufferLoad, type);
+  Constant *BufferLoadOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::BufferLoad);
+  Constant *Unused = UndefValue::get(llvm::Type::getInt32Ty(Ctx));
+  return Builder.CreateCall(BufferLoad, { BufferLoadOpcodeArg, ShaderTableHandle, ShaderRecordOffsetInBytes, Unused }, BufferLoadName);
+}
+
+void DxilPatchShaderRecordBindings::ReplaceUsesOfWith(llvm::Instruction *InstructionToReplace, llvm::Value *ValueToReplaceWith) {
+  for (auto UserIter = InstructionToReplace->user_begin(); UserIter != InstructionToReplace->user_end();) {
+    // Increment the iterator before the replace since the replace alters the uses list
+    auto userInstr = UserIter++;
+    userInstr->replaceUsesOfWith(InstructionToReplace, ValueToReplaceWith);
+  }
+  InstructionToReplace->eraseFromParent();
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateCBufferLoadOffsetInBytes(Module &M, IRBuilder<> &Builder, llvm::Instruction *instruction) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  DxilInst_CBufferLoad cbufferLoad(instruction);
+  DxilInst_CBufferLoadLegacy cbufferLoadLegacy(instruction);
+  if (cbufferLoad) {
+    return cbufferLoad.get_byteOffset();
+  } else if (cbufferLoadLegacy) {
+    Constant *LegacyMultiplier = HlslOP->GetU32Const(16);
+    return Builder.CreateMul(cbufferLoadLegacy.get_regIndex(), LegacyMultiplier);
+  } else {
+    ThrowFailure();
+    return nullptr;
+  }
+}
+
+bool DxilPatchShaderRecordBindings::IsCBufferLoad(llvm::Instruction *instruction) {
+  DxilInst_CBufferLoad cbufferLoad(instruction);
+  DxilInst_CBufferLoadLegacy cbufferLoadLegacy(instruction);
+  return cbufferLoad || cbufferLoadLegacy;
+}
+
+const unsigned int GetResolvedRangeID(DXIL::ResourceClass resClass, Value *rangeIdVal)
+{
+  if (auto CI = dyn_cast<ConstantInt>(rangeIdVal))
+  {
+    return CI->getZExtValue();
+  }
+  else
+  {
+    assert(false);
+    return 0;
+  }
+}
+
+// TODO: This code is quite inefficient
+bool DxilPatchShaderRecordBindings::GetHandleInfo(
+  Module &M,
+  DxilInst_CreateHandleForLib &createHandleStructForLib,
+  _Out_ unsigned int &shaderRegister,
+  _Out_ unsigned int &registerSpace,
+  _Out_ DXIL::ResourceKind &kind,
+  _Out_ DXIL::ResourceClass &resClass,
+  _Out_ llvm::Type *&resType)
+{
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  LoadInst *loadRangeId = cast<LoadInst>(createHandleStructForLib.get_Resource());
+  Value *ResourceSymbol = loadRangeId->getPointerOperand();
+
+  DXIL::ResourceClass resourceClasses[] = {
+    DXIL::ResourceClass::CBuffer,
+    DXIL::ResourceClass::SRV,
+    DXIL::ResourceClass::UAV,
+    DXIL::ResourceClass::Sampler
+  };
+
+  hlsl::DxilResourceBase *Resource = nullptr;
+  for (auto &resourceClass : resourceClasses) {
+    
+    switch (resourceClass)
+    {
+    case DXIL::ResourceClass::CBuffer:
+    {
+      auto &cbuffers = DM.GetCBuffers();
+      for (auto &cbuffer : cbuffers)
+      {
+        if (cbuffer->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = cbuffer.get();
+          break;
+        }
+      }
+      break;
+    }
+    case DXIL::ResourceClass::SRV:
+    case DXIL::ResourceClass::UAV:
+    {
+      auto &viewList = resourceClass == DXIL::ResourceClass::SRV ? DM.GetSRVs() : DM.GetUAVs();
+      for (auto &view : viewList)
+      {
+        if (view->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = view.get();
+          break;
+        }
+      }
+      break;
+    }
+    case DXIL::ResourceClass::Sampler:
+    {
+      auto &samplers = DM.GetSamplers();
+      for (auto &sampler : samplers)
+      {
+        if (sampler->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = sampler.get();
+          break;
+        }
+      }
+      break;
+    }
+    }
+  }
+
+  if (Resource)
+  {
+    registerSpace = Resource->GetSpaceID();
+    shaderRegister = Resource->GetLowerBound();
+    kind = Resource->GetKind();
+    resClass = Resource->GetClass();
+    resType = cast<GlobalVariable>(Resource->GetGlobalSymbol())->getType()->getPointerElementType();
+  }
+  return Resource != nullptr;
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::LoadShaderRecordData(
+    Module &M, 
+    IRBuilder<> &Builder,
+    llvm::Value *offsetToShaderRecord,
+    unsigned int dataOffsetInShaderRecord)
+{
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  LLVMContext &Ctx = M.getContext();
+  OP *HlslOP = DM.GetOP();
+
+  Constant *dataOffset =
+      HlslOP->GetU32Const(dataOffsetInShaderRecord);
+  Value *shaderTableOffsetToData = Builder.CreateAdd(dataOffset, offsetToShaderRecord);
+  return CreateShaderRecordBufferLoad(M, Builder, shaderTableOffsetToData,
+      llvm::Type::getInt32Ty(Ctx));
+}
+
+void DxilPatchShaderRecordBindings::PatchCreateHandleToUseDescriptorIndex(
+    _In_ Module &M,
+    _In_ IRBuilder<> &Builder,
+    _In_ DXIL::ResourceKind &resourceKind,
+    _In_ DXIL::ResourceClass &resourceClass,
+    _In_ llvm::Type *resourceType,
+    _In_ llvm::Value *descriptorIndex,
+    _Inout_ DxilInst_CreateHandleForLib &createHandleInstr)
+{
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    OP *HlslOP = DM.GetOP();
+
+    llvm::Value *descriptorHeapSymbol = GetAliasedDescriptorHeapHandle(M, resourceType, resourceClass, resourceKind);
+    llvm::Value *viewSymbol = Builder.CreateGEP(descriptorHeapSymbol, { HlslOP->GetU32Const(0), descriptorIndex }, "IndexIntoDH");
+    DxilMDHelper::MarkNonUniform(cast<Instruction>(viewSymbol));
+    llvm::Value *handle = Builder.CreateLoad(viewSymbol);
+
+    auto callInst = cast<CallInst>(createHandleInstr.Instr);
+    callInst->setCalledFunction(HlslOP->GetOpFunc(
+        DXIL::OpCode::CreateHandleForLib,
+        handle->getType()));
+    createHandleInstr.set_Resource(handle);
+}
+
+void DxilPatchShaderRecordBindings::InitializeViewTable() {
+    // The Fallback Layer declares a bindless raw buffer that spans the entire descriptor heap,
+    // manually add it to the list of UAV register spaces used
+    if (*pInputShaderInfo->pNumUAVSpaces == 0)
+    {
+        ViewKey key = { (unsigned int)hlsl::DXIL::ResourceKind::RawBuffer, 0 };
+        unsigned int index = FindOrInsertViewIntoList(
+          key, 
+          pInputShaderInfo->pUAVRegisterSpaceArray, 
+          *pInputShaderInfo->pNumUAVSpaces, 
+          FallbackLayerNumDescriptorHeapSpacesPerView);
+        (void*)index;
+        assert(index == 0);
+    }
+}
+
+
+void DxilPatchShaderRecordBindings::PatchShaderBindings(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  // Don't erase instructions until the very end because it throws off the iterator
+  std::vector<llvm::Instruction *> instructionsToRemove;
+  for (BasicBlock &block : EntryPointFunction->getBasicBlockList()) {
+    auto & Instructions = block.getInstList();
+    auto It = Instructions.begin();
+
+    for (auto &instr : Instructions) {
+      DxilInst_CreateHandleForLib createHandleForLib(&instr);
+      if (createHandleForLib) {
+        DXIL::ResourceClass resourceClass;
+        unsigned int registerSpace;
+        unsigned int registerIndex;
+        DXIL::ResourceKind kind;
+        llvm::Type *resType;
+        bool resourceIsResolved = true;
+        resourceIsResolved = GetHandleInfo(M, createHandleForLib, registerIndex, registerSpace, kind, resourceClass, resType);
+
+        if (!resourceIsResolved) continue; // TODO: This shouldn't actually be happening?
+
+        ShaderRecordEntry shaderRecord = FindRootSignatureDescriptor(
+          *pRootSignatureDesc,
+          pInputShaderInfo->ShaderRecordIdentifierSizeInBytes,
+          resourceClass,
+          registerIndex,
+          registerSpace);
+
+        const bool IsBindingSpecifiedInLocalRootSignature = !shaderRecord.IsInvalid();
+        if (IsBindingSpecifiedInLocalRootSignature) {
+          if (!DispatchRaysConstantsHandle) {
+            AddInputBinding(M);
+          }
+
+          switch (shaderRecord.ParameterType) {
+          case DxilRootParameterType::Constants32Bit:
+          {
+            for (User *U : instr.users()) {
+              llvm::Instruction *instruction = cast<CallInst>(U);
+              if (IsCBufferLoad(instruction)) {
+                llvm::Instruction *cbufferLoadInstr = instruction;
+                IRBuilder<> Builder(cbufferLoadInstr);
+
+                llvm::Value * cbufferOffsetInBytes = CreateCBufferLoadOffsetInBytes(M, Builder, cbufferLoadInstr);
+                llvm::Value *LocalOffsetToRootConstant = CreateOffsetToShaderRecord(M, Builder, shaderRecord.RecordOffsetInBytes, cbufferOffsetInBytes);
+                llvm::Value *GlobalOffsetToRootConstant = Builder.CreateAdd(LocalOffsetToRootConstant, BaseShaderRecordOffset);
+                llvm::Value *srvBufferLoad = CreateShaderRecordBufferLoad(M, Builder, GlobalOffsetToRootConstant, cbufferLoadInstr->getType());
+                ReplaceUsesOfWith(cbufferLoadInstr, srvBufferLoad);
+              } else {
+                ThrowFailure();
+              }
+            }
+            instructionsToRemove.push_back(&instr);
+            break;
+          }
+          case DxilRootParameterType::DescriptorTable:
+          {
+            IRBuilder<> Builder(&instr);
+            llvm::Value *srvBufferLoad = LoadShaderRecordData(
+             M, 
+             Builder, 
+             BaseShaderRecordOffset,
+             shaderRecord.RecordOffsetInBytes);
+
+            llvm::Value *DescriptorTableEntryLo = Builder.CreateExtractValue(srvBufferLoad, 0, "DescriptorTableHandleLo");
+
+            unsigned int offsetToLoadInUints = offsetof(DispatchRaysConstants, SrvCbvUavDescriptorHeapStart) / sizeof(uint32_t);
+            unsigned int uintsPerRow = 4;
+            unsigned int rowToLoad = offsetToLoadInUints / uintsPerRow;
+            unsigned int extractValueOffset = offsetToLoadInUints % uintsPerRow;
+            llvm::Value *DescHeapConstants = CreateCBufferLoadLegacy(M, Builder, DispatchRaysConstantsHandle, rowToLoad);
+            llvm::Value *DescriptorHeapStartAddressLo = Builder.CreateExtractValue(DescHeapConstants, extractValueOffset, "DescriptorHeapStartHandleLo");
+
+            // TODO: The hi bits can only be ignored if the difference is guaranteed to be < 32 bytes. This is an unsafe assumption, particularly given 
+            // large descriptor sizes
+            llvm::Value *DescriptorTableOffsetInBytes = Builder.CreateSub(DescriptorTableEntryLo, DescriptorHeapStartAddressLo, "TableOffsetInBytes");
+
+            Constant *DescriptorSizeInBytes = HlslOP->GetU32Const(pInputShaderInfo->SrvCbvUavDescriptorSizeInBytes);
+            llvm::Value * DescriptorTableStartIndex = Builder.CreateExactUDiv(DescriptorTableOffsetInBytes, DescriptorSizeInBytes, "TableStartIndex");
+
+            Constant *RecordOffset = HlslOP->GetU32Const(shaderRecord.OffsetInDescriptors);
+            llvm::Value * BaseDescriptorIndex = Builder.CreateAdd(DescriptorTableStartIndex, RecordOffset, "BaseDescriptorIndex");
+
+            // TODO: Not supporting dynamic indexing yet, should be pulled from CreateHandleForLib
+            // If dynamic indexing is being used, add the apps index on top of the calculated index
+            llvm::Value * DynamicIndex = HlslOP->GetU32Const(0);
+
+            llvm::Value * DescriptorIndex = Builder.CreateAdd(BaseDescriptorIndex, DynamicIndex, "DescriptorIndex");
+            PatchCreateHandleToUseDescriptorIndex(
+                M, 
+                Builder, 
+                kind, 
+                resourceClass, 
+                resType, 
+                DescriptorIndex, 
+                createHandleForLib);
+            break;
+          }
+          case DxilRootParameterType::CBV:
+          case DxilRootParameterType::SRV:
+          case DxilRootParameterType::UAV: {
+            IRBuilder<> Builder(&instr);
+            llvm::Value *srvBufferLoad = LoadShaderRecordData(
+             M, 
+             Builder, 
+             BaseShaderRecordOffset,
+             shaderRecord.RecordOffsetInBytes);
+
+            llvm::Value *DescriptorIndex = Builder.CreateExtractValue(
+                srvBufferLoad, 1, "DescriptorHeapIndex");
+
+            // TODO: Handle offset in bytes
+            // llvm::Value *OffsetInBytes = Builder.CreateExtractValue(
+            //     srvBufferLoad, 0, "OffsetInBytes");
+
+            PatchCreateHandleToUseDescriptorIndex(
+                M,
+                Builder,
+                kind,
+                resourceClass,
+                resType,
+                DescriptorIndex,
+                createHandleForLib);
+
+            break;
+          }
+          default:
+            ThrowFailure();
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  for (auto instruction : instructionsToRemove) {
+    instruction->eraseFromParent();
+  }
+
+}
+
+bool IsParameterTypeCompatibleWithResourceClass(
+  DXIL::ResourceClass resourceClass,
+  DxilRootParameterType parameterType) {
+  switch (parameterType) {
+  case DxilRootParameterType::DescriptorTable:
+    return true;
+  case DxilRootParameterType::Constants32Bit:
+  case DxilRootParameterType::CBV:
+    return resourceClass == DXIL::ResourceClass::CBuffer;
+  case DxilRootParameterType::SRV:
+    return resourceClass == DXIL::ResourceClass::SRV;
+  case DxilRootParameterType::UAV:
+    return resourceClass == DXIL::ResourceClass::UAV;
+  default:
+    ThrowFailure();
+    return false;
+  }
+}
+
+DxilRootParameterType ConvertD3D12ParameterTypeToDxil(DxilRootParameterType parameter) {
+  switch (parameter) {
+  case DxilRootParameterType::Constants32Bit:
+    return DxilRootParameterType::Constants32Bit;
+  case DxilRootParameterType::DescriptorTable:
+    return DxilRootParameterType::DescriptorTable;
+  case DxilRootParameterType::CBV:
+    return DxilRootParameterType::CBV;
+  case DxilRootParameterType::SRV:
+    return DxilRootParameterType::SRV;
+  case DxilRootParameterType::UAV:
+    return DxilRootParameterType::UAV;
+  }
+
+  assert(false);
+  return (DxilRootParameterType)-1;
+}
+
+DXIL::ResourceClass ConvertD3D12RangeTypeToDxil(DxilDescriptorRangeType rangeType) {
+  switch (rangeType) {
+  case DxilDescriptorRangeType::SRV:
+    return DXIL::ResourceClass::SRV;
+  case DxilDescriptorRangeType::UAV:
+    return DXIL::ResourceClass::UAV;
+  case DxilDescriptorRangeType::CBV:
+    return DXIL::ResourceClass::CBuffer;
+  case DxilDescriptorRangeType::Sampler:
+    return DXIL::ResourceClass::Sampler;
+  }
+  assert(false);
+  return (DXIL::ResourceClass) - 1;
+}
+
+unsigned int GetParameterTypeAlignment(DxilRootParameterType parameterType) {
+  switch (parameterType) {
+  case DxilRootParameterType::DescriptorTable:
+    return SizeofD3D12GpuDescriptorHandle;
+  case DxilRootParameterType::Constants32Bit:
+    return sizeof(uint32_t);
+  case DxilRootParameterType::CBV: // fallthrough
+  case DxilRootParameterType::SRV: // fallthrough
+  case DxilRootParameterType::UAV:
+    return SizeofD3D12GpuVA;
+  default:
+    return UINT_MAX;
+  }
+}
+
+template <typename TD3D12_ROOT_SIGNATURE_DESC>
+ShaderRecordEntry FindRootSignatureDescriptorHelper(
+    const TD3D12_ROOT_SIGNATURE_DESC &rootSignatureDescriptor,
+    unsigned int ShaderRecordIdentifierSizeInBytes,
+    DXIL::ResourceClass resourceClass, unsigned int baseRegisterIndex,
+    unsigned int registerSpace) {
+  // Automatically fail if it's looking for a fallback binding as these never
+  // need to be patched
+  if (registerSpace != FallbackLayerRegisterSpace) {
+    unsigned int recordOffset = ShaderRecordIdentifierSizeInBytes;
+    for (unsigned int rootParamIndex = 0;
+         rootParamIndex < rootSignatureDescriptor.NumParameters;
+         rootParamIndex++) {
+      auto &rootParam = rootSignatureDescriptor.pParameters[rootParamIndex];
+      auto dxilParamType =
+          ConvertD3D12ParameterTypeToDxil(rootParam.ParameterType);
+
+#define ALIGN(alignment, num) (((num + alignment - 1) / alignment) * alignment)
+      recordOffset = ALIGN(GetParameterTypeAlignment(rootParam.ParameterType),
+                           recordOffset);
+
+      switch (rootParam.ParameterType) {
+      case DxilRootParameterType::Constants32Bit:
+        if (IsParameterTypeCompatibleWithResourceClass(resourceClass,
+                                                       dxilParamType) &&
+            baseRegisterIndex == rootParam.Constants.ShaderRegister &&
+            registerSpace == rootParam.Constants.RegisterSpace) {
+          return {dxilParamType, recordOffset};
+        }
+        recordOffset += rootParam.Constants.Num32BitValues * sizeof(uint32_t);
+        break;
+      case DxilRootParameterType::DescriptorTable: {
+        auto &descriptorTable = rootParam.DescriptorTable;
+
+        unsigned int rangeOffsetInDescriptors = 0;
+        for (unsigned int rangeIndex = 0;
+             rangeIndex < descriptorTable.NumDescriptorRanges; rangeIndex++) {
+          auto &range = descriptorTable.pDescriptorRanges[rangeIndex];
+          if (range.OffsetInDescriptorsFromTableStart != -1) {
+            rangeOffsetInDescriptors = range.OffsetInDescriptorsFromTableStart;
+          }
+
+          if (ConvertD3D12RangeTypeToDxil(range.RangeType) == resourceClass &&
+              range.RegisterSpace == registerSpace &&
+              range.BaseShaderRegister <= baseRegisterIndex &&
+              range.BaseShaderRegister + range.NumDescriptors >
+                  baseRegisterIndex) {
+            rangeOffsetInDescriptors +=
+                baseRegisterIndex - range.BaseShaderRegister;
+            return {dxilParamType, recordOffset, rangeOffsetInDescriptors};
+          }
+
+          rangeOffsetInDescriptors += range.NumDescriptors;
+        }
+
+        recordOffset += SizeofD3D12GpuDescriptorHandle;
+        break;
+      }
+      case DxilRootParameterType::CBV:
+      case DxilRootParameterType::SRV:
+      case DxilRootParameterType::UAV:
+        if (IsParameterTypeCompatibleWithResourceClass(resourceClass,
+                                                       dxilParamType) &&
+            baseRegisterIndex == rootParam.Descriptor.ShaderRegister &&
+            registerSpace == rootParam.Descriptor.RegisterSpace) {
+          return {dxilParamType, recordOffset};
+        }
+
+        recordOffset += SizeofD3D12GpuVA;
+        break;
+      }
+    }
+  }
+  return ShaderRecordEntry::InvalidEntry();
+}
+
+// TODO: Consider pre-calculating this into a map
+ShaderRecordEntry DxilPatchShaderRecordBindings::FindRootSignatureDescriptor(
+  const DxilVersionedRootSignatureDesc &rootSignatureDescriptor,
+  unsigned int ShaderRecordIdentifierSizeInBytes,
+  DXIL::ResourceClass resourceClass,
+  unsigned int baseRegisterIndex,
+  unsigned int registerSpace) {
+  switch (rootSignatureDescriptor.Version) {
+  case DxilRootSignatureVersion::Version_1_0:
+    return FindRootSignatureDescriptorHelper(rootSignatureDescriptor.Desc_1_0, ShaderRecordIdentifierSizeInBytes, resourceClass, baseRegisterIndex, registerSpace);
+  case DxilRootSignatureVersion::Version_1_1:
+    return FindRootSignatureDescriptorHelper(rootSignatureDescriptor.Desc_1_1, ShaderRecordIdentifierSizeInBytes, resourceClass, baseRegisterIndex, registerSpace);
+  default:
+    ThrowFailure();
+    return ShaderRecordEntry::InvalidEntry();
+  }
+}
+
+
+
+

+ 75 - 0
lib/HLSL/DxilPatchShaderRecordBindingsShared.h

@@ -0,0 +1,75 @@
+#pragma once
+
+#define FallbackLayerRegisterSpace 214743647
+
+// SRVs
+#define FallbackLayerHitGroupRecordByteAddressBufferRegister 0
+#define FallbackLayerMissShaderRecordByteAddressBufferRegister 1
+#define FallbackLayerRayGenShaderRecordByteAddressBufferRegister 2
+#define FallbackLayerCallableShaderRecordByteAddressBufferRegister 3
+
+// SRV & UAV
+#define FallbackLayerDescriptorHeapTable 0
+
+// There's a driver issue on some hardware that has issues
+// starting a bindless table on any register but 0, so
+// make sure each bindless table has it's own register space
+#define FallbackLayerDescriptorHeapSpaceOffset 1
+#define FallbackLayerNumDescriptorHeapSpacesPerView 10
+
+// CBVs
+#define FallbackLayerDispatchConstantsRegister 0
+#define FallbackLayerAccelerationStructureList 1
+
+#ifndef HLSL
+struct ViewKey {
+  unsigned int ViewType;
+  union 
+  {
+    unsigned int StructuredStride; // When ViewType == StructuredBuffer
+    unsigned int SRVComponentType; // When ViewType != StructuredBuffer &&  ViewType != RawBuffer
+  };
+};
+
+struct ShaderInfo {
+  const wchar_t *ExportName;
+  unsigned int SamplerDescriptorSizeInBytes;
+  unsigned int SrvCbvUavDescriptorSizeInBytes;
+  unsigned int ShaderRecordIdentifierSizeInBytes;
+  const void *pRootSignatureDesc;
+
+  ViewKey *pSRVRegisterSpaceArray;
+  unsigned int *pNumSRVSpaces;
+
+  ViewKey *pUAVRegisterSpaceArray;
+  unsigned int *pNumUAVSpaces;
+};
+
+struct DispatchRaysConstants {
+  unsigned __int32 RayDispatchDimensionsWidth;
+  unsigned __int32 RayDispatchDimensionsHeight;
+  unsigned __int32 HitGroupShaderRecordStride;
+  unsigned __int32 MissShaderRecordStride;
+
+  // 64-bit values
+  unsigned __int64 SamplerDescriptorHeapStart;
+  unsigned __int64 SrvCbvUavDescriptorHeapStart;
+};
+
+enum DescriptorRangeTypes { SRV = 0, CBV, UAV, Sampler, NumRangeTypes };
+
+enum RootSignatureParameterOffset {
+  HitGroupRecord = 0,
+  MissShaderRecord,
+  RayGenShaderRecord,
+  CallableShaderRecord,
+  DispatchConstants,
+  CbvSrvUavDescriptorHeapAliasedTables,
+  SamplerDescriptorHeapAliasedTables,
+  AccelerationStructuresList,
+#if ENABLE_UAV_LOG
+  DebugUAVLog,
+#endif
+  NumParameters
+};
+#endif

+ 2 - 1
lib/LLVMBuild.txt

@@ -38,8 +38,9 @@ subdirectories =
  Target
  Transforms
  HLSL
+ DxrFallback
 
-; HLSL Change: remove LibDriver, LineEditor, add HLSL
+; HLSL Change: remove LibDriver, LineEditor, add HLSL, add DxrtFallback
 
 [component_0]
 type = Group

+ 1 - 0
tools/clang/tools/CMakeLists.txt

@@ -25,6 +25,7 @@ add_llvm_external_project(clang-tools-extra extra)
 # HLSL Change Starts
 add_subdirectory(d3dcomp)
 add_subdirectory(dxcompiler)
+add_subdirectory(dxrfallbackcompiler)
 add_subdirectory(dxa)
 add_subdirectory(dxc)
 add_subdirectory(dxopt)

+ 68 - 0
tools/clang/tools/dxrfallbackcompiler/CMakeLists.txt

@@ -0,0 +1,68 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
+find_package(DiaSDK REQUIRED) # Used for constants and declarations.
+
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  analysis
+  asmparser
+#  asmprinter # no support for LLVM codegen
+  bitreader
+  bitwriter
+#  codegen # no support for LLVM codegen
+  core
+#  debuginfodwarf # no support for DWARF files (IR debug info is OK)
+#  debuginfopdb # no support for PDB files
+  dxcsupport
+  dxrfallback
+  hlsl
+  instcombine
+  ipa
+  ipo
+  irreader
+#  libdriver
+#  lineeditor
+  linker
+  lto
+#  mirparser # no support for LLVM codegen
+  mssupport
+#  object # no support for object files (coff, elf)
+  option
+#  passes
+  profiledata
+  scalaropts
+#  selectiondag # no support for LLVM codegen
+  support
+  target
+  transformutils
+  vectorize
+  )
+
+set(SOURCES
+  dxcapi.cpp
+  DXCompiler.cpp
+  DXCompiler.rc
+  DXCompiler.def
+  dxillib.cpp
+  dxcutil.cpp
+  dxcdxrfallbackcompiler.cpp
+  dxcvalidator.cpp
+  )
+
+add_clang_library(dxrfallbackcompiler SHARED ${SOURCES})
+target_link_libraries(dxrfallbackcompiler PRIVATE ${LIBRARIES} ${DIASDK_LIBRARIES})
+# SPIRV change starts
+if (ENABLE_SPIRV_CODEGEN)
+  target_link_libraries(dxrfallbackcompiler PRIVATE clangSPIRV)
+endif (ENABLE_SPIRV_CODEGEN)
+# SPIRV change ends
+add_dependencies(dxrfallbackcompiler DxcEtw)
+include_directories(AFTER ${LLVM_INCLUDE_DIR}/dxc/Tracing ${DIASDK_INCLUDE_DIRS})
+
+set_target_properties(dxrfallbackcompiler
+  PROPERTIES
+  OUTPUT_NAME "dxrfallbackcompiler"
+  VERSION ${LIBCLANG_LIBRARY_VERSION}
+  DEFINE_SYMBOL _CINDEX_LIB_)
+
+hlsl_update_product_ver("dxrfallbackcompiler")

+ 104 - 0
tools/clang/tools/dxrfallbackcompiler/DXCompiler.cpp

@@ -0,0 +1,104 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DXCompiler.cpp                                                            //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Implements the entry point for the dxcompiler DLL.                        //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/FileSystem.h"
+#include "dxc/Support/Global.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/HLSLOptions.h"
+#include "dxcetw.h"
+#include "dxillib.h"
+
+namespace hlsl { HRESULT SetupRegistryPassForHLSL(); }
+
+// C++ exception specification ignored except to indicate a function is not __declspec(nothrow)
+#pragma warning( disable : 4290 )
+
+// operator new and friends.
+void *  __CRTDECL operator new(std::size_t size) throw(std::bad_alloc) {
+  void * ptr = DxcGetThreadMallocNoRef()->Alloc(size);
+  if (ptr == nullptr)
+    throw std::bad_alloc();
+  return ptr;
+}
+void * __CRTDECL operator new(std::size_t size,
+  const std::nothrow_t &nothrow_value) throw() {
+  return DxcGetThreadMallocNoRef()->Alloc(size);
+}
+void  __CRTDECL operator delete (void* ptr) throw() {
+  DxcGetThreadMallocNoRef()->Free(ptr);
+}
+void  __CRTDECL operator delete (void* ptr, const std::nothrow_t& nothrow_constant) throw() {
+  DxcGetThreadMallocNoRef()->Free(ptr);
+}
+
+static HRESULT InitMaybeFail() throw() {
+  HRESULT hr;
+  bool fsSetup = false, memSetup = false;
+  IFC(DxcInitThreadMalloc());
+  DxcSetThreadMallocOrDefault(nullptr);
+  memSetup = true;
+  if (::llvm::sys::fs::SetupPerThreadFileSystem()) {
+    hr = E_FAIL;
+    goto Cleanup;
+  }
+  fsSetup = true;
+  IFC(hlsl::SetupRegistryPassForHLSL());
+  IFC(DxilLibInitialize());
+  if (hlsl::options::initHlslOptTable()) {
+    hr = E_FAIL;
+    goto Cleanup;
+  }
+Cleanup:
+  if (FAILED(hr)) {
+    if (fsSetup) {
+      ::llvm::sys::fs::CleanupPerThreadFileSystem();
+    }
+    if (memSetup) {
+      DxcClearThreadMalloc();
+      DxcCleanupThreadMalloc();
+    }
+  }
+  else {
+    DxcClearThreadMalloc();
+  }
+  return hr;
+}
+
+BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD Reason, LPVOID reserved) {
+  BOOL result = TRUE;
+  if (Reason == DLL_PROCESS_ATTACH) {
+    EventRegisterMicrosoft_Windows_DXCompiler_API();
+    DxcEtw_DXCompilerInitialization_Start();
+    DisableThreadLibraryCalls(hinstDLL);
+    HRESULT hr = InitMaybeFail();
+    DxcEtw_DXCompilerInitialization_Stop(hr);
+    result = SUCCEEDED(hr) ? TRUE : FALSE;
+  } else if (Reason == DLL_PROCESS_DETACH) {
+    DxcEtw_DXCompilerShutdown_Start();
+    DxcSetThreadMallocOrDefault(nullptr);
+    ::hlsl::options::cleanupHlslOptTable();
+    ::llvm::sys::fs::CleanupPerThreadFileSystem();
+    ::llvm::llvm_shutdown();
+    if (reserved == NULL) { // FreeLibrary has been called or the DLL load failed
+      DxilLibCleanup(DxilLibCleanUpType::UnloadLibrary);
+    }
+    else { // Process termination. We should not call FreeLibrary()
+      DxilLibCleanup(DxilLibCleanUpType::ProcessTermination);
+    }
+    DxcClearThreadMalloc();
+    DxcCleanupThreadMalloc();
+    DxcEtw_DXCompilerShutdown_Stop(S_OK);
+    EventUnregisterMicrosoft_Windows_DXCompiler_API();
+  }
+
+  return result;
+}

+ 4 - 0
tools/clang/tools/dxrfallbackcompiler/DXCompiler.def

@@ -0,0 +1,4 @@
+LIBRARY dxrfallbackcompiler
+
+EXPORTS
+    DxcCreateDxrFallbackCompiler

+ 14 - 0
tools/clang/tools/dxrfallbackcompiler/DXCompiler.rc

@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+
+#include <windows.h>
+#include <ntverp.h>
+
+#define VER_FILETYPE                  VFT_DLL
+#define VER_FILESUBTYPE               VFT_UNKNOWN
+#define VER_FILEDESCRIPTION_STR       "DXR Fallback Compiler DLL"
+#define VER_INTERNALNAME_STR          "DX Fallback Compiler DLL"
+#define VER_ORIGINALFILENAME_STR      "DxrFallbackCompiler.dll"
+
+// #include <common.ver>
+#include "dxcetw.rc"

+ 55 - 0
tools/clang/tools/dxrfallbackcompiler/dxcapi.cpp

@@ -0,0 +1,55 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcapi.cpp                                                                //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Implements the DxcCreateInstance function for the DirectX Compiler.       //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/Support/WinIncludes.h"
+
+#define DXC_API_IMPORT __declspec(dllexport)
+
+#include "dxc/dxctools.h"
+#include "dxc/Support/Global.h"
+#include "dxcetw.h"
+#include "dxc/dxcdxrfallbackcompiler.h"
+#include <memory>
+
+HRESULT CreateDxcDxrFallbackCompiler(_In_ REFIID riid, _Out_ LPVOID *ppv);
+
+static HRESULT ThreadMallocDxcCreateInstance(
+  _In_ REFCLSID   rclsid,
+                  _In_ REFIID     riid,
+                  _Out_ LPVOID   *ppv) {
+  HRESULT hr = S_OK;
+  *ppv = nullptr;
+
+  if (IsEqualCLSID(rclsid, CLSID_DxcDxrFallbackCompiler)) {
+    hr = CreateDxcDxrFallbackCompiler(riid, ppv);
+  }
+  else {
+    hr = REGDB_E_CLASSNOTREG;
+  }
+  return hr;
+}
+
+DXC_API_IMPORT HRESULT __stdcall
+DxcCreateDxrFallbackCompiler(
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID   *ppv) {
+  if (ppv == nullptr) {
+    return E_POINTER;
+  }
+
+  HRESULT hr = S_OK;
+  DxcEtw_DXCompilerCreateInstance_Start();
+  DxcThreadMalloc TM(nullptr);
+  hr = ThreadMallocDxcCreateInstance(rclsid, riid, ppv);
+  DxcEtw_DXCompilerCreateInstance_Stop(hr);
+  return hr;
+}

+ 778 - 0
tools/clang/tools/dxrfallbackcompiler/dxcdxrfallbackcompiler.cpp

@@ -0,0 +1,778 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcdxrfallbackcompiler.cpp                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Implements the DirectX Raytracing Fallback Compiler object.               //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/Global.h"
+#include "dxc/Support/Unicode.h"
+#include "dxc/Support/microcom.h"
+#include "dxc/dxcdxrfallbackcompiler.h"
+#include "dxc/DxrFallback/DxrFallbackCompiler.h"
+#include "dxc/HLSL/DxilContainer.h"
+#include "dxc/HLSL/DxilLinker.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "dxc/HLSL/DxilValidation.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxcutil.h"
+
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MSFileSystem.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+#include "dxc/HLSL/DxilFallbackLayerPass.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+
+static std::string ws2s(const std::wstring& wide)
+{
+  return std::string(wide.begin(), wide.end());
+}
+
+static HRESULT FindDxilProgram(IDxcBlob* pBlob,
+  _In_ DxilFourCC FourCC,
+  _In_ const DxilProgramHeader **ppProgram)
+{
+
+  void* pContainerBytes = pBlob->GetBufferPointer();
+  SIZE_T ContainerSize = pBlob->GetBufferSize();
+  const DxilContainerHeader *pContainer =
+    IsDxilContainerLike(pContainerBytes, ContainerSize);
+
+  if (!pContainer)
+  {
+    IFR(DXC_E_CONTAINER_INVALID);
+  }
+
+  if (!IsValidDxilContainer(pContainer, ContainerSize))
+  {
+    IFR(DXC_E_CONTAINER_INVALID);
+  }
+
+  DxilPartIterator it = std::find_if(begin(pContainer), end(pContainer),
+    DxilPartIsType(FourCC));
+  if (it == end(pContainer))
+  {
+    IFR(DXC_E_CONTAINER_MISSING_DXIL);
+  }
+
+  const DxilProgramHeader *pProgramHeader =
+    reinterpret_cast<const DxilProgramHeader *>(GetDxilPartData(*it));
+  if (!IsValidDxilProgramHeader(pProgramHeader, (*it)->PartSize))
+  {
+    IFR(DXC_E_CONTAINER_INVALID);
+  }
+
+  *ppProgram = pProgramHeader;
+  return S_OK;
+}
+
+
+static DxilModule* ExtractDxil(LLVMContext& context, IDxcBlob* pContainer)
+{
+  const DxilProgramHeader *pProgram = nullptr;
+  IFT(FindDxilProgram(pContainer, DFCC_DXIL, &pProgram));
+
+  const char *pIL = nullptr;
+  uint32_t ILLength = 0;
+  GetDxilProgramBitcode(pProgram, &pIL, &ILLength);
+
+  std::unique_ptr<Module> M;
+  std::string diagStr;
+  M = dxilutil::LoadModuleFromBitcode(
+    llvm::StringRef(pIL, ILLength), context, diagStr);
+
+  DxilModule* dxil = nullptr;
+  if (M)
+    dxil = &M->GetOrCreateDxilModule();
+  M.release();
+
+  return dxil;
+}
+
+
+static void saveModuleToAsmFile(const llvm::Module* module, const std::string& filename)
+{
+  std::error_code EC;
+  raw_fd_ostream out(filename, EC, sys::fs::F_Text);
+  if (!out.has_error())
+  {
+    module->print(out, nullptr);
+    out.close();
+  }
+  if (out.has_error())
+  {
+    errs() << "Error saving to " << filename << ":" << filename << "\n";
+    exit(1);
+  }
+}
+
+class DxcDxrFallbackCompiler : public IDxcDxrFallbackCompiler
+{
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+    bool m_findCalledShaders = false;
+  int m_debugOutput = 0;
+
+  // Only used for test purposes when exports aren't explicitly listed
+  std::unique_ptr<DxrFallbackCompiler::IntToFuncNameMap> m_pCachedMap;
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+    DXC_MICROCOM_TM_CTOR(DxcDxrFallbackCompiler)
+
+    HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject)
+  {
+    return DoBasicQueryInterface<IDxcDxrFallbackCompiler>(this, iid, ppvObject);
+  }
+
+  __override HRESULT STDMETHODCALLTYPE SetFindCalledShaders(bool val)
+  {
+    m_findCalledShaders = val;
+    return S_OK;
+  }
+
+  __override HRESULT STDMETHODCALLTYPE SetDebugOutput(int val)
+  {
+    m_debugOutput = val;
+    return S_OK;
+  }
+
+  __override HRESULT STDMETHODCALLTYPE PatchShaderBindingTables(
+      _In_ const LPCWSTR pEntryName,
+      _In_ DxcShaderBytecode *pShaderBytecode,
+      void *pShaderInfo,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  );
+
+  __override HRESULT STDMETHODCALLTYPE RenameAndLink(
+      _In_count_(libCount) DxcShaderBytecode *pLibs,
+      UINT32 libCount,
+      _In_count_(ExportCount) DxcExportDesc *pExports,
+      UINT32 ExportCount,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  );
+
+  __override HRESULT STDMETHODCALLTYPE Compile(
+    _In_count_(libCount) DxcShaderBytecode *pLibs,
+    UINT32 libCount,
+    _In_count_(shaderCount) const LPCWSTR *pShaderNames,
+    _Out_writes_(shaderCount) DxcShaderInfo *pShaderInfo,
+    UINT32 shaderCount,
+    UINT32 maxAttributeSize,
+    _COM_Outptr_ IDxcOperationResult **ppResult
+  );
+
+  __override HRESULT STDMETHODCALLTYPE Link(
+      _In_ const LPCWSTR pEntryName,                      
+      _In_count_(libCount) IDxcBlob **pLibs,               
+      UINT32 libCount,                                    
+      _In_count_(shaderCount) const LPCWSTR *pShaderNames,
+      _In_count_(shaderCount) DxcShaderInfo *pShaderInfo, 
+      UINT32 shaderCount,                                 
+      UINT32 maxAttributeSize,
+      UINT32 stackSizeInBytes,                            
+      _COM_Outptr_ IDxcOperationResult **ppResult         
+  );
+};
+
+// TODO: Stolen from Brandon's code, merge
+// Remove ELF mangling
+static inline std::string GetUnmangledName(StringRef name) {
+    if (!name.startswith("\x1?"))
+        return name;
+
+    size_t pos = name.find("@@");
+    if (pos == name.npos)
+        return name;
+
+    return name.substr(2, pos - 2);
+}
+
+static Function* getFunctionFromName(Module &M, const std::wstring& exportName) {
+    for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+        std::wstring functionName = Unicode::UTF8ToUTF16StringOrThrow(GetUnmangledName(F->getName()).c_str());
+        if (exportName == functionName) {
+            return F;
+        }
+    }
+    return nullptr;
+}
+
+DXIL::ShaderKind getRayShaderKind(Function* F);
+Function *CloneFunction(Function *Orig,
+    const llvm::Twine &Name,
+    llvm::Module *llvmModule);
+
+HRESULT STDMETHODCALLTYPE DxcDxrFallbackCompiler::RenameAndLink(
+    _In_count_(libCount) DxcShaderBytecode *pLibs,
+    UINT32 libCount,
+    _In_count_(ExportCount) DxcExportDesc *pExports,
+    UINT32 ExportCount,
+    _COM_Outptr_ IDxcOperationResult **ppResult
+)
+{
+    if (pLibs == nullptr || pExports == nullptr)
+        return E_POINTER;
+
+    if (libCount == 0 || ExportCount == 0)
+        return E_INVALIDARG;
+
+    *ppResult = nullptr;
+    HRESULT hr = S_OK;
+    DxcThreadMalloc TM(m_pMalloc);
+    LLVMContext context;
+    try
+    {
+        // Init file system because we are currently loading the runtime from disk
+        ::llvm::sys::fs::MSFileSystem *msfPtr;
+        IFT(CreateMSFileSystemForDisk(&msfPtr));
+        std::unique_ptr<::llvm::sys::fs::MSFileSystem> msf(msfPtr);
+        ::llvm::sys::fs::AutoPerThreadSystem pts(msf.get());
+        IFTLLVM(pts.error_code());
+
+        // Create a diagnostic printer
+        CComPtr<AbstractMemoryStream> pDiagStream;
+        IFT(CreateMemoryStream(TM.p, &pDiagStream));
+        raw_stream_ostream DiagStream(pDiagStream);
+        DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+        PrintDiagnosticContext DiagContext(DiagPrinter);
+        context.setDiagnosticHandler(PrintDiagnosticContext::PrintDiagnosticHandler,
+            &DiagContext, true);
+
+        std::vector<CComPtr<IDxcBlobEncoding>> pShaderLibs(libCount);
+        for (UINT i = 0; i < libCount; i++)
+        {
+            hlsl::DxcCreateBlobWithEncodingFromPinned(pLibs[i].pData, pLibs[i].Size, CP_ACP, &pShaderLibs[i]);
+        }
+
+        // Link all the modules together into a single into library
+        int valMajor = 1, valMinor = 2; // TODO: Where to get these values?
+        std::unique_ptr<Module> M;
+        {
+            DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+            for (UINT32 i = 0; i < libCount; ++i)
+            {
+                DxilModule* dxil = ExtractDxil(context, pShaderLibs[i]);
+                if (dxil == nullptr)
+                {
+                    return DXC_E_CONTAINER_MISSING_DXIL;
+                }
+                pLinker->RegisterLib(std::to_string(i), std::unique_ptr<Module>(dxil->GetModule()), nullptr);
+                pLinker->AttachLib(std::to_string(i));
+            }
+
+            dxilutil::ExportMap exportMap;
+            M = pLinker->Link("", "lib_6_1", exportMap);
+            if (m_debugOutput)
+            {
+                saveModuleToAsmFile(M.get(), "combined.ll");
+            }
+        }
+
+        dxilutil::ExportMap exportMap;
+        for (UINT i = 0; i < ExportCount; i++)
+        {
+            auto &exportDesc = pExports[i];
+            auto exportName = ws2s(exportDesc.ExportName);
+            if (exportDesc.ExportToRename)
+            {
+                auto exportToRename = ws2s(exportDesc.ExportToRename);
+                CloneFunction(
+                    M->getFunction(exportToRename),
+                    exportName,
+                    M.get());
+            }
+            exportMap.Add(GetUnmangledName(exportName));
+        }
+
+        // Create the compute shader
+        DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+        pLinker->RegisterLib("M", std::move(M), nullptr);
+        pLinker->AttachLib("M");
+        auto profile = "lib_6_1";
+        M = pLinker->Link(StringRef(), profile, exportMap);
+        bool hasErrors = DiagContext.HasErrors();
+
+        CComPtr<IDxcBlob> pResultBlob;
+        if (M)
+        {
+            CComPtr<AbstractMemoryStream> pOutputStream;
+            IFT(CreateMemoryStream(TM.p, &pOutputStream));
+            raw_stream_ostream outStream(pOutputStream.p);
+            WriteBitcodeToFile(M.get(), outStream);
+            outStream.flush();
+
+            // Validation.
+            dxcutil::AssembleToContainer(
+                std::move(M), pResultBlob, TM.p, SerializeDxilFlags::None,
+                pOutputStream
+#if !DISABLE_GET_CUSTOM_DIAG_ID
+                , Diag
+#endif
+            );
+        }
+
+        DiagStream.flush();
+        CComPtr<IStream> pStream = pDiagStream;
+        std::string warnings;
+        dxcutil::CreateOperationResultFromOutputs(pResultBlob, pStream, warnings, hasErrors, ppResult);
+    }
+    CATCH_CPP_ASSIGN_HRESULT();
+
+    return hr;
+}
+
+HRESULT STDMETHODCALLTYPE DxcDxrFallbackCompiler::PatchShaderBindingTables(
+    _In_ const LPCWSTR pEntryName,
+    _In_ DxcShaderBytecode *pShaderBytecode,
+    void *pShaderInfo,
+    _COM_Outptr_ IDxcOperationResult **ppResult
+)
+{
+    if (pShaderBytecode == nullptr  || pShaderInfo == nullptr)
+        return E_POINTER;
+
+    *ppResult = nullptr;
+    HRESULT hr = S_OK;
+    DxcThreadMalloc TM(m_pMalloc);
+    LLVMContext context;
+    try
+    {
+        CComPtr<IDxcBlobEncoding> pShaderBlob;
+        hlsl::DxcCreateBlobWithEncodingFromPinned(pShaderBytecode->pData, pShaderBytecode->Size, CP_ACP, &pShaderBlob);
+
+        // Init file system because we are currently loading the runtime from disk
+        ::llvm::sys::fs::MSFileSystem *msfPtr;
+        IFT(CreateMSFileSystemForDisk(&msfPtr));
+        std::unique_ptr<::llvm::sys::fs::MSFileSystem> msf(msfPtr);
+        ::llvm::sys::fs::AutoPerThreadSystem pts(msf.get());
+        IFTLLVM(pts.error_code());
+
+        // Create a diagnostic printer
+        CComPtr<AbstractMemoryStream> pDiagStream;
+        IFT(CreateMemoryStream(TM.p, &pDiagStream));
+        raw_stream_ostream DiagStream(pDiagStream);
+        DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+        PrintDiagnosticContext DiagContext(DiagPrinter);
+        context.setDiagnosticHandler(PrintDiagnosticContext::PrintDiagnosticHandler,
+            &DiagContext, true);
+
+        DxilModule* dxil = ExtractDxil(context, pShaderBlob);
+
+        // TODO: Lifetime managment?
+        std::unique_ptr<Module> M(dxil->GetModule());
+        if (dxil == nullptr)
+        {
+            return DXC_E_CONTAINER_MISSING_DXIL;
+        }
+
+        ModulePass *patchShaderRecordBindingsPass = createDxilPatchShaderRecordBindingsPass();
+
+        char dxilPatchShaderRecordString[32];
+        StringCchPrintf(dxilPatchShaderRecordString, _countof(dxilPatchShaderRecordString),
+            "%p", pShaderInfo);
+        auto passOption = PassOption("root-signature", dxilPatchShaderRecordString);
+        PassOptions options(passOption);
+        patchShaderRecordBindingsPass->applyOptions(options);
+
+        legacy::PassManager FPM;
+        FPM.add(patchShaderRecordBindingsPass);
+        FPM.run(*M);
+
+        CComPtr<IDxcBlob> pResultBlob;
+        if (M)
+        {
+            CComPtr<AbstractMemoryStream> pOutputStream;
+            IFT(CreateMemoryStream(TM.p, &pOutputStream));
+            raw_stream_ostream outStream(pOutputStream.p);
+            WriteBitcodeToFile(M.get(), outStream);
+            outStream.flush();
+            dxcutil::AssembleToContainer(
+                std::move(M),
+                pResultBlob,
+                TM.p,
+                SerializeDxilFlags::None,
+                pOutputStream);
+        }
+
+        DiagStream.flush();
+        CComPtr<IStream> pStream = pDiagStream;
+        std::string warnings;
+        dxcutil::CreateOperationResultFromOutputs(pResultBlob, pStream, warnings, false, ppResult);
+    }
+    CATCH_CPP_ASSIGN_HRESULT();
+
+    return hr;
+}
+
+HRESULT STDMETHODCALLTYPE DxcDxrFallbackCompiler::Link(
+    _In_ const LPCWSTR pEntryName,
+    _In_count_(libCount) IDxcBlob **pLibs,
+    UINT32 libCount,
+    _In_count_(shaderCount) const LPCWSTR *pShaderNames,
+    _In_count_(shaderCount) DxcShaderInfo *pShaderInfo,
+    UINT32 shaderCount,
+    UINT32 maxAttributeSize,
+    UINT32 stackSizeInBytes,
+    _COM_Outptr_ IDxcOperationResult **ppResult
+)
+{
+    if (pLibs == nullptr || pShaderNames == nullptr || ppResult == nullptr)
+        return E_POINTER;
+
+    if (libCount == 0 || shaderCount == 0)
+        return E_INVALIDARG;
+
+    *ppResult = nullptr;
+    HRESULT hr = S_OK;
+    DxcThreadMalloc TM(m_pMalloc);
+    LLVMContext context;
+    try
+    {
+        // Init file system because we are currently loading the runtime from disk
+        ::llvm::sys::fs::MSFileSystem *msfPtr;
+        IFT(CreateMSFileSystemForDisk(&msfPtr));
+        std::unique_ptr<::llvm::sys::fs::MSFileSystem> msf(msfPtr);
+        ::llvm::sys::fs::AutoPerThreadSystem pts(msf.get());
+        IFTLLVM(pts.error_code());
+
+        // Create a diagnostic printer
+        CComPtr<AbstractMemoryStream> pDiagStream;
+        IFT(CreateMemoryStream(TM.p, &pDiagStream));
+        raw_stream_ostream DiagStream(pDiagStream);
+        DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+        PrintDiagnosticContext DiagContext(DiagPrinter);
+        context.setDiagnosticHandler(PrintDiagnosticContext::PrintDiagnosticHandler,
+            &DiagContext, true);
+
+
+        std::vector<std::string> shaderNames(shaderCount);
+        for (UINT32 i = 0; i < shaderCount; ++i)
+            shaderNames[i] = ws2s(pShaderNames[i]);
+
+        // Link all the modules together into a single into library
+        int valMajor = 1, valMinor = 2; // TODO: Where to get these values?
+        std::unique_ptr<Module> M;
+        {
+            DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+            for (UINT32 i = 0; i < libCount; ++i)
+            {
+                DxilModule* dxil = ExtractDxil(context, pLibs[i]);
+                if (dxil == nullptr)
+                {
+                    return DXC_E_CONTAINER_MISSING_DXIL;
+                }
+                pLinker->RegisterLib(std::to_string(i), std::unique_ptr<Module>(dxil->GetModule()), nullptr);
+                pLinker->AttachLib(std::to_string(i));
+            }
+
+            dxilutil::ExportMap exportMap;
+            M = pLinker->Link("", "lib_6_1", exportMap);
+            if (m_debugOutput)
+            {
+                saveModuleToAsmFile(M.get(), "combined.ll");
+            }
+        }
+
+        std::vector<int> shaderEntryStateIds;
+        std::vector<unsigned int> shaderStackSizes;
+
+        DxrFallbackCompiler compiler(M.get(), shaderNames, maxAttributeSize, stackSizeInBytes, m_findCalledShaders);
+        compiler.setDebugOutputLevel(m_debugOutput);
+        shaderEntryStateIds.resize(shaderCount);
+        shaderStackSizes.resize(shaderCount);
+        for (UINT i = 0; i < shaderCount; i++)
+        {
+            shaderEntryStateIds[i] = pShaderInfo[i].Identifier;
+            shaderStackSizes[i] = pShaderInfo[i].StackSize;
+        }
+        compiler.link(shaderEntryStateIds, shaderStackSizes, m_pCachedMap.get());
+        if (m_debugOutput)
+        {
+            saveModuleToAsmFile(M.get(), "compiled.ll");
+        }
+
+
+        // Create the compute shader
+        dxilutil::ExportMap exportMap;
+        DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+        pLinker->RegisterLib("M", std::move(M), nullptr);
+        pLinker->AttachLib("M");
+        auto profile = "cs_6_0";
+        M = pLinker->Link(pEntryName ? ws2s(pEntryName).c_str() : StringRef(), profile, exportMap);
+        bool hasErrors = DiagContext.HasErrors();
+
+        CComPtr<IDxcBlob> pResultBlob;
+        if (M)
+        {
+            if (!hasErrors && stackSizeInBytes)
+                DxrFallbackCompiler::resizeStack(M->getFunction(ws2s(pEntryName).c_str()), stackSizeInBytes);
+
+            llvm::NamedMDNode *IdentMetadata = M->getOrInsertNamedMetadata("llvm.ident");
+            llvm::LLVMContext &Ctx = M->getContext();
+            llvm::Metadata *IdentNode[] = { llvm::MDString::get(Ctx, "FallbackLayer") };
+            IdentMetadata->addOperand(llvm::MDNode::get(Ctx, IdentNode));
+
+            DxilModule& DM = M->GetDxilModule();
+            DM.SetValidatorVersion(valMajor, valMinor);
+            DxilModule::ClearDxilMetadata(*M);
+            DM.EmitDxilMetadata();
+
+            if (m_debugOutput)
+                saveModuleToAsmFile(M.get(), "linked.ll");
+
+    #if !DISABLE_GET_CUSTOM_DIAG_ID
+            const IntrusiveRefCntPtr<clang::DiagnosticIDs> Diags(
+                new clang::DiagnosticIDs);
+            IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts =
+                new clang::DiagnosticOptions();
+            // Construct our diagnostic client.
+            clang::TextDiagnosticPrinter *DiagClient =
+                new clang::TextDiagnosticPrinter(DiagStream, &*DiagOpts);
+            clang::DiagnosticsEngine Diag(Diags, &*DiagOpts, DiagClient);
+    #endif
+        }
+
+        if (M)
+        {
+            CComPtr<AbstractMemoryStream> pOutputStream;
+            IFT(CreateMemoryStream(TM.p, &pOutputStream));
+            raw_stream_ostream outStream(pOutputStream.p);
+            WriteBitcodeToFile(M.get(), outStream);
+            outStream.flush();
+
+            // Validation.
+            HRESULT valHR = dxcutil::ValidateAndAssembleToContainer(
+                std::move(M), pResultBlob, TM.p, SerializeDxilFlags::None,
+                pOutputStream,
+                /*bDebugInfo*/ false
+#if !DISABLE_GET_CUSTOM_DIAG_ID
+                , Diag
+#endif
+            );
+
+            if (FAILED(valHR))
+                hasErrors = true;
+        }
+
+        DiagStream.flush();
+        CComPtr<IStream> pStream = pDiagStream;
+        std::string warnings;
+        dxcutil::CreateOperationResultFromOutputs(pResultBlob, pStream, warnings, hasErrors, ppResult);
+    }
+    CATCH_CPP_ASSIGN_HRESULT();
+
+    return hr;
+}
+
+HRESULT STDMETHODCALLTYPE DxcDxrFallbackCompiler::Compile(
+  _In_count_(libCount) DxcShaderBytecode *pShaderLibs,
+  UINT32 libCount,
+  _In_count_(shaderCount) const LPCWSTR *pShaderNames,
+  _Out_writes_(shaderCount) DxcShaderInfo *pShaderInfo,
+  UINT32 shaderCount,
+  UINT32 maxAttributeSize,
+  _COM_Outptr_ IDxcOperationResult **ppResult
+)
+{
+  if (pShaderLibs == nullptr || pShaderNames == nullptr || ppResult == nullptr)
+    return E_POINTER;
+
+  if (libCount == 0 || shaderCount == 0)
+    return E_INVALIDARG;
+
+  *ppResult = nullptr;
+  HRESULT hr = S_OK;
+  DxcThreadMalloc TM(m_pMalloc);
+  LLVMContext context;
+  try
+  {
+    std::vector<CComPtr<IDxcBlobEncoding>> pLibs(libCount);
+    for (UINT i = 0; i < libCount; i++)
+    {
+        auto &shaderBytecode = pShaderLibs[i];
+        hlsl::DxcCreateBlobWithEncodingFromPinned(shaderBytecode.pData, shaderBytecode.Size, CP_ACP, &pLibs[i]);
+    }
+
+    // Init file system because we are currently loading the runtime from disk
+    ::llvm::sys::fs::MSFileSystem *msfPtr;
+    IFT(CreateMSFileSystemForDisk(&msfPtr));
+    std::unique_ptr<::llvm::sys::fs::MSFileSystem> msf(msfPtr);
+    ::llvm::sys::fs::AutoPerThreadSystem pts(msf.get());
+    IFTLLVM(pts.error_code());
+
+    // Create a diagnostic printer
+    CComPtr<AbstractMemoryStream> pDiagStream;
+    IFT(CreateMemoryStream(TM.p, &pDiagStream));
+    raw_stream_ostream DiagStream(pDiagStream);
+    DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+    PrintDiagnosticContext DiagContext(DiagPrinter);
+    context.setDiagnosticHandler(PrintDiagnosticContext::PrintDiagnosticHandler,
+        &DiagContext, true);
+
+
+    std::vector<std::string> shaderNames(shaderCount);
+    for (UINT32 i = 0; i < shaderCount; ++i)
+      shaderNames[i] = ws2s(pShaderNames[i]);
+
+    // Link all the modules together into a single into library
+    int valMajor = 1, valMinor = 2; // TODO: Where to get these values?
+    std::unique_ptr<Module> M;
+    {
+    DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+    for (UINT32 i = 0; i < libCount; ++i)
+    {
+      DxilModule* dxil = ExtractDxil(context, pLibs[i]);
+      if (dxil == nullptr)
+      {
+          return DXC_E_CONTAINER_MISSING_DXIL;
+      }
+      pLinker->RegisterLib(std::to_string(i), std::unique_ptr<Module>(dxil->GetModule()), nullptr);
+      pLinker->AttachLib(std::to_string(i));
+    }
+
+    dxilutil::ExportMap exportMap;
+    M = pLinker->Link("", "lib_6_1", exportMap);
+    if (m_debugOutput)
+    {
+        saveModuleToAsmFile(M.get(), "combined.ll");
+    }
+    }
+    std::vector<ShaderType> shaderTypes;
+    for (UINT32 i = 0; i < shaderCount; ++i)
+    {
+        switch (getRayShaderKind(getFunctionFromName(*M, pShaderNames[i])))
+        {
+        case DXIL::ShaderKind::RayGeneration:
+            shaderTypes.push_back(ShaderType::Raygen);
+            break;
+        case DXIL::ShaderKind::AnyHit:
+            shaderTypes.push_back(ShaderType::AnyHit);
+            break;
+        case DXIL::ShaderKind::ClosestHit:
+            shaderTypes.push_back(ShaderType::ClosestHit);
+            break;
+        case DXIL::ShaderKind::Intersection:
+            shaderTypes.push_back(ShaderType::Intersection);
+            break;
+        case DXIL::ShaderKind::Miss:
+            shaderTypes.push_back(ShaderType::Miss);
+            break;
+        case DXIL::ShaderKind::Callable:
+            shaderTypes.push_back(ShaderType::Callable);
+            break;
+        default:
+            shaderTypes.push_back(ShaderType::Lib);
+            break;
+        }
+    }
+
+    if (m_findCalledShaders)
+    {
+        m_pCachedMap.reset(new DxrFallbackCompiler::IntToFuncNameMap);
+    }
+
+    std::vector<int> shaderEntryStateIds;
+    std::vector<unsigned int> shaderStackSizes;
+    DxrFallbackCompiler compiler(M.get(), shaderNames, maxAttributeSize, 0, m_findCalledShaders);
+    compiler.setDebugOutputLevel(m_debugOutput);
+    compiler.compile(shaderEntryStateIds, shaderStackSizes, m_pCachedMap.get());
+    if (m_debugOutput)
+    {
+        saveModuleToAsmFile(M.get(), "compiled.ll");
+    }
+
+
+    // Create the compute shader
+    dxilutil::ExportMap exportMap;
+    DxilLinker* pLinker = DxilLinker::CreateLinker(context, valMajor, valMinor);
+    pLinker->RegisterLib("M", std::move(M), nullptr);
+    pLinker->AttachLib("M");
+    auto profile = "lib_6_1";
+    M = pLinker->Link(StringRef(), profile, exportMap);
+    bool hasErrors = DiagContext.HasErrors();
+
+    CComPtr<IDxcBlob> pResultBlob;
+    if (M)
+    {
+      CComPtr<AbstractMemoryStream> pOutputStream;
+      IFT(CreateMemoryStream(TM.p, &pOutputStream));
+      raw_stream_ostream outStream(pOutputStream.p);
+      WriteBitcodeToFile(M.get(), outStream);
+      outStream.flush();
+      dxcutil::AssembleToContainer(
+          std::move(M), 
+          pResultBlob, 
+          TM.p, 
+          SerializeDxilFlags::None,
+          pOutputStream);
+    }
+
+    DiagStream.flush();
+    CComPtr<IStream> pStream = pDiagStream;
+    std::string warnings;
+    dxcutil::CreateOperationResultFromOutputs(pResultBlob, pStream, warnings, hasErrors, ppResult);
+
+    // Write out shader identifiers 
+    size_t copyCount = (m_findCalledShaders) ? 1 : shaderCount;
+    for (unsigned int i = 0; i < copyCount; i++)
+    {
+        pShaderInfo[i].Identifier = shaderEntryStateIds[i];
+        pShaderInfo[i].StackSize = shaderStackSizes[i];
+        pShaderInfo[i].Type = shaderTypes[i];
+    }
+  }
+  CATCH_CPP_ASSIGN_HRESULT();
+
+  return hr;
+}
+
+
+HRESULT CreateDxcDxrFallbackCompiler(_In_ REFIID riid, _Out_ LPVOID *ppv)
+{
+  CComPtr<DxcDxrFallbackCompiler> result = DxcDxrFallbackCompiler::Alloc(DxcGetThreadMallocNoRef());
+  if (result == nullptr)
+  {
+    *ppv = nullptr;
+    return E_OUTOFMEMORY;
+  }
+
+  return result.p->QueryInterface(riid, ppv);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ 266 - 0
tools/clang/tools/dxrfallbackcompiler/dxcutil.cpp

@@ -0,0 +1,266 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcutil.cpp                                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides helper code for dxcompiler.                                      //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/HLSL/DxilContainer.h"
+#include "dxc/Support/Global.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/dxcapi.h"
+#include "dxcutil.h"
+#include "dxillib.h"
+#include "clang/Basic/Diagnostic.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxc/Support/HLSLOptions.h"
+
+#include "llvm/Support/Path.h"
+
+
+using namespace llvm;
+using namespace hlsl;
+
+// This declaration is used for the locally-linked validator.
+HRESULT CreateDxcValidator(_In_ REFIID riid, _Out_ LPVOID *ppv);
+// This internal call allows the validator to avoid having to re-deserialize
+// the module. It trusts that the caller didn't make any changes and is
+// kept internal because the layout of the module class may change based
+// on changes across modules, or picking a different compiler version or CRT.
+HRESULT RunInternalValidator(_In_ IDxcValidator *pValidator,
+                             _In_ llvm::Module *pModule,
+                             _In_ llvm::Module *pDebugModule,
+                             _In_ IDxcBlob *pShader, UINT32 Flags,
+                             _In_ IDxcOperationResult **ppResult);
+
+namespace {
+// AssembleToContainer helper functions.
+
+bool CreateValidator(CComPtr<IDxcValidator> &pValidator) {
+  if (DxilLibIsEnabled()) {
+    DxilLibCreateInstance(CLSID_DxcValidator, &pValidator);
+  }
+  bool bInternalValidator = false;
+  if (pValidator == nullptr) {
+    IFT(CreateDxcValidator(IID_PPV_ARGS(&pValidator)));
+    bInternalValidator = true;
+  }
+  return bInternalValidator;
+}
+
+// Class to manage lifetime of llvm module and provide some utility
+// functions used for generating compiler output.
+class DxilCompilerLLVMModuleOutput {
+public:
+  DxilCompilerLLVMModuleOutput(std::unique_ptr<llvm::Module> module)
+      : m_llvmModule(std::move(module)) {}
+
+  void CloneForDebugInfo() {
+    m_llvmModuleWithDebugInfo.reset(llvm::CloneModule(m_llvmModule.get()));
+  }
+
+  void WrapModuleInDxilContainer(IMalloc *pMalloc,
+                                 AbstractMemoryStream *pModuleBitcode,
+                                 CComPtr<IDxcBlob> &pDxilContainerBlob,
+                                 SerializeDxilFlags Flags) {
+    CComPtr<AbstractMemoryStream> pContainerStream;
+    IFT(CreateMemoryStream(pMalloc, &pContainerStream));
+    SerializeDxilContainerForModule(&m_llvmModule->GetOrCreateDxilModule(),
+                                    pModuleBitcode, pContainerStream, Flags);
+
+    pDxilContainerBlob.Release();
+    IFT(pContainerStream.QueryInterface(&pDxilContainerBlob));
+  }
+
+  llvm::Module *get() { return m_llvmModule.get(); }
+  llvm::Module *getWithDebugInfo() { return m_llvmModuleWithDebugInfo.get(); }
+
+private:
+  std::unique_ptr<llvm::Module> m_llvmModule;
+  std::unique_ptr<llvm::Module> m_llvmModuleWithDebugInfo;
+};
+
+} // namespace
+
+namespace dxcutil {
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor) {
+  if (pMajor == nullptr || pMinor == nullptr)
+    return;
+
+  CComPtr<IDxcValidator> pValidator;
+  CreateValidator(pValidator);
+
+  CComPtr<IDxcVersionInfo> pVersionInfo;
+  if (SUCCEEDED(pValidator.QueryInterface(&pVersionInfo))) {
+    IFT(pVersionInfo->GetVersion(pMajor, pMinor));
+  } else {
+    // Default to 1.0
+    *pMajor = 1;
+    *pMinor = 0;
+  }
+}
+
+void AssembleToContainer(std::unique_ptr<llvm::Module> pM,
+                         CComPtr<IDxcBlob> &pOutputBlob,
+                         IMalloc *pMalloc,
+                         SerializeDxilFlags SerializeFlags,
+                         CComPtr<AbstractMemoryStream> &pOutputStream) {
+  // Take ownership of the module from the action.
+  DxilCompilerLLVMModuleOutput llvmModule(std::move(pM));
+
+  llvmModule.WrapModuleInDxilContainer(pMalloc, pOutputStream, pOutputBlob,
+                                       SerializeFlags);
+}
+
+void ReadOptsAndValidate(hlsl::options::MainArgs &mainArgs,
+                         hlsl::options::DxcOpts &opts,
+                         AbstractMemoryStream *pOutputStream,
+                         _COM_Outptr_ IDxcOperationResult **ppResult,
+                         bool &finished) {
+  const llvm::opt::OptTable *table = ::options::getHlslOptTable();
+  raw_stream_ostream outStream(pOutputStream);
+  if (0 != hlsl::options::ReadDxcOpts(table, hlsl::options::CompilerFlags,
+                                      mainArgs, opts, outStream)) {
+    CComPtr<IDxcBlob> pErrorBlob;
+    IFT(pOutputStream->QueryInterface(&pErrorBlob));
+    CComPtr<IDxcBlobEncoding> pErrorBlobWithEncoding;
+    outStream.flush();
+    IFT(DxcCreateBlobWithEncodingSet(pErrorBlob.p, CP_UTF8,
+                                     &pErrorBlobWithEncoding));
+    IFT(DxcOperationResult::CreateFromResultErrorStatus(
+        nullptr, pErrorBlobWithEncoding.p, E_INVALIDARG, ppResult));
+    finished = true;
+    return;
+  }
+  DXASSERT(opts.HLSLVersion > 2015,
+           "else ReadDxcOpts didn't fail for non-isense");
+  finished = false;
+}
+
+HRESULT ValidateAndAssembleToContainer(
+    std::unique_ptr<llvm::Module> pM, CComPtr<IDxcBlob> &pOutputBlob,
+    IMalloc *pMalloc, SerializeDxilFlags SerializeFlags,
+    CComPtr<AbstractMemoryStream> &pOutputStream, bool bDebugInfo
+#if  !DISABLE_GET_CUSTOM_DIAG_ID
+    , clang::DiagnosticsEngine &Diag
+#endif
+) {
+  HRESULT valHR = S_OK;
+
+  // Take ownership of the module from the action.
+  DxilCompilerLLVMModuleOutput llvmModule(std::move(pM));
+
+  CComPtr<IDxcValidator> pValidator;
+  bool bInternalValidator = CreateValidator(pValidator);
+  // Warning on internal Validator
+
+  if (bInternalValidator) {
+    // TODO how to make this work without clang?
+#if !DISABLE_GET_CUSTOM_DIAG_ID
+    unsigned diagID =
+        Diag.getCustomDiagID(clang::DiagnosticsEngine::Level::Warning,
+                             "DXIL.dll not found.  Resulting DXIL will not be "
+                             "signed for use in release environments.\r\n");
+    Diag.Report(diagID);
+#endif
+    // If using the internal validator, we'll use the modules directly.
+    // In this case, we'll want to make a clone to avoid
+    // SerializeDxilContainerForModule stripping all the debug info. The debug
+    // info will be stripped from the orginal module, but preserved in the cloned
+    // module.
+    if (bDebugInfo) {
+      llvmModule.CloneForDebugInfo();
+    }
+  }
+
+  llvmModule.WrapModuleInDxilContainer(pMalloc, pOutputStream, pOutputBlob,
+                                       SerializeFlags);
+
+  CComPtr<IDxcOperationResult> pValResult;
+  // Important: in-place edit is required so the blob is reused and thus
+  // dxil.dll can be released.
+  if (bInternalValidator) {
+    IFT(RunInternalValidator(pValidator, llvmModule.get(),
+                             llvmModule.getWithDebugInfo(), pOutputBlob,
+                             DxcValidatorFlags_InPlaceEdit, &pValResult));
+  } else {
+    IFT(pValidator->Validate(pOutputBlob, DxcValidatorFlags_InPlaceEdit,
+                             &pValResult));
+  }
+  IFT(pValResult->GetStatus(&valHR));
+  if (FAILED(valHR)) {
+    CComPtr<IDxcBlobEncoding> pErrors;
+    CComPtr<IDxcBlobEncoding> pErrorsUtf8;
+    IFT(pValResult->GetErrorBuffer(&pErrors));
+    IFT(hlsl::DxcGetBlobAsUtf8(pErrors, &pErrorsUtf8));
+    StringRef errRef((const char *)pErrorsUtf8->GetBufferPointer(),
+                     pErrorsUtf8->GetBufferSize());
+
+#if !DISABLE_GET_CUSTOM_DIAG_ID
+    unsigned DiagID = Diag.getCustomDiagID(clang::DiagnosticsEngine::Error,
+                                           "validation errors\r\n%0");
+    Diag.Report(DiagID) << errRef;
+#endif
+  }
+  CComPtr<IDxcBlob> pValidatedBlob;
+  IFT(pValResult->GetResult(&pValidatedBlob));
+  if (pValidatedBlob != nullptr) {
+    std::swap(pOutputBlob, pValidatedBlob);
+  }
+  pValidator.Release();
+
+  return valHR;
+}
+
+void CreateOperationResultFromOutputs(
+    IDxcBlob *pResultBlob, CComPtr<IStream> &pErrorStream,
+    const std::string &warnings, bool hasErrorOccurred,
+    _COM_Outptr_ IDxcOperationResult **ppResult) {
+  CComPtr<IDxcBlobEncoding> pErrorBlob;
+
+  if (pErrorStream != nullptr) {
+    CComPtr<IDxcBlob> pErrorStreamBlob;
+    IFT(pErrorStream.QueryInterface(&pErrorStreamBlob));
+    IFT(DxcCreateBlobWithEncodingSet(pErrorStreamBlob, CP_UTF8, &pErrorBlob));
+  }
+  if (IsBlobNullOrEmpty(pErrorBlob)) {
+    pErrorBlob.Release();
+    IFT(DxcCreateBlobWithEncodingOnHeapCopy(warnings.c_str(), warnings.size(),
+                                            CP_UTF8, &pErrorBlob));
+  }
+
+  HRESULT status = hasErrorOccurred ? E_FAIL : S_OK;
+  IFT(DxcOperationResult::CreateFromResultErrorStatus(pResultBlob, pErrorBlob,
+                                                      status, ppResult));
+}
+
+bool IsAbsoluteOrCurDirRelative(const Twine &T) {
+  if (llvm::sys::path::is_absolute(T)) {
+    return true;
+  }
+  if (T.isSingleStringRef()) {
+    StringRef r = T.getSingleStringRef();
+    if (r.size() < 2) return false;
+    const char *pData = r.data();
+    return pData[0] == '.' && (pData[1] == '\\' || pData[1] == '/');
+  }
+  DXASSERT(false, "twine kind not supported");
+  return false;
+}
+
+} // namespace dxcutil

+ 70 - 0
tools/clang/tools/dxrfallbackcompiler/dxcutil.h

@@ -0,0 +1,70 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcutil.h                                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides helper code for dxcompiler.                                      //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "dxc/dxcapi.h"
+#include "dxc/Support/microcom.h"
+#include <memory>
+
+#define DISABLE_GET_CUSTOM_DIAG_ID 1
+
+namespace clang {
+class DiagnosticsEngine;
+}
+
+namespace llvm {
+class LLVMContext;
+class MemoryBuffer;
+class Module;
+class raw_string_ostream;
+class StringRef;
+class Twine;
+} // namespace llvm
+
+namespace hlsl {
+enum class SerializeDxilFlags : uint32_t;
+class AbstractMemoryStream;
+namespace options {
+class MainArgs;
+class DxcOpts;
+} // namespace options
+} // namespace hlsl
+
+namespace dxcutil {
+HRESULT ValidateAndAssembleToContainer(
+    std::unique_ptr<llvm::Module> pM, CComPtr<IDxcBlob> &pOutputContainerBlob,
+    IMalloc *pMalloc, hlsl::SerializeDxilFlags SerializeFlags,
+    CComPtr<hlsl::AbstractMemoryStream> &pModuleBitcode, bool bDebugInfo
+#if  !DISABLE_GET_CUSTOM_DIAG_ID
+  , clang::DiagnosticsEngine &Diag
+#endif
+  );
+void GetValidatorVersion(unsigned *pMajor, unsigned *pMinor);
+void AssembleToContainer(std::unique_ptr<llvm::Module> pM,
+                         CComPtr<IDxcBlob> &pOutputContainerBlob,
+                         IMalloc *pMalloc,
+                         hlsl::SerializeDxilFlags SerializeFlags,
+                         CComPtr<hlsl::AbstractMemoryStream> &pModuleBitcode);
+HRESULT Disassemble(IDxcBlob *pProgram, llvm::raw_string_ostream &Stream);
+void ReadOptsAndValidate(hlsl::options::MainArgs &mainArgs,
+                         hlsl::options::DxcOpts &opts,
+                         hlsl::AbstractMemoryStream *pOutputStream,
+                         _COM_Outptr_ IDxcOperationResult **ppResult,
+                         bool &finished);
+void CreateOperationResultFromOutputs(
+    IDxcBlob *pResultBlob, CComPtr<IStream> &pErrorStream,
+    const std::string &warnings, bool hasErrorOccurred,
+    _COM_Outptr_ IDxcOperationResult **ppResult);
+
+bool IsAbsoluteOrCurDirRelative(const llvm::Twine &T);
+
+} // namespace dxcutil

+ 269 - 0
tools/clang/tools/dxrfallbackcompiler/dxcvalidator.cpp

@@ -0,0 +1,269 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcvalidator.cpp                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Implements the DirectX Validator object.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/HLSL/DxilContainer.h"
+#include "dxc/HLSL/DxilValidation.h"
+
+#include "dxc/Support/Global.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MSFileSystem.h"
+#include "dxc/Support/microcom.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxc/HLSL/DxilRootSignature.h"
+#include "dxcetw.h"
+
+using namespace llvm;
+using namespace hlsl;
+
+// Utility class for setting and restoring the diagnostic context so we may capture errors/warnings
+struct DiagRestore {
+  LLVMContext &Ctx;
+  void *OrigDiagContext;
+  LLVMContext::DiagnosticHandlerTy OrigHandler;
+
+  DiagRestore(llvm::LLVMContext &Ctx, void *DiagContext) : Ctx(Ctx) {
+    OrigHandler = Ctx.getDiagnosticHandler();
+    OrigDiagContext = Ctx.getDiagnosticContext();
+    Ctx.setDiagnosticHandler(PrintDiagnosticContext::PrintDiagnosticHandler,
+                             DiagContext);
+  }
+  ~DiagRestore() {
+    Ctx.setDiagnosticHandler(OrigHandler, OrigDiagContext);
+  }
+};
+
+class DxcValidator : public IDxcValidator, public IDxcVersionInfo {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+
+  HRESULT RunValidation(
+    _In_ IDxcBlob *pShader,                       // Shader to validate.
+    _In_ UINT32 Flags,                            // Validation flags.
+    _In_ llvm::Module *pModule,                   // Module to validate, if available.
+    _In_ llvm::Module *pDebugModule,              // Debug module to validate, if available
+    _In_ AbstractMemoryStream *pDiagStream);
+
+  HRESULT RunRootSignatureValidation(
+    _In_ IDxcBlob *pShader,                       // Shader to validate.
+    _In_ AbstractMemoryStream *pDiagStream);
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  DXC_MICROCOM_TM_CTOR(DxcValidator)
+
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<IDxcValidator, IDxcVersionInfo>(this, iid, ppvObject);
+  }
+
+  // For internal use only.
+  HRESULT ValidateWithOptModules(
+    _In_ IDxcBlob *pShader,                       // Shader to validate.
+    _In_ UINT32 Flags,                            // Validation flags.
+    _In_ llvm::Module *pModule,                   // Module to validate, if available.
+    _In_ llvm::Module *pDebugModule,              // Debug module to validate, if available
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Validation output status, buffer, and errors
+  );
+
+  // IDxcValidator
+  __override HRESULT STDMETHODCALLTYPE Validate(
+    _In_ IDxcBlob *pShader,                       // Shader to validate.
+    _In_ UINT32 Flags,                            // Validation flags.
+    _COM_Outptr_ IDxcOperationResult **ppResult   // Validation output status, buffer, and errors
+    );
+
+  // IDxcVersionInfo
+  __override HRESULT STDMETHODCALLTYPE GetVersion(_Out_ UINT32 *pMajor, _Out_ UINT32 *pMinor);
+  __override HRESULT STDMETHODCALLTYPE GetFlags(_Out_ UINT32 *pFlags);
+};
+
+// Compile a single entry point to the target shader model
+HRESULT STDMETHODCALLTYPE DxcValidator::Validate(
+  _In_ IDxcBlob *pShader,                       // Shader to validate.
+  _In_ UINT32 Flags,                            // Validation flags.
+  _COM_Outptr_ IDxcOperationResult **ppResult   // Validation output status, buffer, and errors
+) {
+  DxcThreadMalloc TM(m_pMalloc);
+  if (pShader == nullptr || ppResult == nullptr || Flags & ~DxcValidatorFlags_ValidMask)
+    return E_INVALIDARG;
+  if ((Flags & DxcValidatorFlags_ModuleOnly) && (Flags & (DxcValidatorFlags_InPlaceEdit | DxcValidatorFlags_RootSignatureOnly)))
+    return E_INVALIDARG;
+  return ValidateWithOptModules(pShader, Flags, nullptr, nullptr, ppResult);
+}
+
+HRESULT DxcValidator::ValidateWithOptModules(
+  _In_ IDxcBlob *pShader,                       // Shader to validate.
+  _In_ UINT32 Flags,                            // Validation flags.
+  _In_ llvm::Module *pModule,                   // Module to validate, if available.
+  _In_ llvm::Module *pDebugModule,              // Debug module to validate, if available
+  _COM_Outptr_ IDxcOperationResult **ppResult   // Validation output status, buffer, and errors
+) {
+  *ppResult = nullptr;
+  HRESULT hr = S_OK;
+  HRESULT validationStatus = S_OK;
+  DxcEtw_DxcValidation_Start();
+  DxcThreadMalloc TM(m_pMalloc);
+  try {
+    CComPtr<AbstractMemoryStream> pDiagStream;
+    IFT(CreateMemoryStream(m_pMalloc, &pDiagStream));
+
+    // Run validation may throw, but that indicates an inability to validate,
+    // not that the validation failed (eg out of memory).
+    if (Flags & DxcValidatorFlags_RootSignatureOnly) {
+      validationStatus = RunRootSignatureValidation(pShader, pDiagStream);
+    } else {
+      validationStatus = RunValidation(pShader, Flags, pModule, pDebugModule, pDiagStream);
+    }
+    if (FAILED(validationStatus)) {
+      std::string msg("Validation failed.\n");
+      ULONG cbWritten;
+      pDiagStream->Write(msg.c_str(), msg.size(), &cbWritten);
+    }
+    // Assemble the result object.
+    CComPtr<IDxcBlob> pDiagBlob;
+    CComPtr<IDxcBlobEncoding> pDiagBlobEnconding;
+    hr = pDiagStream.QueryInterface(&pDiagBlob);
+    DXASSERT_NOMSG(SUCCEEDED(hr));
+    IFT(DxcCreateBlobWithEncodingSet(pDiagBlob, CP_UTF8, &pDiagBlobEnconding));
+    IFT(DxcOperationResult::CreateFromResultErrorStatus(nullptr, pDiagBlobEnconding, validationStatus, ppResult));
+  }
+  CATCH_CPP_ASSIGN_HRESULT();
+
+  DxcEtw_DxcValidation_Stop(SUCCEEDED(hr) ? validationStatus : hr);
+  return hr;
+}
+
+HRESULT STDMETHODCALLTYPE DxcValidator::GetVersion(_Out_ UINT32 *pMajor, _Out_ UINT32 *pMinor) {
+  if (pMajor == nullptr || pMinor == nullptr)
+    return E_INVALIDARG;
+  GetValidationVersion(pMajor, pMinor);
+  return S_OK;
+}
+
+HRESULT STDMETHODCALLTYPE DxcValidator::GetFlags(_Out_ UINT32 *pFlags) {
+  if (pFlags == nullptr)
+    return E_INVALIDARG;
+  *pFlags = DxcVersionInfoFlags_None;
+#ifdef _DEBUG
+  *pFlags |= DxcVersionInfoFlags_Debug;
+#endif
+  *pFlags |= DxcVersionInfoFlags_Internal;
+  return S_OK;
+}
+
+HRESULT DxcValidator::RunValidation(
+  _In_ IDxcBlob *pShader,
+  _In_ UINT32 Flags,                            // Validation flags.
+  _In_ llvm::Module *pModule,                   // Module to validate, if available.
+  _In_ llvm::Module *pDebugModule,              // Debug module to validate, if available
+  _In_ AbstractMemoryStream *pDiagStream) {
+
+  // Run validation may throw, but that indicates an inability to validate,
+  // not that the validation failed (eg out of memory). That is indicated
+  // by a failing HRESULT, and possibly error messages in the diagnostics stream.
+
+  raw_stream_ostream DiagStream(pDiagStream);
+
+  if (Flags & DxcValidatorFlags_ModuleOnly) {
+    IFRBOOL(!IsDxilContainerLike(pShader->GetBufferPointer(), pShader->GetBufferSize()), E_INVALIDARG);
+  } else {
+    IFRBOOL(IsDxilContainerLike(pShader->GetBufferPointer(), pShader->GetBufferSize()), DXC_E_CONTAINER_INVALID);
+  }
+
+  if (!pModule) {
+    DXASSERT_NOMSG(pDebugModule == nullptr);
+    if (Flags & DxcValidatorFlags_ModuleOnly) {
+      return ValidateDxilBitcode((const char*)pShader->GetBufferPointer(), (uint32_t)pShader->GetBufferSize(), DiagStream);
+    } else {
+      return ValidateDxilContainer(pShader->GetBufferPointer(), pShader->GetBufferSize(), DiagStream);
+    }
+  }
+
+  llvm::DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+  PrintDiagnosticContext DiagContext(DiagPrinter);
+  DiagRestore DR(pModule->getContext(), &DiagContext);
+
+  IFR(hlsl::ValidateDxilModule(pModule, pDebugModule));
+  if (!(Flags & DxcValidatorFlags_ModuleOnly)) {
+    IFR(ValidateDxilContainerParts(pModule, pDebugModule,
+                      IsDxilContainerLike(pShader->GetBufferPointer(), pShader->GetBufferSize()),
+                      (uint32_t)pShader->GetBufferSize()));
+  }
+
+  if (DiagContext.HasErrors() || DiagContext.HasWarnings()) {
+    return DXC_E_IR_VERIFICATION_FAILED;
+  }
+
+  return S_OK;
+}
+
+HRESULT DxcValidator::RunRootSignatureValidation(
+  _In_ IDxcBlob *pShader,
+  _In_ AbstractMemoryStream *pDiagStream) {
+
+  const DxilContainerHeader *pDxilContainer = IsDxilContainerLike(
+    pShader->GetBufferPointer(), pShader->GetBufferSize());
+  if (!pDxilContainer) {
+    return DXC_E_IR_VERIFICATION_FAILED;
+  }
+
+  const DxilProgramHeader *pProgramHeader = GetDxilProgramHeader(pDxilContainer, DFCC_DXIL);
+  const DxilPartHeader *pPSVPart = GetDxilPartByType(pDxilContainer, DFCC_PipelineStateValidation);
+  const DxilPartHeader *pRSPart = GetDxilPartByType(pDxilContainer, DFCC_RootSignature);
+  IFRBOOL(pPSVPart && pRSPart, DXC_E_MISSING_PART);
+  try {
+    RootSignatureHandle RSH;
+    RSH.LoadSerialized((const uint8_t*)GetDxilPartData(pRSPart), pRSPart->PartSize);
+    RSH.Deserialize();
+    raw_stream_ostream DiagStream(pDiagStream);
+    IFRBOOL(VerifyRootSignatureWithShaderPSV(RSH.GetDesc(),
+                                             GetVersionShaderType(pProgramHeader->ProgramVersion),
+                                             GetDxilPartData(pPSVPart),
+                                             pPSVPart->PartSize,
+                                             DiagStream),
+      DXC_E_INCORRECT_ROOT_SIGNATURE);
+  } catch(...) {
+    return DXC_E_IR_VERIFICATION_FAILED;
+  }
+
+  return S_OK;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+HRESULT RunInternalValidator(_In_ IDxcValidator *pValidator,
+                             _In_ llvm::Module *pModule,
+                             _In_ llvm::Module *pDebugModule,
+                             _In_ IDxcBlob *pShader, UINT32 Flags,
+                             _COM_Outptr_ IDxcOperationResult **ppResult) {
+  DXASSERT_NOMSG(pValidator != nullptr);
+  DXASSERT_NOMSG(pModule != nullptr);
+  DXASSERT_NOMSG(pShader != nullptr);
+  DXASSERT_NOMSG(ppResult != nullptr);
+
+  DxcValidator *pInternalValidator = (DxcValidator *)pValidator;
+  return pInternalValidator->ValidateWithOptModules(pShader, Flags, pModule,
+                                                    pDebugModule, ppResult);
+}
+
+HRESULT CreateDxcValidator(_In_ REFIID riid, _Out_ LPVOID* ppv) {
+  try {
+      CComPtr<DxcValidator> result(DxcValidator::Alloc(DxcGetThreadMallocNoRef()));
+      IFROOM(result.p);
+      return result.p->QueryInterface(riid, ppv);
+  }
+  CATCH_CPP_RETURN_HRESULT();
+}

+ 68 - 0
tools/clang/tools/dxrfallbackcompiler/dxillib.cpp

@@ -0,0 +1,68 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxillib.cpp                                                               //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides access to dxil.dll                                               //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxillib.h"
+#include "dxc/Support/Global.h" // For DXASSERT
+#include "dxc/Support/dxcapi.use.h"
+
+using namespace dxc;
+
+static DxcDllSupport g_DllSupport;
+static HRESULT g_DllLibResult = S_OK;
+static CRITICAL_SECTION cs;
+
+// Check if we can successfully get IDxcValidator from dxil.dll
+// This function is to prevent multiple attempts to load dxil.dll 
+HRESULT DxilLibInitialize() {
+  InitializeCriticalSection(&cs);
+  return S_OK;
+}
+
+HRESULT DxilLibCleanup(DxilLibCleanUpType type) {
+  HRESULT hr = S_OK;
+  if (type == DxilLibCleanUpType::ProcessTermination) {
+    g_DllSupport.Detach();
+  }
+  else if (type == DxilLibCleanUpType::UnloadLibrary) {
+    g_DllSupport.Cleanup();
+  }
+  else {
+    hr = E_INVALIDARG;
+  }
+  DeleteCriticalSection(&cs);
+  return hr;
+}
+
+// g_DllLibResult is S_OK by default, check again to see if dxil.dll is loaded
+// If we fail to load dxil.dll, set g_DllLibResult to E_FAIL so that we don't
+// have multiple attempts to load dxil.dll
+bool DxilLibIsEnabled() {
+  EnterCriticalSection(&cs);
+  if (SUCCEEDED(g_DllLibResult)) {
+    if (!g_DllSupport.IsEnabled()) {
+      g_DllLibResult = g_DllSupport.InitializeForDll(L"dxil.dll", "DxcCreateInstance");
+    }
+  }
+  LeaveCriticalSection(&cs);
+  return SUCCEEDED(g_DllLibResult);
+}
+
+
+HRESULT DxilLibCreateInstance(_In_ REFCLSID rclsid, _In_ REFIID riid, _In_ IUnknown **ppInterface) {
+  DXASSERT_NOMSG(ppInterface != nullptr);
+  HRESULT hr = E_FAIL;
+  if (DxilLibIsEnabled()) {
+    EnterCriticalSection(&cs);
+    hr = g_DllSupport.CreateInstance(rclsid, riid, ppInterface);
+    LeaveCriticalSection(&cs);
+  }
+  return hr;
+}

+ 42 - 0
tools/clang/tools/dxrfallbackcompiler/dxillib.h

@@ -0,0 +1,42 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxillib.h                                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides wrappers to handle calls to dxil.dll                             //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+#ifndef __DXC_DXILLIB__
+#define __DXC_DXILLIB__
+
+#include "dxc/Support/WinIncludes.h"
+
+
+// Initialize Dxil library. 
+HRESULT DxilLibInitialize();
+
+// When dxcompiler is detached from process, 
+// we should not call FreeLibrary on process termination. 
+// So the caller has to specify if cleaning is from FreeLibrary or process termination
+enum class DxilLibCleanUpType {
+  UnloadLibrary,
+  ProcessTermination
+};
+
+HRESULT DxilLibCleanup(DxilLibCleanUpType type);
+
+// Check if can access dxil.dll
+bool DxilLibIsEnabled();
+
+HRESULT DxilLibCreateInstance(_In_ REFCLSID rclsid, _In_ REFIID riid, _In_ IUnknown **ppInterface);
+
+template <class TInterface>
+HRESULT DxilLibCreateInstance(_In_ REFCLSID rclsid, _In_ TInterface **ppInterface) {
+  return DxilLibCreateInstance(rclsid, __uuidof(TInterface), (IUnknown**) ppInterface);
+}
+
+#endif // __DXC_DXILLIB__

+ 1 - 0
tools/clang/unittests/CMakeLists.txt

@@ -41,6 +41,7 @@ if (HLSL_INCLUDE_TESTS)
     add_subdirectory(HLSLHost)
   endif (WIN32)
   add_subdirectory(dxc_batch)
+  add_subdirectory(DxrFallback)
 endif (HLSL_INCLUDE_TESTS)
 
 # HLSL Change Ends

+ 73 - 0
tools/clang/unittests/DxrFallback/CMakeLists.txt

@@ -0,0 +1,73 @@
+set(LLVM_LINK_COMPONENTS
+  analysis
+  asmparser
+  bitreader
+  bitwriter
+  core
+  dxcsupport
+  dxrfallback
+  hlsl
+  instcombine
+  ipa
+  ipo
+  irreader
+  linker
+  lto
+  mssupport
+  option
+  profiledata
+  scalaropts
+  support
+  target
+  transformutils
+  vectorize
+  )
+
+set(TEST_FILES 
+  testFiles/testShader1.hlsl
+  testFiles/testShader2.hlsl
+  testFiles/testShader3.hlsl
+  testFiles/testShader4.hlsl
+  testFiles/testTraversal.h
+  testFiles/testTraversal.hlsl
+  testFiles/testTraversal2.hlsl
+  testFiles/testLib.h
+  testFiles/testLib.hlsl
+  testFiles/HLSLRayTracingInternalPrototypes.h
+  )
+
+# put test files in a folder in Visual Studio
+source_group( "Test Files" FILES ${TEST_FILES} )
+
+# don't compile test files
+set_source_files_properties( ${TEST_FILES} PROPERTIES HEADER_FILE_ONLY ON)
+
+# create a file to include to provide a default path for test files
+set(DEFAULT_TEST_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/testFiles/")
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/defaultTestFilePath.h.in ${CMAKE_CURRENT_BINARY_DIR}/defaultTestFilePath.h)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_clang_executable(test_DxrFallback
+  test_DxrFallback.cpp
+
+  d3dx12.h
+  DXSampleHelper.h
+  ShaderTester.h
+  ShaderTesterImpl.cpp
+  ShaderTesterImpl.h
+  
+  ${TEST_FILES}
+  )
+
+target_link_libraries(test_DxrFallback
+  dxcompiler
+  d3d12
+  dxgi
+  )
+
+add_dependencies(test_DxrFallback dxcompiler)
+
+install(TARGETS test_DxrFallback
+  RUNTIME DESTINATION bin)
+
+

+ 110 - 0
tools/clang/unittests/DxrFallback/DXSampleHelper.h

@@ -0,0 +1,110 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#pragma once
+
+inline void ThrowIfFailed(HRESULT hr)
+{
+	if (FAILED(hr))
+	{
+		throw std::exception();
+	}
+}
+
+inline void GetAssetsPath(_Out_writes_(pathSize) WCHAR* path, UINT pathSize)
+{
+	if (path == nullptr)
+	{
+		throw std::exception();
+	}
+
+	DWORD size = GetModuleFileName(nullptr, path, pathSize);
+	if (size == 0 || size == pathSize)
+	{
+		// Method failed or path was truncated.
+		throw std::exception();
+	}
+
+	WCHAR* lastSlash = wcsrchr(path, L'\\');
+	if (lastSlash)
+	{
+		*(lastSlash + 1) = L'\0';
+	}
+}
+
+inline HRESULT ReadDataFromFile(LPCWSTR filename, byte** data, UINT* size)
+{
+	using namespace Microsoft::WRL;
+
+	CREATEFILE2_EXTENDED_PARAMETERS extendedParams = {};
+	extendedParams.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS);
+	extendedParams.dwFileAttributes = FILE_ATTRIBUTE_NORMAL;
+	extendedParams.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN;
+	extendedParams.dwSecurityQosFlags = SECURITY_ANONYMOUS;
+	extendedParams.lpSecurityAttributes = nullptr;
+	extendedParams.hTemplateFile = nullptr;
+
+	Wrappers::FileHandle file(CreateFile2(filename, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams));
+	if (file.Get() == INVALID_HANDLE_VALUE)
+	{
+		throw std::exception();
+	}
+
+	FILE_STANDARD_INFO fileInfo = {};
+	if (!GetFileInformationByHandleEx(file.Get(), FileStandardInfo, &fileInfo, sizeof(fileInfo)))
+	{
+		throw std::exception();
+	}
+
+	if (fileInfo.EndOfFile.HighPart != 0)
+	{
+		throw std::exception();
+	}
+
+	*data = reinterpret_cast<byte*>(malloc(fileInfo.EndOfFile.LowPart));
+	*size = fileInfo.EndOfFile.LowPart;
+
+	if (!ReadFile(file.Get(), *data, fileInfo.EndOfFile.LowPart, nullptr, nullptr))
+	{
+		throw std::exception();
+	}
+
+	return S_OK;
+}
+
+// Assign a name to the object to aid with debugging.
+#if defined(_DEBUG)
+inline void SetName(ID3D12Object* pObject, LPCWSTR name)
+{
+	pObject->SetName(name);
+}
+inline void SetNameIndexed(ID3D12Object* pObject, LPCWSTR name, UINT index)
+{
+	WCHAR fullName[50];
+	if (swprintf_s(fullName, L"%s[%u]", name, index) > 0)
+	{
+		pObject->SetName(fullName);
+	}
+}
+#else
+inline void SetName(ID3D12Object*, LPCWSTR)
+{
+}
+inline void SetNameIndexed(ID3D12Object*, LPCWSTR, UINT)
+{
+}
+#endif
+
+// Naming helper for ComPtr<T>.
+// Assigns the name of the variable as the name of the object.
+// The indexed variant will include the index in the name of the object.
+#define NAME_D3D12_OBJECT(x) SetName(x.Get(), L#x)
+#define NAME_D3D12_OBJECT_INDEXED(x, n) SetNameIndexed(x[n].Get(), L#x, n)

+ 15 - 0
tools/clang/unittests/DxrFallback/ShaderTester.h

@@ -0,0 +1,15 @@
+#pragma once
+#include <string>
+#include <vector>
+
+class ShaderTester
+{
+public:
+  virtual ~ShaderTester() {};
+
+  static ShaderTester* New(const std::wstring& file);
+  static ShaderTester* New(void* blob);
+
+  virtual void setDevice(const std::wstring& namePrefix) = 0;
+  virtual void runShader(int initialShaderId, const std::vector<int>& input, std::vector<int>& output) = 0;
+};

+ 542 - 0
tools/clang/unittests/DxrFallback/ShaderTesterImpl.cpp

@@ -0,0 +1,542 @@
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN             // Exclude rarely-used stuff from Windows headers.
+#endif
+
+#define UNICODE
+
+#include <windows.h>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <D3Dcompiler.h>
+#include <DirectXMath.h>
+#include "d3dx12.h"
+
+#include <string>
+#include <wrl.h>
+#include <shellapi.h>
+
+#include "DXSampleHelper.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+
+using namespace DirectX;
+using Microsoft::WRL::ComPtr;
+
+#include "ShaderTesterImpl.h"
+
+#include <dxc/dxcapi.h>
+#include <dxc/Support/dxcapi.use.h>
+#include <atlcomcli.h>
+
+
+
+static dxc::DxcDllSupport g_DxcDllHelper;
+
+#define VERIFY_SUCCEEDED(expr) { HRESULT Result = expr; if (FAILED(Result)) { assert(0 && #expr " failed: Result=%08x"); } }
+
+#ifndef DXIL_FOURCC
+#define DXIL_FOURCC(ch0, ch1, ch2, ch3) (                            \
+  (uint32_t)(uint8_t)(ch0)        | (uint32_t)(uint8_t)(ch1) << 8  | \
+  (uint32_t)(uint8_t)(ch2) << 16  | (uint32_t)(uint8_t)(ch3) << 24   \
+  )
+#endif
+
+HRESULT D3DCompileToDxilFromFile(LPCWSTR pShaderTextFilePath, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, const DxcDefine *pDefines, UINT32 defineCount, ID3DBlob **ppBlob, IDxcBlobEncoding** ppErrorBlob)
+{
+  VERIFY_SUCCEEDED(g_DxcDllHelper.Initialize());
+  CComPtr<IDxcCompiler> pCompiler;
+  CComPtr<IDxcLibrary> pLibrary;
+  CComPtr<IDxcBlobEncoding> pTextBlob(nullptr);
+  CComPtr<IDxcOperationResult> pResult;
+  CComPtr<IDxcIncludeHandler> dxcIncludeHandler;
+  VERIFY_SUCCEEDED(g_DxcDllHelper.CreateInstance(CLSID_DxcCompiler, &pCompiler));
+  VERIFY_SUCCEEDED(g_DxcDllHelper.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+  VERIFY_SUCCEEDED(pLibrary->CreateIncludeHandler(&dxcIncludeHandler));
+  UINT32 codePage(0);
+  VERIFY_SUCCEEDED(pLibrary->CreateBlobFromFile(pShaderTextFilePath, &codePage, &pTextBlob));
+  VERIFY_SUCCEEDED(pCompiler->Compile(pTextBlob, pShaderTextFilePath, pEntryPoint, pTargetProfile, nullptr, 0, pDefines, defineCount, dxcIncludeHandler, &pResult));
+  HRESULT resultCode;
+  VERIFY_SUCCEEDED(pResult->GetStatus(&resultCode));
+  VERIFY_SUCCEEDED(pResult->GetErrorBuffer(ppErrorBlob));
+  //VERIFY_SUCCEEDED(resultCode);
+  if (SUCCEEDED(resultCode))
+  {
+    VERIFY_SUCCEEDED(pResult->GetResult((IDxcBlob **)ppBlob));
+  }
+
+  return resultCode;
+}
+
+// A more recent Windows SDK than currently required is needed for these.
+typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
+  UINT                                    NumFeatures,
+  __in_ecount(NumFeatures) const IID*     pIIDs,
+  __in_ecount_opt(NumFeatures) void*      pConfigurationStructs,
+  __in_ecount_opt(NumFeatures) UINT*      pConfigurationStructSizes);
+
+static const GUID D3D12ExperimentalShaderModelsID = { /* 76f5573e-f13a-40f5-b297-81ce9e18933f */
+  0x76f5573e,
+  0xf13a,
+  0x40f5,
+  { 0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f }
+};
+
+static HRESULT EnableExperimentalShaderModels()
+{
+#if 1
+  HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
+  if (hRuntime == NULL)
+  {
+    return HRESULT_FROM_WIN32(GetLastError());
+  }
+
+  D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
+    (D3D12EnableExperimentalFeaturesFn)GetProcAddress(hRuntime, "D3D12EnableExperimentalFeatures");
+  if (pD3D12EnableExperimentalFeatures == nullptr)
+  {
+    std::cerr << "Unable to enable experimental shader models\n";
+    FreeLibrary(hRuntime);
+    return HRESULT_FROM_WIN32(GetLastError());
+  }
+
+  HRESULT hr = pD3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID, nullptr, nullptr);
+  //FreeLibrary(hRuntime);
+  return hr;
+#else
+  HRESULT hr = D3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID, nullptr, nullptr);
+#endif
+}
+
+void GetHardwareAdapter(IDXGIFactory2* pFactory, IDXGIAdapter1** ppAdapter, const std::wstring& namePrefix)
+{
+  ComPtr<IDXGIAdapter1> adapter;
+  *ppAdapter = nullptr;
+
+  for (UINT adapterIndex = 0; DXGI_ERROR_NOT_FOUND != pFactory->EnumAdapters1(adapterIndex, &adapter); ++adapterIndex)
+  {
+    DXGI_ADAPTER_DESC1 desc;
+    adapter->GetDesc1(&desc);
+
+    if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE)
+    {
+      // Don't select the Basic Render Driver adapter.
+      // If you want a software adapter, pass in "/warp" on the command line.
+      continue;
+    }
+
+    // Check to see if the adapter supports Direct3D 12, but don't create the
+    // actual device yet.
+    if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_12_0, _uuidof(ID3D12Device), nullptr)) &&
+      std::wstring(desc.Description).find(namePrefix) != std::wstring::npos)
+    {
+      break;
+    }
+  }
+
+  *ppAdapter = adapter.Detach();
+}
+
+void ReadFileToBuffer(const std::wstring& path, std::vector<char>& buffer)
+{
+  std::fstream fs(path, std::ios::binary | std::ios::in);
+  if (fs.fail())
+  {
+    std::wcerr << L"Could not open file " << path << L"\n";
+    exit(1);
+  }
+
+  fs.seekg(0, std::ios::end);
+  std::streampos size = fs.tellg();
+  fs.seekg(0, std::ios::beg);
+  buffer.resize(size, 0);
+  if (size)
+    fs.read(buffer.data(), buffer.size());
+}
+
+
+ShaderTester* ShaderTester::New(const std::wstring& filename)
+{
+  return new ShaderTesterImpl(filename);
+}
+
+ShaderTester* ShaderTester::New(void* blob)
+{
+  return new ShaderTesterImpl((ID3DBlob*)blob);
+}
+
+ShaderTesterImpl::ShaderTesterImpl(const std::wstring& filename)
+  : m_filename(filename)
+{}
+
+ShaderTesterImpl::ShaderTesterImpl(ID3DBlob* blob)
+  : m_blob(blob)
+{}
+
+ShaderTesterImpl::~ShaderTesterImpl()
+{
+  CloseHandle(m_fenceEvent);
+}
+
+void ShaderTesterImpl::init()
+{
+  initDevice();
+  initResources();
+  initPipeline();
+  initExecution();
+}
+
+void ShaderTesterImpl::initDevice()
+{
+  ThrowIfFailed(EnableExperimentalShaderModels());
+
+#if defined(_DEBUG)
+  // Enable the D3D12 debug layer.
+  {
+    ComPtr<ID3D12Debug> debugController;
+    if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController))))
+    {
+      debugController->EnableDebugLayer();
+    }
+  }
+#endif
+
+  ComPtr<IDXGIFactory4> factory;
+  ThrowIfFailed(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
+
+  if (m_namePrefix == L"WARP")
+  {
+    ComPtr<IDXGIAdapter> warpAdapter;
+    ThrowIfFailed(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
+    ThrowIfFailed(D3D12CreateDevice(
+      warpAdapter.Get(),
+      D3D_FEATURE_LEVEL_12_0,
+      IID_PPV_ARGS(&m_device)
+    ));
+  }
+  else
+  {
+    ComPtr<IDXGIAdapter1> hardwareAdapter;
+    GetHardwareAdapter(factory.Get(), &hardwareAdapter, m_namePrefix);
+    ThrowIfFailed(D3D12CreateDevice(
+      hardwareAdapter.Get(),
+      D3D_FEATURE_LEVEL_12_0,
+      IID_PPV_ARGS(&m_device)
+    ));
+  }
+
+  {
+    D3D12_FEATURE_DATA_SHADER_MODEL shaderModel = { D3D_SHADER_MODEL_6_0 };
+    if (FAILED(m_device->CheckFeatureSupport(D3D12_FEATURE_SHADER_MODEL, &shaderModel, sizeof(shaderModel))) || shaderModel.HighestShaderModel < D3D_SHADER_MODEL_6_0)
+    {
+      std::cerr << "SM6_0 not supported.\n";
+    }
+  }
+}
+
+void ShaderTesterImpl::initResources()
+{
+  // Create the compute resources
+  {
+    CD3DX12_HEAP_PROPERTIES heapProps;
+    CD3DX12_RESOURCE_DESC resDesc;
+
+    UINT64 bufferSizeInBytes = m_bufferSize * sizeof(int);
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(bufferSizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+      D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+      nullptr,
+      IID_PPV_ARGS(&m_input)));
+    NAME_D3D12_OBJECT(m_input);
+
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(bufferSizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+      D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+      nullptr,
+      IID_PPV_ARGS(&m_output)));
+    NAME_D3D12_OBJECT(m_output);
+
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(int), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+      D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+      nullptr,
+      IID_PPV_ARGS(&m_one)));
+    NAME_D3D12_OBJECT(m_one);
+
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(bufferSizeInBytes)),
+      D3D12_RESOURCE_STATE_GENERIC_READ,
+      nullptr,
+      IID_PPV_ARGS(&m_uploadInput)));
+    NAME_D3D12_OBJECT(m_uploadInput);
+
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(int))),
+      D3D12_RESOURCE_STATE_GENERIC_READ,
+      nullptr,
+      IID_PPV_ARGS(&m_uploadOne)));
+    NAME_D3D12_OBJECT(m_uploadOne);
+
+    ThrowIfFailed(m_device->CreateCommittedResource(
+      &(heapProps = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK)),
+      D3D12_HEAP_FLAG_NONE,
+      &(resDesc = CD3DX12_RESOURCE_DESC::Buffer(bufferSizeInBytes)),
+      D3D12_RESOURCE_STATE_COPY_DEST,
+      nullptr,
+      IID_PPV_ARGS(&m_readback)));
+    NAME_D3D12_OBJECT(m_readback);
+  }
+
+  // Create a UAV heap.
+  {
+    D3D12_DESCRIPTOR_HEAP_DESC uavHeapDesc = {};
+    uavHeapDesc.NumDescriptors = 3;
+    uavHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+    uavHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+    ThrowIfFailed(m_device->CreateDescriptorHeap(&uavHeapDesc, IID_PPV_ARGS(&m_uavHeap)));
+
+    m_uavDescriptorSize = m_device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+  }
+
+  // Create compute UAVs
+  {
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+    uavDesc.Format = DXGI_FORMAT_UNKNOWN;
+    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+    uavDesc.Buffer.FirstElement = 0;
+    uavDesc.Buffer.NumElements = m_bufferSize;
+    uavDesc.Buffer.StructureByteStride = sizeof(int);
+    uavDesc.Buffer.CounterOffsetInBytes = 0;
+    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(m_uavHeap->GetCPUDescriptorHandleForHeapStart(), 0, m_uavDescriptorSize);
+    m_device->CreateUnorderedAccessView(m_input.Get(), nullptr, &uavDesc, uavHandle);
+
+    uavHandle.Offset(1, m_uavDescriptorSize);
+    m_device->CreateUnorderedAccessView(m_output.Get(), nullptr, &uavDesc, uavHandle);
+
+    uavHandle.Offset(1, m_uavDescriptorSize);
+    uavDesc.Buffer.NumElements = 1;
+    m_device->CreateUnorderedAccessView(m_one.Get(), nullptr, &uavDesc, uavHandle);
+
+  }
+}
+
+void ShaderTesterImpl::initPipeline()
+{
+  // Compute root signature.
+  {
+    CD3DX12_ROOT_PARAMETER1 rootParameters[2];
+    CD3DX12_DESCRIPTOR_RANGE1 uavs(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 3, 0);
+    rootParameters[0].InitAsDescriptorTable(1, &uavs);  // register u0 : input
+                                                        // register u1 : output
+                                                        // register u2 : one
+    rootParameters[1].InitAsConstants(1, 0);            // register b0 : initialStateId
+
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC computeRootSignatureDesc;
+    computeRootSignatureDesc.Init_1_1(_countof(rootParameters), rootParameters);
+
+    ComPtr<ID3DBlob> signature;
+    ComPtr<ID3DBlob> error;
+    ThrowIfFailed(D3DX12SerializeVersionedRootSignature(&computeRootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
+    ThrowIfFailed(m_device->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(), IID_PPV_ARGS(&m_computeRootSignature)));
+    NAME_D3D12_OBJECT(m_computeRootSignature);
+  }
+
+  // Create compute pipeline
+  {
+//#if defined(_DEBUG)
+//    // Enable better shader debugging with the graphics debugging tools.
+//    UINT compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
+//#else
+//    UINT compileFlags = 0;
+//#endif
+    // Load and compile shaders.
+    //ComPtr<ID3DBlob> computeShader;
+    //ThrowIfFailed(D3DCompileFromFile(L"shaders.hlsl", nullptr, nullptr, "CSMain", "cs_6_0", compileFlags, 0, &computeShader, nullptr));
+
+    CD3DX12_SHADER_BYTECODE bytecode;
+    std::vector<char> computeShaderBuffer;
+    ComPtr<ID3DBlob> computeShaderBlob;
+    if (m_blob)
+    {
+      bytecode = CD3DX12_SHADER_BYTECODE(m_blob.Get());
+    }
+    else if (m_filename.find(L".cso") != std::wstring::npos)
+    {
+      ReadFileToBuffer(m_filename, computeShaderBuffer);
+      bytecode = CD3DX12_SHADER_BYTECODE(computeShaderBuffer.data(), computeShaderBuffer.size());
+    }
+    else
+    {
+      HRESULT DxilResult(S_OK);
+      ComPtr<IDxcBlobEncoding> errors;
+      DxilResult = D3DCompileToDxilFromFile(m_filename.c_str(), L"CSMain", L"cs_6_0", nullptr, 0, &computeShaderBlob, &errors);
+      if (!SUCCEEDED(DxilResult)) { OutputDebugStringA((LPCSTR)errors->GetBufferPointer()); }
+      ThrowIfFailed(DxilResult);
+      bytecode = CD3DX12_SHADER_BYTECODE(computeShaderBlob.Get());
+    }
+
+    // Describe and create the compute pipeline state object (PSO).
+    D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {};
+    computePsoDesc.pRootSignature = m_computeRootSignature.Get();
+    computePsoDesc.CS = bytecode;
+    ThrowIfFailed(m_device->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(&m_computeState)));
+    NAME_D3D12_OBJECT(m_computeState);
+  }
+}
+
+void ShaderTesterImpl::initExecution()
+{
+  // Describe and create the command queue.
+  D3D12_COMMAND_QUEUE_DESC queueDesc = {};
+  queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+  queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  ThrowIfFailed(m_device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&m_commandQueue)));
+  ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_commandAllocator)));
+
+  // Create the command list.
+  ThrowIfFailed(m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocator.Get(), nullptr, IID_PPV_ARGS(&m_commandList)));
+
+  // Command lists are created in the recording state, but there is nothing
+  // to record yet. The main loop expects it to be closed, so close it now.
+  ThrowIfFailed(m_commandList->Close());
+
+  // Create synchronization objects
+  {
+    ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_fence)));
+    m_fenceValue = 1;
+
+    // Create an event handle to use for frame synchronization.
+    m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+    if (m_fenceEvent == nullptr)
+    {
+      ThrowIfFailed(HRESULT_FROM_WIN32(GetLastError()));
+    }
+  }
+}
+
+void ShaderTesterImpl::setDevice(const std::wstring& namePrefix)
+{
+  m_namePrefix = namePrefix;
+}
+
+
+void ShaderTesterImpl::runShader(int initialShaderId, const std::vector<int>& input, std::vector<int>& output)
+{
+  if (!m_device)
+    init();
+
+  //////////////////////////////////////////////////////////////////////////
+  // Dispatch compute shader
+  //////////////////////////////////////////////////////////////////////////
+
+  // Command list allocators can only be reset when the associated 
+  // command lists have finished execution on the GPU; apps should use 
+  // fences to determine GPU execution progress.
+  ThrowIfFailed(m_commandAllocator->Reset());
+
+  // However, when ExecuteCommandList() is called on a particular command 
+  // list, that command list can then be reset at any time and must be before 
+  // re-recording.
+  ThrowIfFailed(m_commandList->Reset(m_commandAllocator.Get(), nullptr));
+
+  m_commandList->SetPipelineState(m_computeState.Get());
+  m_commandList->SetComputeRootSignature(m_computeRootSignature.Get());
+
+  ID3D12DescriptorHeap* ppHeaps[] = { m_uavHeap.Get() };
+  m_commandList->SetDescriptorHeaps(_countof(ppHeaps), ppHeaps);
+
+  CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandle(m_uavHeap->GetGPUDescriptorHandleForHeapStart(), 0, m_uavDescriptorSize);
+  m_commandList->SetComputeRootDescriptorTable(0, uavHandle);
+  m_commandList->SetComputeRoot32BitConstant(1, initialShaderId, 0);
+
+  // Upload some data
+  CD3DX12_RANGE readRange(0, 0);
+  int* pUpload = nullptr;
+  m_uploadInput->Map(0, &readRange, (void**)&pUpload);
+  pUpload[0] = 0;
+  memcpy(pUpload + 1, input.data(), input.size() * sizeof(int));
+  m_uploadInput->Unmap(0, nullptr);
+
+  m_uploadOne->Map(0, &readRange, (void**)&pUpload);
+  pUpload[0] = 1;
+  m_uploadOne->Unmap(0, nullptr);
+
+
+  // Copy it to the input buffer
+  CD3DX12_RESOURCE_BARRIER bar;
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_input.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST)));
+  m_commandList->CopyResource(m_input.Get(), m_uploadInput.Get());
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_input.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)));
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_one.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST)));
+  m_commandList->CopyResource(m_one.Get(), m_uploadOne.Get());
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_one.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)));
+
+  m_commandList->Dispatch(1, 1, 1);
+
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_output.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE)));
+  m_commandList->CopyResource(m_readback.Get(), m_output.Get());
+  m_commandList->ResourceBarrier(1, &(bar = CD3DX12_RESOURCE_BARRIER::Transition(m_output.Get(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)));
+
+  ThrowIfFailed(m_commandList->Close());
+
+  // Execute the command list.
+  ID3D12CommandList* ppCommandLists[] = { m_commandList.Get() };
+  m_commandQueue->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists);
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Synchronize
+  //////////////////////////////////////////////////////////////////////////
+
+  // Signal and increment the fence value.
+  const UINT64 oldFenceValue = m_fenceValue;
+  ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), oldFenceValue));
+  m_fenceValue++;
+
+  // Wait until the previous frame is finished.
+  if (m_fence->GetCompletedValue() < oldFenceValue)
+  {
+    ThrowIfFailed(m_fence->SetEventOnCompletion(oldFenceValue, m_fenceEvent));
+    WaitForSingleObject(m_fenceEvent, INFINITE);
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Readback
+  //////////////////////////////////////////////////////////////////////////
+  {
+    int* pReadback = nullptr;
+    CD3DX12_RANGE readRange(0, m_bufferSize);
+    m_readback->Map(0, &readRange, (void**)&pReadback);
+    output.assign(pReadback, pReadback + m_bufferSize);
+    CD3DX12_RANGE writeRange(0, 0);
+    m_readback->Unmap(0, &writeRange);
+  }
+}
+
+void ShaderTesterImpl::printLog(int* log)
+{
+  int *pos = log;
+  int count = pos[0];
+  std::cout << count << ": ";
+  pos++;
+
+  for (int i = 0; i < count; ++i)
+    std::cout << pos[i] << " ";
+  std::cout << "\n";
+}

+ 55 - 0
tools/clang/unittests/DxrFallback/ShaderTesterImpl.h

@@ -0,0 +1,55 @@
+#pragma once
+#include "ShaderTester.h"
+
+using namespace DirectX;
+using Microsoft::WRL::ComPtr;
+
+class ShaderTesterImpl : public ShaderTester
+{
+public:
+  ShaderTesterImpl(const std::wstring& filename);
+  ShaderTesterImpl(ID3DBlob* blob);
+  virtual ~ShaderTesterImpl();
+
+  virtual void setDevice(const std::wstring& namePrefix);
+  virtual void runShader(int initialShaderId, const std::vector<int>& input, std::vector<int>& output) override;
+
+private:
+  void init();
+  void initDevice();
+  void initResources();
+  void initPipeline();
+  void initExecution();
+
+  void printLog(int* log);
+
+  std::wstring m_filename;
+  ComPtr<ID3DBlob> m_blob;
+  std::wstring m_namePrefix;
+
+  ComPtr<ID3D12Device> m_device;
+
+  ComPtr<ID3D12PipelineState> m_computeState;
+  ComPtr<ID3D12RootSignature> m_computeRootSignature;
+
+  // Resources
+  int m_bufferSize = 64 * 1024;
+  ComPtr<ID3D12Resource> m_input;
+  ComPtr<ID3D12Resource> m_output;
+  ComPtr<ID3D12Resource> m_one;
+  ComPtr<ID3D12Resource> m_uploadInput;
+  ComPtr<ID3D12Resource> m_uploadOne;
+  ComPtr<ID3D12Resource> m_readback;
+  ComPtr<ID3D12DescriptorHeap> m_uavHeap;
+  UINT m_uavDescriptorSize;
+
+  // Execution 
+  ComPtr<ID3D12CommandAllocator> m_commandAllocator;
+  ComPtr<ID3D12CommandQueue> m_commandQueue;
+  ComPtr<ID3D12GraphicsCommandList> m_commandList;
+
+  // Synchronization objects.
+  ComPtr<ID3D12Fence> m_fence;
+  HANDLE m_fenceEvent;
+  UINT64 m_fenceValue;
+};

+ 1946 - 0
tools/clang/unittests/DxrFallback/d3dx12.h

@@ -0,0 +1,1946 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#ifndef __D3DX12_H__
+#define __D3DX12_H__
+
+#include "d3d12.h"
+
+#if defined( __cplusplus )
+
+struct CD3DX12_DEFAULT {};
+extern const DECLSPEC_SELECTANY CD3DX12_DEFAULT D3D12_DEFAULT;
+
+//------------------------------------------------------------------------------------------------
+inline bool operator==( const D3D12_VIEWPORT& l, const D3D12_VIEWPORT& r )
+{
+    return l.TopLeftX == r.TopLeftX && l.TopLeftY == r.TopLeftY && l.Width == r.Width &&
+        l.Height == r.Height && l.MinDepth == r.MinDepth && l.MaxDepth == r.MaxDepth;
+}
+
+//------------------------------------------------------------------------------------------------
+inline bool operator!=( const D3D12_VIEWPORT& l, const D3D12_VIEWPORT& r )
+{ return !( l == r ); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RECT : public D3D12_RECT
+{
+    CD3DX12_RECT()
+    {}
+    explicit CD3DX12_RECT( const D3D12_RECT& o ) :
+        D3D12_RECT( o )
+    {}
+    explicit CD3DX12_RECT(
+        LONG Left,
+        LONG Top,
+        LONG Right,
+        LONG Bottom )
+    {
+        left = Left;
+        top = Top;
+        right = Right;
+        bottom = Bottom;
+    }
+    ~CD3DX12_RECT() {}
+    operator const D3D12_RECT&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_BOX : public D3D12_BOX
+{
+    CD3DX12_BOX()
+    {}
+    explicit CD3DX12_BOX( const D3D12_BOX& o ) :
+        D3D12_BOX( o )
+    {}
+    explicit CD3DX12_BOX(
+        LONG Left,
+        LONG Right )
+    {
+        left = Left;
+        top = 0;
+        front = 0;
+        right = Right;
+        bottom = 1;
+        back = 1;
+    }
+    explicit CD3DX12_BOX(
+        LONG Left,
+        LONG Top,
+        LONG Right,
+        LONG Bottom )
+    {
+        left = Left;
+        top = Top;
+        front = 0;
+        right = Right;
+        bottom = Bottom;
+        back = 1;
+    }
+    explicit CD3DX12_BOX(
+        LONG Left,
+        LONG Top,
+        LONG Front,
+        LONG Right,
+        LONG Bottom,
+        LONG Back )
+    {
+        left = Left;
+        top = Top;
+        front = Front;
+        right = Right;
+        bottom = Bottom;
+        back = Back;
+    }
+    ~CD3DX12_BOX() {}
+    operator const D3D12_BOX&() const { return *this; }
+};
+inline bool operator==( const D3D12_BOX& l, const D3D12_BOX& r )
+{
+    return l.left == r.left && l.top == r.top && l.front == r.front &&
+        l.right == r.right && l.bottom == r.bottom && l.back == r.back;
+}
+inline bool operator!=( const D3D12_BOX& l, const D3D12_BOX& r )
+{ return !( l == r ); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DEPTH_STENCIL_DESC : public D3D12_DEPTH_STENCIL_DESC
+{
+    CD3DX12_DEPTH_STENCIL_DESC()
+    {}
+    explicit CD3DX12_DEPTH_STENCIL_DESC( const D3D12_DEPTH_STENCIL_DESC& o ) :
+        D3D12_DEPTH_STENCIL_DESC( o )
+    {}
+    explicit CD3DX12_DEPTH_STENCIL_DESC( CD3DX12_DEFAULT )
+    {
+        DepthEnable = TRUE;
+        DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL;
+        DepthFunc = D3D12_COMPARISON_FUNC_LESS;
+        StencilEnable = FALSE;
+        StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK;
+        StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK;
+        const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp =
+        { D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS };
+        FrontFace = defaultStencilOp;
+        BackFace = defaultStencilOp;
+    }
+    explicit CD3DX12_DEPTH_STENCIL_DESC(
+        BOOL depthEnable,
+        D3D12_DEPTH_WRITE_MASK depthWriteMask,
+        D3D12_COMPARISON_FUNC depthFunc,
+        BOOL stencilEnable,
+        UINT8 stencilReadMask,
+        UINT8 stencilWriteMask,
+        D3D12_STENCIL_OP frontStencilFailOp,
+        D3D12_STENCIL_OP frontStencilDepthFailOp,
+        D3D12_STENCIL_OP frontStencilPassOp,
+        D3D12_COMPARISON_FUNC frontStencilFunc,
+        D3D12_STENCIL_OP backStencilFailOp,
+        D3D12_STENCIL_OP backStencilDepthFailOp,
+        D3D12_STENCIL_OP backStencilPassOp,
+        D3D12_COMPARISON_FUNC backStencilFunc )
+    {
+        DepthEnable = depthEnable;
+        DepthWriteMask = depthWriteMask;
+        DepthFunc = depthFunc;
+        StencilEnable = stencilEnable;
+        StencilReadMask = stencilReadMask;
+        StencilWriteMask = stencilWriteMask;
+        FrontFace.StencilFailOp = frontStencilFailOp;
+        FrontFace.StencilDepthFailOp = frontStencilDepthFailOp;
+        FrontFace.StencilPassOp = frontStencilPassOp;
+        FrontFace.StencilFunc = frontStencilFunc;
+        BackFace.StencilFailOp = backStencilFailOp;
+        BackFace.StencilDepthFailOp = backStencilDepthFailOp;
+        BackFace.StencilPassOp = backStencilPassOp;
+        BackFace.StencilFunc = backStencilFunc;
+    }
+    ~CD3DX12_DEPTH_STENCIL_DESC() {}
+    operator const D3D12_DEPTH_STENCIL_DESC&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_BLEND_DESC : public D3D12_BLEND_DESC
+{
+    CD3DX12_BLEND_DESC()
+    {}
+    explicit CD3DX12_BLEND_DESC( const D3D12_BLEND_DESC& o ) :
+        D3D12_BLEND_DESC( o )
+    {}
+    explicit CD3DX12_BLEND_DESC( CD3DX12_DEFAULT )
+    {
+        AlphaToCoverageEnable = FALSE;
+        IndependentBlendEnable = FALSE;
+        const D3D12_RENDER_TARGET_BLEND_DESC defaultRenderTargetBlendDesc =
+        {
+            FALSE,FALSE,
+            D3D12_BLEND_ONE, D3D12_BLEND_ZERO, D3D12_BLEND_OP_ADD,
+            D3D12_BLEND_ONE, D3D12_BLEND_ZERO, D3D12_BLEND_OP_ADD,
+            D3D12_LOGIC_OP_NOOP,
+            D3D12_COLOR_WRITE_ENABLE_ALL,
+        };
+        for (UINT i = 0; i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; ++i)
+            RenderTarget[ i ] = defaultRenderTargetBlendDesc;
+    }
+    ~CD3DX12_BLEND_DESC() {}
+    operator const D3D12_BLEND_DESC&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RASTERIZER_DESC : public D3D12_RASTERIZER_DESC
+{
+    CD3DX12_RASTERIZER_DESC()
+    {}
+    explicit CD3DX12_RASTERIZER_DESC( const D3D12_RASTERIZER_DESC& o ) :
+        D3D12_RASTERIZER_DESC( o )
+    {}
+    explicit CD3DX12_RASTERIZER_DESC( CD3DX12_DEFAULT )
+    {
+        FillMode = D3D12_FILL_MODE_SOLID;
+        CullMode = D3D12_CULL_MODE_BACK;
+        FrontCounterClockwise = FALSE;
+        DepthBias = D3D12_DEFAULT_DEPTH_BIAS;
+        DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP;
+        SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS;
+        DepthClipEnable = TRUE;
+        MultisampleEnable = FALSE;
+        AntialiasedLineEnable = FALSE;
+        ForcedSampleCount = 0;
+        ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF;
+    }
+    explicit CD3DX12_RASTERIZER_DESC(
+        D3D12_FILL_MODE fillMode,
+        D3D12_CULL_MODE cullMode,
+        BOOL frontCounterClockwise,
+        INT depthBias,
+        FLOAT depthBiasClamp,
+        FLOAT slopeScaledDepthBias,
+        BOOL depthClipEnable,
+        BOOL multisampleEnable,
+        BOOL antialiasedLineEnable, 
+        UINT forcedSampleCount, 
+        D3D12_CONSERVATIVE_RASTERIZATION_MODE conservativeRaster)
+    {
+        FillMode = fillMode;
+        CullMode = cullMode;
+        FrontCounterClockwise = frontCounterClockwise;
+        DepthBias = depthBias;
+        DepthBiasClamp = depthBiasClamp;
+        SlopeScaledDepthBias = slopeScaledDepthBias;
+        DepthClipEnable = depthClipEnable;
+        MultisampleEnable = multisampleEnable;
+        AntialiasedLineEnable = antialiasedLineEnable;
+        ForcedSampleCount = forcedSampleCount;
+        ConservativeRaster = conservativeRaster;
+    }
+    ~CD3DX12_RASTERIZER_DESC() {}
+    operator const D3D12_RASTERIZER_DESC&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_ALLOCATION_INFO : public D3D12_RESOURCE_ALLOCATION_INFO
+{
+    CD3DX12_RESOURCE_ALLOCATION_INFO()
+    {}
+    explicit CD3DX12_RESOURCE_ALLOCATION_INFO( const D3D12_RESOURCE_ALLOCATION_INFO& o ) :
+        D3D12_RESOURCE_ALLOCATION_INFO( o )
+    {}
+    CD3DX12_RESOURCE_ALLOCATION_INFO(
+        UINT64 size,
+        UINT64 alignment )
+    {
+        SizeInBytes = size;
+        Alignment = alignment;
+    }
+    operator const D3D12_RESOURCE_ALLOCATION_INFO&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_HEAP_PROPERTIES : public D3D12_HEAP_PROPERTIES
+{
+    CD3DX12_HEAP_PROPERTIES()
+    {}
+    explicit CD3DX12_HEAP_PROPERTIES(const D3D12_HEAP_PROPERTIES &o) :
+        D3D12_HEAP_PROPERTIES(o)
+    {}
+    CD3DX12_HEAP_PROPERTIES( 
+        D3D12_CPU_PAGE_PROPERTY cpuPageProperty, 
+        D3D12_MEMORY_POOL memoryPoolPreference,
+        UINT creationNodeMask = 1, 
+        UINT nodeMask = 1 )
+    {
+        Type = D3D12_HEAP_TYPE_CUSTOM;
+        CPUPageProperty = cpuPageProperty;
+        MemoryPoolPreference = memoryPoolPreference;
+        CreationNodeMask = creationNodeMask;
+        VisibleNodeMask = nodeMask;
+    }
+    explicit CD3DX12_HEAP_PROPERTIES( 
+        D3D12_HEAP_TYPE type, 
+        UINT creationNodeMask = 1, 
+        UINT nodeMask = 1 )
+    {
+        Type = type;
+        CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+        MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+        CreationNodeMask = creationNodeMask;
+        VisibleNodeMask = nodeMask;
+    }
+    operator const D3D12_HEAP_PROPERTIES&() const { return *this; }
+    bool IsCPUAccessible() const
+    {
+        return Type == D3D12_HEAP_TYPE_UPLOAD || Type == D3D12_HEAP_TYPE_READBACK || (Type == D3D12_HEAP_TYPE_CUSTOM &&
+            (CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE || CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_BACK));
+    }
+};
+inline bool operator==( const D3D12_HEAP_PROPERTIES& l, const D3D12_HEAP_PROPERTIES& r )
+{
+    return l.Type == r.Type && l.CPUPageProperty == r.CPUPageProperty && 
+        l.MemoryPoolPreference == r.MemoryPoolPreference &&
+        l.CreationNodeMask == r.CreationNodeMask &&
+        l.VisibleNodeMask == r.VisibleNodeMask;
+}
+inline bool operator!=( const D3D12_HEAP_PROPERTIES& l, const D3D12_HEAP_PROPERTIES& r )
+{ return !( l == r ); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_HEAP_DESC : public D3D12_HEAP_DESC
+{
+    CD3DX12_HEAP_DESC()
+    {}
+    explicit CD3DX12_HEAP_DESC(const D3D12_HEAP_DESC &o) :
+        D3D12_HEAP_DESC(o)
+    {}
+    CD3DX12_HEAP_DESC( 
+        UINT64 size, 
+        D3D12_HEAP_PROPERTIES properties, 
+        UINT64 alignment = 0, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = size;
+        Properties = properties;
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC( 
+        UINT64 size, 
+        D3D12_HEAP_TYPE type, 
+        UINT64 alignment = 0, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = size;
+        Properties = CD3DX12_HEAP_PROPERTIES( type );
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC( 
+        UINT64 size, 
+        D3D12_CPU_PAGE_PROPERTY cpuPageProperty, 
+        D3D12_MEMORY_POOL memoryPoolPreference, 
+        UINT64 alignment = 0, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = size;
+        Properties = CD3DX12_HEAP_PROPERTIES( cpuPageProperty, memoryPoolPreference );
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC( 
+        const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo,
+        D3D12_HEAP_PROPERTIES properties, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = properties;
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC( 
+        const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo,
+        D3D12_HEAP_TYPE type, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = CD3DX12_HEAP_PROPERTIES( type );
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC( 
+        const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo,
+        D3D12_CPU_PAGE_PROPERTY cpuPageProperty, 
+        D3D12_MEMORY_POOL memoryPoolPreference, 
+        D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE )
+    {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = CD3DX12_HEAP_PROPERTIES( cpuPageProperty, memoryPoolPreference );
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    operator const D3D12_HEAP_DESC&() const { return *this; }
+    bool IsCPUAccessible() const
+    { return static_cast< const CD3DX12_HEAP_PROPERTIES* >( &Properties )->IsCPUAccessible(); }
+};
+inline bool operator==( const D3D12_HEAP_DESC& l, const D3D12_HEAP_DESC& r )
+{
+    return l.SizeInBytes == r.SizeInBytes &&
+        l.Properties == r.Properties && 
+        l.Alignment == r.Alignment &&
+        l.Flags == r.Flags;
+}
+inline bool operator!=( const D3D12_HEAP_DESC& l, const D3D12_HEAP_DESC& r )
+{ return !( l == r ); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_CLEAR_VALUE : public D3D12_CLEAR_VALUE
+{
+    CD3DX12_CLEAR_VALUE()
+    {}
+    explicit CD3DX12_CLEAR_VALUE(const D3D12_CLEAR_VALUE &o) :
+        D3D12_CLEAR_VALUE(o)
+    {}
+    CD3DX12_CLEAR_VALUE( 
+        DXGI_FORMAT format, 
+        const FLOAT color[4] )
+    {
+        Format = format;
+        memcpy( Color, color, sizeof( Color ) );
+    }
+    CD3DX12_CLEAR_VALUE( 
+        DXGI_FORMAT format, 
+        FLOAT depth,
+        UINT8 stencil )
+    {
+        Format = format;
+        /* Use memcpy to preserve NAN values */
+        memcpy( &DepthStencil.Depth, &depth, sizeof( depth ) );
+        DepthStencil.Stencil = stencil;
+    }
+    operator const D3D12_CLEAR_VALUE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RANGE : public D3D12_RANGE
+{
+    CD3DX12_RANGE()
+    {}
+    explicit CD3DX12_RANGE(const D3D12_RANGE &o) :
+        D3D12_RANGE(o)
+    {}
+    CD3DX12_RANGE( 
+        SIZE_T begin, 
+        SIZE_T end )
+    {
+        Begin = begin;
+        End = end;
+    }
+    operator const D3D12_RANGE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SHADER_BYTECODE : public D3D12_SHADER_BYTECODE
+{
+    CD3DX12_SHADER_BYTECODE()
+    {}
+    explicit CD3DX12_SHADER_BYTECODE(const D3D12_SHADER_BYTECODE &o) :
+        D3D12_SHADER_BYTECODE(o)
+    {}
+    CD3DX12_SHADER_BYTECODE(
+        ID3DBlob* pShaderBlob )
+    {
+        pShaderBytecode = pShaderBlob->GetBufferPointer();
+        BytecodeLength = pShaderBlob->GetBufferSize();
+    }
+    CD3DX12_SHADER_BYTECODE(
+        void* _pShaderBytecode,
+        SIZE_T bytecodeLength )
+    {
+        pShaderBytecode = _pShaderBytecode;
+        BytecodeLength = bytecodeLength;
+    }
+    operator const D3D12_SHADER_BYTECODE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILED_RESOURCE_COORDINATE : public D3D12_TILED_RESOURCE_COORDINATE
+{
+    CD3DX12_TILED_RESOURCE_COORDINATE()
+    {}
+    explicit CD3DX12_TILED_RESOURCE_COORDINATE(const D3D12_TILED_RESOURCE_COORDINATE &o) :
+        D3D12_TILED_RESOURCE_COORDINATE(o)
+    {}
+    CD3DX12_TILED_RESOURCE_COORDINATE( 
+        UINT x, 
+        UINT y, 
+        UINT z, 
+        UINT subresource ) 
+    {
+        X = x;
+        Y = y;
+        Z = z;
+        Subresource = subresource;
+    }
+    operator const D3D12_TILED_RESOURCE_COORDINATE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILE_REGION_SIZE : public D3D12_TILE_REGION_SIZE
+{
+    CD3DX12_TILE_REGION_SIZE()
+    {}
+    explicit CD3DX12_TILE_REGION_SIZE(const D3D12_TILE_REGION_SIZE &o) :
+        D3D12_TILE_REGION_SIZE(o)
+    {}
+    CD3DX12_TILE_REGION_SIZE( 
+        UINT numTiles, 
+        BOOL useBox, 
+        UINT width, 
+        UINT16 height, 
+        UINT16 depth ) 
+    {
+        NumTiles = numTiles;
+        UseBox = useBox;
+        Width = width;
+        Height = height;
+        Depth = depth;
+    }
+    operator const D3D12_TILE_REGION_SIZE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SUBRESOURCE_TILING : public D3D12_SUBRESOURCE_TILING
+{
+    CD3DX12_SUBRESOURCE_TILING()
+    {}
+    explicit CD3DX12_SUBRESOURCE_TILING(const D3D12_SUBRESOURCE_TILING &o) :
+        D3D12_SUBRESOURCE_TILING(o)
+    {}
+    CD3DX12_SUBRESOURCE_TILING( 
+        UINT widthInTiles, 
+        UINT16 heightInTiles, 
+        UINT16 depthInTiles, 
+        UINT startTileIndexInOverallResource ) 
+    {
+        WidthInTiles = widthInTiles;
+        HeightInTiles = heightInTiles;
+        DepthInTiles = depthInTiles;
+        StartTileIndexInOverallResource = startTileIndexInOverallResource;
+    }
+    operator const D3D12_SUBRESOURCE_TILING&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILE_SHAPE : public D3D12_TILE_SHAPE
+{
+    CD3DX12_TILE_SHAPE()
+    {}
+    explicit CD3DX12_TILE_SHAPE(const D3D12_TILE_SHAPE &o) :
+        D3D12_TILE_SHAPE(o)
+    {}
+    CD3DX12_TILE_SHAPE( 
+        UINT widthInTexels, 
+        UINT heightInTexels, 
+        UINT depthInTexels ) 
+    {
+        WidthInTexels = widthInTexels;
+        HeightInTexels = heightInTexels;
+        DepthInTexels = depthInTexels;
+    }
+    operator const D3D12_TILE_SHAPE&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_BARRIER : public D3D12_RESOURCE_BARRIER
+{
+    CD3DX12_RESOURCE_BARRIER()
+    {}
+    explicit CD3DX12_RESOURCE_BARRIER(const D3D12_RESOURCE_BARRIER &o) :
+        D3D12_RESOURCE_BARRIER(o)
+    {}
+    static inline CD3DX12_RESOURCE_BARRIER Transition(
+        _In_ ID3D12Resource* pResource,
+        D3D12_RESOURCE_STATES stateBefore,
+        D3D12_RESOURCE_STATES stateAfter,
+        UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES,
+        D3D12_RESOURCE_BARRIER_FLAGS flags = D3D12_RESOURCE_BARRIER_FLAG_NONE)
+    {
+        CD3DX12_RESOURCE_BARRIER result;
+        ZeroMemory(&result, sizeof(result));
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+        result.Flags = flags;
+        barrier.Transition.pResource = pResource;
+        barrier.Transition.StateBefore = stateBefore;
+        barrier.Transition.StateAfter = stateAfter;
+        barrier.Transition.Subresource = subresource;
+        return result;
+    }
+    static inline CD3DX12_RESOURCE_BARRIER Aliasing(
+        _In_ ID3D12Resource* pResourceBefore,
+        _In_ ID3D12Resource* pResourceAfter)
+    {
+        CD3DX12_RESOURCE_BARRIER result;
+        ZeroMemory(&result, sizeof(result));
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING;
+        barrier.Aliasing.pResourceBefore = pResourceBefore;
+        barrier.Aliasing.pResourceAfter = pResourceAfter;
+        return result;
+    }
+    static inline CD3DX12_RESOURCE_BARRIER UAV(
+        _In_ ID3D12Resource* pResource)
+    {
+        CD3DX12_RESOURCE_BARRIER result;
+        ZeroMemory(&result, sizeof(result));
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
+        barrier.UAV.pResource = pResource;
+        return result;
+    }
+    operator const D3D12_RESOURCE_BARRIER&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_PACKED_MIP_INFO : public D3D12_PACKED_MIP_INFO
+{
+    CD3DX12_PACKED_MIP_INFO()
+    {}
+    explicit CD3DX12_PACKED_MIP_INFO(const D3D12_PACKED_MIP_INFO &o) :
+        D3D12_PACKED_MIP_INFO(o)
+    {}
+    CD3DX12_PACKED_MIP_INFO( 
+        UINT8 numStandardMips, 
+        UINT8 numPackedMips, 
+        UINT numTilesForPackedMips, 
+        UINT startTileIndexInOverallResource ) 
+    {
+        NumStandardMips = numStandardMips;
+        NumPackedMips = numPackedMips;
+        NumTilesForPackedMips = numTilesForPackedMips;
+        StartTileIndexInOverallResource = startTileIndexInOverallResource;
+    }
+    operator const D3D12_PACKED_MIP_INFO&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SUBRESOURCE_FOOTPRINT : public D3D12_SUBRESOURCE_FOOTPRINT
+{
+    CD3DX12_SUBRESOURCE_FOOTPRINT()
+    {}
+    explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_SUBRESOURCE_FOOTPRINT &o) :
+        D3D12_SUBRESOURCE_FOOTPRINT(o)
+    {}
+    CD3DX12_SUBRESOURCE_FOOTPRINT( 
+        DXGI_FORMAT format, 
+        UINT width, 
+        UINT height, 
+        UINT depth, 
+        UINT rowPitch ) 
+    {
+        Format = format;
+        Width = width;
+        Height = height;
+        Depth = depth;
+        RowPitch = rowPitch;
+    }
+    explicit CD3DX12_SUBRESOURCE_FOOTPRINT( 
+        const D3D12_RESOURCE_DESC& resDesc, 
+        UINT rowPitch ) 
+    {
+        Format = resDesc.Format;
+        Width = UINT( resDesc.Width );
+        Height = resDesc.Height;
+        Depth = (resDesc.Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? resDesc.DepthOrArraySize : 1);
+        RowPitch = rowPitch;
+    }
+    operator const D3D12_SUBRESOURCE_FOOTPRINT&() const { return *this; }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TEXTURE_COPY_LOCATION : public D3D12_TEXTURE_COPY_LOCATION
+{ 
+    CD3DX12_TEXTURE_COPY_LOCATION()
+    {}
+    explicit CD3DX12_TEXTURE_COPY_LOCATION(const D3D12_TEXTURE_COPY_LOCATION &o) :
+        D3D12_TEXTURE_COPY_LOCATION(o)
+    {}
+    CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes) { pResource = pRes; }
+    CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes, D3D12_PLACED_SUBRESOURCE_FOOTPRINT const& Footprint)
+    {
+        pResource = pRes;
+        Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+        PlacedFootprint = Footprint;
+    }
+    CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes, UINT Sub)
+    {
+        pResource = pRes;
+        Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+        SubresourceIndex = Sub;
+    }
+}; 
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DESCRIPTOR_RANGE : public D3D12_DESCRIPTOR_RANGE
+{
+    CD3DX12_DESCRIPTOR_RANGE() { }
+    explicit CD3DX12_DESCRIPTOR_RANGE(const D3D12_DESCRIPTOR_RANGE &o) :
+        D3D12_DESCRIPTOR_RANGE(o)
+    {}
+    CD3DX12_DESCRIPTOR_RANGE(
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart);
+    }
+    
+    inline void Init(
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_DESCRIPTOR_RANGE &range,
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        range.RangeType = rangeType;
+        range.NumDescriptors = numDescriptors;
+        range.BaseShaderRegister = baseShaderRegister;
+        range.RegisterSpace = registerSpace;
+        range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR_TABLE : public D3D12_ROOT_DESCRIPTOR_TABLE
+{
+    CD3DX12_ROOT_DESCRIPTOR_TABLE() {}
+    explicit CD3DX12_ROOT_DESCRIPTOR_TABLE(const D3D12_ROOT_DESCRIPTOR_TABLE &o) :
+        D3D12_ROOT_DESCRIPTOR_TABLE(o)
+    {}
+    CD3DX12_ROOT_DESCRIPTOR_TABLE(
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges)
+    {
+        Init(numDescriptorRanges, _pDescriptorRanges);
+    }
+    
+    inline void Init(
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges)
+    {
+        Init(*this, numDescriptorRanges, _pDescriptorRanges);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_ROOT_DESCRIPTOR_TABLE &rootDescriptorTable,
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges)
+    {
+        rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges;
+        rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_CONSTANTS : public D3D12_ROOT_CONSTANTS
+{
+    CD3DX12_ROOT_CONSTANTS() {}
+    explicit CD3DX12_ROOT_CONSTANTS(const D3D12_ROOT_CONSTANTS &o) :
+        D3D12_ROOT_CONSTANTS(o)
+    {}
+    CD3DX12_ROOT_CONSTANTS(
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0)
+    {
+        Init(num32BitValues, shaderRegister, registerSpace);
+    }
+    
+    inline void Init(
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0)
+    {
+        Init(*this, num32BitValues, shaderRegister, registerSpace);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_ROOT_CONSTANTS &rootConstants,
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0)
+    {
+        rootConstants.Num32BitValues = num32BitValues;
+        rootConstants.ShaderRegister = shaderRegister;
+        rootConstants.RegisterSpace = registerSpace;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR : public D3D12_ROOT_DESCRIPTOR
+{
+    CD3DX12_ROOT_DESCRIPTOR() {}
+    explicit CD3DX12_ROOT_DESCRIPTOR(const D3D12_ROOT_DESCRIPTOR &o) :
+        D3D12_ROOT_DESCRIPTOR(o)
+    {}
+    CD3DX12_ROOT_DESCRIPTOR(
+        UINT shaderRegister,
+        UINT registerSpace = 0)
+    {
+        Init(shaderRegister, registerSpace);
+    }
+    
+    inline void Init(
+        UINT shaderRegister,
+        UINT registerSpace = 0)
+    {
+        Init(*this, shaderRegister, registerSpace);
+    }
+    
+    static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR &table, UINT shaderRegister, UINT registerSpace = 0)
+    {
+        table.ShaderRegister = shaderRegister;
+        table.RegisterSpace = registerSpace;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_PARAMETER : public D3D12_ROOT_PARAMETER
+{
+    CD3DX12_ROOT_PARAMETER() {}
+    explicit CD3DX12_ROOT_PARAMETER(const D3D12_ROOT_PARAMETER &o) :
+        D3D12_ROOT_PARAMETER(o)
+    {}
+    
+    static inline void InitAsDescriptorTable(
+        _Out_ D3D12_ROOT_PARAMETER &rootParam,
+        UINT numDescriptorRanges,
+        _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* pDescriptorRanges,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR_TABLE::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges);
+    }
+
+    static inline void InitAsConstants(
+        _Out_ D3D12_ROOT_PARAMETER &rootParam,
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace);
+    }
+
+    static inline void InitAsConstantBufferView(
+        _Out_ D3D12_ROOT_PARAMETER &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+
+    static inline void InitAsShaderResourceView(
+        _Out_ D3D12_ROOT_PARAMETER &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+
+    static inline void InitAsUnorderedAccessView(
+        _Out_ D3D12_ROOT_PARAMETER &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+    
+    inline void InitAsDescriptorTable(
+        UINT numDescriptorRanges,
+        _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* pDescriptorRanges,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility);
+    }
+    
+    inline void InitAsConstants(
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsConstantBufferView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsConstantBufferView(*this, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsShaderResourceView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsShaderResourceView(*this, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsUnorderedAccessView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, visibility);
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_STATIC_SAMPLER_DESC : public D3D12_STATIC_SAMPLER_DESC
+{
+    CD3DX12_STATIC_SAMPLER_DESC() {}
+    explicit CD3DX12_STATIC_SAMPLER_DESC(const D3D12_STATIC_SAMPLER_DESC &o) :
+        D3D12_STATIC_SAMPLER_DESC(o)
+    {}
+    CD3DX12_STATIC_SAMPLER_DESC(
+         UINT shaderRegister,
+         D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+         D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         FLOAT mipLODBias = 0,
+         UINT maxAnisotropy = 16,
+         D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+         D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE,
+         FLOAT minLOD = 0.f,
+         FLOAT maxLOD = D3D12_FLOAT32_MAX,
+         D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, 
+         UINT registerSpace = 0)
+    {
+        Init(
+            shaderRegister,
+            filter,
+            addressU,
+            addressV,
+            addressW,
+            mipLODBias,
+            maxAnisotropy,
+            comparisonFunc,
+            borderColor,
+            minLOD,
+            maxLOD,
+            shaderVisibility,
+            registerSpace);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_STATIC_SAMPLER_DESC &samplerDesc,
+         UINT shaderRegister,
+         D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+         D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         FLOAT mipLODBias = 0,
+         UINT maxAnisotropy = 16,
+         D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+         D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE,
+         FLOAT minLOD = 0.f,
+         FLOAT maxLOD = D3D12_FLOAT32_MAX,
+         D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, 
+         UINT registerSpace = 0)
+    {
+        samplerDesc.ShaderRegister = shaderRegister;
+        samplerDesc.Filter = filter;
+        samplerDesc.AddressU = addressU;
+        samplerDesc.AddressV = addressV;
+        samplerDesc.AddressW = addressW;
+        samplerDesc.MipLODBias = mipLODBias;
+        samplerDesc.MaxAnisotropy = maxAnisotropy;
+        samplerDesc.ComparisonFunc = comparisonFunc;
+        samplerDesc.BorderColor = borderColor;
+        samplerDesc.MinLOD = minLOD;
+        samplerDesc.MaxLOD = maxLOD;
+        samplerDesc.ShaderVisibility = shaderVisibility;
+        samplerDesc.RegisterSpace = registerSpace;
+    }
+    inline void Init(
+         UINT shaderRegister,
+         D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+         D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+         FLOAT mipLODBias = 0,
+         UINT maxAnisotropy = 16,
+         D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+         D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE,
+         FLOAT minLOD = 0.f,
+         FLOAT maxLOD = D3D12_FLOAT32_MAX,
+         D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, 
+         UINT registerSpace = 0)
+    {
+        Init(
+            *this,
+            shaderRegister,
+            filter,
+            addressU,
+            addressV,
+            addressW,
+            mipLODBias,
+            maxAnisotropy,
+            comparisonFunc,
+            borderColor,
+            minLOD,
+            maxLOD,
+            shaderVisibility,
+            registerSpace);
+    }
+    
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_SIGNATURE_DESC : public D3D12_ROOT_SIGNATURE_DESC
+{
+    CD3DX12_ROOT_SIGNATURE_DESC() {}
+    explicit CD3DX12_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) :
+        D3D12_ROOT_SIGNATURE_DESC(o)
+    {}
+    CD3DX12_ROOT_SIGNATURE_DESC(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT)
+    {
+        Init(0, NULL, 0, NULL, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    }
+    
+    inline void Init(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void Init(
+        _Out_ D3D12_ROOT_SIGNATURE_DESC &desc,
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        desc.NumParameters = numParameters;
+        desc.pParameters = _pParameters;
+        desc.NumStaticSamplers = numStaticSamplers;
+        desc.pStaticSamplers = _pStaticSamplers;
+        desc.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DESCRIPTOR_RANGE1 : public D3D12_DESCRIPTOR_RANGE1
+{
+    CD3DX12_DESCRIPTOR_RANGE1() { }
+    explicit CD3DX12_DESCRIPTOR_RANGE1(const D3D12_DESCRIPTOR_RANGE1 &o) :
+        D3D12_DESCRIPTOR_RANGE1(o)
+    {}
+    CD3DX12_DESCRIPTOR_RANGE1(
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, flags, offsetInDescriptorsFromTableStart);
+    }
+    
+    inline void Init(
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, flags, offsetInDescriptorsFromTableStart);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_DESCRIPTOR_RANGE1 &range,
+        D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+        UINT numDescriptors,
+        UINT baseShaderRegister,
+        UINT registerSpace = 0,
+        D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+        UINT offsetInDescriptorsFromTableStart =
+        D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    {
+        range.RangeType = rangeType;
+        range.NumDescriptors = numDescriptors;
+        range.BaseShaderRegister = baseShaderRegister;
+        range.RegisterSpace = registerSpace;
+        range.Flags = flags;
+        range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR_TABLE1 : public D3D12_ROOT_DESCRIPTOR_TABLE1
+{
+    CD3DX12_ROOT_DESCRIPTOR_TABLE1() {}
+    explicit CD3DX12_ROOT_DESCRIPTOR_TABLE1(const D3D12_ROOT_DESCRIPTOR_TABLE1 &o) :
+        D3D12_ROOT_DESCRIPTOR_TABLE1(o)
+    {}
+    CD3DX12_ROOT_DESCRIPTOR_TABLE1(
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1* _pDescriptorRanges)
+    {
+        Init(numDescriptorRanges, _pDescriptorRanges);
+    }
+    
+    inline void Init(
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1* _pDescriptorRanges)
+    {
+        Init(*this, numDescriptorRanges, _pDescriptorRanges);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_ROOT_DESCRIPTOR_TABLE1 &rootDescriptorTable,
+        UINT numDescriptorRanges,
+        _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1* _pDescriptorRanges)
+    {
+        rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges;
+        rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR1 : public D3D12_ROOT_DESCRIPTOR1
+{
+    CD3DX12_ROOT_DESCRIPTOR1() {}
+    explicit CD3DX12_ROOT_DESCRIPTOR1(const D3D12_ROOT_DESCRIPTOR1 &o) :
+        D3D12_ROOT_DESCRIPTOR1(o)
+    {}
+    CD3DX12_ROOT_DESCRIPTOR1(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE)
+    {
+        Init(shaderRegister, registerSpace, flags);
+    }
+    
+    inline void Init(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE)
+    {
+        Init(*this, shaderRegister, registerSpace, flags);
+    }
+    
+    static inline void Init(
+        _Out_ D3D12_ROOT_DESCRIPTOR1 &table, 
+        UINT shaderRegister, 
+        UINT registerSpace = 0, 
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE)
+    {
+        table.ShaderRegister = shaderRegister;
+        table.RegisterSpace = registerSpace;
+        table.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_PARAMETER1 : public D3D12_ROOT_PARAMETER1
+{
+    CD3DX12_ROOT_PARAMETER1() {}
+    explicit CD3DX12_ROOT_PARAMETER1(const D3D12_ROOT_PARAMETER1 &o) :
+        D3D12_ROOT_PARAMETER1(o)
+    {}
+    
+    static inline void InitAsDescriptorTable(
+        _Out_ D3D12_ROOT_PARAMETER1 &rootParam,
+        UINT numDescriptorRanges,
+        _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1* pDescriptorRanges,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR_TABLE1::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges);
+    }
+
+    static inline void InitAsConstants(
+        _Out_ D3D12_ROOT_PARAMETER1 &rootParam,
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace);
+    }
+
+    static inline void InitAsConstantBufferView(
+        _Out_ D3D12_ROOT_PARAMETER1 &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+
+    static inline void InitAsShaderResourceView(
+        _Out_ D3D12_ROOT_PARAMETER1 &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+
+    static inline void InitAsUnorderedAccessView(
+        _Out_ D3D12_ROOT_PARAMETER1 &rootParam,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+    
+    inline void InitAsDescriptorTable(
+        UINT numDescriptorRanges,
+        _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1* pDescriptorRanges,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility);
+    }
+    
+    inline void InitAsConstants(
+        UINT num32BitValues,
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsConstantBufferView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsConstantBufferView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+
+    inline void InitAsShaderResourceView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsShaderResourceView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+
+    inline void InitAsUnorderedAccessView(
+        UINT shaderRegister,
+        UINT registerSpace = 0,
+        D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+        D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL)
+    {
+        InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC : public D3D12_VERSIONED_ROOT_SIGNATURE_DESC
+{
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC() {}
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_VERSIONED_ROOT_SIGNATURE_DESC &o) :
+        D3D12_VERSIONED_ROOT_SIGNATURE_DESC(o)
+    {}
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o)
+    {
+        Version = D3D_ROOT_SIGNATURE_VERSION_1_0;
+        Desc_1_0 = o;
+    }
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC1 &o)
+    {
+        Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+        Desc_1_1 = o;
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init_1_0(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init_1_1(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT)
+    {
+        Init_1_1(0, NULL, 0, NULL, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    }
+    
+    inline void Init_1_0(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init_1_0(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void Init_1_0(
+        _Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc,
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_0;
+        desc.Desc_1_0.NumParameters = numParameters;
+        desc.Desc_1_0.pParameters = _pParameters;
+        desc.Desc_1_0.NumStaticSamplers = numStaticSamplers;
+        desc.Desc_1_0.pStaticSamplers = _pStaticSamplers;
+        desc.Desc_1_0.Flags = flags;
+    }
+
+    inline void Init_1_1(
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        Init_1_1(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void Init_1_1(
+        _Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc,
+        UINT numParameters,
+        _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1* _pParameters,
+        UINT numStaticSamplers = 0,
+        _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL,
+        D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    {
+        desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+        desc.Desc_1_1.NumParameters = numParameters;
+        desc.Desc_1_1.pParameters = _pParameters;
+        desc.Desc_1_1.NumStaticSamplers = numStaticSamplers;
+        desc.Desc_1_1.pStaticSamplers = _pStaticSamplers;
+        desc.Desc_1_1.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_CPU_DESCRIPTOR_HANDLE : public D3D12_CPU_DESCRIPTOR_HANDLE
+{
+    CD3DX12_CPU_DESCRIPTOR_HANDLE() {}
+    explicit CD3DX12_CPU_DESCRIPTOR_HANDLE(const D3D12_CPU_DESCRIPTOR_HANDLE &o) :
+        D3D12_CPU_DESCRIPTOR_HANDLE(o)
+    {}
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) { ptr = 0; }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize)
+    {
+        InitOffsetted(other, offsetScaledByIncrementSize);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE& Offset(INT offsetInDescriptors, UINT descriptorIncrementSize)
+    { 
+        ptr += offsetInDescriptors * descriptorIncrementSize;
+        return *this;
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE& Offset(INT offsetScaledByIncrementSize) 
+    { 
+        ptr += offsetScaledByIncrementSize;
+        return *this;
+    }
+    bool operator==(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE& other) const
+    {
+        return (ptr == other.ptr);
+    }
+    bool operator!=(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE& other) const
+    {
+        return (ptr != other.ptr);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE &operator=(const D3D12_CPU_DESCRIPTOR_HANDLE &other)
+    {
+        ptr = other.ptr;
+        return *this;
+    }
+    
+    inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize)
+    {
+        InitOffsetted(*this, base, offsetScaledByIncrementSize);
+    }
+    
+    inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize);
+    }
+    
+    static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize)
+    {
+        handle.ptr = base.ptr + offsetScaledByIncrementSize;
+    }
+    
+    static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        handle.ptr = base.ptr + offsetInDescriptors * descriptorIncrementSize;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_GPU_DESCRIPTOR_HANDLE : public D3D12_GPU_DESCRIPTOR_HANDLE
+{
+    CD3DX12_GPU_DESCRIPTOR_HANDLE() {}
+    explicit CD3DX12_GPU_DESCRIPTOR_HANDLE(const D3D12_GPU_DESCRIPTOR_HANDLE &o) :
+        D3D12_GPU_DESCRIPTOR_HANDLE(o)
+    {}
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) { ptr = 0; }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize)
+    {
+        InitOffsetted(other, offsetScaledByIncrementSize);
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize);
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE& Offset(INT offsetInDescriptors, UINT descriptorIncrementSize)
+    { 
+        ptr += offsetInDescriptors * descriptorIncrementSize;
+        return *this;
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE& Offset(INT offsetScaledByIncrementSize) 
+    { 
+        ptr += offsetScaledByIncrementSize;
+        return *this;
+    }
+    inline bool operator==(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE& other) const
+    {
+        return (ptr == other.ptr);
+    }
+    inline bool operator!=(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE& other) const
+    {
+        return (ptr != other.ptr);
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE &operator=(const D3D12_GPU_DESCRIPTOR_HANDLE &other)
+    {
+        ptr = other.ptr;
+        return *this;
+    }
+    
+    inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize)
+    {
+        InitOffsetted(*this, base, offsetScaledByIncrementSize);
+    }
+    
+    inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize);
+    }
+    
+    static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize)
+    {
+        handle.ptr = base.ptr + offsetScaledByIncrementSize;
+    }
+    
+    static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize)
+    {
+        handle.ptr = base.ptr + offsetInDescriptors * descriptorIncrementSize;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+inline UINT D3D12CalcSubresource( UINT MipSlice, UINT ArraySlice, UINT PlaneSlice, UINT MipLevels, UINT ArraySize )
+{ 
+    return MipSlice + ArraySlice * MipLevels + PlaneSlice * MipLevels * ArraySize; 
+}
+
+//------------------------------------------------------------------------------------------------
+template <typename T, typename U, typename V>
+inline void D3D12DecomposeSubresource( UINT Subresource, UINT MipLevels, UINT ArraySize, _Out_ T& MipSlice, _Out_ U& ArraySlice, _Out_ V& PlaneSlice )
+{
+    MipSlice = static_cast<T>(Subresource % MipLevels);
+    ArraySlice = static_cast<U>((Subresource / MipLevels) % ArraySize);
+    PlaneSlice = static_cast<V>(Subresource / (MipLevels * ArraySize));
+}
+
+//------------------------------------------------------------------------------------------------
+inline UINT8 D3D12GetFormatPlaneCount(
+    _In_ ID3D12Device* pDevice,
+    DXGI_FORMAT Format
+    )
+{
+    D3D12_FEATURE_DATA_FORMAT_INFO formatInfo = {Format};
+    if (FAILED(pDevice->CheckFeatureSupport(D3D12_FEATURE_FORMAT_INFO, &formatInfo, sizeof(formatInfo))))
+    {
+        return 0;
+    }
+    return formatInfo.PlaneCount;
+}
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_DESC : public D3D12_RESOURCE_DESC
+{
+    CD3DX12_RESOURCE_DESC()
+    {}
+    explicit CD3DX12_RESOURCE_DESC( const D3D12_RESOURCE_DESC& o ) :
+        D3D12_RESOURCE_DESC( o )
+    {}
+    CD3DX12_RESOURCE_DESC( 
+        D3D12_RESOURCE_DIMENSION dimension,
+        UINT64 alignment,
+        UINT64 width,
+        UINT height,
+        UINT16 depthOrArraySize,
+        UINT16 mipLevels,
+        DXGI_FORMAT format,
+        UINT sampleCount,
+        UINT sampleQuality,
+        D3D12_TEXTURE_LAYOUT layout,
+        D3D12_RESOURCE_FLAGS flags )
+    {
+        Dimension = dimension;
+        Alignment = alignment;
+        Width = width;
+        Height = height;
+        DepthOrArraySize = depthOrArraySize;
+        MipLevels = mipLevels;
+        Format = format;
+        SampleDesc.Count = sampleCount;
+        SampleDesc.Quality = sampleQuality;
+        Layout = layout;
+        Flags = flags;
+    }
+    static inline CD3DX12_RESOURCE_DESC Buffer( 
+        const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo,
+        D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE )
+    {
+        return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes, 
+            1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags );
+    }
+    static inline CD3DX12_RESOURCE_DESC Buffer( 
+        UINT64 width,
+        D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+        UINT64 alignment = 0 )
+    {
+        return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, 
+            DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags );
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex1D( 
+        DXGI_FORMAT format,
+        UINT64 width,
+        UINT16 arraySize = 1,
+        UINT16 mipLevels = 0,
+        D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+        D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+        UINT64 alignment = 0 )
+    {
+        return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, 
+            mipLevels, format, 1, 0, layout, flags );
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex2D( 
+        DXGI_FORMAT format,
+        UINT64 width,
+        UINT height,
+        UINT16 arraySize = 1,
+        UINT16 mipLevels = 0,
+        UINT sampleCount = 1,
+        UINT sampleQuality = 0,
+        D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+        D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+        UINT64 alignment = 0 )
+    {
+        return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize, 
+            mipLevels, format, sampleCount, sampleQuality, layout, flags );
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex3D( 
+        DXGI_FORMAT format,
+        UINT64 width,
+        UINT height,
+        UINT16 depth,
+        UINT16 mipLevels = 0,
+        D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+        D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+        UINT64 alignment = 0 )
+    {
+        return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, 
+            mipLevels, format, 1, 0, layout, flags );
+    }
+    inline UINT16 Depth() const
+    { return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); }
+    inline UINT16 ArraySize() const
+    { return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); }
+    inline UINT8 PlaneCount(_In_ ID3D12Device* pDevice) const
+    { return D3D12GetFormatPlaneCount(pDevice, Format); }
+    inline UINT Subresources(_In_ ID3D12Device* pDevice) const
+    { return MipLevels * ArraySize() * PlaneCount(pDevice); }
+    inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice)
+    { return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize()); }
+    operator const D3D12_RESOURCE_DESC&() const { return *this; }
+};
+inline bool operator==( const D3D12_RESOURCE_DESC& l, const D3D12_RESOURCE_DESC& r )
+{
+    return l.Dimension == r.Dimension &&
+        l.Alignment == r.Alignment &&
+        l.Width == r.Width &&
+        l.Height == r.Height &&
+        l.DepthOrArraySize == r.DepthOrArraySize &&
+        l.MipLevels == r.MipLevels &&
+        l.Format == r.Format &&
+        l.SampleDesc.Count == r.SampleDesc.Count &&
+        l.SampleDesc.Quality == r.SampleDesc.Quality &&
+        l.Layout == r.Layout &&
+        l.Flags == r.Flags;
+}
+inline bool operator!=( const D3D12_RESOURCE_DESC& l, const D3D12_RESOURCE_DESC& r )
+{ return !( l == r ); }
+
+//------------------------------------------------------------------------------------------------
+// Row-by-row memcpy
+inline void MemcpySubresource(
+    _In_ const D3D12_MEMCPY_DEST* pDest,
+    _In_ const D3D12_SUBRESOURCE_DATA* pSrc,
+    SIZE_T RowSizeInBytes,
+    UINT NumRows,
+    UINT NumSlices)
+{
+    for (UINT z = 0; z < NumSlices; ++z)
+    {
+        BYTE* pDestSlice = reinterpret_cast<BYTE*>(pDest->pData) + pDest->SlicePitch * z;
+        const BYTE* pSrcSlice = reinterpret_cast<const BYTE*>(pSrc->pData) + pSrc->SlicePitch * z;
+        for (UINT y = 0; y < NumRows; ++y)
+        {
+            memcpy(pDestSlice + pDest->RowPitch * y,
+                   pSrcSlice + pSrc->RowPitch * y,
+                   RowSizeInBytes);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------------------------
+// Returns required size of a buffer to be used for data upload
+inline UINT64 GetRequiredIntermediateSize(
+    _In_ ID3D12Resource* pDestinationResource,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources)
+{
+    D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc();
+    UINT64 RequiredSize = 0;
+    
+    ID3D12Device* pDevice;
+    pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast<void**>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, 0, nullptr, nullptr, nullptr, &RequiredSize);
+    pDevice->Release();
+    
+    return RequiredSize;
+}
+
+//------------------------------------------------------------------------------------------------
+// All arrays must be populated (e.g. by calling GetCopyableFootprints)
+inline UINT64 UpdateSubresources(
+    _In_ ID3D12GraphicsCommandList* pCmdList,
+    _In_ ID3D12Resource* pDestinationResource,
+    _In_ ID3D12Resource* pIntermediate,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources,
+    UINT64 RequiredSize,
+    _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT* pLayouts,
+    _In_reads_(NumSubresources) const UINT* pNumRows,
+    _In_reads_(NumSubresources) const UINT64* pRowSizesInBytes,
+    _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA* pSrcData)
+{
+    // Minor validation
+    D3D12_RESOURCE_DESC IntermediateDesc = pIntermediate->GetDesc();
+    D3D12_RESOURCE_DESC DestinationDesc = pDestinationResource->GetDesc();
+    if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || 
+        IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || 
+        RequiredSize > (SIZE_T)-1 || 
+        (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER && 
+            (FirstSubresource != 0 || NumSubresources != 1)))
+    {
+        return 0;
+    }
+    
+    BYTE* pData;
+    HRESULT hr = pIntermediate->Map(0, NULL, reinterpret_cast<void**>(&pData));
+    if (FAILED(hr))
+    {
+        return 0;
+    }
+    
+    for (UINT i = 0; i < NumSubresources; ++i)
+    {
+        if (pRowSizesInBytes[i] > (SIZE_T)-1) return 0;
+        D3D12_MEMCPY_DEST DestData = { pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch, pLayouts[i].Footprint.RowPitch * pNumRows[i] };
+        MemcpySubresource(&DestData, &pSrcData[i], (SIZE_T)pRowSizesInBytes[i], pNumRows[i], pLayouts[i].Footprint.Depth);
+    }
+    pIntermediate->Unmap(0, NULL);
+    
+    if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER)
+    {
+        CD3DX12_BOX SrcBox( UINT( pLayouts[0].Offset ), UINT( pLayouts[0].Offset + pLayouts[0].Footprint.Width ) );
+        pCmdList->CopyBufferRegion(
+            pDestinationResource, 0, pIntermediate, pLayouts[0].Offset, pLayouts[0].Footprint.Width);
+    }
+    else
+    {
+        for (UINT i = 0; i < NumSubresources; ++i)
+        {
+            CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource);
+            CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]);
+            pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr);
+        }
+    }
+    return RequiredSize;
+}
+
+//------------------------------------------------------------------------------------------------
+// Heap-allocating UpdateSubresources implementation
+inline UINT64 UpdateSubresources( 
+    _In_ ID3D12GraphicsCommandList* pCmdList,
+    _In_ ID3D12Resource* pDestinationResource,
+    _In_ ID3D12Resource* pIntermediate,
+    UINT64 IntermediateOffset,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+    _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources,
+    _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA* pSrcData)
+{
+    UINT64 RequiredSize = 0;
+    UINT64 MemToAlloc = static_cast<UINT64>(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) * NumSubresources;
+    if (MemToAlloc > SIZE_MAX)
+    {
+       return 0;
+    }
+    void* pMem = HeapAlloc(GetProcessHeap(), 0, static_cast<SIZE_T>(MemToAlloc));
+    if (pMem == NULL)
+    {
+       return 0;
+    }
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT* pLayouts = reinterpret_cast<D3D12_PLACED_SUBRESOURCE_FOOTPRINT*>(pMem);
+    UINT64* pRowSizesInBytes = reinterpret_cast<UINT64*>(pLayouts + NumSubresources);
+    UINT* pNumRows = reinterpret_cast<UINT*>(pRowSizesInBytes + NumSubresources);
+    
+    D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc();
+    ID3D12Device* pDevice;
+    pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast<void**>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows, pRowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+    
+    UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pSrcData);
+    HeapFree(GetProcessHeap(), 0, pMem);
+    return Result;
+}
+
+//------------------------------------------------------------------------------------------------
+// Stack-allocating UpdateSubresources implementation
+template <UINT MaxSubresources>
+inline UINT64 UpdateSubresources( 
+    _In_ ID3D12GraphicsCommandList* pCmdList,
+    _In_ ID3D12Resource* pDestinationResource,
+    _In_ ID3D12Resource* pIntermediate,
+    UINT64 IntermediateOffset,
+    _In_range_(0, MaxSubresources) UINT FirstSubresource,
+    _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources,
+    _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA* pSrcData)
+{
+    UINT64 RequiredSize = 0;
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources];
+    UINT NumRows[MaxSubresources];
+    UINT64 RowSizesInBytes[MaxSubresources];
+    
+    D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc();
+    ID3D12Device* pDevice;
+    pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast<void**>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows, RowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+    
+    return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, RequiredSize, Layouts, NumRows, RowSizesInBytes, pSrcData);
+}
+
+//------------------------------------------------------------------------------------------------
+inline bool D3D12IsLayoutOpaque( D3D12_TEXTURE_LAYOUT Layout )
+{ return Layout == D3D12_TEXTURE_LAYOUT_UNKNOWN || Layout == D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE; }
+
+//------------------------------------------------------------------------------------------------
+inline ID3D12CommandList * const * CommandListCast(ID3D12GraphicsCommandList * const * pp)
+{
+    // This cast is useful for passing strongly typed command list pointers into
+    // ExecuteCommandLists.
+    // This cast is valid as long as the const-ness is respected. D3D12 APIs do
+    // respect the const-ness of their arguments.
+    return reinterpret_cast<ID3D12CommandList * const *>(pp);
+}
+
+//------------------------------------------------------------------------------------------------
+// D3D12 exports a new method for serializing root signatures in the Windows 10 Anniversary Update.
+// To help enable root signature 1.1 features when they are available and not require maintaining
+// two code paths for building root signatures, this helper method reconstructs a 1.0 signature when
+// 1.1 is not supported.
+inline HRESULT D3DX12SerializeVersionedRootSignature(
+    _In_ const D3D12_VERSIONED_ROOT_SIGNATURE_DESC* pRootSignatureDesc,
+    D3D_ROOT_SIGNATURE_VERSION MaxVersion,
+    _Outptr_ ID3DBlob** ppBlob,
+    _Always_(_Outptr_opt_result_maybenull_) ID3DBlob** ppErrorBlob)
+{
+    switch (MaxVersion)
+    {
+        case D3D_ROOT_SIGNATURE_VERSION_1_0:
+            switch (pRootSignatureDesc->Version)
+            {
+                case D3D_ROOT_SIGNATURE_VERSION_1_0:
+                    return D3D12SerializeRootSignature(&pRootSignatureDesc->Desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob, ppErrorBlob);
+
+                case D3D_ROOT_SIGNATURE_VERSION_1_1:
+                {
+                    const D3D12_ROOT_SIGNATURE_DESC1& desc_1_1 = pRootSignatureDesc->Desc_1_1;
+
+                    SIZE_T ParametersSize = sizeof(D3D12_ROOT_PARAMETER) * desc_1_1.NumParameters;
+                    void* pParameters = (ParametersSize > 0) ? HeapAlloc(GetProcessHeap(), 0, ParametersSize) : NULL;
+                    D3D12_ROOT_PARAMETER* pParameters_1_0 = reinterpret_cast<D3D12_ROOT_PARAMETER*>(pParameters);
+
+                    for (UINT n = 0; n < desc_1_1.NumParameters; n++)
+                    {
+                        pParameters_1_0[n].ParameterType = desc_1_1.pParameters[n].ParameterType;
+                        pParameters_1_0[n].ShaderVisibility = desc_1_1.pParameters[n].ShaderVisibility;
+
+                        switch (desc_1_1.pParameters[n].ParameterType)
+                        {
+                        case D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS:
+                            pParameters_1_0[n].Constants.Num32BitValues = desc_1_1.pParameters[n].Constants.Num32BitValues;
+                            pParameters_1_0[n].Constants.RegisterSpace = desc_1_1.pParameters[n].Constants.RegisterSpace;
+                            pParameters_1_0[n].Constants.ShaderRegister = desc_1_1.pParameters[n].Constants.ShaderRegister;
+                            break;
+
+                        case D3D12_ROOT_PARAMETER_TYPE_CBV:
+                        case D3D12_ROOT_PARAMETER_TYPE_SRV:
+                        case D3D12_ROOT_PARAMETER_TYPE_UAV:
+                            pParameters_1_0[n].Descriptor.RegisterSpace = desc_1_1.pParameters[n].Descriptor.RegisterSpace;
+                            pParameters_1_0[n].Descriptor.ShaderRegister = desc_1_1.pParameters[n].Descriptor.ShaderRegister;
+                            break;
+
+                        case D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE:
+                            const D3D12_ROOT_DESCRIPTOR_TABLE1& table_1_1 = desc_1_1.pParameters[n].DescriptorTable;
+
+                            SIZE_T DescriptorRangesSize = sizeof(D3D12_DESCRIPTOR_RANGE) * table_1_1.NumDescriptorRanges;
+                            void* pDescriptorRanges = (DescriptorRangesSize > 0) ? HeapAlloc(GetProcessHeap(), 0, DescriptorRangesSize) : NULL;
+                            D3D12_DESCRIPTOR_RANGE* pDescriptorRanges_1_0 = reinterpret_cast<D3D12_DESCRIPTOR_RANGE*>(pDescriptorRanges);
+
+                            for (UINT x = 0; x < table_1_1.NumDescriptorRanges; x++)
+                            {
+                                pDescriptorRanges_1_0[x].BaseShaderRegister = table_1_1.pDescriptorRanges[x].BaseShaderRegister;
+                                pDescriptorRanges_1_0[x].NumDescriptors = table_1_1.pDescriptorRanges[x].NumDescriptors;
+                                pDescriptorRanges_1_0[x].OffsetInDescriptorsFromTableStart = table_1_1.pDescriptorRanges[x].OffsetInDescriptorsFromTableStart;
+                                pDescriptorRanges_1_0[x].RangeType = table_1_1.pDescriptorRanges[x].RangeType;
+                                pDescriptorRanges_1_0[x].RegisterSpace = table_1_1.pDescriptorRanges[x].RegisterSpace;
+                            }
+
+                            D3D12_ROOT_DESCRIPTOR_TABLE& table_1_0 = pParameters_1_0[n].DescriptorTable;
+                            table_1_0.NumDescriptorRanges = table_1_1.NumDescriptorRanges;
+                            table_1_0.pDescriptorRanges = pDescriptorRanges_1_0;
+                        }
+                    }
+
+                    CD3DX12_ROOT_SIGNATURE_DESC desc_1_0(desc_1_1.NumParameters, pParameters_1_0, desc_1_1.NumStaticSamplers, desc_1_1.pStaticSamplers, desc_1_1.Flags);
+                    HRESULT hr = D3D12SerializeRootSignature(&desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob, ppErrorBlob);
+
+                    for (UINT n = 0; n < desc_1_0.NumParameters; n++)
+                    {
+                        if (desc_1_0.pParameters[n].ParameterType == D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE)
+                        {
+                            HeapFree(GetProcessHeap(), 0, reinterpret_cast<void*>(const_cast<D3D12_DESCRIPTOR_RANGE*>(pParameters_1_0[n].DescriptorTable.pDescriptorRanges)));
+                        }
+                    }
+                    HeapFree(GetProcessHeap(), 0, pParameters);
+                    return hr;
+                }
+            }
+            break;
+
+        case D3D_ROOT_SIGNATURE_VERSION_1_1:
+            return D3D12SerializeVersionedRootSignature(pRootSignatureDesc, ppBlob, ppErrorBlob);
+    }
+
+    return E_INVALIDARG;
+}
+
+#endif // defined( __cplusplus )
+
+#endif //__D3DX12_H__
+
+
+

+ 1 - 0
tools/clang/unittests/DxrFallback/defaultTestFilePath.h.in

@@ -0,0 +1 @@
+#define DEFAULT_TEST_FILE_PATH "@DEFAULT_TEST_FILE_PATH@"

+ 71 - 0
tools/clang/unittests/DxrFallback/testFiles/HLSLRayTracingInternalPrototypes.h

@@ -0,0 +1,71 @@
+#ifndef HLSL_RAYTRACING_INTERNAL_PROTOTYPES
+#define HLSL_RAYTRACING_INTERNAL_PROTOTYPES
+
+#define INSTANCE_FLAG_NONE                              0x0
+#define INSTANCE_FLAG_TRIANGLE_CULL_DISABLE             0x1
+#define INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE   0x2
+#define INSTANCE_FLAG_FORCE_OPAQUE                      0x4
+#define INSTANCE_FLAG_FORCE_NON_OPAQUE                  0x8
+
+#define SHADER_test [experimental("shader", "test")][noinline]
+#define SHADER_internal [experimental("shader", "internal")][noinline]
+
+
+// Declare ReportHit overload for given attribute structure
+#define Declare_Fallback_SetPendingAttr(attr_t) \
+    void Fallback_SetPendingAttr(attr_t);
+
+// Declare TraceRayTest overload for given attribute structure
+#define Declare_TraceRayTest(attr_t) \
+    SHADER_test\
+    void TraceRayTest(int param, attr_t);
+
+
+float3 Fallback_WorldRayOrigin();
+float3 Fallback_WorldRayDirection();
+float  Fallback_RayTMin();
+float  Fallback_RayTCurrent();
+uint   Fallback_RayFlags();
+float3 Fallback_ObjectRayOrigin();
+float3 Fallback_ObjectRayDirection();
+uint   Fallback_PrimitiveIndex();
+uint   Fallback_InstanceIndex();
+uint   Fallback_InstanceID();
+uint   Fallback_HitKind();
+uint   Fallback_ShaderRecordOffset();
+
+void Fallback_SetWorldRayOrigin(float3 val);
+void Fallback_SetWorldRayDirection(float3 val);
+void Fallback_SetRayTMin(float val);
+void Fallback_SetRayTCurrent(float val);
+void Fallback_SetRayFlags(uint rayFlags);
+void Fallback_SetObjectRayOrigin(float3 val);
+void Fallback_SetObjectRayDirection(float3 val);
+void Fallback_SetPrimitiveIndex(uint val);
+void Fallback_SetInstanceIndex(uint val);
+void Fallback_SetInstanceID(uint val);
+void Fallback_SetHitKind(uint val);
+void Fallback_SetShaderRecordOffset(uint offset);
+void Fallback_SetObjectToWorld(row_major float3x4 val);
+void Fallback_SetWorldToObject(row_major float3x4 val);
+void Fallback_SetPendingRayTCurrent(float t);
+void Fallback_SetPendingHitKind(uint hitKind);
+void Fallback_SetPendingTriVals(uint hitGroupRecordOffset, uint primitiveIndex, uint instanceIndex, uint instanceID, float t, uint hitKind);
+void Fallback_SetPendingCustomVals(uint hitGroupRecordOffset, uint primitiveIndex, uint instanceIndex, uint instanceID);
+
+// Returns the old payload offset to be restored by Fallback_TraceRayEnd().
+uint Fallback_TraceRayBegin(uint rayFlags, float3 origin, float tmin, float3 dir, float tmax, uint newPayloadOffset);
+void Fallback_TraceRayEnd(int oldPayloadOffset);
+uint Fallback_GroupIndex();
+int  Fallback_AnyHitResult();
+void Fallback_SetAnyHitResult(int result);
+int  Fallback_AnyHitStateId();
+void Fallback_SetAnyHitStateId(int stateId);
+void Fallback_CommitHit();
+void Fallback_CallIndirect(int stateId);
+void Fallback_Scheduler(int initialStateId, uint dimx, uint dimy);
+
+int Fallback_RuntimeDataLoadInt(int offset);
+void Fallback_RuntimeDataStoreInt(int offset, int val);
+
+#endif // HLSL_RAYTRACING_INTERNAL_PROTOTYPES

+ 83 - 0
tools/clang/unittests/DxrFallback/testFiles/HLSLRayTracingPrototypes.h

@@ -0,0 +1,83 @@
+//=================================================================================================================================
+// Ray tracing prototype header  (not bothering to separate into a different header for now)
+//=================================================================================================================================
+
+#ifndef HLSL_RAYTRACING_PROTOTYPES
+#define HLSL_RAYTRACING_PROTOTYPES
+
+#define HIT_KIND_TRIANGLE_FRONT_FACE    0xFE
+#define HIT_KIND_TRIANGLE_BACK_FACE     0xFF
+
+typedef uint RAY_FLAG;
+#define RAY_FLAG_NONE                         0x00
+#define RAY_FLAG_FORCE_OPAQUE                 0x01
+#define RAY_FLAG_FORCE_NON_OPAQUE             0x02
+#define RAY_FLAG_TERMINATE_ON_FIRST_HIT       0x04
+#define RAY_FLAG_SKIP_CLOSEST_HIT_SHADER      0x08
+#define RAY_FLAG_CULL_BACK_FACING_TRIANGLES   0x10
+#define RAY_FLAG_CULL_FRONT_FACING_TRIANGLES  0x20
+#define RAY_FLAG_CULL_OPAQUE                  0x40
+#define RAY_FLAG_CULL_NON_OPAQUE              0x80
+
+#define SV_RayPayload RT_RayPayload
+#define SV_IntersectionAttributes RT_IntersectionAttributes
+
+struct RayDesc
+{
+    float3 Origin;
+    float TMin;
+    float3 Direction;
+    float TMax;
+};
+
+struct BuiltInTriangleIntersectionAttributes
+{
+    float2 barycentrics;
+};
+
+typedef ByteAddressBuffer RayTracingAccelerationStructure;
+
+// Declare TraceRay overload for given payload structure
+//#define Declare_TraceRay(payload_t) \
+//    void TraceRay(RayTracingAccelerationStructure, uint RayFlags, uint InstanceCullMask, uint RayContributionToHitGroupIndex, uint MultiplierForGeometryContributionToHitGroupIndex, uint MissShaderIndex, RayDesc, inout payload_t);
+#define Declare_TraceRay(payload_t) \
+    void TraceRay(int param, inout payload_t);
+
+// Declare ReportHit overload for given attribute structure
+#define Declare_ReportHit(attr_t) \
+    bool ReportHit(float HitT, uint HitKind, attr_t);
+
+// Declare CallShader overload for given param structure
+#define Declare_CallShader(param_t) \
+    void CallShader(uint ShaderIndex, inout param_t);
+
+void IgnoreHit();
+void AcceptHitAndEndSearch();
+
+// System Value retrieval functions
+uint2 DispatchRaysIndex();
+uint2 DispatchRaysDimensions();
+float3 WorldRayOrigin();
+float3 WorldRayDirection();
+float RayTMin();
+float CurrentRayT();
+uint RayFlags();
+uint PrimitiveIndex();
+uint InstanceIndex();
+uint InstanceID();
+float3 ObjectRayOrigin();
+float3 ObjectRayDirection();
+row_major float3x4 ObjectToWorld();
+row_major float3x4 WorldToObject();
+uint HitKind();
+
+// Place SHADER_* before appropriate entry function
+#define SHADER_raygeneration [experimental("shader", "raygeneration")]
+#define SHADER_intersection [experimental("shader", "intersection")]
+#define SHADER_anyhit [experimental("shader", "anyhit")]
+#define SHADER_closesthit [experimental("shader", "closesthit")]
+#define SHADER_miss [experimental("shader", "miss")]
+#define SHADER_callable [experimental("shader", "callable")]
+
+
+#endif // HLSL_RAYTRACING_PROTOTYPES

+ 25 - 0
tools/clang/unittests/DxrFallback/testFiles/testLib.h

@@ -0,0 +1,25 @@
+#include "HLSLRayTracingInternalPrototypes.h"
+
+RWStructuredBuffer<int> input	    : register(u0);
+RWStructuredBuffer<int> output  	: register(u1);
+RWStructuredBuffer<int> one     	: register(u2);
+
+cbuffer TestConstants : register(b0)
+{
+  int initialStateId;
+}
+
+// Read one integer from the input buffer
+int consume();
+
+int peekInput();
+
+// Write val to the output buffer
+void append(int val);
+
+// Returns idx by reading the value of 1 from the one buffer. This is to avoid the 
+// compiler optimizing stuff away. 
+int load(int idx);
+
+// Write both val and the expected value to the output
+void verify(int val, int expected);

+ 59 - 0
tools/clang/unittests/DxrFallback/testFiles/testLib.hlsl

@@ -0,0 +1,59 @@
+#include "testLib.h"
+
+#define INLINING [noinline] // Hide cruft to make debugging easier.
+//#define INLINING
+
+INLINING
+void logAppend(int val)
+{
+  int slot;
+  InterlockedAdd(output[0], 1, slot);
+  slot += 1; // to account for the slot counter being at position 0
+  output[slot] = val;
+}
+
+INLINING
+void logAppend2(int key, int val)
+{
+  int slot;
+  InterlockedAdd(output[0], 2, slot);
+  slot += 1; // to account for the slot counter being at position 0
+  output[slot + 0] = key;
+  output[slot + 1] = val;
+}
+
+INLINING
+int load(int idx)
+{
+  return one[0] * idx;
+}
+
+INLINING
+void verify(int val, int expected)
+{
+  logAppend2(val, expected);
+}
+
+INLINING
+void append(int val)
+{
+  logAppend(val);
+}
+
+INLINING
+int consume()
+{
+  int slot;
+  InterlockedAdd(input[0], 1, slot);
+  slot += 1;
+  return input[slot];
+}
+
+INLINING
+int peekInput()
+{
+  int slot;
+  InterlockedAdd(input[0], 0, slot);
+  slot += 1;
+  return input[slot];
+}

+ 380 - 0
tools/clang/unittests/DxrFallback/testFiles/testShader1.hlsl

@@ -0,0 +1,380 @@
+#include "testLib.h"
+
+SHADER_test
+void continuation();
+
+SHADER_test
+void continuation_in(int val);
+
+SHADER_test
+void continuation_out64(out int val);
+
+SHADER_test
+void continuation_inout64(inout int val);
+
+SHADER_test
+void types()
+{
+  bool bVal = (load(1) > 0);
+  int  ival = load(2);
+  uint uval = load(3);
+  half hval = load(4);
+  float fval = load(5);
+  double dval = load(6) + 1e-5;
+  int2 ival2 = int2(load(7), load(8));
+
+  continuation();
+
+  verify(bVal ? 1 : 0, 1);
+  verify(ival, 2);
+  verify(uval, 3);
+  verify(hval, 4);
+  verify(fval, 5);
+  verify((int)dval, 6);
+  verify(ival2.x, 7);
+  verify(ival2.y, 8);
+}
+
+SHADER_test
+void no_call()
+{
+  int val = load(1);
+  verify(val, 1);
+}
+
+SHADER_test
+void no_live_values()
+{
+  verify(1, 1);
+  continuation();
+  verify(2, 2);
+}
+
+SHADER_test
+void single_call()
+{
+  int val = load(1);
+  continuation();
+  verify(val, 1);
+}
+
+SHADER_test
+void multiple_calls()
+{
+  int val1 = load(1);
+  int val2 = load(2);
+  continuation();
+  verify(val1, 1);
+
+  val1 += load(3); // creates a live alloca
+  continuation();  // val2 is live here, but should not be reloaded/saved
+  verify(val1, 4);
+  verify(val2, 2);
+}
+
+SHADER_test
+void branch()
+{
+  int val = load(10);
+  if (load(1))
+    continuation_out64(val);
+  verify(val, 64);
+}
+
+SHADER_test
+void no_branch()
+{
+  int val = load(10);
+  if (!load(1))
+    continuation_out64(val);
+  verify(val, 10);
+}
+
+SHADER_test
+void loop()
+{
+  int val = load(1);
+  continuation();
+  verify(val, 1);
+
+  for (int i = 0, n = load(4); i < n; i++)
+  {
+    continuation();
+    val += 1;
+  }
+  verify(val, 5);
+}
+
+SHADER_test
+void recursive_rec(int val)
+{
+  verify(val, val);
+  if (val > 0)
+    recursive_rec(val - 1);
+}
+
+SHADER_test
+void recursive()
+{
+  int val = load(1);
+  recursive_rec(load(5));
+  verify(val, 1);
+}
+
+struct MyStruct
+{
+  int v1;
+  int v2;
+};
+
+
+SHADER_test
+void continuation_aggregates(inout MyStruct S, inout uint3 V, inout int A[4])
+{
+  append(-99);
+  append(S.v1);
+  append(V.x);
+  append(A[0]);
+}
+
+SHADER_test
+void call_with_aggregates()
+{
+  MyStruct S;
+  S.v1 = load(1);
+  S.v2 = load(2);
+
+  uint3 V = uint3(load(3), 0, 0);
+
+  int A[4];
+  A[0] = 0;
+  A[1] = 1;
+  A[2] = load(2);
+  A[3] = 3;
+
+  continuation_aggregates(S, V, A);
+
+  append(S.v1);
+  append(V.x);
+  append(A[2]);
+}
+
+
+
+SHADER_test
+void func_with_args(int arg1, int arg2)
+{
+  verify(arg1, 1);
+  continuation();
+  verify(arg1, 1);
+
+  continuation();
+  verify(arg2, 2);
+}
+
+SHADER_test
+void multiple_calls_with_args()
+{
+  int val = load(3);
+  func_with_args(load(1), load(2));
+  verify(val, 3);
+}
+
+SHADER_test
+void single_call_in()
+{
+  continuation_in(load(10));
+}
+
+SHADER_test
+void single_call_out()
+{
+  int val;
+  continuation_out64(val);
+  verify(val, val);
+}
+
+
+SHADER_test
+void single_call_inout()
+{
+  int val = load(10);
+  continuation_inout64(val);
+  verify(val, 64);
+}
+
+
+
+SHADER_test
+void continuation_inout_passthru64(inout int val)
+{
+  append(-98);
+  continuation_inout64(val);
+}
+
+SHADER_test
+void single_call_inout_passthru()
+{
+  int val = load(10);
+  continuation_inout_passthru64(val);
+  verify(val, 64);
+}
+
+
+SHADER_test
+void use_buffer()
+{
+  int val = load(10);
+  continuation();
+  verify(val, load(10));
+}
+
+
+SHADER_test
+void lower_intrinsics()
+{
+  float3 exp_WorldRayOrigin = float3(0, 1, 2);
+  Fallback_SetWorldRayOrigin(exp_WorldRayOrigin);
+
+  float3 exp_WorldRayDirection = float3(3, 4, 5);
+  Fallback_SetWorldRayDirection(exp_WorldRayDirection);
+
+  float exp_RayTMin = 6;
+  Fallback_SetRayTMin(exp_RayTMin);
+
+  float exp_RayTCurrent = 7;
+  Fallback_SetRayTCurrent(exp_RayTCurrent);
+
+  uint exp_PrimitiveIndex = 8;
+  Fallback_SetPrimitiveIndex(exp_PrimitiveIndex);
+
+  uint exp_InstanceID = 9;
+  Fallback_SetInstanceID(exp_InstanceID);
+
+  uint exp_InstanceIndex = 10;
+  Fallback_SetInstanceIndex(exp_InstanceIndex);
+
+  float3 exp_ObjectRayOrigin = float3(11, 12, 13);
+  Fallback_SetObjectRayOrigin(exp_ObjectRayOrigin);
+
+  float3 exp_ObjectRayDirection = float3(14, 15, 16);
+  Fallback_SetObjectRayDirection(exp_ObjectRayDirection);
+
+  row_major float3x4 exp_ObjectToWorld = {
+    {17,18,19,20},
+    {21,22,23,24},
+    {25,26,27,28},
+  };
+  Fallback_SetObjectToWorld(exp_ObjectToWorld);
+
+  row_major float3x4 exp_WorldToObject = {
+    {29,30,31,32},
+    {33,34,35,36},
+    {37,38,39,40},
+  };
+  Fallback_SetWorldToObject(exp_WorldToObject);
+
+  uint exp_HitKind = 41;
+  Fallback_SetHitKind(exp_HitKind);
+
+  continuation();
+
+  int mismatches = 0;
+
+  float3 worldRayOrigin = WorldRayOrigin();
+  mismatches += any(worldRayOrigin != exp_WorldRayOrigin);
+
+  float3 worldRayDirection = WorldRayDirection();
+  mismatches += any(worldRayDirection != exp_WorldRayDirection);
+
+  float rayTMin = RayTMin();
+  mismatches += (rayTMin != exp_RayTMin);
+
+  float rayTCurrent = RayTCurrent();
+  mismatches += (rayTCurrent != exp_RayTCurrent);
+
+  uint primitiveIndex = PrimitiveIndex();
+  mismatches += (primitiveIndex != exp_PrimitiveIndex);
+
+  uint instanceID = InstanceID();
+  mismatches += (instanceID != exp_InstanceID);
+
+  uint instanceIndex = InstanceIndex();
+  mismatches += (instanceIndex != exp_InstanceIndex);
+
+  float3 objectRayOrigin = ObjectRayOrigin();
+  mismatches += any(objectRayOrigin != exp_ObjectRayOrigin);
+
+  float3 objectRayDirection = ObjectRayDirection();
+  mismatches += any(objectRayDirection != exp_ObjectRayDirection);
+
+  row_major float3x4 objectToWorld = ObjectToWorld();
+  mismatches += any(objectToWorld != exp_ObjectToWorld);
+
+  row_major float3x4 worldToObject = WorldToObject();
+  mismatches += any(worldToObject != exp_WorldToObject);
+
+  uint hitKind = HitKind();
+  mismatches += (hitKind != exp_HitKind);
+
+  verify(mismatches, 0);
+}
+
+SHADER_test
+void local_array()
+{
+  int vals[10];
+  for (int i = 0; i < 10; ++i)
+    vals[i] = i;
+
+  vals[5] = load(5);
+  continuation();
+
+  verify(vals[load(4)], 4);
+}
+
+[noinline]
+void func_with_array_param(inout int val[5])
+{
+  val[load(2)] = 2;
+}
+
+SHADER_test
+void array_param()
+{
+  int val[5];
+  continuation();
+  func_with_array_param(val);
+  append(val[load(2)]);
+}
+
+SHADER_test
+void array_param2()
+{
+  row_major float3x4 exp_ObjectToWorld = {
+    {17,18,19,20},
+    {21,22,23,24},
+    {25,26,27,28},
+  };
+  continuation();
+  Fallback_SetObjectToWorld(exp_ObjectToWorld);
+}
+
+SHADER_test
+void dispatch_idx_and_dims()
+{
+  uint2 dispatchRaysIndex = DispatchRaysIndex();
+  append(dispatchRaysIndex.x);
+  append(dispatchRaysIndex.y);
+
+  uint2 dispatchRaysDimensions = DispatchRaysDimensions();
+  append(dispatchRaysDimensions.x);
+  append(dispatchRaysDimensions.y);
+}
+
+
+[numthreads(1, 1, 1)]
+void CSMain()
+{
+  Fallback_Scheduler(initialStateId, 1, 1);
+}

+ 228 - 0
tools/clang/unittests/DxrFallback/testFiles/testShader2.hlsl

@@ -0,0 +1,228 @@
+#include "testLib.h"
+
+SHADER_test
+void indirect()
+{
+  Fallback_CallIndirect(consume());
+}
+
+SHADER_test
+void indirect_callee()
+{
+  append(-99);
+}
+
+
+
+
+SHADER_test
+void continuation()
+{
+  append(-99); // mark that we got here
+}
+
+SHADER_test
+void continuation_in(int val)
+{
+  append(val); // mark that we got here
+}
+
+SHADER_test
+void continuation_out64(out int val)
+{
+  val = 64;
+  append(-99); // mark that we got here
+}
+
+SHADER_test
+void continuation_inout64(inout int val)
+{
+  append(val); // mark that we got here
+  val = 64;
+}
+
+void append2(int val)
+{
+  int slot;
+  InterlockedAdd(output[0], 2, slot);
+  slot += 1; // to account for the slot counter being at position 0
+  output[slot + 0] = val;
+  output[slot + 1] = val;
+}
+
+
+
+
+void StackDump(int begin, int end)
+{
+  append(88888888);
+  for (int i = begin; i <= end; i++)
+    append(Fallback_RuntimeDataLoadInt(i));
+  append(88888888);
+}
+
+struct MyPayload
+{
+  int val;
+  int depth;
+};
+
+struct MyPayload2
+{
+  int val;
+  int val2;
+  int depth;
+};
+
+
+struct MyAttributes
+{
+  int attr0;
+  int attr1;
+};
+
+//SHADER_test
+void TraceRayTest(int which, inout MyPayload);
+void TraceRayTest(int which, inout MyPayload2);
+
+
+[shader("raygeneration")]
+void raygen_tri()
+{
+  MyPayload payload;
+  payload.val = 1000;
+  payload.depth = 0;
+
+  TraceRayTest(0, payload);
+
+  append(payload.val);
+}
+
+
+[shader("raygeneration")]
+void raygen_custom()
+{
+  MyPayload payload;
+  payload.val = 1000;
+  payload.depth = 0;
+  TraceRayTest(1, payload);
+  append(payload.val);
+
+  MyPayload2 payload2;
+  payload2.val = payload.val;
+  payload2.val2 = 2001;
+  payload2.depth = 0;
+  TraceRayTest(2, payload2);
+  append(payload2.val);
+}
+
+[shader("closesthit")]
+void chTri(inout MyPayload payload : SV_RayPayload, in BuiltInTriangleIntersectionAttributes attr : SV_IntersectionAttributes)
+{
+  append(-97);
+  append(attr.barycentrics.x);
+  append(attr.barycentrics.y);
+  continuation();
+  payload.val += 10;
+}
+
+
+[shader("closesthit")]
+void chCustom1(inout MyPayload payload : SV_RayPayload, in MyAttributes attr : SV_IntersectionAttributes)
+{
+  append(-96);
+  append(attr.attr0);
+  append(attr.attr1);
+  continuation();
+  payload.val += 10;
+}
+
+[shader("closesthit")]
+void chCustom2(inout MyPayload2 payload2 : SV_RayPayload, in MyAttributes attr : SV_IntersectionAttributes)
+{
+  append(-96);
+  append(attr.attr0);
+  append(attr.attr1);
+  continuation();
+  payload2.val += 100;
+}
+
+[shader("miss")]
+void miss(inout MyPayload payload : SV_RayPayload)
+{
+  append(-95);
+  payload.val += 1;
+}
+
+Declare_Fallback_SetPendingAttr(MyAttributes);
+
+[shader("intersection")][noinline]
+void intersection()
+{
+  append(-95);
+  append(RayTCurrent());
+  append(Fallback_ShaderRecordOffset());
+  append(PrimitiveIndex());
+  append(InstanceIndex());
+  append(InstanceID());
+  MyAttributes attr;
+  attr.attr0 = 333;
+  attr.attr1 = 444;
+  if (ReportHit(12, 77, attr))
+    append(500);
+  else
+    append(600);
+}
+
+
+// Return < 0 for terminate, 0 for ignore, > 0 for accept
+int Fallback_ReportHit(float t, uint hitKind)
+{
+  append(-100);
+  continuation();
+  Fallback_CommitHit();
+  return load(1);
+}
+
+Declare_Fallback_SetPendingAttr(BuiltInTriangleIntersectionAttributes);
+
+SHADER_test
+void Fallback_TraceRay(int which, uint payloadOffset)
+{
+  uint oldPayloadOffset = Fallback_TraceRayBegin(0, float3(0, 0, 0), 0, float3(0, 0, 1), 1e34, payloadOffset);
+  append(-98);
+  if (which == 0)
+  {
+    BuiltInTriangleIntersectionAttributes attr;
+    attr.barycentrics = float2(555, 666);
+    Fallback_SetPendingAttr(attr);
+    Fallback_CommitHit();
+  }
+  else if (which == 1)
+  {
+    // test that we get ending values in intersection, but not for rayTCurrent
+    Fallback_SetPendingRayTCurrent(19);
+    Fallback_SetPendingCustomVals(20, 21, 22, 23);
+    Fallback_CommitHit();
+    Fallback_SetPendingRayTCurrent(9);
+    Fallback_SetPendingCustomVals(10, 11, 12, 13);
+    intersection();
+  }
+  else if (which == 2)
+  {
+    // test that we get ending values in intersection, but not for rayTCurrent
+    Fallback_SetPendingRayTCurrent(59);
+    Fallback_SetPendingCustomVals(60, 61, 62, 63);
+    Fallback_CommitHit();
+    Fallback_SetPendingRayTCurrent(49);
+    Fallback_SetPendingCustomVals(50, 51, 52, 53);
+
+    Fallback_SetInstanceID(63);
+    intersection();
+  }
+
+  Fallback_CallIndirect(consume());
+
+  Fallback_TraceRayEnd(oldPayloadOffset);
+}
+

+ 47 - 0
tools/clang/unittests/DxrFallback/testFiles/testShader3.hlsl

@@ -0,0 +1,47 @@
+#include "testLib.h"
+
+struct SomePayload
+{
+  int val;
+};
+
+void UndefinedFunction();
+void TraceRayTest(RayDesc, uint rayFlags, inout SomePayload);
+
+SHADER_test
+void Fallback_TraceRay(RayDesc rd, uint rayFlags, uint payloadOffset)
+{
+  uint oldPayloadOffset = Fallback_TraceRayBegin(rayFlags, rd.Origin, rd.TMin, rd.Direction, rd.TMax, payloadOffset);
+
+  //UndefinedFunction();
+  append(-99);
+  append(rd.Origin.x);
+  append(rd.Origin.y);
+  append(rd.Origin.z);
+  append(rd.TMin);
+  append(rd.Direction.x);
+  append(rd.Direction.y);
+  append(rd.Direction.z);
+  append(rd.TMax);
+  append(rayFlags);
+
+  Fallback_TraceRayEnd(oldPayloadOffset);
+}
+
+SHADER_test
+void pass_struct()
+{
+  RayDesc rd;
+  rd.Origin = float3(1, 2, 3);
+  rd.TMin = 4;
+  rd.Direction = float3(5, 6, 7);
+  rd.TMax = 8;
+  SomePayload payload = { 10 };
+  TraceRayTest(rd, 11, payload);
+}
+
+[numthreads(1, 1, 1)]
+void CSMain()
+{
+  Fallback_Scheduler(initialStateId, 1, 1);
+}

+ 69 - 0
tools/clang/unittests/DxrFallback/testFiles/testShader4.hlsl

@@ -0,0 +1,69 @@
+#include "testLib.h"
+
+RaytracingAccelerationStructure accel : register(t0);
+
+struct SomePayload
+{
+  int val;
+};
+
+SHADER_test
+void Fallback_TraceRay(
+  uint rayFlags,
+  uint instanceInclusionMask,
+  uint rayContributionToHitGroupIndex,
+  uint multiplierForGeometryContributionToHitGroupIndex,
+  uint missShaderIndex,
+  float originX,
+  float originY,
+  float originZ,
+  float tMin,
+  float directionX,
+  float directionY,
+  float directionZ,
+  float tMax,
+  uint payloadOffset)
+{
+  uint oldPayloadOffset = Fallback_TraceRayBegin(rayFlags, float3(originX, originY, originZ), tMin, float3(directionX, directionY, directionZ), tMax, payloadOffset);
+
+  append(originX);
+  append(originY);
+  append(originZ);
+  append(tMin);
+  append(directionX);
+  append(directionY);
+  append(directionZ);
+  append(tMax);
+  append(rayFlags);
+  append(instanceInclusionMask);
+  append(rayContributionToHitGroupIndex);
+  append(multiplierForGeometryContributionToHitGroupIndex);
+  append(missShaderIndex);
+
+  Fallback_TraceRayEnd(oldPayloadOffset);
+}
+
+SHADER_test
+void full_trace_ray()
+{
+  RayDesc ray;
+  ray.Origin = float3(1, 2, 3);
+  ray.TMin = 4;
+  ray.Direction = float3(5, 6, 7);
+  ray.TMax = 8;
+  SomePayload payload = { 10 };
+  uint rayFlags = 11;
+  uint instanceInclusionMask = 12;
+  uint rayContributionToHitGroupIndex = 13;
+  uint multiplierForGeometryContributionToHitGroupIndex = 14;
+  uint missShaderIndex = 15;
+
+  TraceRay(accel, rayFlags, instanceInclusionMask, rayContributionToHitGroupIndex,
+    multiplierForGeometryContributionToHitGroupIndex, missShaderIndex, ray, payload);
+}
+
+[numthreads(1, 1, 1)]
+void CSMain()
+{
+  Fallback_Scheduler(initialStateId, 1, 1);
+}

+ 84 - 0
tools/clang/unittests/DxrFallback/testFiles/testShader5.hlsl

@@ -0,0 +1,84 @@
+#include "testLib.h"
+
+struct MyPayload
+{
+  int val;
+};
+
+void TraceRayTest(int w, inout MyPayload);
+Declare_Fallback_SetPendingAttr(BuiltInTriangleIntersectionAttributes);
+
+SHADER_test
+void Fallback_TraceRay(int w, uint payloadOffset)
+{
+  uint oldPayloadOffset = Fallback_TraceRayBegin(w, float3(w,w,w), w, float3(w,w,w), w, payloadOffset);
+
+  append(-99);
+  BuiltInTriangleIntersectionAttributes attr;
+  attr.barycentrics = float2(555, 666);
+  Fallback_SetPendingAttr(attr);
+  Fallback_SetPendingTriVals(w, w, w, w, w, w);
+  Fallback_CommitHit();
+
+  Fallback_CallIndirect(consume());
+
+  Fallback_TraceRayEnd(oldPayloadOffset);
+}
+
+[shader("raygeneration")]
+void raygen()
+{
+  MyPayload payload = { 1 };
+  TraceRayTest(0, payload);
+  append(payload.val);
+}
+
+[shader("closesthit")]
+void ch1(inout MyPayload payload : SV_RayPayload, in BuiltInTriangleIntersectionAttributes attr : SV_IntersectionAttributes)
+{
+  append(100);
+  append(Fallback_RayTCurrent());
+
+  TraceRayTest(1, payload);
+  append(Fallback_RayTCurrent());
+
+  TraceRayTest(4, payload);
+  append(Fallback_RayTCurrent());
+
+  payload.val += 10;
+}
+
+[shader("closesthit")]
+void ch2(inout MyPayload payload : SV_RayPayload, in BuiltInTriangleIntersectionAttributes attr : SV_IntersectionAttributes)
+{
+  append(101);
+  append(Fallback_RayTCurrent());
+  TraceRayTest(2, payload);
+  append(Fallback_RayTCurrent());
+  payload.val += 100;
+}
+
+[shader("miss")]
+void miss1(inout MyPayload payload : SV_RayPayload)
+{
+  append(102);
+  append(Fallback_RayTCurrent());
+  TraceRayTest(3, payload);
+  append(Fallback_RayTCurrent());
+  payload.val += 1000;
+}
+
+[shader("miss")]
+void miss2(inout MyPayload payload : SV_RayPayload)
+{
+  append(103);
+  append(Fallback_RayTCurrent());
+  payload.val += 10000;
+}
+
+
+[numthreads(1, 1, 1)]
+void CSMain()
+{
+  Fallback_Scheduler(initialStateId, 1, 1);
+}

+ 97 - 0
tools/clang/unittests/DxrFallback/testFiles/testTraversal.h

@@ -0,0 +1,97 @@
+#define ENABLE_PRINT 0
+
+#define TERMINATE -40
+#define IGNORE      0
+#define ACCEPT     40
+#define OPAQUE     41
+
+#define LEAF_DONE     90
+#define LEAF_INST     91
+#define LEAF_TRIS     92
+#define LEAF_CUSTOM   93
+
+#define RAYGEN     190
+#define INTERSECT  290
+#define ANYHIT     390
+#define CLOSESTHIT 490
+#define MISS       590
+#define TRACERAY   999
+
+
+#ifndef HLSL
+typedef unsigned uint;
+
+#define HIT_KIND_TRIANGLE_FRONT_FACE    0xFE
+#define HIT_KIND_TRIANGLE_BACK_FACE     0xFF
+
+#define RAY_FLAG_NONE                         0x00
+#define RAY_FLAG_FORCE_OPAQUE                 0x01
+#define RAY_FLAG_FORCE_NON_OPAQUE             0x02
+#define RAY_FLAG_TERMINATE_ON_FIRST_HIT       0x04
+#define RAY_FLAG_SKIP_CLOSEST_HIT_SHADER      0x08
+#define RAY_FLAG_CULL_BACK_FACING_TRIANGLES   0x10
+#define RAY_FLAG_CULL_FRONT_FACING_TRIANGLES  0x20
+#define RAY_FLAG_CULL_OPAQUE                  0x40
+#define RAY_FLAG_CULL_NON_OPAQUE              0x80
+
+#define INSTANCE_FLAG_NONE                              0x0
+#define INSTANCE_FLAG_TRIANGLE_CULL_DISABLE             0x1
+#define INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE   0x2
+#define INSTANCE_FLAG_FORCE_OPAQUE                      0x4
+#define INSTANCE_FLAG_FORCE_NON_OPAQUE                  0x8
+
+#endif
+
+static int pack(int primIdx, int geomIdx, bool opaque)
+{
+  return
+      (opaque ? 0x80000000 : 0)
+    | (geomIdx << 16)
+    | (primIdx);
+}
+
+#ifdef HLSL
+static void unpack(int val, out uint primIdx, out uint geomIdx, out bool opaque)
+{
+  opaque = val & 0x80000000;
+  geomIdx = (val >> 16) & 0xFFFF;
+  primIdx = val & 0xFFFF;
+}
+#endif
+
+static bool isOpaque( bool geomOpaque, uint instanceFlags, uint rayFlags )
+{
+  bool opaque = geomOpaque;
+
+  if( instanceFlags & INSTANCE_FLAG_FORCE_OPAQUE )
+    opaque = true;
+  else if( instanceFlags & INSTANCE_FLAG_FORCE_NON_OPAQUE ) 
+    opaque = false;
+
+  if( rayFlags & RAY_FLAG_FORCE_OPAQUE )
+    opaque = true;
+  else if( rayFlags & RAY_FLAG_FORCE_NON_OPAQUE )
+    opaque = false;
+
+  return opaque;
+}
+
+static float computeCullFaceDir(uint instanceFlags, uint rayFlags)
+{
+  float cullFaceDir = 0;
+  if( rayFlags & RAY_FLAG_CULL_FRONT_FACING_TRIANGLES )
+    cullFaceDir = 1;
+  else if( rayFlags & RAY_FLAG_CULL_BACK_FACING_TRIANGLES )
+    cullFaceDir = -1;
+  if( instanceFlags & INSTANCE_FLAG_TRIANGLE_CULL_DISABLE )
+    cullFaceDir = 0;
+
+  return cullFaceDir;
+}
+
+static bool cull(bool opaque, uint rayFlags)
+{
+  return (opaque && (rayFlags & RAY_FLAG_CULL_OPAQUE)) || (!opaque && (rayFlags & RAY_FLAG_CULL_NON_OPAQUE));
+}
+
+

+ 281 - 0
tools/clang/unittests/DxrFallback/testFiles/testTraversal.hlsl

@@ -0,0 +1,281 @@
+#include "testLib.h"
+#define HLSL
+#include "testTraversal.h"
+
+
+void Input_GetLeafRange(out int leafType, out int leafBegin, out int leafEnd)
+{
+  leafType = consume();
+  leafBegin = 0;
+  leafEnd = (leafType == LEAF_DONE) ? 0 : 1;
+}
+
+void Input_GetInst(out uint instIdx, out uint instId, out uint instFlags)
+{
+  instIdx = consume();
+  instId = consume();
+  instFlags = consume();
+}
+
+void Input_GetPrimInfo(out uint primIdx, out uint geomIdx, out bool geomOpaque)
+{
+  int val = consume();
+  unpack(val, primIdx, geomIdx, geomOpaque);
+}
+
+void Input_IntersectTri(out float t, out float u, out float v, out float d)
+{
+  t = asfloat(consume());
+  u = asfloat(consume());
+  v = asfloat(consume());
+  d = asfloat(consume());
+}
+
+int Input_LoadAnyHit()
+{
+  return consume();
+}
+
+int Input_LoadIntersection()
+{
+  return consume();
+}
+
+int Input_LoadClosestHitOrMiss(bool hit)
+{
+  int closestHitStateId = consume();
+  int missStateId = consume();
+  return hit ? closestHitStateId : missStateId;
+}
+
+void StackDump(int begin, int end)
+{
+  append(88888888);
+  for (int i = begin; i <= end; i++)
+    append(Fallback_RuntimeDataLoadInt(i));
+  append(88888888);
+}
+
+int InvokeAnyHit(int stateId)
+{
+  Fallback_SetAnyHitResult(ACCEPT);
+  Fallback_CallIndirect(stateId);
+  return Fallback_AnyHitResult();
+}
+
+int InvokeIntersection(int stateId)
+{
+  Fallback_SetAnyHitResult(ACCEPT);
+  Fallback_CallIndirect(stateId);
+  return Fallback_AnyHitResult();
+}
+
+// Return < 0 for terminate, 0 for ignore, > 0 for accept
+SHADER_internal
+int Fallback_ReportHit(float tHit, uint hitKind)
+{
+  int print = ENABLE_PRINT;
+  if (print) { append(4000); append(tHit); append(RayTMin()); append(Fallback_RayTCurrent()); }
+  if (tHit < RayTMin() || Fallback_RayTCurrent() <= tHit)
+    return 0;
+
+  Fallback_SetPendingRayTCurrent(tHit);
+  Fallback_SetPendingHitKind(tHit);
+  int stateId = Fallback_AnyHitStateId();
+  if (print) { append(4001); append(stateId); }
+  int ret = ACCEPT;
+  if (stateId > 0)
+    ret = InvokeAnyHit(stateId);
+  if (ret != IGNORE)
+  {
+    if (print) append(4002);
+    Fallback_CommitHit();
+    if (RayFlags() & RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH)
+      ret = TERMINATE;
+  }
+  return ret;
+}
+
+Declare_Fallback_SetPendingAttr(BuiltInTriangleIntersectionAttributes);
+
+// Assumptions
+// 1 leaf
+//
+bool Traversal()
+{
+  uint hgtStartOfs = 0;
+  uint hgtStride = 1;
+  uint rayOfs = 0;
+  uint rayFlags = RayFlags();
+  uint instOfs = 0;
+  uint instIdx = -1;
+  uint instId = 0;
+  uint instFlags = 0;
+  uint geomStride = 0;
+
+  bool done = false;
+  int NO_HIT_SENTINEL = ~0;
+  Fallback_SetInstanceIndex(NO_HIT_SENTINEL);
+  int count = 0;
+  bool print = ENABLE_PRINT;
+  if (print) { append(3000); append(Fallback_InstanceIndex()); }
+  while (!done)
+  {
+    // traversal = ray + AS ==> leaf range, done, or continue
+
+    int leafType, leafBegin, leafEnd;
+    Input_GetLeafRange(leafType, leafBegin, leafEnd);
+    if (print) { append(3010); append(leafType); append(instIdx); }
+    if (count++ > 10)
+    {
+      append(9999999);
+      break;
+    }
+
+    if (leafBegin < leafEnd) // leaf
+    {
+      if (print) append(3020);
+      // Inputs:
+      // Primitive: primIdx
+      // Geometry:  geomIdx, opaqueFlag 
+      // Instance:  instOfs, instFlags(TRI_CULL_DISABLE, TRI_FRONT_CCW, FORCE_OPAQUE, FORCE_NONOPAQUE)
+      // TraceRay:  rayOfs, geomStride, rayFlags(FORCE_OPAQUE, FORCE_NONOPAQUE, TERMINATE_ON_FIRST_HIT, SKIP_CLOSEST_HIT, CULL_BACK_FACING_TRIS, CULL_FRONT_FACING_TRIS, CULL_OPAQUE, CULL_NONOPAQUE)
+      // Dispatch:  hgtStartOfs, hgtStride
+      //
+      // Hit group shader offset:
+      //   hgtStartOfs + hgtStride * (rayOfs + instOfs + geomStride * geomIdx)
+      //
+      // Hit info:
+      //   hitGroupRecordOffset, primIdx, instIdx, hitGroupAddr, t, hitKind, t, attributes
+      if (leafType == LEAF_INST)
+      {
+        // Transition to bottom level
+        if (print) append(3021);
+        Input_GetInst(instIdx, instId, instFlags);
+        // object ray = world ray * inverse
+      }
+      else
+      {
+        for (int i = leafBegin; i < leafEnd; ++i)
+        {
+          uint primIdx, geomIdx;
+          bool geomOpaque;
+          Input_GetPrimInfo(primIdx, geomIdx, geomOpaque);
+
+          bool opaque = isOpaque(geomOpaque, instFlags, rayFlags);
+          if (cull(opaque, rayFlags))
+            continue;
+
+          uint hitGroupRecordOffset = hgtStartOfs + hgtStride * (rayOfs + instOfs + geomStride * geomIdx);
+          if (leafType == LEAF_TRIS) // loop invariant
+          {
+            float t, u, v, d;
+            Input_IntersectTri(t, u, v, d);
+            if (instFlags & INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE)
+              d = -d;
+
+            float cullFaceDir = computeCullFaceDir(instFlags, rayFlags);
+            if (print) { append(3025); append(t); append(RayTMin()); append(Fallback_RayTCurrent()); append(d); append(cullFaceDir); }
+            if (t < RayTMin() || t > Fallback_RayTCurrent() || -d * cullFaceDir < 0)
+              continue;
+
+            int hitKind = (d > 0) ? HIT_KIND_TRIANGLE_FRONT_FACE : HIT_KIND_TRIANGLE_BACK_FACE;
+            Fallback_SetPendingTriVals(hitGroupRecordOffset, primIdx, instIdx, instId, t, hitKind);
+
+            BuiltInTriangleIntersectionAttributes attr;
+            attr.barycentrics = float2(u, v);
+            Fallback_SetPendingAttr(attr);
+            if (opaque)
+            {
+              if (print) append(3030);
+              Fallback_CommitHit();
+              done = rayFlags & RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH;
+            }
+            else
+            {
+              if (print) append(3040);
+              int ahStateId = Input_LoadAnyHit();
+              int ret = InvokeAnyHit(ahStateId);
+              if (ret != IGNORE)
+                Fallback_CommitHit();
+              done = (ret == TERMINATE) || (rayFlags & RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH);
+            }
+          }
+          else // (leafType == LEAF_CUSTOM)
+          {
+            if (print) append(3050);
+            Fallback_SetPendingCustomVals(hitGroupRecordOffset, primIdx, instIdx, instId);
+            int ahStateId = Input_LoadAnyHit();
+            Fallback_SetAnyHitStateId(opaque ? -1 : ahStateId);
+            int stateId = Input_LoadIntersection();
+            if (print) { append(3051); append(stateId); append(Fallback_AnyHitStateId()); }
+            int ret = InvokeIntersection(stateId);
+            if (print) { append(3052); append(ret); append(Fallback_InstanceIndex()); }
+            done = (ret == TERMINATE) || (rayFlags & RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH);
+          }
+          if (done)
+            break;
+        } // for
+      } // LEAF_BOTTOM
+    }
+    else if (leafType == LEAF_DONE)
+    {
+      if (print) append(3060);
+      if (instIdx == -1)
+        done = true;
+      else
+      {
+        // transition to top 
+        instIdx = -1;
+        // object ray = world ray (why?) 
+        // done = stack.empty();
+      }
+    }
+  }
+  if (print) { append(3070); append(Fallback_InstanceIndex()); }
+
+  return Fallback_InstanceIndex() != NO_HIT_SENTINEL;
+}
+
+
+void Fallback_IgnoreHit()
+{
+  Fallback_SetAnyHitResult(IGNORE);
+}
+
+void Fallback_AcceptHitAndEndSearch()
+{
+  Fallback_SetAnyHitResult(TERMINATE);
+}
+
+SHADER_internal
+void Fallback_TraceRay(int param, uint payloadOffset)
+{
+  append(TRACERAY);
+
+  int rayFlags = consume();
+  uint oldPayloadOffset = Fallback_TraceRayBegin(rayFlags, float3(0, 0, 0), 0, float3(0, 0, 1), 1e34, payloadOffset);
+
+  bool hit = Traversal();
+
+  int stateId = 0;
+  if (hit && (RayFlags() & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER))
+    stateId = 0;
+  else
+    stateId = Input_LoadClosestHitOrMiss(hit);
+
+  if (stateId)
+    Fallback_CallIndirect(stateId);
+
+  Fallback_TraceRayEnd(oldPayloadOffset);
+}
+
+
+[numthreads(1, 1, 1)]
+void CSMain()
+{
+  Fallback_Scheduler(initialStateId, 1, 1);
+}
+
+
+

+ 165 - 0
tools/clang/unittests/DxrFallback/testFiles/testTraversal2.hlsl

@@ -0,0 +1,165 @@
+#include "testLib.h"
+#define HLSL
+#include "testTraversal.h"
+
+struct MyPayload
+{
+  int val;
+  int primIdx;
+};
+
+struct MyAttributes
+{
+  int attr0;
+  int attr1;
+};
+
+bool Input_LoadHit(out float t, out uint hitKind, out MyAttributes attr)
+{
+  int val = consume();
+  if (val == -1)
+    return false;
+
+  t = asfloat(val);
+  hitKind = consume();
+  attr.attr0 = consume();
+  attr.attr1 = consume();
+  return true;
+}
+
+int Input_LoadAnyHitRet()
+{
+  return consume();
+}
+
+void StackDump(int begin, int end);
+
+RaytracingAccelerationStructure accel : register(t5);
+
+Declare_TraceRayTest(MyPayload);
+
+[shader("raygeneration")]
+void raygen()
+{
+  append(RAYGEN);
+
+  RayDesc ray;
+  MyPayload payload;
+  payload.val = 1000;
+  payload.primIdx = -1;
+
+  //TraceRay(accel,0,0,0,0,0,ray,payload);
+  TraceRayTest(0, payload);
+
+  append(payload.val);
+  append(payload.primIdx);
+}
+
+[shader("anyhit")]
+void ahTri(inout MyPayload payload : SV_RayPayload, in BuiltInTriangleIntersectionAttributes attr : SV_IntersectionAttributes)
+{
+  append(ANYHIT + 0);
+  append(attr.barycentrics.x);
+  append(attr.barycentrics.y);
+  payload.val += 100;
+
+  int anyHitRet = Input_LoadAnyHitRet();
+  if (anyHitRet == IGNORE)
+    IgnoreHit();
+
+  if (anyHitRet == TERMINATE)
+    AcceptHitAndEndSearch();
+}
+
+[shader("closesthit")]
+void chTri(inout MyPayload payload : SV_RayPayload, in BuiltInTriangleIntersectionAttributes attr : SV_IntersectionAttributes)
+{
+  append(CLOSESTHIT + 0);
+  float2 barycentrics = attr.barycentrics;
+  append(barycentrics.x);
+  append(barycentrics.y);
+  //append(attr.barycentrics.x);
+  //append(attr.barycentrics.y);
+
+  payload.val += 10;
+  payload.primIdx = PrimitiveIndex();
+}
+
+[shader("intersection")]
+void intersection()
+{
+  int print = ENABLE_PRINT;
+  append(INTERSECT + 1);
+
+  float t;
+  int hitKind;
+  MyAttributes attr;
+  //int count = 0;
+  while (Input_LoadHit(t, hitKind, attr))
+  {
+    if (print) { append(5000); append(hitKind); append(attr.attr0); append(attr.attr1); append(Fallback_AnyHitStateId()); }
+    bool ret = ReportHit(t, hitKind, attr);
+    if (print) { append(5001); append(ret); }
+    //if(count++ > 5) {append(9999998); return;}
+  }
+}
+
+[shader("anyhit")]
+void ahCustom(inout MyPayload payload : SV_RayPayload, in MyAttributes attr : SV_IntersectionAttributes)
+{
+  int print = ENABLE_PRINT;
+  append(ANYHIT + 1);
+  append(attr.attr0);
+  append(attr.attr1);
+  payload.val += 100;
+
+  int anyHitRet = Input_LoadAnyHitRet();
+  if (print) append(anyHitRet);
+  if (anyHitRet == IGNORE)
+    IgnoreHit();
+
+  if (anyHitRet == TERMINATE)
+    AcceptHitAndEndSearch();
+}
+
+[shader("closesthit")]
+void chCustom(inout MyPayload payload : SV_RayPayload, in MyAttributes attr : SV_IntersectionAttributes)
+{
+  append(CLOSESTHIT + 1);
+  append(attr.attr0);
+  append(attr.attr1);
+
+  payload.val += 10;
+  payload.primIdx = PrimitiveIndex();
+}
+
+[shader("miss")]
+void miss(inout MyPayload payload : SV_RayPayload)
+{
+  append(MISS);
+
+  payload.val += 1;
+}
+
+
+struct PayloadWithArray
+{
+  int size;
+  int vals[10];
+};
+
+
+[shader("raygeneration")]
+void raygen_array()
+{
+  RayDesc ray;
+  PayloadWithArray payload;
+  payload.size = 10;
+  payload.vals[load(3)] = 4;
+
+  //TraceRay(0, payload);
+  TraceRay(accel, 0, 0, 0, 0, 0, ray, payload);
+
+  append(payload.vals[2]);
+}
+

+ 974 - 0
tools/clang/unittests/DxrFallback/test_DxrFallback.cpp

@@ -0,0 +1,974 @@
+#include "dxc/Support/Global.h"
+#include "dxc/Support/Unicode.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/dxcapi.h"
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxc/dxcdxrfallbackcompiler.h"
+#include "dxc/support/dxcapi.use.h"
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MSFileSystem.h"
+
+#include "defaultTestFilePath.h"
+#include "ShaderTester.h"
+#undef IGNORE
+#undef OPAQUE
+#include "testFiles/testTraversal.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+using namespace dxc;
+using namespace llvm;
+using namespace hlsl;
+
+const int DEBUG_OUTPUT_LEVEL = 1;
+
+std::string ws2s(const std::wstring& wide)
+{
+  return std::string(wide.begin(), wide.end());
+}
+
+std::wstring s2ws(const std::string& str)
+{
+  return std::wstring(str.begin(), str.end());
+}
+
+void printErrors(CComPtr<IDxcOperationResult> pResult)
+{
+  CComPtr<IDxcBlobEncoding> pErrorBuffer;
+  IFT(pResult->GetErrorBuffer(&pErrorBuffer));
+  const char *pStart = (const char *)pErrorBuffer->GetBufferPointer();
+  std::string msg(pStart);
+  std::cerr << msg;
+
+  HRESULT status;
+  pResult->GetStatus(&status);
+  //IFTMSG(status, msg);
+}
+
+void CompileToDxilFromFile(DxcDllSupport& dxcSupport, LPCWSTR pShaderTextFilePath, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, LPCWSTR* pArgs, UINT32 argCount, const DxcDefine *pDefines, UINT32 defineCount, IDxcBlob **ppBlob)
+{
+  CComPtr<IDxcLibrary> pLibrary;
+  IFT(dxcSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+
+  CComPtr<IDxcIncludeHandler> dxcIncludeHandler;
+  IFT(pLibrary->CreateIncludeHandler(&dxcIncludeHandler));
+
+  UINT32 codePage(0);
+  CComPtr<IDxcBlobEncoding> pTextBlob(nullptr);
+  IFT(pLibrary->CreateBlobFromFile(pShaderTextFilePath, &codePage, &pTextBlob));
+
+  CComPtr<IDxcCompiler> pCompiler;
+  IFT(dxcSupport.CreateInstance(CLSID_DxcCompiler, &pCompiler));
+
+  CComPtr<IDxcOperationResult> pResult;
+  IFT(pCompiler->Compile(pTextBlob, pShaderTextFilePath, pEntryPoint, pTargetProfile, pArgs, argCount, pDefines, defineCount, dxcIncludeHandler, &pResult));
+
+  HRESULT resultCode;
+  CComPtr<IDxcBlobEncoding> pErrorBuffer;
+  IFT(pResult->GetStatus(&resultCode));
+  IFT(pResult->GetErrorBuffer(&pErrorBuffer));
+  if (SUCCEEDED(resultCode))
+  {
+    IFT(pResult->GetResult((IDxcBlob **)ppBlob));
+  }
+  else
+  {
+    printErrors(pResult);
+  }
+}
+
+bool DxrCompile(
+  DxcDllSupport& dxrFallbackSupport,
+  const std::string& entryName,
+  std::vector<IDxcBlob*>& libs,
+  const std::vector<std::string>& shaderNames,
+  std::vector<DxcShaderInfo>& shaderIds,
+  bool findCalledShaders,
+  IDxcBlob** ppResultBlob)
+{
+  CComPtr<IDxcDxrFallbackCompiler> pCompiler;
+  IFT(dxrFallbackSupport.CreateInstance(CLSID_DxcDxrFallbackCompiler, &pCompiler));
+
+  std::vector<std::wstring> shaderNamesW(shaderNames.size());
+  std::vector<LPCWSTR> shaderNamePtrs(shaderNames.size());
+  for (size_t i = 0; i < shaderNames.size(); ++i)
+  {
+    shaderNamesW[i] = s2ws(shaderNames[i]);
+    shaderNamePtrs[i] = shaderNamesW[i].c_str();
+  }
+
+  const UINT maxAttributeSize = 32;
+  shaderIds.resize(shaderNames.size());
+  CComPtr<IDxcOperationResult> pCompileResult;
+  CComPtr<IDxcBlob> pCompiledCollection;
+  std::vector<DxcShaderBytecode> bytecode(libs.size());
+  for (UINT i = 0; i < libs.size(); i++)
+  {
+      bytecode[i] = { (LPBYTE)libs[i]->GetBufferPointer(), (UINT32)libs[i]->GetBufferSize() };
+  }
+
+  IFT(pCompiler->SetFindCalledShaders(findCalledShaders));
+  IFT(pCompiler->SetDebugOutput(DEBUG_OUTPUT_LEVEL));
+  IFT(pCompiler->Compile(
+    bytecode.data(), libs.size(),
+    shaderNamePtrs.data(), shaderIds.data(), shaderNamePtrs.size(), maxAttributeSize,
+    &pCompileResult));
+  pCompileResult->GetResult(&pCompiledCollection);
+
+  IDxcBlob *compiledCollections[] = { pCompiledCollection };
+  CComPtr<IDxcOperationResult> pResult;
+  IFT(pCompiler->Link(
+      s2ws(entryName).c_str(),
+      compiledCollections, ARRAYSIZE(compiledCollections),
+      shaderNamePtrs.data(), shaderIds.data(), shaderNamePtrs.size(),
+      maxAttributeSize,
+      1024,
+      &pResult));
+
+  HRESULT status;
+  IFT(pResult->GetStatus(&status));
+  IFT(pResult->GetResult(ppResultBlob));
+  if (SUCCEEDED(status))
+  {
+    return true;
+  }
+  else
+  {
+    std::cerr << "Compile errors\n";
+    printErrors(pResult);
+    return false;
+  }
+}
+
+
+class Tester
+{
+public:
+  Tester(const std::string& deviceName, const std::string& path)
+    : m_deviceName(s2ws(deviceName))
+    , m_path(path)
+  {
+    dxc::EnsureEnabled(m_dxcSupport);
+    m_dxrFallbackSupport.InitializeForDll(L"DxrFallbackCompiler.dll", "DxcCreateDxrFallbackCompiler");
+  }
+
+  void setFiles(const std::vector<std::string>& files)
+  {
+    std::vector<std::string> filesWithLib(files);
+    filesWithLib.push_back(m_testLibFilename);
+    m_inputBlobs.clear();
+    m_inputBlobPtrs.clear();
+    for (auto& filename : filesWithLib)
+    {
+      CComPtr<IDxcBlob> pInput;
+      LPCWSTR args[] = { L"-O3" };
+      CompileToDxilFromFile(m_dxcSupport, s2ws(m_path + filename).c_str(), L"", L"lib_6_1", args, _countof(args), nullptr, 0, &pInput);
+      m_inputBlobs.push_back(pInput);
+      m_inputBlobPtrs.push_back(pInput);
+    }
+  }
+
+protected:
+  DxcDllSupport m_dxcSupport;
+  DxcDllSupport m_dxrFallbackSupport;
+  std::wstring m_deviceName;
+  std::vector<CComPtr<IDxcBlob>> m_inputBlobs;
+  std::vector<IDxcBlob*> m_inputBlobPtrs;
+  std::string m_path;
+  std::string m_testLibFilename = "testLib.hlsl";
+  std::string m_entryName = "CSMain";
+
+  int runTest(CComPtr<IDxcBlob> pShader, int initialShaderId, const std::vector<int>& input, const std::vector<int>& expectedOutput)
+  {
+    std::vector<int> output;
+    std::unique_ptr<ShaderTester> tester(ShaderTester::New(pShader));
+    tester->setDevice(m_deviceName);
+    tester->runShader(initialShaderId, input, output);
+    int numFailed = checkResult(output, expectedOutput) ? 0 : 1;
+    if (numFailed)
+    {
+      std::cout << "input:";
+      for (size_t i = 0; i < input.size(); ++i)
+        std::cout << " " << input[i];
+      std::cout << "\n";
+    }
+    std::cout << "\n";
+
+    return numFailed;
+  }
+
+  bool checkResult(const std::vector<int>& output, const std::vector<int>& expectedOutput)
+  {
+    int count = output.empty() ? 0 : output[0];
+    std::cout << count << ": ";
+
+    // print result
+    for (int i = 0; i < count; ++i)
+      std::cout << output[i + 1] << " ";
+    std::cout << "\n";
+
+    bool passed = false;
+    if (count == expectedOutput.size())
+    {
+      passed = true;
+      for (size_t i = 0; i < expectedOutput.size(); ++i)
+      {
+        if (output[i + 1] != expectedOutput[i])
+        {
+          passed = false;
+          break;
+        }
+      }
+    }
+
+    if (!passed)
+    {
+      std::cout << expectedOutput.size() << ": ";
+      for (size_t i = 0; i < expectedOutput.size(); ++i)
+        std::cout << expectedOutput[i] << " ";
+      std::cout << "\n";
+    }
+
+    std::cout << (passed ? "PASSED" : "FAILED") << "\n";
+
+    return passed;
+  }
+};
+
+class RtCompilerTester : public Tester
+{
+public:
+  struct TestWithEntryPoint
+  {
+    std::string entryPoint;
+    std::vector<int> expectedOutput;
+  };
+
+  RtCompilerTester(const std::string& deviceName, const std::string& path)
+    : Tester(deviceName, path)
+  {}
+
+  // Returns the number of failures
+  int runTestsWithEntryPoints(const std::vector<TestWithEntryPoint>& tests)
+  {
+    int numFailed = 0;
+    for (auto& test : tests)
+    {
+      std::cout << test.entryPoint << "\n";
+
+      std::vector<std::string> shaderNames = { test.entryPoint };
+      std::vector<int> input;
+      std::vector<DxcShaderInfo> shaderIds;
+      CComPtr<IDxcBlob> pComputeShader;
+      if (DxrCompile(m_dxrFallbackSupport, m_entryName, m_inputBlobPtrs, shaderNames, shaderIds, true, &pComputeShader))
+        numFailed += runTest(pComputeShader, shaderIds[0].Identifier, input, test.expectedOutput);
+    }
+
+    return numFailed;
+  }
+
+  // The first shader is the entry shader. The shaderId of the shader at 
+  // indirectShaderIdx is placed in const memory.
+  //
+  // Returns the number of failures.
+  int runSingleTest(const std::vector<std::string>& shaderNames, const std::vector<int>& input, const std::vector<int>& expectedOutput)
+  {
+    std::vector<DxcShaderInfo> shaderIds(shaderNames.size());
+    CComPtr<IDxcBlob> pComputeShader;
+    if (!DxrCompile(m_dxrFallbackSupport, m_entryName, m_inputBlobPtrs, shaderNames, shaderIds, false, &pComputeShader))
+      return 1;
+
+    for (size_t i = 0; i < shaderNames.size(); ++i)
+      std::cout << shaderNames[i] << ":" << shaderIds[i].Identifier << " ";
+    std::cout << "\n";
+
+    return runTest(pComputeShader, shaderIds[0].Identifier, input, expectedOutput);
+  }
+
+  void compileTest(const std::vector<std::string>& shaderNames, const std::string& entryName)
+  {
+    std::vector<DxcShaderInfo> shaderIds(shaderNames.size());
+    CComPtr<IDxcBlob> pOutput;
+    if (DxrCompile(m_dxrFallbackSupport, entryName, m_inputBlobPtrs, shaderNames, shaderIds, false, &pOutput))
+      std::cout << "Compile succeeded\n";
+    else
+      std::cout << "Compile failed\n";
+
+    for (size_t i = 0; i < shaderNames.size(); ++i)
+      std::cout << shaderNames[i] << ":" << shaderIds[i].Identifier << " ";
+    std::cout << "\n";
+  }
+};
+
+int asint(float v)
+{
+  return *(int*)&v;
+}
+
+float asfloat(int v)
+{
+  return *(float*)&v;
+}
+
+
+class Leaf
+{
+public:
+  int leafType;
+};
+
+class Instance : public Leaf
+{
+public:
+  int instIdx;
+  int instId;
+  int instFlags;
+};
+
+class Primitive : public Leaf
+{
+public:
+  int  primIdx;
+  int  geomIdx;
+  int  geomOpaque;
+};
+
+class Triangle : public Primitive
+{
+public:
+  float t, u, v, d;
+  int anyHitRet;
+};
+
+class Custom : public Primitive
+{
+public:
+  struct Hit
+  {
+    float t;
+    int hitKind;
+    int attr0, attr1;
+    int anyHitRet;
+  };
+
+  std::vector<Hit> hits;
+};
+
+
+
+Instance* inst(int instFlags = 0, int instIdx = 0, int instId = 0)
+{
+  Instance* inst = new Instance; // TODO: make this not leak
+  inst->leafType = LEAF_INST;
+  inst->instFlags = instFlags;
+  inst->instIdx = instIdx;
+  inst->instId = instId;
+  return inst;
+}
+
+Triangle* tri(float t, float u, float v, int anyHitRet = OPAQUE, float d = 1, int primIdx = 0, int geomIdx = 0)
+{
+  Triangle* tri = new Triangle; // TODO: make this not leak
+  tri->leafType = LEAF_TRIS;
+  tri->t = t;
+  tri->u = u;
+  tri->v = v;
+  tri->d = d;
+  tri->anyHitRet = anyHitRet;
+  tri->primIdx = primIdx;
+  tri->geomIdx = geomIdx;
+  tri->geomOpaque = (anyHitRet == OPAQUE);
+  return tri;
+}
+
+Custom* custom(const std::vector<Custom::Hit>& hits, int geomOpaque = 0, int primIdx = 0, int geomIdx = 0)
+{
+  Custom* c = new Custom;
+  c->hits = hits;
+  c->leafType = LEAF_CUSTOM;
+  c->primIdx = primIdx;
+  c->geomIdx = geomIdx;
+  c->geomOpaque = geomOpaque;
+  return c;
+}
+
+
+
+struct Payload
+{
+  int val;
+  int primIdx;
+  float t;
+
+  bool operator!=(const Payload& other)
+  {
+    return this->val != other.val || this->primIdx != other.primIdx || this->t != other.t;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, Payload payload)
+{
+  out << "{" << payload.val << "," << payload.primIdx << "}";
+  return out;
+}
+
+
+
+class TestData
+{
+public:
+  std::string name;
+  std::vector<int> input;
+  std::vector<int> expected;
+  std::vector<std::string> shaders;
+  std::map<std::string, std::vector<int> > shaderIdSlots;
+  static int count;
+
+  TestData(const std::string& name) : name(name) { count++; }
+
+  void setShaderIds(std::vector<DxcShaderInfo>& shaderIds)
+  {
+    for (size_t i = 0; i < shaders.size(); ++i)
+    {
+      for (auto& slot : shaderIdSlots[shaders[i]])
+        input[slot] = shaderIds[i].Identifier;
+    }
+  }
+
+  struct CommittedPrim
+  {
+    const Primitive* prim;
+    float t;
+    const Custom::Hit* hit;
+  };
+
+  void simulate(Payload expectedPayload, const std::vector<Leaf*>& leaves, int rayFlags = 0)
+  {
+    shaders = { "raygen", "chTri", "ahTri", "intersection", "ahCustom", "chCustom", "miss", "Fallback_TraceRay" };
+
+    expect(RAYGEN);
+    Payload payload = { 1000, -1 };
+
+    traceRay(rayFlags);
+    expect(TRACERAY);
+
+    bool terminate = false;
+    CommittedPrim committed = { nullptr, -1, nullptr };
+    int instIdx = -1, instId = 0, instFlags = 0;
+    for (Leaf* leaf : leaves)
+    {
+      if (leaf->leafType == LEAF_INST)
+      {
+        const Instance* i = (Instance*)(leaf); // TODO: Why does dynamic_cast<Instance*> not work here?
+        instIdx = i->instIdx;
+        instId = i->instId;
+        instFlags = i->instFlags;
+        leafInst(instIdx, instId, instFlags);
+      }
+      else
+      {
+        const Primitive* prim = (Primitive*)leaf;
+        leafPrim(prim);
+        bool opaque = isOpaque(prim->geomOpaque, instFlags, rayFlags);
+        if (cull(opaque, rayFlags))
+          continue;
+
+        if (leaf->leafType == LEAF_TRIS)
+        {
+          const Triangle* tri = (Triangle*)leaf;
+          triangle(tri);
+          float d = (instFlags & INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE) ? -tri->d : tri->d;
+          if (committed.prim && (tri->t >= committed.t) || -d * computeCullFaceDir(instFlags, rayFlags) < 0)
+            continue;
+
+          if (opaque)
+          {
+            committed = { tri, tri->t, nullptr };
+          }
+          else
+          {
+            shader("ahTri");
+            expect({ ANYHIT, (int)tri->u, (int)tri->v });
+            payload.val += 100;
+            anyHitRet(tri->anyHitRet);
+            if (tri->anyHitRet == TERMINATE)
+            {
+              committed = { tri, tri->t, nullptr };
+              terminate = true;
+              break;
+            }
+            else if (tri->anyHitRet == IGNORE)
+            {
+              // do nothing
+            }
+            else // ACCEPT)
+            {
+              committed = { tri, tri->t, nullptr };
+            }
+          }
+        }
+        else if (leaf->leafType == LEAF_CUSTOM)
+        {
+          const Custom* c = (Custom*)leaf;
+          shader("ahCustom");
+          shader("intersection");
+          expect(INTERSECT + 1);
+          for (auto& hit : c->hits)
+          {
+            customHit(hit);
+            if (committed.prim && hit.t >= committed.t)
+              continue;
+            if (!opaque)
+            {
+              expect({ ANYHIT + 1, hit.attr0, hit.attr1 });
+              payload.val += 100;
+              anyHitRet(hit.anyHitRet);
+              if (hit.anyHitRet == TERMINATE)
+              {
+                committed = { c, hit.t, &hit };
+                terminate = true;
+                break;
+              }
+              else if (hit.anyHitRet == IGNORE)
+              {
+                // do nothing
+                continue;
+              }
+              // ACCEPT - fall through
+            }
+            committed = { c, hit.t, &hit };
+            if (rayFlags & RAY_FLAG_TERMINATE_ON_FIRST_HIT)
+            {
+              terminate = true;
+              break;
+            }
+          }
+          if (!terminate)
+            endHits();
+        }
+      }
+
+      if ((rayFlags & RAY_FLAG_TERMINATE_ON_FIRST_HIT) && committed.prim)
+      {
+        terminate = true;
+        break;
+      }
+    }
+    if (!terminate)
+    {
+      if (instIdx != -1)
+        endAccel();
+      endAccel();
+    }
+
+
+    if (!(committed.prim && (rayFlags & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER)))
+    {
+      const char* ch = "chTri";
+      if (committed.prim && committed.prim->leafType == LEAF_CUSTOM)
+        ch = "chCustom";
+      shade(ch, "miss");
+      if (committed.prim)
+      {
+        if (committed.prim->leafType == LEAF_TRIS)
+        {
+          Triangle* tri = (Triangle*)committed.prim;
+          expect({ CLOSESTHIT, (int)tri->u, (int)tri->v });
+        }
+        else
+        {
+          expect({ CLOSESTHIT + 1, committed.hit->attr0, committed.hit->attr1 });
+        }
+        payload.val += 10;
+        payload.primIdx = committed.prim->primIdx;
+      }
+      else
+      {
+        expect(MISS);
+        payload.val += 1;
+      }
+    }
+
+    expect(payload.val);
+    expect(payload.primIdx);
+    if (payload != expectedPayload)
+      std::cout << count << ": simulated payload " << payload << " does not match expected " << expectedPayload << "\n";
+  }
+
+
+
+
+  void traceRay(unsigned rayFlags)
+  {
+    input.push_back(rayFlags);
+  }
+
+  void shade(const std::string& closestHit, const std::string& miss)
+  {
+    shader(closestHit);
+    shader(miss);
+  }
+
+  void leafPrim(const Primitive* prim)
+  {
+    input.push_back(prim->leafType);
+    input.push_back(pack(prim->primIdx, prim->geomIdx, prim->geomOpaque));
+  }
+
+  void triangle(const Triangle* tr)
+  {
+    input.push_back(asint(tr->t));
+    input.push_back(asint(tr->u));
+    input.push_back(asint(tr->v));
+    input.push_back(asint(tr->d));
+  }
+
+  void customHit(const Custom::Hit& hit)
+  {
+    input.push_back(asint(hit.t));
+    input.push_back(hit.hitKind);
+    input.push_back(hit.attr0);
+    input.push_back(hit.attr1);
+  }
+
+  void leafInst(int instIdx, int instId, int instFlags)
+  {
+    input.push_back(LEAF_INST);
+
+    input.push_back(instIdx);
+    input.push_back(instId);
+    input.push_back(instFlags);
+  }
+
+  void shader(const std::string& shaderName)
+  {
+    shaderIdSlots[shaderName].push_back(input.size()); // fix up later
+    input.push_back(-1);
+  }
+
+  void anyHitRet(int val)
+  {
+    input.push_back(val);
+  }
+
+  void endHits()
+  {
+    input.push_back(-1);
+  }
+
+  void endAccel()
+  {
+    input.push_back(LEAF_DONE);
+  }
+
+  void expect(int val)
+  {
+    expected.push_back(val);
+  }
+
+  void expect(const std::vector<int>& vals)
+  {
+    expected.insert(expected.end(), vals.begin(), vals.end());
+  }
+
+  void expect(float val)
+  {
+    expected.push_back(asint(val));
+  }
+};
+
+int TestData::count = -1;
+
+class TraversalTester : public Tester
+{
+public:
+  TraversalTester(const std::string& deviceName, const std::string& path)
+    : Tester(deviceName, path)
+  {
+    setFiles({ "testTraversal.hlsl", "testTraversal2.hlsl" });
+  }
+
+  int run(const std::vector<TestData*>& tests)
+  {
+    int failedTests = 0;
+    int testIndex = 0;
+    for (auto td : tests)
+    {
+      std::cout << testIndex++ << " " << td->name << std::endl;
+
+      std::vector<DxcShaderInfo> shaderIds(td->shaders.size());
+      CComPtr<IDxcBlob> pComputeShader;
+      if (!DxrCompile(m_dxrFallbackSupport, m_entryName, m_inputBlobPtrs, td->shaders, shaderIds, false, &pComputeShader))
+      {
+        failedTests++;
+        continue;
+      }
+
+      td->setShaderIds(shaderIds);
+      for (size_t i = 0; i < td->shaders.size(); ++i)
+        std::cout << td->shaders[i] << ":" << shaderIds[i].Identifier << " ";
+      std::cout << "\n";
+
+      failedTests += runTest(pComputeShader, shaderIds[0].Identifier, td->input, td->expected);
+
+      delete td;
+    }
+    return failedTests;
+  }
+};
+
+TestData* test_nohit(Payload expectedPayload)
+{
+  TestData* td = new TestData("nohit");
+  td->simulate(expectedPayload, {});
+  return td;
+}
+
+TestData* test_instance_nohit(Payload expectedPayload)
+{
+  TestData* td = new TestData("instance_nohit");
+  td->simulate(expectedPayload, { inst() });
+  return td;
+}
+
+TestData* test_tri(Payload expectedPayload, int anyHitRet, int instFlags = 0, int rayFlags = 0, float d = 1)
+{
+  TestData* td = new TestData("trihit");
+  td->simulate(
+    expectedPayload,
+    {
+      inst(instFlags),
+      tri(1, 55, 66, anyHitRet, d),
+    },
+    rayFlags
+    );
+  return td;
+}
+
+struct TriHit
+{
+  int anyHitRet;
+  float t;
+};
+
+TestData* test_2tri(Payload expectedPayload, const TriHit& tri0, const TriHit& tri1, int rayFlags = 0)
+{
+  TestData* td = new TestData("trihit2");
+  td->simulate(
+    expectedPayload,
+    {
+      inst(),
+      tri(tri0.t, (expectedPayload.primIdx == 0) ? 5555 : 55, 66, tri0.anyHitRet, 1, 0),
+      tri(tri1.t, (expectedPayload.primIdx == 1) ? 5555 : 56, 67, tri1.anyHitRet, 1, 1),
+    },
+    rayFlags
+    );
+  return td;
+}
+
+struct CustomHit2
+{
+  int anyHitRet;
+  float t;
+};
+
+TestData* test_custom(Payload expectedPayload, int geomOpaque, const std::vector<CustomHit2> hits, int instFlags = 0, int rayFlags = 0)
+{
+  TestData* td = new TestData("custom");
+  std::vector<Custom::Hit> customHits;
+  for (size_t i = 0; i < hits.size(); ++i)
+  {
+    const CustomHit2& h = hits[i];
+    customHits.push_back({ h.t, 33, int(55 + i), int(66 + i), h.anyHitRet });
+  }
+  td->simulate(
+    expectedPayload,
+    {
+      inst(instFlags),
+      custom(customHits, geomOpaque),
+    },
+    rayFlags
+    );
+  return td;
+}
+
+
+void printUsageAndExit()
+{
+  std::cerr
+    << "Options:\n"
+    << "  -h | --help                     Print this message\n"
+    << "  -d | --device <name>            Name of device to use. Can be a prefix, e.g. WARP, AMD, etc.\n"
+    << "  -p | --path <directory>         Base path for test input files.\n"
+    << std::endl;
+
+  exit(1);
+}
+
+
+int main(int argc, const char* argv[])
+{
+  std::string deviceName = "";
+  std::string basePath = DEFAULT_TEST_FILE_PATH;
+
+  // Parse arguments
+  std::vector<std::string> args;
+  for (int i = 1; i < argc; ++i)
+    args.push_back(argv[i]);
+  for (size_t i = 0; i < args.size(); ++i)
+  {
+    if (args[i] == "-h" || args[i] == "--help")
+    {
+      printUsageAndExit();
+    }
+    else if (args[i] == "-d" || args[i] == "--device")
+    {
+      deviceName = args[++i];
+    }
+    else if (args[i] == "-p" || args[i] == "--path")
+    {
+      basePath = args[++i];
+    }
+    else
+    {
+      std::cerr << "Bad arg:" << args[i] << std::endl;
+      printUsageAndExit();
+    }
+  }
+
+  try
+  {
+    if (!deviceName.empty())
+      std::cout << "Testing on device " << deviceName << std::endl;
+
+    int numFailed = 0;
+    if (1)
+    {
+      RtCompilerTester tester(deviceName, basePath);
+      tester.setFiles({ "testShader1.hlsl", "testShader2.hlsl" });
+      numFailed += tester.runTestsWithEntryPoints({
+        {"no_call", {1, 1}},
+        {"no_live_values", {1, 1, -99, 2, 2}},
+        {"single_call", {-99, 1, 1}},
+        {"single_call_in", {10}},
+        {"single_call_out", {-99, 64, 64}},
+        {"single_call_inout", {10, 64, 64}},
+        {"single_call_inout_passthru", {-98, 10, 64, 64}},
+        {"types", {-99, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8}},
+        {"multiple_calls", {-99, 1, 1, -99, 4, 4, 2, 2}},
+        {"multiple_calls_with_args", {1, 1, -99, 1, 1, -99, 2, 2, 3, 3}},
+        {"branch", {-99, 64, 64}},
+        {"no_branch", {10, 10}},
+        {"loop", {-99, 1, 1, -99, -99, -99, -99, 5, 5}},
+        {"recursive", {5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1}},
+        {"use_buffer", {-99, 10, 10}},
+        {"lower_intrinsics", {-99, 0, 0}},
+        {"local_array", {-99, 4, 4}},
+        {"dispatch_idx_and_dims", {0, 0, 1, 1}},
+      });
+      numFailed += tester.runSingleTest({ "indirect", "indirect_callee" }, { 1002 }, { -99 });
+      numFailed += tester.runSingleTest({ "raygen_tri", "chTri", "intersection", "continuation", "Fallback_TraceRay" }, { 1002 }, { -98, -97, 555, 666, -99, 1010 });
+      numFailed += tester.runSingleTest({ "raygen_custom", "chCustom1", "chCustom2", "intersection", "continuation", "Fallback_TraceRay" }, { 1003, 1005 }, { -98, -95, 19, 10, 11, 12, 13, -100, -99, 500, -96, 333, 444, -99, 1010, -98, -95, 59, 50, 51, 52, 53, -100, -99, 500, -96, 333, 444, -99, 1110 });
+
+      tester.setFiles({ "testShader3.hlsl" });
+      numFailed += tester.runSingleTest({ "pass_struct", "Fallback_TraceRay" }, {}, { -99, 1, 2, 3, 4, 5, 6, 7, 8, 11 });
+
+      tester.setFiles({ "testShader4.hlsl" });
+      numFailed += tester.runSingleTest({ "full_trace_ray", "Fallback_TraceRay" }, {}, { 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15 });
+
+      tester.setFiles({ "testShader5.hlsl" });
+      numFailed += tester.runSingleTest({ "raygen", "ch1", "ch2", "miss1", "miss2", "Fallback_TraceRay" }, {1002, 1005, 1007, 1009, 1009}, {-99, 100,0, -99,101,1, -99,102,2, -99,103,3, 2, 1, 0, -99,103,4, 0, 21111});
+    }
+
+    if (1)
+    {
+      // Expected payload is the number of invocations in the following shader types:
+      //   RG AH CH MS 
+      // These counts are store in each digit.
+      TraversalTester tester(deviceName, basePath);
+      numFailed += tester.run({
+        test_nohit({1001,-1}),
+        test_instance_nohit({1001,-1}),
+        test_tri({1010, 0}, OPAQUE),
+        test_tri({1110, 0}, ACCEPT),
+        test_tri({1101,-1}, IGNORE),
+        test_tri({1110, 0}, TERMINATE),
+
+        test_tri({1001,-1}, OPAQUE, 0, RAY_FLAG_CULL_OPAQUE), // culling
+        test_tri({1010, 0}, OPAQUE, 0, 0, -1), // flipping direction doesn't matter without culling flags
+        test_tri({1001,-1}, OPAQUE, 0, RAY_FLAG_CULL_FRONT_FACING_TRIANGLES, 1), // triangle culling
+        test_tri({1001,-1}, OPAQUE, 0, RAY_FLAG_CULL_BACK_FACING_TRIANGLES, -1), // triangle culling
+        test_tri({1010, 0}, OPAQUE, INSTANCE_FLAG_TRIANGLE_CULL_DISABLE, RAY_FLAG_CULL_BACK_FACING_TRIANGLES, -1), // disable triangle culling
+        test_tri({1010, 0}, OPAQUE, INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE, 1), // flip winding
+        test_tri({1001,-1}, OPAQUE, INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE, RAY_FLAG_CULL_BACK_FACING_TRIANGLES, 1),
+        test_tri({1010, 0}, ACCEPT, INSTANCE_FLAG_FORCE_OPAQUE),
+        test_tri({1010, 0}, ACCEPT, 0, RAY_FLAG_FORCE_OPAQUE),
+        test_tri({1110, 0}, OPAQUE, INSTANCE_FLAG_FORCE_NON_OPAQUE),
+        test_tri({1110, 0}, OPAQUE, 0, RAY_FLAG_FORCE_NON_OPAQUE),
+        test_tri({1010, 0}, ACCEPT, INSTANCE_FLAG_FORCE_NON_OPAQUE, RAY_FLAG_FORCE_OPAQUE), // ray flags opaque overrides instance
+        test_tri({1110, 0}, OPAQUE, INSTANCE_FLAG_FORCE_OPAQUE, RAY_FLAG_FORCE_NON_OPAQUE), // ray flags opaque overrides instance
+        test_tri({1010, 0}, OPAQUE, INSTANCE_FLAG_TRIANGLE_CULL_DISABLE, 0, -1), // disable cull
+        test_tri({1000,-1}, OPAQUE, 0, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER),
+
+        test_2tri({1010, 0}, {OPAQUE,    1},  {OPAQUE, 2}), // pick closest (first)
+        test_2tri({1010, 1}, {OPAQUE,    2},  {OPAQUE, 1}), // pick closest (second)
+        test_2tri({1010, 0}, {OPAQUE,    2},  {OPAQUE, 1},  RAY_FLAG_TERMINATE_ON_FIRST_HIT),
+        test_2tri({1110, 0}, {ACCEPT,    1},  {ACCEPT, 2}), // pick closest (first)
+        test_2tri({1210, 1}, {ACCEPT,    2},  {ACCEPT, 1}), // pick closest (second)
+        test_2tri({1110, 0}, {ACCEPT,    2},  {ACCEPT, 1},  RAY_FLAG_TERMINATE_ON_FIRST_HIT),
+        test_2tri({1210, 1}, {IGNORE,    1},  {ACCEPT, 2}), // ignore first (even though closer)
+        test_2tri({1210, 0}, {ACCEPT,    2},  {IGNORE, 1}), // ignore second (even though closer)
+        test_2tri({1110, 0}, {TERMINATE, 2},  {ACCEPT, 1}),
+
+        test_custom({1010, 0}, 1, {{ACCEPT, 1}}),
+        test_custom({1110, 0}, 0, {{ACCEPT, 1}}),
+        test_custom({1101,-1}, 0, {{IGNORE, 1}}),
+        test_custom({1110, 0}, 0, {{TERMINATE, 1}}),
+        test_custom({1110, 0}, 0, {{ACCEPT, 1},    {ACCEPT, 2}}), // closest first - no anyhit for second
+        test_custom({1210, 0}, 0, {{ACCEPT, 2},    {ACCEPT, 1}}), // closest second
+        test_custom({1210, 0}, 0, {{IGNORE, 1},    {ACCEPT, 2}}), // ignore closer hit
+        test_custom({1201,-1}, 0, {{IGNORE, 2},    {IGNORE, 1}}), // ignore both
+        test_custom({1201,-1}, 0, {{IGNORE, 1},    {IGNORE, 2}}), // ignore both - anyhit for both
+        test_custom({1110, 0}, 0, {{TERMINATE, 2}, {ACCEPT, 1}}), // terminate ==> don't handle second
+
+        test_custom({1001,-1}, 1, {{ACCEPT, 1}}, 0, RAY_FLAG_CULL_OPAQUE),
+        test_custom({1110, 0}, 0, {{ACCEPT, 1}}, 0, RAY_FLAG_CULL_OPAQUE), // no effect on non-opaque
+        test_custom({1010, 0}, 1, {{ACCEPT, 1}}, 0, RAY_FLAG_CULL_NON_OPAQUE), // no effect on non-opaque
+        test_custom({1001,-1}, 0, {{ACCEPT, 1}}, 0, RAY_FLAG_CULL_NON_OPAQUE),
+        test_custom({1010, 0}, 0, {{IGNORE, 1}}, INSTANCE_FLAG_FORCE_OPAQUE), //no anyhit
+        test_custom({1010, 0}, 0, {{IGNORE, 1}}, 0, RAY_FLAG_FORCE_OPAQUE),
+        test_custom({1101,-1}, 1, {{IGNORE, 1}}, INSTANCE_FLAG_FORCE_NON_OPAQUE), // anyhit drops the hit
+        test_custom({1101,-1}, 1, {{IGNORE, 1}}, 0, RAY_FLAG_FORCE_NON_OPAQUE),
+        test_custom({1010, 0}, 0, {{IGNORE, 1}}, INSTANCE_FLAG_FORCE_NON_OPAQUE, RAY_FLAG_FORCE_OPAQUE), // ray flags opaque overrides instance
+        test_custom({1101,-1}, 1, {{IGNORE, 1}}, INSTANCE_FLAG_FORCE_OPAQUE, RAY_FLAG_FORCE_NON_OPAQUE), // ray flags opaque overrides instance
+        test_custom({1001,-1}, 0, {{ACCEPT, 1}}, INSTANCE_FLAG_FORCE_OPAQUE, RAY_FLAG_CULL_OPAQUE),
+        test_custom({1100,-1}, 0, {{ACCEPT, 1}}, 0, RAY_FLAG_SKIP_CLOSEST_HIT_SHADER),
+        test_custom({1210, 0}, 0, {{IGNORE, 3}, {ACCEPT,2}, {ACCEPT, 1}}, 0, RAY_FLAG_TERMINATE_ON_FIRST_HIT),
+      });
+    }
+
+    std::cout << "===============================================\n";
+    if (numFailed == 0)
+      std::cout << "PASSED\n";
+    else
+    {
+      std::cout << "FAILED\n";
+      std::cout << numFailed << " tests failed\n";
+    }
+  }
+  catch (...)
+  {
+    printf("Failed - unknown error.\n");
+    return 1;
+  }
+}
+
+