瀏覽代碼

Merge pull request #1547 from tex3d/dxr-master

Merge dxr-master to master
Tex Riddell 7 年之前
父節點
當前提交
9e101a036d
共有 100 個文件被更改,包括 18360 次插入3724 次删除
  1. 4 4
      cmake/modules/AddLLVM.cmake
  2. 236 199
      docs/DXIL.rst
  3. 99 0
      include/dxc/DxrFallback/DxrFallbackCompiler.h
  4. 138 4
      include/dxc/HLSL/DxilConstants.h
  5. 2 0
      include/dxc/HLSL/DxilContainer.h
  6. 107 0
      include/dxc/HLSL/DxilExportMap.h
  7. 20 0
      include/dxc/HLSL/DxilFallbackLayerPass.h
  8. 22 8
      include/dxc/HLSL/DxilFunctionProps.h
  9. 14 4
      include/dxc/HLSL/DxilGenerationPass.h
  10. 519 0
      include/dxc/HLSL/DxilInstructions.h
  11. 4 2
      include/dxc/HLSL/DxilLinker.h
  12. 23 18
      include/dxc/HLSL/DxilMetadataHelper.h
  13. 60 183
      include/dxc/HLSL/DxilModule.h
  14. 12 5
      include/dxc/HLSL/DxilOperations.h
  15. 45 15
      include/dxc/HLSL/DxilPipelineStateValidation.h
  16. 473 0
      include/dxc/HLSL/DxilRuntimeReflection.h
  17. 425 0
      include/dxc/HLSL/DxilRuntimeReflection.inl
  18. 147 0
      include/dxc/HLSL/DxilShaderFlags.h
  19. 15 7
      include/dxc/HLSL/DxilShaderModel.h
  20. 69 52
      include/dxc/HLSL/DxilSigPoint.inl
  21. 1 1
      include/dxc/HLSL/DxilSignature.h
  22. 49 9
      include/dxc/HLSL/DxilUtil.h
  23. 22 1
      include/dxc/HLSL/DxilValidation.h
  24. 5 0
      include/dxc/HLSL/HLMatrixLowerHelper.h
  25. 25 3
      include/dxc/HLSL/HLModule.h
  26. 7 0
      include/dxc/HLSL/HLOperations.h
  27. 24 0
      include/dxc/HlslIntrinsicOp.h
  28. 4 0
      include/dxc/Support/HLSLOptions.h
  29. 9 1
      include/dxc/Support/HLSLOptions.td
  30. 1 0
      include/dxc/Support/WinAdapter.h
  31. 4 0
      include/dxc/dxcapi.h
  32. 5 1
      include/dxc/dxcapi.internal.h
  33. 113 0
      include/dxc/dxcdxrfallbackcompiler.h
  34. 3 0
      include/llvm/Support/FileSystem.h
  35. 1 1
      include/llvm/Support/raw_ostream.h
  36. 1 0
      lib/CMakeLists.txt
  37. 72 11
      lib/DxcSupport/HLSLOptions.cpp
  38. 14 0
      lib/DxrFallback/CMakeLists.txt
  39. 864 0
      lib/DxrFallback/DxrFallbackCompiler.cpp
  40. 148 0
      lib/DxrFallback/FunctionBuilder.h
  41. 16 0
      lib/DxrFallback/LLVMBuild.txt
  42. 122 0
      lib/DxrFallback/LLVMUtils.cpp
  43. 34 0
      lib/DxrFallback/LLVMUtils.h
  44. 337 0
      lib/DxrFallback/LiveValues.cpp
  45. 81 0
      lib/DxrFallback/LiveValues.h
  46. 356 0
      lib/DxrFallback/Reducibility.cpp
  47. 10 0
      lib/DxrFallback/Reducibility.h
  48. 1797 0
      lib/DxrFallback/StateFunctionTransform.cpp
  49. 295 0
      lib/DxrFallback/StateFunctionTransform.h
  50. 26 0
      lib/DxrFallback/readme.md
  51. 1974 0
      lib/DxrFallback/runtime.h
  52. 62 0
      lib/DxrFallback/runtime/rewriteRuntime.py
  53. 658 0
      lib/DxrFallback/runtime/runtime.c
  54. 9 0
      lib/DxrFallback/runtime/script.cmd
  55. 3 0
      lib/HLSL/CMakeLists.txt
  56. 7 2
      lib/HLSL/DxcOptimizer.cpp
  57. 2 1
      lib/HLSL/DxilAddPixelHitInstrumentation.cpp
  58. 1851 271
      lib/HLSL/DxilCondenseResources.cpp
  59. 583 35
      lib/HLSL/DxilContainerAssembler.cpp
  60. 448 58
      lib/HLSL/DxilContainerReflection.cpp
  61. 2 1
      lib/HLSL/DxilDebugInstrumentation.cpp
  62. 8 6
      lib/HLSL/DxilEliminateOutputDynamicIndexing.cpp
  63. 27 0
      lib/HLSL/DxilEntryProps.h
  64. 221 0
      lib/HLSL/DxilExportMap.cpp
  65. 169 616
      lib/HLSL/DxilGenerationPass.cpp
  66. 3 2
      lib/HLSL/DxilLegalizeSampleOffsetPass.cpp
  67. 560 157
      lib/HLSL/DxilLinker.cpp
  68. 283 57
      lib/HLSL/DxilMetadataHelper.cpp
  69. 389 460
      lib/HLSL/DxilModule.cpp
  70. 525 242
      lib/HLSL/DxilOperations.cpp
  71. 1155 0
      lib/HLSL/DxilPatchShaderRecordBindings.cpp
  72. 75 0
      lib/HLSL/DxilPatchShaderRecordBindingsShared.h
  73. 97 24
      lib/HLSL/DxilPreparePasses.cpp
  74. 5 5
      lib/HLSL/DxilPreserveAllOutputs.cpp
  75. 4 0
      lib/HLSL/DxilResource.cpp
  76. 1 1
      lib/HLSL/DxilResourceBase.cpp
  77. 380 0
      lib/HLSL/DxilShaderFlags.cpp
  78. 27 4
      lib/HLSL/DxilShaderModel.cpp
  79. 8 4
      lib/HLSL/DxilSignature.cpp
  80. 3 3
      lib/HLSL/DxilTypeSystem.cpp
  81. 245 26
      lib/HLSL/DxilUtil.cpp
  82. 473 219
      lib/HLSL/DxilValidation.cpp
  83. 332 211
      lib/HLSL/HLMatrixLowerPass.cpp
  84. 57 3
      lib/HLSL/HLModule.cpp
  85. 407 217
      lib/HLSL/HLOperationLower.cpp
  86. 2 0
      lib/HLSL/HLOperations.cpp
  87. 3 1
      lib/HLSL/HLSignatureLower.cpp
  88. 2 1
      lib/LLVMBuild.txt
  89. 65 18
      lib/Support/Windows/MSFileSystem.inc.cpp
  90. 1 1
      lib/Support/raw_ostream.cpp
  91. 16 6
      lib/Transforms/IPO/PassManagerBuilder.cpp
  92. 2 2
      lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
  93. 12 0
      lib/Transforms/Scalar/GVN.cpp
  94. 7 7
      lib/Transforms/Scalar/Reg2MemHLSL.cpp
  95. 2 0
      lib/Transforms/Scalar/SROA.cpp
  96. 298 532
      lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
  97. 1 1
      lib/Transforms/Utils/InlineFunction.cpp
  98. 6 0
      lib/Transforms/Utils/Local.cpp
  99. 8 0
      tools/clang/include/clang/AST/HlslTypes.h
  100. 2 1
      tools/clang/include/clang/AST/PrettyPrinter.h

+ 4 - 4
cmake/modules/AddLLVM.cmake

@@ -959,15 +959,15 @@ function(hlsl_update_product_ver RC_INTERNAL_NAME)
                  PROPERTY COMPILE_DEFINITIONS
                  "RC_COMPANY_NAME=\"Microsoft(r) Corporation\""
                  "RC_VERSION_FIELD_1=0"
-                 "RC_VERSION_FIELD_2=2017"
-                 "RC_VERSION_FIELD_3=11"
+                 "RC_VERSION_FIELD_2=2018"
+                 "RC_VERSION_FIELD_3=08"
                  "RC_VERSION_FIELD_4=0"
-                 "RC_FILE_VERSION=\"0.2017.11.0\""
+                 "RC_FILE_VERSION=\"0.2018.08.0\""
                  "RC_FILE_DESCRIPTION=\"DirectX Compiler - Out Of Band\""
                  "RC_INTERNAL_NAME=\"${RC_INTERNAL_NAME}\""
                  "RC_COPYRIGHT=\"(c) Microsoft Corporation. All rights reserved.\""
                  "RC_PRODUCT_NAME=\"Microsoft(r) DirectX for Windows(r) - Out Of Band\""
-                 "RC_PRODUCT_VERSION=\"0.2017.11.0\"")
+                 "RC_PRODUCT_VERSION=\"0.2018.08.0\"")
   endif (HLSL_ENABLE_FIXED_VER)
 endfunction(hlsl_update_product_ver)
 # HLSL Change Ends

+ 236 - 199
docs/DXIL.rst

@@ -2083,9 +2083,9 @@ Opcodes are defined on a dense range and will be provided as enum in a header fi
 .. <py::lines('OPCODES-RST')>hctdb_instrhelp.get_opcodes_rst()</py>
 .. OPCODES-RST:BEGIN
 
-=== ============================= =================================================================================================================
+=== ============================= =======================================================================================================================================================================================================================
 ID  Name                          Description
-=== ============================= =================================================================================================================
+=== ============================= =======================================================================================================================================================================================================================
 0   TempRegLoad_                  Helper load operation
 1   TempRegStore_                 Helper store operation
 2   MinPrecXRegLoad_              Helper load operation for minprecision
@@ -2227,7 +2227,28 @@ ID  Name                          Description
 138 ViewID                        returns the view index
 139 RawBufferLoad                 reads from a raw buffer and structured buffer
 140 RawBufferStore                writes to a RWByteAddressBuffer or RWStructuredBuffer
-=== ============================= =================================================================================================================
+141 InstanceID                    The user-provided InstanceID on the bottom-level acceleration structure instance within the top-level structure
+142 InstanceIndex                 The autogenerated index of the current instance in the top-level structure
+143 HitKind                       Returns the value passed as HitKind in ReportIntersection().  If intersection was reported by fixed-function triangle intersection, HitKind will be one of HIT_KIND_TRIANGLE_FRONT_FACE or HIT_KIND_TRIANGLE_BACK_FACE.
+144 RayFlags                      uint containing the current ray flags.
+145 DispatchRaysIndex             The current x and y location within the Width and Height
+146 DispatchRaysDimensions        The Width and Height values from the D3D12_DISPATCH_RAYS_DESC structure provided to the originating DispatchRays() call.
+147 WorldRayOrigin                The world-space origin for the current ray.
+148 WorldRayDirection             The world-space direction for the current ray.
+149 ObjectRayOrigin               Object-space origin for the current ray.
+150 ObjectRayDirection            Object-space direction for the current ray.
+151 ObjectToWorld                 Matrix for transforming from object-space to world-space.
+152 WorldToObject                 Matrix for transforming from world-space to object-space.
+153 RayTMin                       float representing the parametric starting point for the ray.
+154 RayTCurrent                   float representing the current parametric ending point for the ray
+155 IgnoreHit                     Used in an any hit shader to reject an intersection and terminate the shader
+156 AcceptHitAndEndSearch         Used in an any hit shader to abort the ray query and the intersection shader (if any). The current hit is committed and execution passes to the closest hit shader with the closest hit recorded so far
+157 TraceRay                      returns the view index
+158 ReportHit                     returns true if hit was accepted
+159 CallShader                    Call a shader in the callable shader table supplied through the DispatchRays() API
+160 CreateHandleForLib            create resource handle from resource struct for library
+161 PrimitiveIndex                PrimitiveIndex for raytracing shaders
+=== ============================= =======================================================================================================================================================================================================================
 
 
 Acos
@@ -2873,202 +2894,218 @@ The set of validation rules that are known to hold for a DXIL program is identif
 .. <py::lines('VALRULES-RST')>hctdb_instrhelp.get_valrules_rst()</py>
 .. VALRULES-RST:BEGIN
 
-====================================== =======================================================================================================================================================================================================================================================================================================
-Rule Code                              Description
-====================================== =======================================================================================================================================================================================================================================================================================================
-BITCODE.VALID                          TODO - Module must be bitcode-valid
-CONTAINER.PARTINVALID                  DXIL Container must not contain unknown parts
-CONTAINER.PARTMATCHES                  DXIL Container Parts must match Module
-CONTAINER.PARTMISSING                  DXIL Container requires certain parts, corresponding to module
-CONTAINER.PARTREPEATED                 DXIL Container must have only one of each part type
-CONTAINER.ROOTSIGNATUREINCOMPATIBLE    Root Signature in DXIL Container must be compatible with shader
-DECL.DXILFNEXTERN                      External function must be a DXIL function
-DECL.DXILNSRESERVED                    The DXIL reserved prefixes must only be used by built-in functions and types
-DECL.FNATTRIBUTE                       Functions should only contain known function attributes
-DECL.FNFLATTENPARAM                    Function parameters must not use struct types
-DECL.FNISCALLED                        Functions can only be used by call instructions
-DECL.NOTUSEDEXTERNAL                   External declaration should not be used
-DECL.USEDEXTERNALFUNCTION              External function must be used
-DECL.USEDINTERNAL                      Internal declaration must be used
-FLOW.DEADLOOP                          Loop must have break
-FLOW.FUNCTIONCALL                      Function with parameter is not permitted
-FLOW.NORECUSION                        Recursion is not permitted
-FLOW.REDUCIBLE                         Execution flow must be reducible
-INSTR.ALLOWED                          Instructions must be of an allowed type
-INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
-INSTR.BARRIERMODEFORNONCS              sync in a non-Compute Shader must only sync UAV (sync_uglobal)
-INSTR.BARRIERMODENOMEMORY              sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
-INSTR.BARRIERMODEUSELESSUGROUP         sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
-INSTR.BUFFERUPDATECOUNTERONUAV         BufferUpdateCounter valid only on UAV
-INSTR.CALLOLOAD                        Call to DXIL intrinsic must match overload signature
-INSTR.CANNOTPULLPOSITION               pull-model evaluation of position disallowed
-INSTR.CBUFFERCLASSFORCBUFFERHANDLE     Expect Cbuffer for CBufferLoad handle
-INSTR.CBUFFEROUTOFBOUND                Cbuffer access out of bound
-INSTR.CHECKACCESSFULLYMAPPED           CheckAccessFullyMapped should only used on resource status
-INSTR.COORDINATECOUNTFORRAWTYPEDBUF    raw/typed buffer don't need 2 coordinates
-INSTR.COORDINATECOUNTFORSTRUCTBUF      structured buffer require 2 coordinates
-INSTR.CREATEHANDLEIMMRANGEID           Local resource must map to global resource.
-INSTR.DXILSTRUCTUSER                   Dxil struct types should only used by ExtractValue
-INSTR.DXILSTRUCTUSEROUTOFBOUND         Index out of bound when extract value from dxil struct types
-INSTR.EVALINTERPOLATIONMODE            Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample
-INSTR.EXTRACTVALUE                     ExtractValue should only be used on dxil struct types and cmpxchg
-INSTR.FAILTORESLOVETGSMPOINTER         TGSM pointers must originate from an unambiguous TGSM global variable.
-INSTR.HANDLENOTFROMCREATEHANDLE        Resource handle should returned by createHandle
-INSTR.IMMBIASFORSAMPLEB                bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate
-INSTR.INBOUNDSACCESS                   Access to out-of-bounds memory is disallowed
-INSTR.MINPRECISIONNOTPRECISE           Instructions marked precise may not refer to minprecision values
-INSTR.MINPRECISONBITCAST               Bitcast on minprecison types is not allowed
-INSTR.MIPLEVELFORGETDIMENSION          Use mip level on buffer when GetDimensions
-INSTR.MIPONUAVLOAD                     uav load don't support mipLevel/sampleIndex
-INSTR.NOGENERICPTRADDRSPACECAST        Address space cast between pointer types must have one part to be generic address space
-INSTR.NOIDIVBYZERO                     No signed integer division by zero
-INSTR.NOINDEFINITEACOS                 No indefinite arccosine
-INSTR.NOINDEFINITEASIN                 No indefinite arcsine
-INSTR.NOINDEFINITEDSXY                 No indefinite derivative calculation
-INSTR.NOINDEFINITELOG                  No indefinite logarithm
-INSTR.NOREADINGUNINITIALIZED           Instructions should not read uninitialized value
-INSTR.NOUDIVBYZERO                     No unsigned integer division by zero
-INSTR.OFFSETONUAVLOAD                  uav load don't support offset
-INSTR.OLOAD                            DXIL intrinsic overload must be valid
-INSTR.ONLYONEALLOCCONSUME              RWStructuredBuffers may increment or decrement their counters, but not both.
-INSTR.OPCODERESERVED                   Instructions must not reference reserved opcodes
-INSTR.OPCONST                          DXIL intrinsic requires an immediate constant operand
-INSTR.OPCONSTRANGE                     Constant values must be in-range for operation
-INSTR.OPERANDRANGE                     DXIL intrinsic operand must be within defined range
-INSTR.PTRBITCAST                       Pointer type bitcast must be have same size
-INSTR.RESOURCECLASSFORLOAD             load can only run on UAV/SRV resource
-INSTR.RESOURCECLASSFORSAMPLERGATHER    sample, lod and gather should on srv resource.
-INSTR.RESOURCECLASSFORUAVSTORE         store should on uav resource.
-INSTR.RESOURCECOORDINATEMISS           coord uninitialized
-INSTR.RESOURCECOORDINATETOOMANY        out of bound coord must be undef
-INSTR.RESOURCEKINDFORBUFFERLOADSTORE   buffer load/store only works on Raw/Typed/StructuredBuffer
-INSTR.RESOURCEKINDFORCALCLOD           lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray
-INSTR.RESOURCEKINDFORGATHER            gather requires resource declared as texture/2D/Cube/2DArray/CubeArray
-INSTR.RESOURCEKINDFORGETDIM            Invalid resource kind on GetDimensions
-INSTR.RESOURCEKINDFORSAMPLE            sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray
-INSTR.RESOURCEKINDFORSAMPLEC           samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray
-INSTR.RESOURCEKINDFORTEXTURELOAD       texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray
-INSTR.RESOURCEKINDFORTEXTURESTORE      texture store only works on Texture1D/1DArray/2D/2DArray/3D
-INSTR.RESOURCEOFFSETMISS               offset uninitialized
-INSTR.RESOURCEOFFSETTOOMANY            out of bound offset must be undef
-INSTR.SAMPLECOMPTYPE                   sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
-INSTR.SAMPLEINDEXFORLOAD2DMS           load on Texture2DMS/2DMSArray require sampleIndex
-INSTR.SAMPLERMODEFORLOD                lod instruction requires sampler declared in default mode
-INSTR.SAMPLERMODEFORSAMPLE             sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
-INSTR.SAMPLERMODEFORSAMPLEC            sample_c_*/gather_c instructions require sampler declared in comparison mode
-INSTR.STATUS                           Resource status should only used by CheckAccessFullyMapped
-INSTR.STRUCTBITCAST                    Bitcast on struct types is not allowed
-INSTR.TEXTUREOFFSET                    offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
-INSTR.TGSMRACECOND                     Race condition writing to shared memory detected, consider making this write conditional
-INSTR.UNDEFRESULTFORGETDIMENSION       GetDimensions used undef dimension %0 on %1
-INSTR.WRITEMASKFORTYPEDUAVSTORE        store on typed uav must write to all four components of the UAV
-INSTR.WRITEMASKMATCHVALUEFORUAVSTORE   uav store write mask must match store value mask, write mask is %0 and store value mask is %1
-META.BARYCENTRICSFLOAT3                only 'float3' type is allowed for SV_Barycentrics.
-META.BARYCENTRICSINTERPOLATION         SV_Barycentrics cannot be used with 'nointerpolation' type
-META.BARYCENTRICSTWOPERSPECTIVES       There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
-META.BRANCHFLATTEN                     Can't use branch and flatten attributes together
-META.CLIPCULLMAXCOMPONENTS             Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
-META.CLIPCULLMAXROWS                   Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
-META.CONTROLFLOWHINTNOTONCONTROLFLOW   Control flow hint only works on control flow inst
-META.DENSERESIDS                       Resource identifiers must be zero-based and dense
-META.DUPLICATESYSVALUE                 System value may only appear once in signature
-META.ENTRYFUNCTION                     entrypoint not found
-META.FLAGSUSAGE                        Flags must match usage
-META.FORCECASEONSWITCH                 Attribute forcecase only works for switch
-META.FUNCTIONANNOTATION                Cannot find function annotation for %0
-META.GLCNOTONAPPENDCONSUME             globallycoherent cannot be used with append/consume buffers
-META.INTEGERINTERPMODE                 Interpolation mode on integer must be Constant
-META.INTERPMODEINONEROW                Interpolation mode must be identical for all elements packed into the same row.
-META.INTERPMODEVALID                   Interpolation mode must be valid
-META.INVALIDCONTROLFLOWHINT            Invalid control flow hint
-META.KNOWN                             Named metadata should be known
-META.MAXTESSFACTOR                     Hull Shader MaxTessFactor must be [%0..%1].  %2 specified
-META.NOSEMANTICOVERLAP                 Semantics must not overlap
-META.REQUIRED                          TODO - Required metadata missing
-META.SEMAKINDMATCHESNAME               Semantic name must match system value, when defined.
-META.SEMAKINDVALID                     Semantic kind must be valid
-META.SEMANTICCOMPTYPE                  %0 must be %1
-META.SEMANTICINDEXMAX                  System value semantics have a maximum valid semantic index
-META.SEMANTICLEN                       Semantic length must be at least 1 and at most 64
-META.SEMANTICSHOULDBEALLOCATED         Semantic should have a valid packing location
-META.SEMANTICSHOULDNOTBEALLOCATED      Semantic should have a packing location of -1
-META.SIGNATURECOMPTYPE                 signature %0 specifies unrecognized or invalid component type
-META.SIGNATUREDATAWIDTH                Data width must be identical for all elements packed into the same row.
-META.SIGNATUREILLEGALCOMPONENTORDER    Component ordering for packed elements must be: arbitrary < system value < system generated value
-META.SIGNATUREINDEXCONFLICT            Only elements with compatible indexing rules may be packed together
-META.SIGNATUREOUTOFRANGE               Signature elements must fit within maximum signature size
-META.SIGNATUREOVERLAP                  Signature elements may not overlap in packing location.
-META.STRUCTBUFALIGNMENT                StructuredBuffer stride not aligned
-META.STRUCTBUFALIGNMENTOUTOFBOUND      StructuredBuffer stride out of bounds
-META.SYSTEMVALUEROWS                   System value may only have 1 row
-META.TARGET                            Target triple must be 'dxil-ms-dx'
-META.TESSELLATOROUTPUTPRIMITIVE        Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
-META.TESSELLATORPARTITION              Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
-META.TEXTURETYPE                       elements of typed buffers and textures must fit in four 32-bit quantities
-META.USED                              All metadata must be used by dxil
-META.VALIDSAMPLERMODE                  Invalid sampler mode on sampler
-META.VALUERANGE                        Metadata value must be within range
-META.WELLFORMED                        TODO - Metadata must be well-formed in operand count and types
-SM.APPENDANDCONSUMEONSAMEUAV           BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
-SM.CBUFFERELEMENTOVERFLOW              CBuffer elements must not overflow
-SM.CBUFFEROFFSETOVERLAP                CBuffer offsets must not overlap
-SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT     D3D12 constant/texture buffer template element can only be a struct
-SM.COMPLETEPOSITION                    Not all elements of SV_Position were written
-SM.COUNTERONLYONSTRUCTBUF              BufferUpdateCounter valid only on structured buffers
-SM.CSNORETURN                          Compute shaders can't return values, outputs must be written in writable resources (UAVs).
-SM.DOMAINLOCATIONIDXOOB                DomainLocation component index out of bounds for the domain.
-SM.DSINPUTCONTROLPOINTCOUNTRANGE       DS input control point count must be [0..%0].  %1 specified
-SM.DXILVERSION                         Target shader model requires specific Dxil Version
-SM.GSINSTANCECOUNTRANGE                GS instance count must be [1..%0].  %1 specified
-SM.GSOUTPUTVERTEXCOUNTRANGE            GS output vertex count must be [0..%0].  %1 specified
-SM.GSTOTALOUTPUTVERTEXDATARANGE        Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
-SM.GSVALIDINPUTPRIMITIVE               GS input primitive unrecognized
-SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY      GS output primitive topology unrecognized
-SM.HSINPUTCONTROLPOINTCOUNTRANGE       HS input control point count must be [0..%0].  %1 specified
-SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH  For pass thru hull shader, input control point count must match output control point count
-SM.INSIDETESSFACTORSIZEMATCHDOMAIN     InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.INVALIDRESOURCECOMPTYPE             Invalid resource return type
-SM.INVALIDRESOURCEKIND                 Invalid resources kind
-SM.INVALIDTEXTUREKINDONUAV             Texture2DMS[Array] or TextureCube[Array] resources are not supported with UAVs
-SM.ISOLINEOUTPUTPRIMITIVEMISMATCH      Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
-SM.MAXTGSMSIZE                         Total Thread Group Shared Memory storage is %0, exceeded %1
-SM.MAXTHEADGROUP                       Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1
-SM.MULTISTREAMMUSTBEPOINT              When multiple GS output streams are used they must be pointlists
-SM.NAME                                Target shader model name must be known
-SM.NOINTERPMODE                        Interpolation mode must be undefined for VS input/PS output/patch constant.
-SM.NOPSOUTPUTIDX                       Pixel shader output registers are not indexable.
-SM.OPCODE                              Opcode must be defined in target shader model
-SM.OPCODEININVALIDFUNCTION             Invalid DXIL opcode usage like StorePatchConstant in patch constant function
-SM.OPERAND                             Operand must be defined in target shader model
-SM.OUTPUTCONTROLPOINTCOUNTRANGE        output control point count must be [0..%0].  %1 specified
-SM.OUTPUTCONTROLPOINTSTOTALSCALARS     Total number of scalars across all HS output control points must not exceed
-SM.PATCHCONSTANTONLYFORHSDS            patch constant signature only valid in HS and DS
-SM.PSCONSISTENTINTERP                  Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample)
-SM.PSCOVERAGEANDINNERCOVERAGE          InnerCoverage and Coverage are mutually exclusive.
-SM.PSMULTIPLEDEPTHSEMANTIC             Pixel Shader only allows one type of depth semantic to be declared
-SM.PSOUTPUTSEMANTIC                    Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found
-SM.PSTARGETCOL0                        SV_Target packed location must start at column 0
-SM.PSTARGETINDEXMATCHESROW             SV_Target semantic index must match packed row location
-SM.RESOURCERANGEOVERLAP                Resource ranges must not overlap
-SM.ROVONLYINPS                         RasterizerOrdered objects are only allowed in 5.0+ pixel shaders
-SM.SAMPLECOUNTONLYON2DMS               Only Texture2DMS/2DMSArray could has sample count
-SM.SEMANTIC                            Semantic must be defined in target shader model
-SM.STREAMINDEXRANGE                    Stream index (%0) must between 0 and %1
-SM.TESSFACTORFORDOMAIN                 Required TessFactor for domain not found declared anywhere in Patch Constant data
-SM.TESSFACTORSIZEMATCHDOMAIN           TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.THREADGROUPCHANNELRANGE             Declared Thread Group %0 size %1 outside valid range [%2..%3]
-SM.TRIOUTPUTPRIMITIVEMISMATCH          Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
-SM.UNDEFINEDOUTPUT                     Not all elements of output %0 were written
-SM.VALIDDOMAIN                         Invalid Tessellator Domain specified. Must be isoline, tri or quad
-SM.VIEWIDNEEDSSLOT                     ViewID requires compatible space in pixel shader input signature
-SM.ZEROHSINPUTCONTROLPOINTWITHINPUT    When HS input control point count is 0, no input signature should exist
-TYPES.DEFINED                          Type must be defined based on DXIL primitives
-TYPES.I8                               I8 can only used as immediate value for intrinsic
-TYPES.INTWIDTH                         Int type must be of valid width
-TYPES.NOMULTIDIM                       Only one dimension allowed for array type
-TYPES.NOVECTOR                         Vector types must not be present
-UNI.NOWAVESENSITIVEGRADIENT            Gradient operations are not affected by wave-sensitive data or control flow.
-====================================== =======================================================================================================================================================================================================================================================================================================
+======================================== =======================================================================================================================================================================================================================================================================================================
+Rule Code                                Description
+======================================== =======================================================================================================================================================================================================================================================================================================
+BITCODE.VALID                            TODO - Module must be bitcode-valid
+CONTAINER.PARTINVALID                    DXIL Container must not contain unknown parts
+CONTAINER.PARTMATCHES                    DXIL Container Parts must match Module
+CONTAINER.PARTMISSING                    DXIL Container requires certain parts, corresponding to module
+CONTAINER.PARTREPEATED                   DXIL Container must have only one of each part type
+CONTAINER.ROOTSIGNATUREINCOMPATIBLE      Root Signature in DXIL Container must be compatible with shader
+DECL.ATTRSTRUCT                          Attributes parameter must be struct type
+DECL.DXILFNEXTERN                        External function must be a DXIL function
+DECL.DXILNSRESERVED                      The DXIL reserved prefixes must only be used by built-in functions and types
+DECL.EXTRAARGS                           Extra arguments not allowed for shader functions
+DECL.FNATTRIBUTE                         Functions should only contain known function attributes
+DECL.FNFLATTENPARAM                      Function parameters must not use struct types
+DECL.FNISCALLED                          Functions can only be used by call instructions
+DECL.NOTUSEDEXTERNAL                     External declaration should not be used
+DECL.PARAMSTRUCT                         Callable function parameter must be struct type
+DECL.PAYLOADSTRUCT                       Payload parameter must be struct type
+DECL.RESOURCEINFNSIG                     Resources not allowed in function signatures
+DECL.SHADERMISSINGARG                    payload/params/attributes parameter is required for certain shader types
+DECL.SHADERRETURNVOID                    Shader functions must return void
+DECL.USEDEXTERNALFUNCTION                External function must be used
+DECL.USEDINTERNAL                        Internal declaration must be used
+FLOW.DEADLOOP                            Loop must have break
+FLOW.FUNCTIONCALL                        Function with parameter is not permitted
+FLOW.NORECUSION                          Recursion is not permitted
+FLOW.REDUCIBLE                           Execution flow must be reducible
+INSTR.ALLOWED                            Instructions must be of an allowed type
+INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION   Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
+INSTR.BARRIERMODEFORNONCS                sync in a non-Compute Shader must only sync UAV (sync_uglobal)
+INSTR.BARRIERMODENOMEMORY                sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
+INSTR.BARRIERMODEUSELESSUGROUP           sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
+INSTR.BUFFERUPDATECOUNTERONRESHASCOUNTER BufferUpdateCounter valid only when HasCounter is true
+INSTR.BUFFERUPDATECOUNTERONUAV           BufferUpdateCounter valid only on UAV
+INSTR.CALLOLOAD                          Call to DXIL intrinsic must match overload signature
+INSTR.CANNOTPULLPOSITION                 pull-model evaluation of position disallowed
+INSTR.CBUFFERCLASSFORCBUFFERHANDLE       Expect Cbuffer for CBufferLoad handle
+INSTR.CBUFFEROUTOFBOUND                  Cbuffer access out of bound
+INSTR.CHECKACCESSFULLYMAPPED             CheckAccessFullyMapped should only used on resource status
+INSTR.COORDINATECOUNTFORRAWTYPEDBUF      raw/typed buffer don't need 2 coordinates
+INSTR.COORDINATECOUNTFORSTRUCTBUF        structured buffer require 2 coordinates
+INSTR.CREATEHANDLEIMMRANGEID             Local resource must map to global resource.
+INSTR.DXILSTRUCTUSER                     Dxil struct types should only used by ExtractValue
+INSTR.DXILSTRUCTUSEROUTOFBOUND           Index out of bound when extract value from dxil struct types
+INSTR.EVALINTERPOLATIONMODE              Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample
+INSTR.EXTRACTVALUE                       ExtractValue should only be used on dxil struct types and cmpxchg
+INSTR.FAILTORESLOVETGSMPOINTER           TGSM pointers must originate from an unambiguous TGSM global variable.
+INSTR.HANDLENOTFROMCREATEHANDLE          Resource handle should returned by createHandle
+INSTR.IMMBIASFORSAMPLEB                  bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate
+INSTR.INBOUNDSACCESS                     Access to out-of-bounds memory is disallowed
+INSTR.MINPRECISIONNOTPRECISE             Instructions marked precise may not refer to minprecision values
+INSTR.MINPRECISONBITCAST                 Bitcast on minprecison types is not allowed
+INSTR.MIPLEVELFORGETDIMENSION            Use mip level on buffer when GetDimensions
+INSTR.MIPONUAVLOAD                       uav load don't support mipLevel/sampleIndex
+INSTR.NOGENERICPTRADDRSPACECAST          Address space cast between pointer types must have one part to be generic address space
+INSTR.NOIDIVBYZERO                       No signed integer division by zero
+INSTR.NOINDEFINITEACOS                   No indefinite arccosine
+INSTR.NOINDEFINITEASIN                   No indefinite arcsine
+INSTR.NOINDEFINITEDSXY                   No indefinite derivative calculation
+INSTR.NOINDEFINITELOG                    No indefinite logarithm
+INSTR.NOREADINGUNINITIALIZED             Instructions should not read uninitialized value
+INSTR.NOUDIVBYZERO                       No unsigned integer division by zero
+INSTR.OFFSETONUAVLOAD                    uav load don't support offset
+INSTR.OLOAD                              DXIL intrinsic overload must be valid
+INSTR.ONLYONEALLOCCONSUME                RWStructuredBuffers may increment or decrement their counters, but not both.
+INSTR.OPCODERESERVED                     Instructions must not reference reserved opcodes
+INSTR.OPCONST                            DXIL intrinsic requires an immediate constant operand
+INSTR.OPCONSTRANGE                       Constant values must be in-range for operation
+INSTR.OPERANDRANGE                       DXIL intrinsic operand must be within defined range
+INSTR.PTRBITCAST                         Pointer type bitcast must be have same size
+INSTR.RESOURCECLASSFORLOAD               load can only run on UAV/SRV resource
+INSTR.RESOURCECLASSFORSAMPLERGATHER      sample, lod and gather should on srv resource.
+INSTR.RESOURCECLASSFORUAVSTORE           store should on uav resource.
+INSTR.RESOURCECOORDINATEMISS             coord uninitialized
+INSTR.RESOURCECOORDINATETOOMANY          out of bound coord must be undef
+INSTR.RESOURCEKINDFORBUFFERLOADSTORE     buffer load/store only works on Raw/Typed/StructuredBuffer
+INSTR.RESOURCEKINDFORCALCLOD             lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray
+INSTR.RESOURCEKINDFORGATHER              gather requires resource declared as texture/2D/Cube/2DArray/CubeArray
+INSTR.RESOURCEKINDFORGETDIM              Invalid resource kind on GetDimensions
+INSTR.RESOURCEKINDFORSAMPLE              sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray
+INSTR.RESOURCEKINDFORSAMPLEC             samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray
+INSTR.RESOURCEKINDFORTEXTURELOAD         texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray
+INSTR.RESOURCEKINDFORTEXTURESTORE        texture store only works on Texture1D/1DArray/2D/2DArray/3D
+INSTR.RESOURCEKINDFORTRACERAY            TraceRay should only use RTAccelerationStructure
+INSTR.RESOURCEMAPTOSINGLEENTRY           Fail to map resource to resource table
+INSTR.RESOURCEOFFSETMISS                 offset uninitialized
+INSTR.RESOURCEOFFSETTOOMANY              out of bound offset must be undef
+INSTR.RESOURCEUSER                       Resource should only used by Load/GEP/Call
+INSTR.SAMPLECOMPTYPE                     sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
+INSTR.SAMPLEINDEXFORLOAD2DMS             load on Texture2DMS/2DMSArray require sampleIndex
+INSTR.SAMPLERMODEFORLOD                  lod instruction requires sampler declared in default mode
+INSTR.SAMPLERMODEFORSAMPLE               sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
+INSTR.SAMPLERMODEFORSAMPLEC              sample_c_*/gather_c instructions require sampler declared in comparison mode
+INSTR.SIGNATUREOPERATIONNOTINENTRY       Dxil operation for input output signature must be in entryPoints.
+INSTR.STATUS                             Resource status should only used by CheckAccessFullyMapped
+INSTR.STRUCTBITCAST                      Bitcast on struct types is not allowed
+INSTR.TEXTUREOFFSET                      offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
+INSTR.TGSMRACECOND                       Race condition writing to shared memory detected, consider making this write conditional
+INSTR.UNDEFRESULTFORGETDIMENSION         GetDimensions used undef dimension %0 on %1
+INSTR.WRITEMASKFORTYPEDUAVSTORE          store on typed uav must write to all four components of the UAV
+INSTR.WRITEMASKMATCHVALUEFORUAVSTORE     uav store write mask must match store value mask, write mask is %0 and store value mask is %1
+META.BARYCENTRICSFLOAT3                  only 'float3' type is allowed for SV_Barycentrics.
+META.BARYCENTRICSINTERPOLATION           SV_Barycentrics cannot be used with 'nointerpolation' type
+META.BARYCENTRICSTWOPERSPECTIVES         There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
+META.BRANCHFLATTEN                       Can't use branch and flatten attributes together
+META.CLIPCULLMAXCOMPONENTS               Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
+META.CLIPCULLMAXROWS                     Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
+META.CONTROLFLOWHINTNOTONCONTROLFLOW     Control flow hint only works on control flow inst
+META.DENSERESIDS                         Resource identifiers must be zero-based and dense
+META.DUPLICATESYSVALUE                   System value may only appear once in signature
+META.ENTRYFUNCTION                       entrypoint not found
+META.FLAGSUSAGE                          Flags must match usage
+META.FORCECASEONSWITCH                   Attribute forcecase only works for switch
+META.FUNCTIONANNOTATION                  Cannot find function annotation for %0
+META.GLCNOTONAPPENDCONSUME               globallycoherent cannot be used with append/consume buffers
+META.INTEGERINTERPMODE                   Interpolation mode on integer must be Constant
+META.INTERPMODEINONEROW                  Interpolation mode must be identical for all elements packed into the same row.
+META.INTERPMODEVALID                     Interpolation mode must be valid
+META.INVALIDCONTROLFLOWHINT              Invalid control flow hint
+META.KNOWN                               Named metadata should be known
+META.MAXTESSFACTOR                       Hull Shader MaxTessFactor must be [%0..%1].  %2 specified
+META.NOENTRYPROPSFORENTRY                EntryPoints must have entry properties.
+META.NOSEMANTICOVERLAP                   Semantics must not overlap
+META.REQUIRED                            TODO - Required metadata missing
+META.SEMAKINDMATCHESNAME                 Semantic name must match system value, when defined.
+META.SEMAKINDVALID                       Semantic kind must be valid
+META.SEMANTICCOMPTYPE                    %0 must be %1
+META.SEMANTICINDEXMAX                    System value semantics have a maximum valid semantic index
+META.SEMANTICLEN                         Semantic length must be at least 1 and at most 64
+META.SEMANTICSHOULDBEALLOCATED           Semantic should have a valid packing location
+META.SEMANTICSHOULDNOTBEALLOCATED        Semantic should have a packing location of -1
+META.SIGNATURECOMPTYPE                   signature %0 specifies unrecognized or invalid component type
+META.SIGNATUREDATAWIDTH                  Data width must be identical for all elements packed into the same row.
+META.SIGNATUREILLEGALCOMPONENTORDER      Component ordering for packed elements must be: arbitrary < system value < system generated value
+META.SIGNATUREINDEXCONFLICT              Only elements with compatible indexing rules may be packed together
+META.SIGNATUREOUTOFRANGE                 Signature elements must fit within maximum signature size
+META.SIGNATUREOVERLAP                    Signature elements may not overlap in packing location.
+META.STRUCTBUFALIGNMENT                  StructuredBuffer stride not aligned
+META.STRUCTBUFALIGNMENTOUTOFBOUND        StructuredBuffer stride out of bounds
+META.SYSTEMVALUEROWS                     System value may only have 1 row
+META.TARGET                              Target triple must be 'dxil-ms-dx'
+META.TESSELLATOROUTPUTPRIMITIVE          Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
+META.TESSELLATORPARTITION                Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
+META.TEXTURETYPE                         elements of typed buffers and textures must fit in four 32-bit quantities
+META.USED                                All metadata must be used by dxil
+META.VALIDSAMPLERMODE                    Invalid sampler mode on sampler
+META.VALUERANGE                          Metadata value must be within range
+META.WELLFORMED                          TODO - Metadata must be well-formed in operand count and types
+SM.64BITRAWBUFFERLOADSTORE               i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3
+SM.APPENDANDCONSUMEONSAMEUAV             BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
+SM.CBUFFERELEMENTOVERFLOW                CBuffer elements must not overflow
+SM.CBUFFEROFFSETOVERLAP                  CBuffer offsets must not overlap
+SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT       D3D12 constant/texture buffer template element can only be a struct
+SM.COMPLETEPOSITION                      Not all elements of SV_Position were written
+SM.COUNTERONLYONSTRUCTBUF                BufferUpdateCounter valid only on structured buffers
+SM.CSNOSIGNATURES                        Compute shaders must not have shader signatures.
+SM.DOMAINLOCATIONIDXOOB                  DomainLocation component index out of bounds for the domain.
+SM.DSINPUTCONTROLPOINTCOUNTRANGE         DS input control point count must be [0..%0].  %1 specified
+SM.DXILVERSION                           Target shader model requires specific Dxil Version
+SM.GSINSTANCECOUNTRANGE                  GS instance count must be [1..%0].  %1 specified
+SM.GSOUTPUTVERTEXCOUNTRANGE              GS output vertex count must be [0..%0].  %1 specified
+SM.GSTOTALOUTPUTVERTEXDATARANGE          Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
+SM.GSVALIDINPUTPRIMITIVE                 GS input primitive unrecognized
+SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY        GS output primitive topology unrecognized
+SM.HSINPUTCONTROLPOINTCOUNTRANGE         HS input control point count must be [0..%0].  %1 specified
+SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH    For pass thru hull shader, input control point count must match output control point count
+SM.INSIDETESSFACTORSIZEMATCHDOMAIN       InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.INVALIDRESOURCECOMPTYPE               Invalid resource return type
+SM.INVALIDRESOURCEKIND                   Invalid resources kind
+SM.INVALIDTEXTUREKINDONUAV               Texture2DMS[Array] or TextureCube[Array] resources are not supported with UAVs
+SM.ISOLINEOUTPUTPRIMITIVEMISMATCH        Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
+SM.MAXTGSMSIZE                           Total Thread Group Shared Memory storage is %0, exceeded %1
+SM.MAXTHEADGROUP                         Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1
+SM.MULTISTREAMMUSTBEPOINT                When multiple GS output streams are used they must be pointlists
+SM.NAME                                  Target shader model name must be known
+SM.NOINTERPMODE                          Interpolation mode must be undefined for VS input/PS output/patch constant.
+SM.NOPSOUTPUTIDX                         Pixel shader output registers are not indexable.
+SM.OPCODE                                Opcode must be defined in target shader model
+SM.OPCODEININVALIDFUNCTION               Invalid DXIL opcode usage like StorePatchConstant in patch constant function
+SM.OPERAND                               Operand must be defined in target shader model
+SM.OUTPUTCONTROLPOINTCOUNTRANGE          output control point count must be [0..%0].  %1 specified
+SM.OUTPUTCONTROLPOINTSTOTALSCALARS       Total number of scalars across all HS output control points must not exceed
+SM.PATCHCONSTANTONLYFORHSDS              patch constant signature only valid in HS and DS
+SM.PSCONSISTENTINTERP                    Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample)
+SM.PSCOVERAGEANDINNERCOVERAGE            InnerCoverage and Coverage are mutually exclusive.
+SM.PSMULTIPLEDEPTHSEMANTIC               Pixel Shader only allows one type of depth semantic to be declared
+SM.PSOUTPUTSEMANTIC                      Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found
+SM.PSTARGETCOL0                          SV_Target packed location must start at column 0
+SM.PSTARGETINDEXMATCHESROW               SV_Target semantic index must match packed row location
+SM.RAYSHADERPAYLOADSIZE                  For shader '%0', %1 size is smaller than argument's allocation size
+SM.RAYSHADERSIGNATURES                   Ray tracing shader '%0' should not have any shader signatures
+SM.RESOURCERANGEOVERLAP                  Resource ranges must not overlap
+SM.ROVONLYINPS                           RasterizerOrdered objects are only allowed in 5.0+ pixel shaders
+SM.SAMPLECOUNTONLYON2DMS                 Only Texture2DMS/2DMSArray could has sample count
+SM.SEMANTIC                              Semantic must be defined in target shader model
+SM.STREAMINDEXRANGE                      Stream index (%0) must between 0 and %1
+SM.TESSFACTORFORDOMAIN                   Required TessFactor for domain not found declared anywhere in Patch Constant data
+SM.TESSFACTORSIZEMATCHDOMAIN             TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.THREADGROUPCHANNELRANGE               Declared Thread Group %0 size %1 outside valid range [%2..%3]
+SM.TRIOUTPUTPRIMITIVEMISMATCH            Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
+SM.UNDEFINEDOUTPUT                       Not all elements of output %0 were written
+SM.VALIDDOMAIN                           Invalid Tessellator Domain specified. Must be isoline, tri or quad
+SM.VIEWIDNEEDSSLOT                       ViewID requires compatible space in pixel shader input signature
+SM.ZEROHSINPUTCONTROLPOINTWITHINPUT      When HS input control point count is 0, no input signature should exist
+TYPES.DEFINED                            Type must be defined based on DXIL primitives
+TYPES.I8                                 I8 can only used as immediate value for intrinsic
+TYPES.INTWIDTH                           Int type must be of valid width
+TYPES.NOMULTIDIM                         Only one dimension allowed for array type
+TYPES.NOVECTOR                           Vector types must not be present
+UNI.NOWAVESENSITIVEGRADIENT              Gradient operations are not affected by wave-sensitive data or control flow.
+======================================== =======================================================================================================================================================================================================================================================================================================
 
 .. VALRULES-RST:END
 

+ 99 - 0
include/dxc/DxrFallback/DxrFallbackCompiler.h

@@ -0,0 +1,99 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class CallInst;
+  class Function;
+  class Module;
+  class Type;
+}
+
+// Combines DXIL raytracing shaders together into a compute shader.
+//
+// The incoming module should contain the following functions if the corresponding
+// intrinsic are called by the specified shaders,
+// if called:
+//    Fallback_TraceRay() 
+//    Fallback_Ignore()
+//    Fallback_AcceptHitAndEndSearch()
+//    Fallback_ReportHit()
+//
+// Fallback_TraceRay() will be called with the original arguments, substituting
+// the offset of the payload on the stack for the actual payload. 
+// Fallback_TraceRay() will also be used to replace calls to TraceRayTest().
+//
+// ReportHit() returns a boolean. But to handle the abort of the intersection
+// shader when AcceptHitAndEndSearch() is called we need a third return value.
+// Fallback_ReportHit() should return an integer < 0 for end search, 0 for ignore, 
+// and > 0 for accept.
+//
+// The module should also contain a single call to Fallback_Scheduler() in the
+// entry shader for the raytracing compute shader.
+//
+// resizeStack() needs to be called after inlining everything in the compute 
+// shader.
+//
+// Currently the main scheduling loop and the implementation for intrinsic 
+// functions come from an internal runtime module.
+class DxrFallbackCompiler
+{
+public:
+  typedef std::map<int, std::string> IntToFuncNameMap;
+
+  // If findCalledShaders is true, then the list of shaderNames is expanded to 
+  // include shader functions (functions with attribute "exp-shader") that are 
+  // called by functions in shaderNames. Shader entry state IDs are still
+  // returned only for those originally in shaderNames. findCalledShaders used 
+  // for testing.
+  DxrFallbackCompiler(llvm::Module* module, const std::vector<std::string>& shaderNames, unsigned maxAttributeSize, unsigned stackSizeInBytes, bool findCalledShaders = false);
+
+  // 0 - no debug output
+  // 1 - dump initial combined module, compiled module, and final linked module
+  // 2 - dump intermediate stages of SFT to console
+  // 3 - dump intermediate stages of SFT to file
+  void setDebugOutputLevel(int val);
+
+  // Returns the entry state id for each of shaderNames. The transformations 
+  // are performed in place on the module.
+  void compile(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap);
+  void link(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap);
+  // TODO: Ideally we would run this after inlining everything at the end of compile.
+  // Until we figure out to do this, we will call the function after the final link.
+  static void resizeStack(llvm::Function* F, unsigned stackSizeInBytes);
+private:
+  typedef std::map<int, llvm::Function*> IntToFuncMap;
+  typedef std::map<std::string, llvm::Function*> StringToFuncMap;
+
+  llvm::Module* m_module = nullptr;
+  const std::vector<std::string>& m_entryShaderNames;
+  unsigned m_stackSizeInBytes = 0;
+  unsigned m_maxAttributeSize = 0;
+  bool m_findCalledShaders = false;
+  int m_debugOutputLevel = 0;
+
+  StringToFuncMap m_shaderMap;
+
+  void initShaderMap(std::vector<std::string>& shaderNames);
+  void linkRuntime();
+  void lowerAnyHitControlFlowFuncs();
+  void lowerReportHit();
+  void lowerTraceRay(llvm::Type* runtimeDataArgTy);
+  void createStateFunctions(IntToFuncMap& stateFunctionMap, std::vector<int>& shaderEntryStateIds, std::vector<unsigned int>& shaderStackSizes, int baseStateId, const std::vector<std::string>& shaderNames, llvm::Type* runtimeDataArgTy);
+  void createLaunchParams(llvm::Function* func);
+  void createStack(llvm::Function* func);
+  void createStateDispatch(llvm::Function* func, const IntToFuncMap& stateFunctionMap, llvm::Type* runtimeDataArgTy);
+  void lowerIntrinsics();
+
+  llvm::Type* getRuntimeDataArgType();
+  llvm::Function* createDispatchFunction(const IntToFuncMap &stateFunctionMap, llvm::Type* runtimeDataArgTy);
+
+  // These functions return calls only in shaders in m_shaderMap.
+  std::vector<llvm::CallInst*> getCallsInShadersToFunction(const std::string& funcName);
+  std::vector<llvm::CallInst*> getCallsInShadersToFunctionWithPrefix(const std::string& funcNamePrefix);
+
+};

+ 138 - 4
include/dxc/HLSL/DxilConstants.h

@@ -126,6 +126,12 @@ namespace DXIL {
     Domain,
     Compute,
     Library,
+    RayGeneration,
+    Intersection,
+    AnyHit,
+    ClosestHit,
+    Miss,
+    Callable,
     Invalid,
   };
 
@@ -242,6 +248,12 @@ namespace DXIL {
     Invalid,
   };
 
+  enum class DefaultLinkage : unsigned {
+    Default = 0,
+    Internal = 1,
+    External = 2,
+  };
+
   enum class SamplerKind : unsigned {
     Default = 0,
     Comparison,
@@ -274,6 +286,7 @@ namespace DXIL {
     CBuffer,
     Sampler,
     TBuffer,
+    RTAccelerationStructure,
     NumEntries,
   };
 
@@ -282,6 +295,10 @@ namespace DXIL {
   // OPCODE-ENUM:BEGIN
   // Enumeration for operations specified by DXIL
   enum class OpCode : unsigned {
+    // AnyHit Terminals
+    AcceptHitAndEndSearch = 156, // Used in an any hit shader to abort the ray query and the intersection shader (if any). The current hit is committed and execution passes to the closest hit shader with the closest hit recorded so far
+    IgnoreHit = 155, // Used in an any hit shader to reject an intersection and terminate the shader
+  
     // Binary float
     FMax = 35, // returns a if a >= b, else b
     FMin = 36, // returns a if a < b, else b
@@ -349,13 +366,23 @@ namespace DXIL {
   
     // Hull shader
     OutputControlPointID = 107, // OutputControlPointID
-    PrimitiveID = 108, // PrimitiveID
     StorePatchConstant = 106, // StorePatchConstant
   
+    // Hull, Domain and Geometry shaders
+    PrimitiveID = 108, // PrimitiveID
+  
+    // Indirect Shader Invocation
+    CallShader = 159, // Call a shader in the callable shader table supplied through the DispatchRays() API
+    ReportHit = 158, // returns true if hit was accepted
+    TraceRay = 157, // returns the view index
+  
     // Legacy floating-point
     LegacyF16ToF32 = 131, // legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
     LegacyF32ToF16 = 130, // legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
   
+    // Library create handle from resource struct (like HL intrinsic)
+    CreateHandleForLib = 160, // create resource handle from resource struct for library
+  
     // Other
     CycleCounterLegacy = 109, // CycleCounterLegacy
   
@@ -377,6 +404,37 @@ namespace DXIL {
     // Quaternary
     Bfi = 53, // Given a bit range from the LSB of a number, places that number of bits in another number at any offset
   
+    // Ray Dispatch Arguments
+    DispatchRaysDimensions = 146, // The Width and Height values from the D3D12_DISPATCH_RAYS_DESC structure provided to the originating DispatchRays() call.
+    DispatchRaysIndex = 145, // The current x and y location within the Width and Height
+  
+    // Ray Transforms
+    ObjectToWorld = 151, // Matrix for transforming from object-space to world-space.
+    WorldToObject = 152, // Matrix for transforming from world-space to object-space.
+  
+    // Ray Vectors
+    WorldRayDirection = 148, // The world-space direction for the current ray.
+    WorldRayOrigin = 147, // The world-space origin for the current ray.
+  
+    // Ray object space Vectors
+    ObjectRayDirection = 150, // Object-space direction for the current ray.
+    ObjectRayOrigin = 149, // Object-space origin for the current ray.
+  
+    // RayT
+    RayTCurrent = 154, // float representing the current parametric ending point for the ray
+    RayTMin = 153, // float representing the parametric starting point for the ray.
+  
+    // Raytracing hit uint System Values
+    HitKind = 143, // Returns the value passed as HitKind in ReportIntersection().  If intersection was reported by fixed-function triangle intersection, HitKind will be one of HIT_KIND_TRIANGLE_FRONT_FACE or HIT_KIND_TRIANGLE_BACK_FACE.
+  
+    // Raytracing object space uint System Values
+    InstanceID = 141, // The user-provided InstanceID on the bottom-level acceleration structure instance within the top-level structure
+    InstanceIndex = 142, // The autogenerated index of the current instance in the top-level structure
+    PrimitiveIndex = 161, // PrimitiveIndex for raytracing shaders
+  
+    // Raytracing uint System Values
+    RayFlags = 144, // uint containing the current ray flags.
+  
     // Resources - gather
     TextureGather = 73, // gathers the four texels that would be used in a bi-linear filtering operation
     TextureGatherCmp = 74, // same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
@@ -490,8 +548,9 @@ namespace DXIL {
     NumOpCodes_Dxil_1_0 = 137,
     NumOpCodes_Dxil_1_1 = 139,
     NumOpCodes_Dxil_1_2 = 141,
+    NumOpCodes_Dxil_1_3 = 162,
   
-    NumOpCodes = 141 // exclusive last value of enumeration
+    NumOpCodes = 162 // exclusive last value of enumeration
   };
   // OPCODE-ENUM:END
 
@@ -499,6 +558,10 @@ namespace DXIL {
   // OPCODECLASS-ENUM:BEGIN
   // Groups for DXIL operations with equivalent function templates
   enum class OpCodeClass : unsigned {
+    // AnyHit Terminals
+    AcceptHitAndEndSearch,
+    IgnoreHit,
+  
     // Binary uint with carry or borrow
     BinaryWithCarryOrBorrow,
   
@@ -552,9 +615,16 @@ namespace DXIL {
   
     // Hull shader
     OutputControlPointID,
-    PrimitiveID,
     StorePatchConstant,
   
+    // Hull, Domain and Geometry shaders
+    PrimitiveID,
+  
+    // Indirect Shader Invocation
+    CallShader,
+    ReportHit,
+    TraceRay,
+  
     // LLVM Instructions
     LlvmInst,
   
@@ -562,6 +632,9 @@ namespace DXIL {
     LegacyF16ToF32,
     LegacyF32ToF16,
   
+    // Library create handle from resource struct (like HL intrinsic)
+    CreateHandleForLib,
+  
     // Other
     CycleCounterLegacy,
   
@@ -580,6 +653,37 @@ namespace DXIL {
     // Quaternary
     Quaternary,
   
+    // Ray Dispatch Arguments
+    DispatchRaysDimensions,
+    DispatchRaysIndex,
+  
+    // Ray Transforms
+    ObjectToWorld,
+    WorldToObject,
+  
+    // Ray Vectors
+    WorldRayDirection,
+    WorldRayOrigin,
+  
+    // Ray object space Vectors
+    ObjectRayDirection,
+    ObjectRayOrigin,
+  
+    // RayT
+    RayTCurrent,
+    RayTMin,
+  
+    // Raytracing hit uint System Values
+    HitKind,
+  
+    // Raytracing object space uint System Values
+    InstanceID,
+    InstanceIndex,
+    PrimitiveIndex,
+  
+    // Raytracing uint System Values
+    RayFlags,
+  
     // Resources - gather
     TextureGather,
     TextureGatherCmp,
@@ -651,8 +755,9 @@ namespace DXIL {
     NumOpClasses_Dxil_1_0 = 93,
     NumOpClasses_Dxil_1_1 = 95,
     NumOpClasses_Dxil_1_2 = 97,
+    NumOpClasses_Dxil_1_3 = 118,
   
-    NumOpClasses = 97 // exclusive last value of enumeration
+    NumOpClasses = 118 // exclusive last value of enumeration
   };
   // OPCODECLASS-ENUM:END
 
@@ -774,6 +879,15 @@ namespace DXIL {
     const unsigned kCreateHandleResIndexOpIdx = 3;
     const unsigned kCreateHandleIsUniformOpIdx = 4;
 
+    // CreateHandleFromResource
+    const unsigned kCreateHandleForLibResOpIdx = 1;
+
+    // TraceRay
+    const unsigned kTraceRayRayDescOpIdx = 7;
+    const unsigned kTraceRayPayloadOpIdx = 15;
+    const unsigned kTraceRayNumOp = 16;
+
+
     // Emit/Cut
     const unsigned kStreamEmitCutIDOpIdx = 1;
     // TODO: add operand index for all the OpCodeClass.
@@ -964,6 +1078,26 @@ namespace DXIL {
     UseNativeLowPrecision
   };
 
+  // Corresponds to HIT_FLAG_* in HLSL
+  enum class RayFlag : uint8_t {
+    None = 0x00,
+    ForceOpaque = 0x01,
+    ForceNonOpaque = 0x02,
+    AcceptFirstHitAndEndSearch = 0x04,
+    SkipClosestHitShader = 0x08,
+    CullBackFacingTriangles = 0x10,
+    CullFrontFacingTriangles = 0x20,
+    CullOpaque = 0x40,
+    CullNonOpaque = 0x80,
+  };
+
+  // Corresponds to HIT_KIND_* in HLSL
+  enum class HitKind : uint8_t {
+    None = 0x00,
+    TriangleFrontFace = 0xFE,
+    TriangleBackFace = 0xFF,
+  };
+
 
   extern const char* kLegacyLayoutString;
   extern const char* kNewLayoutString;

+ 2 - 0
include/dxc/HLSL/DxilContainer.h

@@ -83,6 +83,7 @@ enum DxilFourCC {
   DFCC_RootSignature            = DXIL_FOURCC('R', 'T', 'S', '0'),
   DFCC_DXIL                     = DXIL_FOURCC('D', 'X', 'I', 'L'),
   DFCC_PipelineStateValidation  = DXIL_FOURCC('P', 'S', 'V', '0'),
+  DFCC_RuntimeData              = DXIL_FOURCC('R', 'D', 'A', 'T'),
 };
 
 #undef DXIL_FOURCC
@@ -428,6 +429,7 @@ DxilPartWriter *NewProgramSignatureWriter(const DxilModule &M, DXIL::SignatureKi
 DxilPartWriter *NewRootSignatureWriter(const RootSignatureHandle &S);
 DxilPartWriter *NewFeatureInfoWriter(const DxilModule &M);
 DxilPartWriter *NewPSVWriter(const DxilModule &M, uint32_t PSVVersion = 0);
+DxilPartWriter *NewRDATWriter(const DxilModule &M, uint32_t InfoVersion = 0);
 
 class DxilContainerWriter : public DxilPartWriter  {
 public:

+ 107 - 0
include/dxc/HLSL/DxilExportMap.h

@@ -0,0 +1,107 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilExportMap.h                                                           //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// dxilutil::ExportMap for handling -exports option.                         //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: Refactor to separate name export verification part from
+// llvm/Function part so first part may be have shared use without llvm
+
+#pragma once
+#include <vector>
+#include <set>
+#include <unordered_set>
+#include <string>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+class Function;
+class raw_ostream;
+}
+
+namespace hlsl {
+namespace dxilutil {
+  class ExportMap {
+  public:
+    typedef std::unordered_set<std::string> StringStore;
+    typedef std::set<llvm::StringRef> NameSet;
+    typedef llvm::DenseMap< llvm::Function*, NameSet > RenameMap;
+    typedef llvm::StringMap< llvm::StringSet<> > ExportMapByString;
+    typedef ExportMapByString::iterator iterator;
+    typedef ExportMapByString::const_iterator const_iterator;
+
+    ExportMap() {}
+    void clear();
+    bool empty() const;
+
+    // Iterate export map by string name
+    iterator begin() { return m_ExportMap.begin(); }
+    const_iterator begin() const { return m_ExportMap.begin(); }
+    iterator end() { return m_ExportMap.end(); }
+    const_iterator end() const { return m_ExportMap.end(); }
+
+    // Initialize export map from option strings
+    bool ParseExports(const std::vector<std::string> &exportOpts, llvm::raw_ostream &errors);
+    // Add one export to the export map
+    void Add(llvm::StringRef exportName, llvm::StringRef internalName = llvm::StringRef());
+    // Return true if export is present, or m_ExportMap is empty
+    bool IsExported(llvm::StringRef original) const;
+
+    // Retrieve export entry by name.  If Name is mangled, it will fallback to
+    // search for unmangled version if exact match fails.
+    // If result == end(), no matching export was found.
+    ExportMapByString::const_iterator GetExportsByName(llvm::StringRef Name) const;
+
+    // Call before processing functions for renaming and cloning validation
+    void BeginProcessing();
+
+    // Called for each function to be processed
+    // In order to avoid intermediate name collisions during renaming,
+    //  if collisionAvoidanceRenaming is true:
+    //    non-exported functions will be renamed internal.<name>
+    //    functions exported with a different name will be renamed temp.<name>
+    // returns true if function is exported
+    bool ProcessFunction(llvm::Function *F, bool collisionAvoidanceRenaming);
+
+    // Add function to exports without checking export map or renaming
+    //  (useful for patch constant functions used by exported HS)
+    void RegisterExportedFunction(llvm::Function *F);
+
+    // Called to mark an internal name as used (remove from unused set)
+    void UseExport(llvm::StringRef internalName);
+    // Called to add an exported (full) name (for collision detection)
+    void ExportName(llvm::StringRef exportName);
+
+    // Called after functions are processed.
+    // Returns true if no name collisions or unused exports are present.
+    bool EndProcessing() const;
+    const NameSet& GetNameCollisions() const { return m_NameCollisions; }
+    const NameSet& GetUnusedExports() const { return m_UnusedExports; }
+
+    // GetRenames gets the map of mangled renames by function pointer
+    const RenameMap &GetFunctionRenames() const { return m_RenameMap; }
+
+  private:
+    // {"internalname": ("export1", "export2", ...), ...}
+    ExportMapByString m_ExportMap;
+    StringStore m_StringStorage;
+    llvm::StringRef StoreString(llvm::StringRef str);
+
+    // Renaming/Validation state
+    RenameMap m_RenameMap;
+    NameSet m_ExportNames;
+    NameSet m_NameCollisions;
+    NameSet m_UnusedExports;
+  };
+}
+
+}

+ 20 - 0
include/dxc/HLSL/DxilFallbackLayerPass.h

@@ -0,0 +1,20 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilFallbackLayerPass.h                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// This file provides passes used by the Ray Tracing Fallback Layer          //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace llvm {
+    ModulePass *createDxilUpdateMetadataPass();
+    ModulePass *createDxilPatchShaderRecordBindingsPass();
+
+    void initializeDxilUpdateMetadataPass(llvm::PassRegistry&);
+    void initializeDxilPatchShaderRecordBindingsPass(llvm::PassRegistry&);
+}

+ 22 - 8
include/dxc/HLSL/DxilFunctionProps.h

@@ -20,6 +20,9 @@ class Constant;
 
 namespace hlsl {
 struct DxilFunctionProps {
+  DxilFunctionProps() {
+    memset(this, 0, sizeof(DxilFunctionProps));
+  }
   union {
     // Compute shader.
     struct {
@@ -56,8 +59,17 @@ struct DxilFunctionProps {
     struct {
       bool EarlyDepthStencil;
     } PS;
+    // Ray Tracing shaders
+    struct {
+      union {
+        unsigned payloadSizeInBytes;
+        unsigned paramSizeInBytes;
+      };
+      unsigned attributeSizeInBytes;
+    } Ray;
   } ShaderProps;
   DXIL::ShaderKind shaderKind;
+  // TODO: Should we have an unmangled name here for ray tracing shaders?
   bool IsPS() const     { return shaderKind == DXIL::ShaderKind::Pixel; }
   bool IsVS() const     { return shaderKind == DXIL::ShaderKind::Vertex; }
   bool IsGS() const     { return shaderKind == DXIL::ShaderKind::Geometry; }
@@ -65,14 +77,16 @@ struct DxilFunctionProps {
   bool IsDS() const     { return shaderKind == DXIL::ShaderKind::Domain; }
   bool IsCS() const     { return shaderKind == DXIL::ShaderKind::Compute; }
   bool IsGraphics() const {
-    switch (shaderKind) {
-    case DXIL::ShaderKind::Compute:
-    case DXIL::ShaderKind::Library:
-    case DXIL::ShaderKind::Invalid:
-      return false;
-    default:
-      return true;
-    }
+    return (shaderKind >= DXIL::ShaderKind::Pixel && shaderKind <= DXIL::ShaderKind::Domain);
+  }
+  bool IsRayGeneration() const { return shaderKind == DXIL::ShaderKind::RayGeneration; }
+  bool IsIntersection() const { return shaderKind == DXIL::ShaderKind::Intersection; }
+  bool IsAnyHit() const { return shaderKind == DXIL::ShaderKind::AnyHit; }
+  bool IsClosestHit() const { return shaderKind == DXIL::ShaderKind::ClosestHit; }
+  bool IsMiss() const { return shaderKind == DXIL::ShaderKind::Miss; }
+  bool IsCallable() const { return shaderKind == DXIL::ShaderKind::Callable; }
+  bool IsRay() const {
+    return (shaderKind >= DXIL::ShaderKind::RayGeneration && shaderKind <= DXIL::ShaderKind::Callable);
   }
 };
 

+ 14 - 4
include/dxc/HLSL/DxilGenerationPass.h

@@ -44,6 +44,8 @@ namespace llvm {
 /// \brief Create and return a pass that tranform the module into a DXIL module
 /// Note that this pass is designed for use with the legacy pass manager.
 ModulePass *createDxilCondenseResourcesPass();
+ModulePass *createDxilLowerCreateHandleForLibPass();
+ModulePass *createDxilAllocateResourcesForLibPass();
 ModulePass *createDxilEliminateOutputDynamicIndexingPass();
 ModulePass *createDxilGenerationPass(bool NotOptimized, hlsl::HLSLExtensionsCodegenHelper *extensionsHelper);
 ModulePass *createHLEmitMetadataPass();
@@ -59,17 +61,22 @@ ModulePass *createHLDeadFunctionEliminationPass();
 ModulePass *createHLPreprocessPass();
 ModulePass *createDxilPrecisePropagatePass();
 FunctionPass *createDxilPreserveAllOutputsPass();
-FunctionPass *createDxilLegalizeResourceUsePass();
-ModulePass *createDxilLegalizeStaticResourceUsePass();
+FunctionPass *createDxilPromoteLocalResources();
+ModulePass *createDxilPromoteStaticResources();
+ModulePass *createDxilLegalizeResources();
 ModulePass *createDxilLegalizeEvalOperationsPass();
 FunctionPass *createDxilLegalizeSampleOffsetPass();
+ModulePass *createFailUndefResourcePass();
 FunctionPass *createSimplifyInstPass();
 ModulePass *createDxilTranslateRawBuffer();
 ModulePass *createNoPausePassesPass();
 ModulePass *createPausePassesPass();
 ModulePass *createResumePassesPass();
+FunctionPass *createMatrixBitcastLowerPass();
 
 void initializeDxilCondenseResourcesPass(llvm::PassRegistry&);
+void initializeDxilLowerCreateHandleForLibPass(llvm::PassRegistry&);
+void initializeDxilAllocateResourcesForLibPass(llvm::PassRegistry&);
 void initializeDxilEliminateOutputDynamicIndexingPass(llvm::PassRegistry&);
 void initializeDxilGenerationPassPass(llvm::PassRegistry&);
 void initializeHLEnsureMetadataPass(llvm::PassRegistry&);
@@ -85,15 +92,18 @@ void initializeDxilConvergentMarkPass(llvm::PassRegistry&);
 void initializeDxilConvergentClearPass(llvm::PassRegistry&);
 void initializeDxilPrecisePropagatePassPass(llvm::PassRegistry&);
 void initializeDxilPreserveAllOutputsPass(llvm::PassRegistry&);
-void initializeDxilLegalizeResourceUsePassPass(llvm::PassRegistry&);
-void initializeDxilLegalizeStaticResourceUsePassPass(llvm::PassRegistry&);
+void initializeDxilPromoteLocalResourcesPass(llvm::PassRegistry&);
+void initializeDxilPromoteStaticResourcesPass(llvm::PassRegistry&);
+void initializeDxilLegalizeResourcesPass(llvm::PassRegistry&);
 void initializeDxilLegalizeEvalOperationsPass(llvm::PassRegistry&);
 void initializeDxilLegalizeSampleOffsetPassPass(llvm::PassRegistry&);
+void initializeFailUndefResourcePass(llvm::PassRegistry&);
 void initializeSimplifyInstPass(llvm::PassRegistry&);
 void initializeDxilTranslateRawBufferPass(llvm::PassRegistry&);
 void initializeNoPausePassesPass(llvm::PassRegistry&);
 void initializePausePassesPass(llvm::PassRegistry&);
 void initializeResumePassesPass(llvm::PassRegistry&);
+void initializeMatrixBitcastLowerPassPass(llvm::PassRegistry&);
 
 bool AreDxilResourcesDense(llvm::Module *M, hlsl::DxilResourceBase **ppNonDense);
 

+ 519 - 0
include/dxc/HLSL/DxilInstructions.h

@@ -4822,5 +4822,524 @@ struct DxilInst_RawBufferStore {
   int32_t get_alignment_val() const { return (int32_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(9))->getZExtValue()); }
   void set_alignment_val(int32_t val) { Instr->setOperand(9, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 32), llvm::APInt(32, (uint64_t)val))); }
 };
+
+/// This instruction The user-provided InstanceID on the bottom-level acceleration structure instance within the top-level structure
+struct DxilInst_InstanceID {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_InstanceID(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::InstanceID);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction The autogenerated index of the current instance in the top-level structure
+struct DxilInst_InstanceIndex {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_InstanceIndex(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::InstanceIndex);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction Returns the value passed as HitKind in ReportIntersection().  If intersection was reported by fixed-function triangle intersection, HitKind will be one of HIT_KIND_TRIANGLE_FRONT_FACE or HIT_KIND_TRIANGLE_BACK_FACE.
+struct DxilInst_HitKind {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_HitKind(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::HitKind);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction uint containing the current ray flags.
+struct DxilInst_RayFlags {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_RayFlags(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::RayFlags);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction The current x and y location within the Width and Height
+struct DxilInst_DispatchRaysIndex {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_DispatchRaysIndex(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::DispatchRaysIndex);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction The Width and Height values from the D3D12_DISPATCH_RAYS_DESC structure provided to the originating DispatchRays() call.
+struct DxilInst_DispatchRaysDimensions {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_DispatchRaysDimensions(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::DispatchRaysDimensions);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction The world-space origin for the current ray.
+struct DxilInst_WorldRayOrigin {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_WorldRayOrigin(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::WorldRayOrigin);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction The world-space direction for the current ray.
+struct DxilInst_WorldRayDirection {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_WorldRayDirection(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::WorldRayDirection);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction Object-space origin for the current ray.
+struct DxilInst_ObjectRayOrigin {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_ObjectRayOrigin(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::ObjectRayOrigin);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction Object-space direction for the current ray.
+struct DxilInst_ObjectRayDirection {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_ObjectRayDirection(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::ObjectRayDirection);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_col = 1,
+  };
+  // Accessors
+  llvm::Value *get_col() const { return Instr->getOperand(1); }
+  void set_col(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction Matrix for transforming from object-space to world-space.
+struct DxilInst_ObjectToWorld {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_ObjectToWorld(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::ObjectToWorld);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_row = 1,
+    arg_col = 2,
+  };
+  // Accessors
+  llvm::Value *get_row() const { return Instr->getOperand(1); }
+  void set_row(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_col() const { return Instr->getOperand(2); }
+  void set_col(llvm::Value *val) { Instr->setOperand(2, val); }
+};
+
+/// This instruction Matrix for transforming from world-space to object-space.
+struct DxilInst_WorldToObject {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_WorldToObject(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::WorldToObject);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_row = 1,
+    arg_col = 2,
+  };
+  // Accessors
+  llvm::Value *get_row() const { return Instr->getOperand(1); }
+  void set_row(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_col() const { return Instr->getOperand(2); }
+  void set_col(llvm::Value *val) { Instr->setOperand(2, val); }
+};
+
+/// This instruction float representing the parametric starting point for the ray.
+struct DxilInst_RayTMin {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_RayTMin(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::RayTMin);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction float representing the current parametric ending point for the ray
+struct DxilInst_RayTCurrent {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_RayTCurrent(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::RayTCurrent);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction Used in an any hit shader to reject an intersection and terminate the shader
+struct DxilInst_IgnoreHit {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_IgnoreHit(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::IgnoreHit);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction Used in an any hit shader to abort the ray query and the intersection shader (if any). The current hit is committed and execution passes to the closest hit shader with the closest hit recorded so far
+struct DxilInst_AcceptHitAndEndSearch {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_AcceptHitAndEndSearch(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::AcceptHitAndEndSearch);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
+
+/// This instruction returns the view index
+struct DxilInst_TraceRay {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_TraceRay(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::TraceRay);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (16 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_AccelerationStructure = 1,
+    arg_RayFlags = 2,
+    arg_InstanceInclusionMask = 3,
+    arg_RayContributionToHitGroupIndex = 4,
+    arg_MultiplierForGeometryContributionToShaderIndex = 5,
+    arg_MissShaderIndex = 6,
+    arg_Origin_X = 7,
+    arg_Origin_Y = 8,
+    arg_Origin_Z = 9,
+    arg_TMin = 10,
+    arg_Direction_X = 11,
+    arg_Direction_Y = 12,
+    arg_Direction_Z = 13,
+    arg_TMax = 14,
+    arg_payload = 15,
+  };
+  // Accessors
+  llvm::Value *get_AccelerationStructure() const { return Instr->getOperand(1); }
+  void set_AccelerationStructure(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_RayFlags() const { return Instr->getOperand(2); }
+  void set_RayFlags(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_InstanceInclusionMask() const { return Instr->getOperand(3); }
+  void set_InstanceInclusionMask(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_RayContributionToHitGroupIndex() const { return Instr->getOperand(4); }
+  void set_RayContributionToHitGroupIndex(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_MultiplierForGeometryContributionToShaderIndex() const { return Instr->getOperand(5); }
+  void set_MultiplierForGeometryContributionToShaderIndex(llvm::Value *val) { Instr->setOperand(5, val); }
+  llvm::Value *get_MissShaderIndex() const { return Instr->getOperand(6); }
+  void set_MissShaderIndex(llvm::Value *val) { Instr->setOperand(6, val); }
+  llvm::Value *get_Origin_X() const { return Instr->getOperand(7); }
+  void set_Origin_X(llvm::Value *val) { Instr->setOperand(7, val); }
+  llvm::Value *get_Origin_Y() const { return Instr->getOperand(8); }
+  void set_Origin_Y(llvm::Value *val) { Instr->setOperand(8, val); }
+  llvm::Value *get_Origin_Z() const { return Instr->getOperand(9); }
+  void set_Origin_Z(llvm::Value *val) { Instr->setOperand(9, val); }
+  llvm::Value *get_TMin() const { return Instr->getOperand(10); }
+  void set_TMin(llvm::Value *val) { Instr->setOperand(10, val); }
+  llvm::Value *get_Direction_X() const { return Instr->getOperand(11); }
+  void set_Direction_X(llvm::Value *val) { Instr->setOperand(11, val); }
+  llvm::Value *get_Direction_Y() const { return Instr->getOperand(12); }
+  void set_Direction_Y(llvm::Value *val) { Instr->setOperand(12, val); }
+  llvm::Value *get_Direction_Z() const { return Instr->getOperand(13); }
+  void set_Direction_Z(llvm::Value *val) { Instr->setOperand(13, val); }
+  llvm::Value *get_TMax() const { return Instr->getOperand(14); }
+  void set_TMax(llvm::Value *val) { Instr->setOperand(14, val); }
+  llvm::Value *get_payload() const { return Instr->getOperand(15); }
+  void set_payload(llvm::Value *val) { Instr->setOperand(15, val); }
+};
+
+/// This instruction returns true if hit was accepted
+struct DxilInst_ReportHit {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_ReportHit(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::ReportHit);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_THit = 1,
+    arg_HitKind = 2,
+    arg_Attributes = 3,
+  };
+  // Accessors
+  llvm::Value *get_THit() const { return Instr->getOperand(1); }
+  void set_THit(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_HitKind() const { return Instr->getOperand(2); }
+  void set_HitKind(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_Attributes() const { return Instr->getOperand(3); }
+  void set_Attributes(llvm::Value *val) { Instr->setOperand(3, val); }
+};
+
+/// This instruction Call a shader in the callable shader table supplied through the DispatchRays() API
+struct DxilInst_CallShader {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_CallShader(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::CallShader);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_ShaderIndex = 1,
+    arg_Parameter = 2,
+  };
+  // Accessors
+  llvm::Value *get_ShaderIndex() const { return Instr->getOperand(1); }
+  void set_ShaderIndex(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_Parameter() const { return Instr->getOperand(2); }
+  void set_Parameter(llvm::Value *val) { Instr->setOperand(2, val); }
+};
+
+/// This instruction create resource handle from resource struct for library
+struct DxilInst_CreateHandleForLib {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_CreateHandleForLib(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::CreateHandleForLib);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_Resource = 1,
+  };
+  // Accessors
+  llvm::Value *get_Resource() const { return Instr->getOperand(1); }
+  void set_Resource(llvm::Value *val) { Instr->setOperand(1, val); }
+};
+
+/// This instruction PrimitiveIndex for raytracing shaders
+struct DxilInst_PrimitiveIndex {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_PrimitiveIndex(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::PrimitiveIndex);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+};
 // INSTR-HELPER:END
 } // namespace hlsl

+ 4 - 2
include/dxc/HLSL/DxilLinker.h

@@ -14,8 +14,10 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringMap.h"
 #include <memory>
 #include "llvm/Support/ErrorOr.h"
+#include "dxc/HLSL/DxilExportMap.h"
 
 namespace llvm {
 class Function;
@@ -43,8 +45,8 @@ public:
   virtual bool DetachLib(llvm::StringRef name) = 0;
   virtual void DetachAll() = 0;
 
-  virtual std::unique_ptr<llvm::Module> Link(llvm::StringRef entry,
-                                             llvm::StringRef profile) = 0;
+  virtual std::unique_ptr<llvm::Module>
+  Link(llvm::StringRef entry, llvm::StringRef profile, dxilutil::ExportMap &exportMap) = 0;
 
 protected:
   DxilLinker(llvm::LLVMContext &Ctx, unsigned valMajor, unsigned valMinor) : m_ctx(Ctx), m_valMajor(valMajor), m_valMinor(valMinor) {}

+ 23 - 18
include/dxc/HLSL/DxilMetadataHelper.h

@@ -87,10 +87,6 @@ public:
   static const char kDxilSourceMainFileNameMDName[];
   static const char kDxilSourceArgsMDName[];
 
-  // Function props.
-  static const char kDxilFunctionPropertiesMDName[];
-  static const char kDxilEntrySignaturesMDName[];
-
   static const unsigned kDxilEntryPointNumFields  = 5;
   static const unsigned kDxilEntryPointFunction   = 0;  // Entry point function symbol.
   static const unsigned kDxilEntryPointName       = 1;  // Entry point unmangled name.
@@ -125,7 +121,6 @@ public:
 
   // Resources.
   static const char kDxilResourcesMDName[];
-  static const char kDxilResourcesLinkInfoMDName[];
   static const unsigned kDxilNumResourceFields              = 4;
   static const unsigned kDxilResourceSRVs                   = 0;
   static const unsigned kDxilResourceUAVs                   = 1;
@@ -199,6 +194,9 @@ public:
   // Precise attribute.
   static const char kDxilPreciseAttributeMDName[];
 
+  // NonUniform attribute.
+  static const char kDxilNonUniformAttributeMDName[];
+
   // Validator version.
   static const char kDxilValidatorVersionMDName[];
   // Validator version uses the same constants for fields as kDxilVersion*
@@ -209,6 +207,10 @@ public:
   static const unsigned kDxilDSStateTag         = 2;
   static const unsigned kDxilHSStateTag         = 3;
   static const unsigned kDxilNumThreadsTag      = 4;
+  static const unsigned kDxilAutoBindingSpaceTag    = 5;
+  static const unsigned kDxilRayPayloadSizeTag  = 6;
+  static const unsigned kDxilRayAttribSizeTag   = 7;
+  static const unsigned kDxilShaderKindTag      = 8;
 
   // GSState.
   static const unsigned kDxilGSStateNumFields               = 5;
@@ -307,13 +309,6 @@ public:
   void UpdateDxilResources(llvm::MDTuple *pDxilResourceTuple);
   void GetDxilResources(const llvm::MDOperand &MDO, const llvm::MDTuple *&pSRVs, const llvm::MDTuple *&pUAVs, 
                         const llvm::MDTuple *&pCBuffers, const llvm::MDTuple *&pSamplers);
-  void EmitDxilResourceLinkInfoTuple(llvm::MDTuple *pSRVs, llvm::MDTuple *pUAVs,
-                                 llvm::MDTuple *pCBuffers,
-                                 llvm::MDTuple *pSamplers);
-  void LoadDxilResourceLinkInfoTuple(const llvm::MDTuple *&pSRVs,
-                                 const llvm::MDTuple *&pUAVs,
-                                 const llvm::MDTuple *&pCBuffers,
-                                 const llvm::MDTuple *&pSamplers);
   void EmitDxilResourceBase(const DxilResourceBase &R, llvm::Metadata *ppMDVals[]);
   void LoadDxilResourceBase(const llvm::MDOperand &MDO, DxilResourceBase &R);
   llvm::MDTuple *EmitDxilSRV(const DxilResource &SRV);
@@ -346,19 +341,26 @@ public:
 
   // Function props.
   llvm::MDTuple *EmitDxilFunctionProps(const hlsl::DxilFunctionProps *props,
-                                       llvm::Function *F);
-  llvm::Function *LoadDxilFunctionProps(llvm::MDTuple *pProps,
-                                        hlsl::DxilFunctionProps *props);
+                                       const llvm::Function *F);
+  const llvm::Function *LoadDxilFunctionProps(const llvm::MDTuple *pProps,
+                                              hlsl::DxilFunctionProps *props);
+  llvm::MDTuple *EmitDxilEntryProperties(uint64_t rawShaderFlag,
+                                          const hlsl::DxilFunctionProps &props,
+                                          uint32_t autoBindingSpace);
+  void LoadDxilEntryProperties(const llvm::MDOperand &MDO,
+                                uint64_t &rawShaderFlag,
+                                hlsl::DxilFunctionProps &props,
+                                uint32_t &autoBindingSpace);
 
   // ViewId state.
   void EmitDxilViewIdState(DxilViewIdState &ViewIdState);
   void LoadDxilViewIdState(DxilViewIdState &ViewIdState);
-
   // Control flow hints.
   static llvm::MDNode *EmitControlFlowHints(llvm::LLVMContext &Ctx, std::vector<DXIL::ControlFlowHint> &hints);
 
 
   // Shader specific.
+private:
   llvm::MDTuple *EmitDxilGSState(DXIL::InputPrimitive Primitive, unsigned MaxVertexCount, 
                                  unsigned ActiveStreamMask, DXIL::PrimitiveTopology StreamPrimitiveTopology,
                                  unsigned GSInstanceCount);
@@ -384,9 +386,10 @@ public:
                        DXIL::TessellatorPartitioning &TessPartitioning,
                        DXIL::TessellatorOutputPrimitive &TessOutputPrimitive,
                        float &MaxTessFactor);
-
+public:
   // Utility functions.
-  static bool IsKnownNamedMetaData(llvm::NamedMDNode &Node);
+  static bool IsKnownNamedMetaData(const llvm::NamedMDNode &Node);
+  static void combineDxilMetadata(llvm::Instruction *K, const llvm::Instruction *J);
   static llvm::ConstantAsMetadata *Int32ToConstMD(int32_t v, llvm::LLVMContext &Ctx);
   llvm::ConstantAsMetadata *Int32ToConstMD(int32_t v);
   static llvm::ConstantAsMetadata *Uint32ToConstMD(unsigned v, llvm::LLVMContext &Ctx);
@@ -411,6 +414,8 @@ public:
   void ConstMDTupleToUint32Vector(llvm::MDTuple *pTupleMD, std::vector<unsigned> &Vec);
   static bool IsMarkedPrecise(const llvm::Instruction *inst);
   static void MarkPrecise(llvm::Instruction *inst);
+  static bool IsMarkedNonUniform(const llvm::Instruction *inst);
+  static void MarkNonUniform(llvm::Instruction *inst);
 
 private:
   llvm::LLVMContext &m_Ctx;

+ 60 - 183
include/dxc/HLSL/DxilModule.h

@@ -15,17 +15,17 @@
 #include "dxc/HLSL/DxilCBuffer.h"
 #include "dxc/HLSL/DxilResource.h"
 #include "dxc/HLSL/DxilSampler.h"
+#include "dxc/HLSL/DxilShaderFlags.h"
 #include "dxc/HLSL/DxilSignature.h"
 #include "dxc/HLSL/DxilConstants.h"
 #include "dxc/HLSL/DxilTypeSystem.h"
 #include "dxc/HLSL/ComputeViewIdState.h"
 
-
-
 #include <memory>
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace llvm {
 class LLVMContext;
@@ -44,6 +44,11 @@ class OP;
 class RootSignatureHandle;
 struct DxilFunctionProps;
 
+class DxilEntryProps;
+
+using DxilEntryPropsMap =
+    std::unordered_map<const llvm::Function *, std::unique_ptr<DxilEntryProps>>;
+
 /// Use this class to manipulate DXIL of a shader.
 class DxilModule {
 public:
@@ -54,7 +59,7 @@ public:
   llvm::LLVMContext &GetCtx() const;
   llvm::Module *GetModule() const;
   OP *GetOP() const;
-  void SetShaderModel(const ShaderModel *pSM);
+  void SetShaderModel(const ShaderModel *pSM, bool bUseMinPrecision = true);
   const ShaderModel *GetShaderModel() const;
   void GetDxilVersion(unsigned &DxilMajor, unsigned &DxilMinor) const;
   void SetValidatorVersion(unsigned ValMajor, unsigned ValMinor);
@@ -78,8 +83,7 @@ public:
 
   // Flags.
   unsigned GetGlobalFlags() const;
-  // TODO: move out of DxilModule as a util.
-  void CollectShaderFlags();
+  void CollectShaderFlagsForModule();
 
   // Resources.
   unsigned AddCBuffer(std::unique_ptr<DxilCBuffer> pCB);
@@ -102,16 +106,12 @@ public:
   const DxilResource &GetUAV(unsigned idx) const;
   const std::vector<std::unique_ptr<DxilResource> > &GetUAVs() const;
 
-  void CreateResourceLinkInfo();
-  struct ResourceLinkInfo;
-  const ResourceLinkInfo &GetResourceLinkInfo(DXIL::ResourceClass resClass,
-                                        unsigned rangeID) const;
-
   void LoadDxilResourceBaseFromMDNode(llvm::MDNode *MD, DxilResourceBase &R);
   void LoadDxilResourceFromMDNode(llvm::MDNode *MD, DxilResource &R);
   void LoadDxilSamplerFromMDNode(llvm::MDNode *MD, DxilSampler &S);
 
   void RemoveUnusedResources();
+  void RemoveUnusedResourceSymbols();
   void RemoveFunction(llvm::Function *F);
 
   // Signatures.
@@ -122,16 +122,30 @@ public:
   DxilSignature &GetPatchConstantSignature();
   const DxilSignature &GetPatchConstantSignature() const;
   const RootSignatureHandle &GetRootSignature() const;
-  bool HasDxilEntrySignature(llvm::Function *F) const;
-  DxilEntrySignature &GetDxilEntrySignature(llvm::Function *F);
-  // Move DxilEntrySignature of F to NewF.
-  void ReplaceDxilEntrySignature(llvm::Function *F, llvm::Function *NewF);
+
+  bool HasDxilEntrySignature(const llvm::Function *F) const;
+  DxilEntrySignature &GetDxilEntrySignature(const llvm::Function *F);
+  // Move DxilEntryProps of F to NewF.
+  void ReplaceDxilEntryProps(llvm::Function *F, llvm::Function *NewF);
+  // Clone DxilEntryProps of F to NewF.
+  void CloneDxilEntryProps(llvm::Function *F, llvm::Function *NewF);
+  bool HasDxilEntryProps(const llvm::Function *F) const;
+  DxilEntryProps &GetDxilEntryProps(const llvm::Function *F);
 
   // DxilFunctionProps.
-  bool HasDxilFunctionProps(llvm::Function *F) const;
-  DxilFunctionProps &GetDxilFunctionProps(llvm::Function *F);
+  bool HasDxilFunctionProps(const llvm::Function *F) const;
+  DxilFunctionProps &GetDxilFunctionProps(const llvm::Function *F);
+  const DxilFunctionProps &GetDxilFunctionProps(const llvm::Function *F) const;
+
   // Move DxilFunctionProps of F to NewF.
-  void ReplaceDxilFunctionProps(llvm::Function *F, llvm::Function *NewF);
+  void SetPatchConstantFunctionForHS(llvm::Function *hullShaderFunc, llvm::Function *patchConstantFunc);
+  bool IsGraphicsShader(const llvm::Function *F) const; // vs,hs,ds,gs,ps
+  bool IsPatchConstantShader(const llvm::Function *F) const;
+  bool IsComputeShader(const llvm::Function *F) const;
+
+  // Is an entry function that uses input/output signature conventions?
+  // Includes: vs/hs/ds/gs/ps/cs as well as the patch constant function.
+  bool IsEntryThatUsesSignatures(const llvm::Function *F) const ;
 
   // Remove Root Signature from module metadata
   void StripRootSignatureFromMetadata();
@@ -144,6 +158,7 @@ public:
   /// Emit llvm.used array to make sure that optimizations do not remove unreferenced globals.
   void EmitLLVMUsed();
   std::vector<llvm::GlobalVariable* > &GetLLVMUsed();
+  void ClearLLVMUsed();
 
   // ViewId state.
   DxilViewIdState &GetViewIdState();
@@ -166,12 +181,7 @@ public:
   void ResetRootSignature(RootSignatureHandle *pValue);
   void ResetTypeSystem(DxilTypeSystem *pValue);
   void ResetOP(hlsl::OP *hlslOP);
-  void ResetFunctionPropsMap(
-      std::unordered_map<llvm::Function *, std::unique_ptr<DxilFunctionProps>>
-          &&propsMap);
-  void ResetEntrySignatureMap(
-      std::unordered_map<llvm::Function *, std::unique_ptr<DxilEntrySignature>>
-          &&SigMap);
+  void ResetEntryPropsMap(DxilEntryPropsMap &&PropMap);
 
   void StripDebugRelatedCode();
   llvm::DebugInfoFinder &GetOrCreateDebugInfoFinder();
@@ -201,135 +211,16 @@ public:
   static bool PreservesFastMathFlags(const llvm::Instruction *inst);
 
 public:
-  // Shader properties.
-  class ShaderFlags {
-  public:
-    ShaderFlags();
-
-    unsigned GetGlobalFlags() const;
-    void SetDisableOptimizations(bool flag) { m_bDisableOptimizations = flag; }
-    bool GetDisableOptimizations() const { return m_bDisableOptimizations; }
-
-    void SetDisableMathRefactoring(bool flag) { m_bDisableMathRefactoring = flag; }
-    bool GetDisableMathRefactoring() const { return m_bDisableMathRefactoring; }
-
-    void SetEnableDoublePrecision(bool flag) { m_bEnableDoublePrecision = flag; }
-    bool GetEnableDoublePrecision() const { return m_bEnableDoublePrecision; }
-
-    void SetForceEarlyDepthStencil(bool flag) { m_bForceEarlyDepthStencil = flag; }
-    bool GetForceEarlyDepthStencil() const { return m_bForceEarlyDepthStencil; }
-
-    void SetEnableRawAndStructuredBuffers(bool flag) { m_bEnableRawAndStructuredBuffers = flag; }
-    bool GetEnableRawAndStructuredBuffers() const { return m_bEnableRawAndStructuredBuffers; }
-
-    void SetLowPrecisionPresent(bool flag) { m_bLowPrecisionPresent = flag; }
-    bool GetLowPrecisionPresent() const { return m_bLowPrecisionPresent; }
-
-    void SetEnableDoubleExtensions(bool flag) { m_bEnableDoubleExtensions = flag; }
-    bool GetEnableDoubleExtensions() const { return m_bEnableDoubleExtensions; }
-
-    void SetEnableMSAD(bool flag) { m_bEnableMSAD = flag; }
-    bool GetEnableMSAD() const { return m_bEnableMSAD; }
-
-    void SetAllResourcesBound(bool flag) { m_bAllResourcesBound = flag; }
-    bool GetAllResourcesBound() const { return m_bAllResourcesBound; }
-
-    uint64_t GetFeatureInfo() const;
-    void SetCSRawAndStructuredViaShader4X(bool flag) { m_bCSRawAndStructuredViaShader4X = flag; }
-    bool GetCSRawAndStructuredViaShader4X() const { return m_bCSRawAndStructuredViaShader4X; }
-
-    void SetROVs(bool flag) { m_bROVS = flag; }
-    bool GetROVs() const { return m_bROVS; }
-
-    void SetWaveOps(bool flag) { m_bWaveOps = flag; }
-    bool GetWaveOps() const { return m_bWaveOps; }
-
-    void SetInt64Ops(bool flag) { m_bInt64Ops = flag; }
-    bool GetInt64Ops() const { return m_bInt64Ops; }
-
-    void SetTiledResources(bool flag) { m_bTiledResources = flag; }
-    bool GetTiledResources() const { return m_bTiledResources; }
-
-    void SetStencilRef(bool flag) { m_bStencilRef = flag; }
-    bool GetStencilRef() const { return m_bStencilRef; }
-
-    void SetInnerCoverage(bool flag) { m_bInnerCoverage = flag; }
-    bool GetInnerCoverage() const { return m_bInnerCoverage; }
-
-    void SetViewportAndRTArrayIndex(bool flag) { m_bViewportAndRTArrayIndex = flag; }
-    bool GetViewportAndRTArrayIndex() const { return m_bViewportAndRTArrayIndex; }
-
-    void SetUAVLoadAdditionalFormats(bool flag) { m_bUAVLoadAdditionalFormats = flag; }
-    bool GetUAVLoadAdditionalFormats() const { return m_bUAVLoadAdditionalFormats; }
-
-    void SetLevel9ComparisonFiltering(bool flag) { m_bLevel9ComparisonFiltering = flag; }
-    bool GetLevel9ComparisonFiltering() const { return m_bLevel9ComparisonFiltering; }
-
-    void Set64UAVs(bool flag) { m_b64UAVs = flag; }
-    bool Get64UAVs() const { return m_b64UAVs; }
-
-    void SetUAVsAtEveryStage(bool flag) { m_UAVsAtEveryStage = flag; }
-    bool GetUAVsAtEveryStage() const { return m_UAVsAtEveryStage; }
-
-    void SetViewID(bool flag) { m_bViewID = flag; }
-    bool GetViewID() const { return m_bViewID; }
-
-    void SetBarycentrics(bool flag) { m_bBarycentrics = flag; }
-    bool GetBarycentrics() const { return m_bBarycentrics; }
-
-    void SetUseNativeLowPrecision(bool flag) { m_bUseNativeLowPrecision = flag; }
-    bool GetUseNativeLowPrecision() const { return m_bUseNativeLowPrecision; }
-
-    static uint64_t GetShaderFlagsRawForCollection(); // some flags are collected (eg use 64-bit), some provided (eg allow refactoring)
-    uint64_t GetShaderFlagsRaw() const;
-    void SetShaderFlagsRaw(uint64_t data);
-
-  private:
-    unsigned m_bDisableOptimizations :1;   // D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION
-    unsigned m_bDisableMathRefactoring :1; //~D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED
-    unsigned m_bEnableDoublePrecision :1; // D3D11_SB_GLOBAL_FLAG_ENABLE_DOUBLE_PRECISION_FLOAT_OPS
-    unsigned m_bForceEarlyDepthStencil :1; // D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL
-    unsigned m_bEnableRawAndStructuredBuffers :1; // D3D11_SB_GLOBAL_FLAG_ENABLE_RAW_AND_STRUCTURED_BUFFERS
-    unsigned m_bLowPrecisionPresent :1; // D3D11_1_SB_GLOBAL_FLAG_ENABLE_MINIMUM_PRECISION
-    unsigned m_bEnableDoubleExtensions :1; // D3D11_1_SB_GLOBAL_FLAG_ENABLE_DOUBLE_EXTENSIONS
-    unsigned m_bEnableMSAD :1;        // D3D11_1_SB_GLOBAL_FLAG_ENABLE_SHADER_EXTENSIONS
-    unsigned m_bAllResourcesBound :1; // D3D12_SB_GLOBAL_FLAG_ALL_RESOURCES_BOUND
-
-    unsigned m_bViewportAndRTArrayIndex :1;   // SHADER_FEATURE_VIEWPORT_AND_RT_ARRAY_INDEX_FROM_ANY_SHADER_FEEDING_RASTERIZER
-    unsigned m_bInnerCoverage :1;             // SHADER_FEATURE_INNER_COVERAGE
-    unsigned m_bStencilRef  :1;               // SHADER_FEATURE_STENCIL_REF
-    unsigned m_bTiledResources  :1;           // SHADER_FEATURE_TILED_RESOURCES
-    unsigned m_bUAVLoadAdditionalFormats :1;  // SHADER_FEATURE_TYPED_UAV_LOAD_ADDITIONAL_FORMATS
-    unsigned m_bLevel9ComparisonFiltering :1; // SHADER_FEATURE_LEVEL_9_COMPARISON_FILTERING
-                                              // SHADER_FEATURE_11_1_SHADER_EXTENSIONS shared with EnableMSAD
-    unsigned m_b64UAVs :1;                    // SHADER_FEATURE_64_UAVS
-    unsigned m_UAVsAtEveryStage :1;           // SHADER_FEATURE_UAVS_AT_EVERY_STAGE
-    unsigned m_bCSRawAndStructuredViaShader4X : 1; // SHADER_FEATURE_COMPUTE_SHADERS_PLUS_RAW_AND_STRUCTURED_BUFFERS_VIA_SHADER_4_X
-    
-    // SHADER_FEATURE_COMPUTE_SHADERS_PLUS_RAW_AND_STRUCTURED_BUFFERS_VIA_SHADER_4_X is specifically
-    // about shader model 4.x.
-
-    unsigned m_bROVS :1;              // SHADER_FEATURE_ROVS
-    unsigned m_bWaveOps :1;           // SHADER_FEATURE_WAVE_OPS
-    unsigned m_bInt64Ops :1;          // SHADER_FEATURE_INT64_OPS
-    unsigned m_bViewID : 1;           // SHADER_FEATURE_VIEWID
-    unsigned m_bBarycentrics : 1;     // SHADER_FEATURE_BARYCENTRICS
-
-    unsigned m_bUseNativeLowPrecision : 1;
-
-    unsigned m_align0 : 8;        // align to 32 bit.
-    uint32_t m_align1;            // align to 64 bit.
-  };
-
   ShaderFlags m_ShaderFlags;
-  void CollectShaderFlags(ShaderFlags &Flags);
+  void CollectShaderFlagsForModule(ShaderFlags &Flags);
 
   // Check if DxilModule contains multi component UAV Loads.
   // This funciton must be called after unused resources are removed from DxilModule
   bool ModuleHasMulticomponentUAVLoads();
 
   // Compute shader.
-  unsigned m_NumThreads[3];
+  void SetNumThreads(unsigned x, unsigned y, unsigned z);
+  unsigned GetNumThreads(unsigned idx) const;
 
   // Geometry shader.
   DXIL::InputPrimitive GetInputPrimitive() const;
@@ -347,6 +238,14 @@ public:
   void SetActiveStreamMask(unsigned Mask);
   unsigned GetActiveStreamMask() const;
 
+  // Language options
+  // UseMinPrecision must be set at SetShaderModel time.
+  bool GetUseMinPrecision() const;
+  void SetDisableOptimization(bool disableOptimization);
+  bool GetDisableOptimization() const;
+  void SetAllResourcesBound(bool resourcesBound);
+  bool GetAllResourcesBound() const;
+
   // Hull and Domain shaders.
   unsigned GetInputControlPointCount() const;
   void SetInputControlPointCount(unsigned NumICPs);
@@ -363,18 +262,15 @@ public:
   float GetMaxTessellationFactor() const;
   void SetMaxTessellationFactor(float MaxTessellationFactor);
 
-  void SetShaderProperties(DxilFunctionProps *props);
+  // AutoBindingSpace also enables automatic binding for libraries if set.
+  // UINT_MAX == unset
+  void SetAutoBindingSpace(uint32_t Space);
+  uint32_t GetAutoBindingSpace() const;
 
-  // Shader resource information only needed before linking.
-  // Use constant as rangeID for resource in a library.
-  // When link the library, replace these constants with real rangeID.
-  struct ResourceLinkInfo {
-    llvm::Constant *ResRangeID;
-  };
+  void SetShaderProperties(DxilFunctionProps *props);
 
 private:
   // Signatures.
-  std::unique_ptr<DxilEntrySignature> m_EntrySignature;
   std::unique_ptr<RootSignatureHandle> m_RootSignature;
 
   // Shader resources.
@@ -383,34 +279,14 @@ private:
   std::vector<std::unique_ptr<DxilCBuffer> > m_CBuffers;
   std::vector<std::unique_ptr<DxilSampler> > m_Samplers;
 
-  // Save resource link for library, when link replace it with real resource ID.
-  std::vector<ResourceLinkInfo> m_SRVsLinkInfo;
-  std::vector<ResourceLinkInfo> m_UAVsLinkInfo;
-  std::vector<ResourceLinkInfo> m_CBuffersLinkInfo;
-  std::vector<ResourceLinkInfo> m_SamplersLinkInfo;
-
   // Geometry shader.
-  DXIL::InputPrimitive m_InputPrimitive;
-  unsigned m_MaxVertexCount;
   DXIL::PrimitiveTopology m_StreamPrimitiveTopology;
   unsigned m_ActiveStreamMask;
-  unsigned m_NumGSInstances;
-
-  // Hull and Domain shaders.
-  unsigned m_InputControlPointCount;
-  DXIL::TessellatorDomain m_TessellatorDomain;
-
-  // Hull shader.
-  unsigned m_OutputControlPointCount;
-  DXIL::TessellatorPartitioning m_TessellatorPartitioning;
-  DXIL::TessellatorOutputPrimitive m_TessellatorOutputPrimitive;
-  float m_MaxTessellationFactor;
 
 private:
   llvm::LLVMContext &m_Ctx;
   llvm::Module *m_pModule;
   llvm::Function *m_pEntryFunc;
-  llvm::Function *m_pPatchConstantFunc;
   std::string m_EntryName;
   std::unique_ptr<DxilMDHelper> m_pMDHelper;
   std::unique_ptr<llvm::DebugInfoFinder> m_pDebugInfoFinder;
@@ -429,12 +305,11 @@ private:
   // Type annotations.
   std::unique_ptr<DxilTypeSystem> m_pTypeSystem;
 
-  // Function properties for shader functions.
-  std::unordered_map<llvm::Function *, std::unique_ptr<DxilFunctionProps>>
-      m_DxilFunctionPropsMap;
-  // EntrySig for shader functions.
-  std::unordered_map<llvm::Function *, std::unique_ptr<DxilEntrySignature>>
-      m_DxilEntrySignatureMap;
+  // EntryProps for shader functions.
+  DxilEntryPropsMap  m_DxilEntryPropsMap;
+
+  // Keeps track of patch constant functions used by hull shaders
+  std::unordered_set<const llvm::Function *>  m_PatchConstantFunctions;
 
   // ViewId state.
   std::unique_ptr<DxilViewIdState> m_pViewIdState;
@@ -442,14 +317,16 @@ private:
   // DXIL metadata serialization/deserialization.
   llvm::MDTuple *EmitDxilResources();
   void LoadDxilResources(const llvm::MDOperand &MDO);
-  void EmitDxilResourcesLinkInfo();
-  void LoadDxilResourcesLinkInfo();
-  llvm::MDTuple *EmitDxilShaderProperties();
-  void LoadDxilShaderProperties(const llvm::MDOperand &MDO);
 
   // Helpers.
   template<typename T> unsigned AddResource(std::vector<std::unique_ptr<T> > &Vec, std::unique_ptr<T> pRes);
   void LoadDxilSignature(const llvm::MDTuple *pSigTuple, DxilSignature &Sig, bool bInput);
+
+  // properties from HLModule
+  bool m_bDisableOptimizations;
+  bool m_bUseMinPrecision;
+  bool m_bAllResourcesBound;
+  uint32_t m_AutoBindingSpace;
 };
 
 } // namespace hlsl

+ 12 - 5
include/dxc/HLSL/DxilOperations.h

@@ -23,6 +23,7 @@ class Instruction;
 }
 #include "llvm/IR/Attributes.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/DenseMap.h"
 
 #include "DxilConstants.h"
 #include <unordered_map>
@@ -42,7 +43,7 @@ public:
   void RefreshCache();
 
   llvm::Function *GetOpFunc(OpCode OpCode, llvm::Type *pOverloadType);
-  llvm::ArrayRef<llvm::Function *> GetOpFuncList(OpCode OpCode) const;
+  const llvm::SmallDenseMap<llvm::Type *, llvm::Function *, 8> &GetOpFuncList(OpCode OpCode) const;
   void RemoveFunction(llvm::Function *F);
   llvm::Type *GetOverloadType(OpCode OpCode, llvm::Function *F);
   llvm::LLVMContext &GetCtx() { return m_Ctx; }
@@ -99,6 +100,9 @@ public:
   static bool IsDupDxilOpType(llvm::StructType *ST);
   static llvm::StructType *GetOriginalDxilOpType(llvm::StructType *ST,
                                                  llvm::Module &M);
+  static void GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
+                                       unsigned &major, unsigned &minor,
+                                       unsigned &mask);
 
 private:
   // Per-module properties.
@@ -115,17 +119,19 @@ private:
 
   DXIL::LowPrecisionMode m_LowPrecisionMode;
 
-  static const unsigned kNumTypeOverloads = 9;
+  static const unsigned kUserDefineTypeSlot = 9;
+  static const unsigned kObjectTypeSlot = 10;
+  static const unsigned kNumTypeOverloads = 11; // void, h,f,d, i1, i8,i16,i32,i64, udt, obj
 
   llvm::Type *m_pResRetType[kNumTypeOverloads];
   llvm::Type *m_pCBufferRetType[kNumTypeOverloads];
 
   struct OpCodeCacheItem {
-    llvm::Function *pOverloads[kNumTypeOverloads];
+    llvm::SmallDenseMap<llvm::Type *, llvm::Function *, 8> pOverloads;
   };
   OpCodeCacheItem m_OpCodeClassCache[(unsigned)OpCodeClass::NumOpClasses];
   std::unordered_map<const llvm::Function *, OpCodeClass> m_FunctionToOpClass;
-  void UpdateCache(OpCodeClass opClass, unsigned typeSlot, llvm::Function *F);
+  void UpdateCache(OpCodeClass opClass, llvm::Type * Ty, llvm::Function *F);
 private:
   // Static properties.
   struct OpCodeProperty {
@@ -133,7 +139,7 @@ private:
     const char *pOpCodeName;
     OpCodeClass opCodeClass;
     const char *pOpCodeClassName;
-    bool bAllowOverload[kNumTypeOverloads];   // void, h,f,d, i1, i8,i16,i32,i64
+    bool bAllowOverload[kNumTypeOverloads];   // void, h,f,d, i1, i8,i16,i32,i64, udt
     llvm::Attribute::AttrKind FuncAttr;
   };
   static const OpCodeProperty m_OpCodeProps[(unsigned)OpCode::NumOpCodes];
@@ -144,6 +150,7 @@ private:
   static const char *m_MatrixTypePrefix;
   static unsigned GetTypeSlot(llvm::Type *pType);
   static const char *GetOverloadTypeName(unsigned TypeSlot);
+  static llvm::StringRef GetTypeName(llvm::Type *Ty, std::string &str);
 };
 
 } // namespace hlsl

+ 45 - 15
include/dxc/HLSL/DxilPipelineStateValidation.h

@@ -14,7 +14,9 @@
 
 #include <stdint.h>
 #include <string.h>
-
+#ifndef UINT_MAX
+#define UINT_MAX 0xffffffff
+#endif
 // How many dwords are required for mask with one bit per component, 4 components per vector
 inline uint32_t PSVComputeMaskDwordsFromVectors(uint32_t Vectors) { return (Vectors + 7) >> 3; }
 inline uint32_t PSVComputeInputOutputTableSize(uint32_t InputVectors, uint32_t OutputVectors) {
@@ -70,6 +72,13 @@ enum class PSVShaderKind : uint8_t    // DXIL::ShaderKind
   Hull,
   Domain,
   Compute,
+  Library,
+  RayGeneration,
+  Intersection,
+  AnyHit,
+  ClosestHit,
+  Miss,
+  Callable,
   Invalid,
 };
 
@@ -105,10 +114,43 @@ enum class PSVResourceType
   UAVRaw,
   UAVStructured,
   UAVStructuredWithCounter,
+  NumEntries
+};
 
+enum class PSVResourceKind
+{
+  Invalid = 0,
+  Texture1D,
+  Texture2D,
+  Texture2DMS,
+  Texture3D,
+  TextureCube,
+  Texture1DArray,
+  Texture2DArray,
+  Texture2DMSArray,
+  TextureCubeArray,
+  TypedBuffer,
+  RawBuffer,
+  StructuredBuffer,
+  CBuffer,
+  Sampler,
+  TBuffer,
+  RTAccelerationStructure,
   NumEntries
 };
 
+// Table of null-terminated strings, overall size aligned to dword boundary, last byte must be null
+struct PSVStringTable {
+  const char *Table;
+  uint32_t Size;
+  PSVStringTable() : Table(nullptr), Size(0) {}
+  PSVStringTable(const char *table, uint32_t size) : Table(table), Size(size) {}
+  const char *Get(uint32_t offset) const {
+    _Analysis_assume_(offset < Size && Table && Table[Size-1] == '\0');
+    return Table + offset;
+  }
+};
+
 // Versioning is additive and based on size
 struct PSVResourceBindInfo0
 {
@@ -117,7 +159,6 @@ struct PSVResourceBindInfo0
   uint32_t LowerBound;
   uint32_t UpperBound;
 };
-// PSVResourceBindInfo1 would derive and extend
 
 // Helpers for output dependencies (ViewID and Input-Output tables)
 struct PSVComponentMask {
@@ -171,17 +212,6 @@ struct PSVDependencyTable {
   bool IsValid() { return Table != nullptr; }
 };
 
-// Table of null-terminated strings, overall size aligned to dword boundary, last byte must be null
-struct PSVStringTable {
-  const char *Table;
-  uint32_t Size;
-  PSVStringTable() : Table(nullptr), Size(0) {}
-  PSVStringTable(const char *table, uint32_t size) : Table(table), Size(size) {}
-  const char *Get(uint32_t offset) const {
-    _Analysis_assume_(offset < Size && Table && Table[Size-1] == '\0');
-    return Table + offset;
-  }
-};
 struct PSVString {
   uint32_t Offset;
   PSVString() : Offset(0) {}
@@ -243,7 +273,7 @@ enum class PSVSemanticKind : uint8_t    // DXIL::SemanticKind
 
 struct PSVSignatureElement0
 {
-  uint32_t SemanticName;          // Offset into PSVStringTable
+  uint32_t SemanticName;          // Offset into StringTable
   uint32_t SemanticIndexes;       // Offset into PSVSemanticIndexTable, count == Rows
   uint8_t Rows;                   // Number of rows this element occupies
   uint8_t StartRow;               // Starting row of packing location if allocated
@@ -329,7 +359,7 @@ class DxilPipelineStateValidation
   uint32_t* m_pPCInputToOutputTable;
 
 public:
-  DxilPipelineStateValidation() : 
+  DxilPipelineStateValidation() :
     m_uPSVRuntimeInfoSize(0),
     m_pPSVRuntimeInfo0(nullptr),
     m_pPSVRuntimeInfo1(nullptr),

+ 473 - 0
include/dxc/HLSL/DxilRuntimeReflection.h

@@ -0,0 +1,473 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilLibraryReflection.h                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Defines shader reflection for runtime usage.                              //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+#include "DxilConstants.h"
+
+namespace hlsl {
+namespace RDAT {
+
+// Data Layout:
+// -start:
+//  RuntimeDataHeader header;
+//  uint32_t offsets[header.PartCount];
+//  - for each i in header.PartCount:
+//    - at &header + offsets[i]:
+//      RuntimeDataPartHeader part;
+//    - if part.Type is a Table (Function or Resource):
+//      RuntimeDataTableHeader table;
+//      byte TableData[table.RecordCount][table.RecordStride];
+//    - else if part.Type is String:
+//      byte UTF8Data[part.Size];
+//    - else if part.Type is Index:
+//      uint32_t IndexData[part.Size / 4];
+
+enum class RuntimeDataPartType : uint32_t { // TODO: Rename: PartType
+  Invalid = 0,
+  StringBuffer = 1,
+  IndexArrays = 2,
+  ResourceTable = 3,
+  FunctionTable = 4,
+};
+
+enum RuntimeDataVersion {
+  // Cannot be mistaken for part count from prerelease version
+  RDAT_Version_0 = 0x10,
+};
+
+struct RuntimeDataHeader {
+  uint32_t Version;
+  uint32_t PartCount;
+  // Followed by uint32_t array of offsets to parts
+  // offsets are relative to the beginning of this header
+  // offsets must be 4-byte aligned
+  //  uint32_t offsets[];
+};
+struct RuntimeDataPartHeader {
+  RuntimeDataPartType Type;
+  uint32_t Size;  // Not including this header.  Must be 4-byte aligned.
+  // Followed by part data
+  //  byte Data[ALIGN4(Size)];
+};
+
+// For tables of records, such as Function and Resource tables
+// Stride allows for extending records, with forward and backward compatibility
+struct RuntimeDataTableHeader {
+  uint32_t RecordCount;
+  uint32_t RecordStride;  // Must be 4-byte aligned.
+  // Followed by recordCount records of recordStride size
+  // byte TableData[RecordCount * RecordStride];
+};
+
+// General purpose strided table reader with casting Row() operation that
+// returns nullptr if stride is smaller than type, for record expansion.
+class TableReader {
+  const char *m_table;
+  uint32_t m_count;
+  uint32_t m_stride;
+
+public:
+  TableReader() : TableReader(nullptr, 0, 0) {}
+  TableReader(const char *table, uint32_t count, uint32_t stride)
+    : m_table(table), m_count(count), m_stride(stride) {}
+  void Init(const char *table, uint32_t count, uint32_t stride) {
+    m_table = table; m_count = count; m_stride = stride;
+  }
+  const char *Data() const { return m_table; }
+  uint32_t Count() const { return m_count; }
+  uint32_t Stride() const { return m_stride; }
+
+  template<typename T>
+  const T *Row(uint32_t index) const {
+    if (index < m_count && sizeof(T) <= m_stride)
+      return reinterpret_cast<const T*>(m_table + (m_stride * index));
+    return nullptr;
+  }
+};
+
+
+// Index table is a sequence of rows, where each row has a count as a first
+// element followed by the count number of elements pre computing values
+class IndexTableReader {
+private:
+  const uint32_t *m_table;
+  uint32_t m_size;
+
+public:
+  class IndexRow {
+  private:
+    const uint32_t *m_values;
+    const uint32_t m_count;
+
+  public:
+    IndexRow(const uint32_t *values, uint32_t count)
+        : m_values(values), m_count(count) {}
+    uint32_t Count() { return m_count; }
+    uint32_t At(uint32_t i) { return m_values[i]; }
+  };
+
+  IndexTableReader() : m_table(nullptr), m_size(0) {}
+  IndexTableReader(const uint32_t *table, uint32_t size)
+      : m_table(table), m_size(size) {}
+
+  void SetTable(const uint32_t *table) { m_table = table; }
+
+  void SetSize(uint32_t size) { m_size = size; }
+
+  IndexRow getRow(uint32_t i) { return IndexRow(&m_table[i] + 1, m_table[i]); }
+};
+
+class StringTableReader {
+  const char *m_table;
+  uint32_t m_size;
+public:
+  StringTableReader() : m_table(nullptr), m_size(0) {}
+  StringTableReader(const char *table, uint32_t size)
+      : m_table(table), m_size(size) {}
+  const char *Get(uint32_t offset) const {
+    _Analysis_assume_(offset < m_size && m_table &&
+                      m_table[m_size - 1] == '\0');
+    return m_table + offset;
+  }
+};
+
+enum class DxilResourceFlag : uint32_t {
+  None                      = 0,
+  UAVGloballyCoherent       = 1 << 0,
+  UAVCounter                = 1 << 1,
+  UAVRasterizerOrderedView  = 1 << 2,
+  DynamicIndexing           = 1 << 3,
+};
+
+struct RuntimeDataResourceInfo {
+  uint32_t Class; // hlsl::DXIL::ResourceClass
+  uint32_t Kind;  // hlsl::DXIL::ResourceKind
+  uint32_t ID;    // id per class
+  uint32_t Space;
+  uint32_t LowerBound;
+  uint32_t UpperBound;
+  uint32_t Name;  // resource name as an offset for string table
+  uint32_t Flags; // hlsl::RDAT::DxilResourceFlag
+};
+
+struct RuntimeDataFunctionInfo {
+  uint32_t Name;                 // offset for string table
+  uint32_t UnmangledName;        // offset for string table
+  uint32_t Resources;            // index to an index table
+  uint32_t FunctionDependencies; // index to a list of functions that function
+                                 // depends on
+  uint32_t ShaderKind;
+  uint32_t PayloadSizeInBytes;   // 1) hit, miss, or closest shader: payload count
+                                 // 2) call shader: parameter size 
+  uint32_t AttributeSizeInBytes; // attribute size for closest hit and any hit
+  uint32_t FeatureInfo1;         // first 32 bits of feature flag
+  uint32_t FeatureInfo2;         // second 32 bits of feature flag
+  uint32_t ShaderStageFlag;      // valid shader stage flag.
+  uint32_t MinShaderTarget;      // minimum shader target.
+};
+
+class ResourceTableReader;
+class FunctionTableReader;
+
+struct RuntimeDataContext {
+  StringTableReader *pStringTableReader;
+  IndexTableReader *pIndexTableReader;
+  ResourceTableReader *pResourceTableReader;
+  FunctionTableReader *pFunctionTableReader;
+};
+
+class ResourceReader {
+private:
+  const RuntimeDataResourceInfo *m_ResourceInfo;
+  RuntimeDataContext *m_Context;
+
+public:
+  ResourceReader(const RuntimeDataResourceInfo *resInfo,
+                 RuntimeDataContext *context)
+      : m_ResourceInfo(resInfo), m_Context(context) {}
+  hlsl::DXIL::ResourceClass GetResourceClass() const {
+    return !m_ResourceInfo ? hlsl::DXIL::ResourceClass::Invalid
+                           : (hlsl::DXIL::ResourceClass)m_ResourceInfo->Class;
+  }
+  uint32_t GetSpace() const { return !m_ResourceInfo ? 0 : m_ResourceInfo->Space; }
+  uint32_t GetLowerBound() const { return !m_ResourceInfo ? 0 : m_ResourceInfo->LowerBound; }
+  uint32_t GetUpperBound() const { return !m_ResourceInfo ? 0 : m_ResourceInfo->UpperBound; }
+  hlsl::DXIL::ResourceKind GetResourceKind() const {
+    return !m_ResourceInfo ? hlsl::DXIL::ResourceKind::Invalid
+                           : (hlsl::DXIL::ResourceKind)m_ResourceInfo->Kind;
+  }
+  uint32_t GetID() const { return !m_ResourceInfo ? 0 : m_ResourceInfo->ID; }
+  const char *GetName() const {
+    return !m_ResourceInfo ? ""
+           : m_Context->pStringTableReader->Get(m_ResourceInfo->Name);
+  }
+  uint32_t GetFlags() const { return !m_ResourceInfo ? 0 : m_ResourceInfo->Flags; }
+};
+
+class ResourceTableReader {
+private:
+  TableReader m_Table;
+  RuntimeDataContext *m_Context;
+  uint32_t m_CBufferCount;
+  uint32_t m_SamplerCount;
+  uint32_t m_SRVCount;
+  uint32_t m_UAVCount;
+
+public:
+  ResourceTableReader()
+      : m_Context(nullptr), m_CBufferCount(0),
+        m_SamplerCount(0), m_SRVCount(0), m_UAVCount(0){};
+
+  void SetResourceInfo(const char *ptr, uint32_t count, uint32_t recordStride) {
+    m_Table.Init(ptr, count, recordStride);
+    // Assuming that resources are in order of CBuffer, Sampler, SRV, and UAV,
+    // count the number for each resource class
+    m_CBufferCount = 0;
+    m_SamplerCount = 0;
+    m_SRVCount = 0;
+    m_UAVCount = 0;
+
+    for (uint32_t i = 0; i < count; ++i) {
+      const RuntimeDataResourceInfo *curPtr =
+        m_Table.Row<RuntimeDataResourceInfo>(i);
+      if (curPtr->Class == (uint32_t)hlsl::DXIL::ResourceClass::CBuffer)
+        m_CBufferCount++;
+      else if (curPtr->Class == (uint32_t)hlsl::DXIL::ResourceClass::Sampler)
+        m_SamplerCount++;
+      else if (curPtr->Class == (uint32_t)hlsl::DXIL::ResourceClass::SRV)
+        m_SRVCount++;
+      else if (curPtr->Class == (uint32_t)hlsl::DXIL::ResourceClass::UAV)
+        m_UAVCount++;
+    }
+  }
+
+  void SetContext(RuntimeDataContext *context) { m_Context = context; }
+
+  uint32_t GetNumResources() const {
+    return m_CBufferCount + m_SamplerCount + m_SRVCount + m_UAVCount;
+  }
+  ResourceReader GetItem(uint32_t i) const {
+    _Analysis_assume_(i < GetNumResources());
+    return ResourceReader(m_Table.Row<RuntimeDataResourceInfo>(i), m_Context);
+  }
+
+  uint32_t GetNumCBuffers() const { return m_CBufferCount; }
+  ResourceReader GetCBuffer(uint32_t i) {
+    _Analysis_assume_(i < m_CBufferCount);
+    return ResourceReader(m_Table.Row<RuntimeDataResourceInfo>(i), m_Context);
+  }
+
+  uint32_t GetNumSamplers() const { return m_SamplerCount; }
+  ResourceReader GetSampler(uint32_t i) {
+    _Analysis_assume_(i < m_SamplerCount);
+    uint32_t offset = (m_CBufferCount + i);
+    return ResourceReader(m_Table.Row<RuntimeDataResourceInfo>(offset), m_Context);
+  }
+
+  uint32_t GetNumSRVs() const { return m_SRVCount; }
+  ResourceReader GetSRV(uint32_t i) {
+    _Analysis_assume_(i < m_SRVCount);
+    uint32_t offset = (m_CBufferCount + m_SamplerCount + i);
+    return ResourceReader(m_Table.Row<RuntimeDataResourceInfo>(offset), m_Context);
+  }
+
+  uint32_t GetNumUAVs() const { return m_UAVCount; }
+  ResourceReader GetUAV(uint32_t i) {
+    _Analysis_assume_(i < m_UAVCount);
+    uint32_t offset = (m_CBufferCount + m_SamplerCount + m_SRVCount + i);
+    return ResourceReader(m_Table.Row<RuntimeDataResourceInfo>(offset), m_Context);
+  }
+};
+
+class FunctionReader {
+private:
+  const RuntimeDataFunctionInfo *m_RuntimeDataFunctionInfo;
+  RuntimeDataContext *m_Context;
+
+public:
+  FunctionReader() : m_RuntimeDataFunctionInfo(nullptr), m_Context(nullptr) {}
+  FunctionReader(const RuntimeDataFunctionInfo *functionInfo,
+                 RuntimeDataContext *context)
+      : m_RuntimeDataFunctionInfo(functionInfo), m_Context(context) {}
+
+  const char *GetName() const {
+    return !m_RuntimeDataFunctionInfo ? ""
+      : m_Context->pStringTableReader->Get(m_RuntimeDataFunctionInfo->Name);
+  }
+  const char *GetUnmangledName() const {
+    return !m_RuntimeDataFunctionInfo ? ""
+      : m_Context->pStringTableReader->Get(
+          m_RuntimeDataFunctionInfo->UnmangledName);
+  }
+  uint64_t GetFeatureFlag() const {
+    return (static_cast<uint64_t>(GetFeatureInfo2()) << 32)
+           | static_cast<uint64_t>(GetFeatureInfo1());
+  }
+  uint32_t GetFeatureInfo1() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->FeatureInfo1;
+  }
+  uint32_t GetFeatureInfo2() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->FeatureInfo2;
+  }
+
+  uint32_t GetShaderStageFlag() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->ShaderStageFlag;
+  }
+  uint32_t GetMinShaderTarget() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->MinShaderTarget;
+  }
+  uint32_t GetNumResources() const {
+    if (!m_RuntimeDataFunctionInfo ||
+        m_RuntimeDataFunctionInfo->Resources == UINT_MAX)
+      return 0;
+    return m_Context->pIndexTableReader->getRow(
+      m_RuntimeDataFunctionInfo->Resources).Count();
+  }
+  ResourceReader GetResource(uint32_t i) const {
+    if (!m_RuntimeDataFunctionInfo)
+      return ResourceReader(nullptr, m_Context);
+    uint32_t resIndex = m_Context->pIndexTableReader->getRow(
+      m_RuntimeDataFunctionInfo->Resources).At(i);
+    return m_Context->pResourceTableReader->GetItem(resIndex);
+  }
+  uint32_t GetNumDependencies() const {
+    if (!m_RuntimeDataFunctionInfo ||
+        m_RuntimeDataFunctionInfo->FunctionDependencies == UINT_MAX)
+      return 0;
+    return m_Context->pIndexTableReader->getRow(
+      m_RuntimeDataFunctionInfo->FunctionDependencies).Count();
+  }
+  const char *GetDependency(uint32_t i) const {
+    if (!m_RuntimeDataFunctionInfo)
+      return "";
+    uint32_t resIndex = m_Context->pIndexTableReader->getRow(
+      m_RuntimeDataFunctionInfo->FunctionDependencies).At(i);
+    return m_Context->pStringTableReader->Get(resIndex);
+  }
+
+  uint32_t GetPayloadSizeInBytes() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->PayloadSizeInBytes;
+  }
+  uint32_t GetAttributeSizeInBytes() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->AttributeSizeInBytes;
+  }
+  // payload (hit shaders) and parameters (call shaders) are mutually exclusive
+  uint32_t GetParameterSizeInBytes() const {
+    return !m_RuntimeDataFunctionInfo ? 0
+      : m_RuntimeDataFunctionInfo->PayloadSizeInBytes;
+  }
+  hlsl::DXIL::ShaderKind GetShaderKind() const {
+    return !m_RuntimeDataFunctionInfo ? hlsl::DXIL::ShaderKind::Invalid
+      : (hlsl::DXIL::ShaderKind)m_RuntimeDataFunctionInfo->ShaderKind;
+  }
+};
+
+class FunctionTableReader {
+private:
+  TableReader m_Table;
+  RuntimeDataContext *m_context;
+
+public:
+  FunctionTableReader() : m_context(nullptr) {}
+
+  FunctionReader GetItem(uint32_t i) const {
+    return FunctionReader(m_Table.Row<RuntimeDataFunctionInfo>(i), m_context);
+  }
+  uint32_t GetNumFunctions() const { return m_Table.Count(); }
+
+  void SetFunctionInfo(const char *ptr, uint32_t count, uint32_t recordStride) {
+    m_Table.Init(ptr, count, recordStride);
+  }
+  void SetContext(RuntimeDataContext *context) { m_context = context; }
+};
+
+class DxilRuntimeData {
+private:
+  uint32_t m_TableCount;
+  StringTableReader m_StringReader;
+  IndexTableReader m_IndexTableReader;
+  ResourceTableReader m_ResourceTableReader;
+  FunctionTableReader m_FunctionTableReader;
+  RuntimeDataContext m_Context;
+
+public:
+  DxilRuntimeData();
+  DxilRuntimeData(const char *ptr, size_t size);
+  // initializing reader from RDAT. return true if no error has occured.
+  bool InitFromRDAT(const void *pRDAT, size_t size);
+  // read prerelease data:
+  bool InitFromRDAT_Prerelease(const void *pRDAT, size_t size);
+  FunctionTableReader *GetFunctionTableReader();
+  ResourceTableReader *GetResourceTableReader();
+};
+
+//////////////////////////////////
+/// structures for library runtime
+
+typedef struct DxilResourceDesc {
+  uint32_t Class; // hlsl::DXIL::ResourceClass
+  uint32_t Kind;  // hlsl::DXIL::ResourceKind
+  uint32_t ID;    // id per class
+  uint32_t Space;
+  uint32_t UpperBound;
+  uint32_t LowerBound;
+  LPCWSTR Name;
+  uint32_t Flags; // hlsl::RDAT::DxilResourceFlag
+} DxilResourceDesc;
+
+typedef struct DxilFunctionDesc {
+  LPCWSTR Name;
+  LPCWSTR UnmangledName;
+  uint32_t NumResources;
+  const DxilResourceDesc * const*Resources;
+  uint32_t NumFunctionDependencies;
+  const LPCWSTR *FunctionDependencies;
+  uint32_t ShaderKind;
+  uint32_t PayloadSizeInBytes;   // 1) hit, miss, or closest shader: payload count
+                                 // 2) call shader: parameter size
+  uint32_t AttributeSizeInBytes; // attribute size for closest hit and any hit
+  uint32_t FeatureInfo1;         // first 32 bits of feature flag
+  uint32_t FeatureInfo2;         // second 32 bits of feature flag
+  uint32_t ShaderStageFlag;      // valid shader stage flag.
+  uint32_t MinShaderTarget;      // minimum shader target.
+} DxilFunctionDesc;
+
+typedef struct DxilSubobjectDesc {
+} DxilSubobjectDesc;
+
+typedef struct DxilLibraryDesc {
+  uint32_t NumFunctions;
+  DxilFunctionDesc *pFunction;
+  uint32_t NumResources;
+  DxilResourceDesc *pResource;
+  uint32_t NumSubobjects;
+  DxilSubobjectDesc *pSubobjects;
+} DxilLibraryDesc;
+
+class DxilRuntimeReflection {
+public:
+  virtual ~DxilRuntimeReflection() {}
+  // This call will allocate memory for GetLibraryReflection call
+  virtual bool InitFromRDAT(const void *pRDAT, size_t size) = 0;
+  // DxilRuntimeReflection owns the memory pointed to by DxilLibraryDesc
+  virtual const DxilLibraryDesc GetLibraryReflection() = 0;
+};
+
+DxilRuntimeReflection *CreateDxilRuntimeReflection();
+
+} // namespace RDAT
+} // namespace hlsl

+ 425 - 0
include/dxc/HLSL/DxilRuntimeReflection.inl

@@ -0,0 +1,425 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilLibraryReflection.cpp                                                 //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Defines shader reflection for runtime usage.                              //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/hlsl/DxilRuntimeReflection.h"
+#include <windows.h>
+#include <unordered_map>
+#include <vector>
+#include <memory>
+
+namespace hlsl {
+namespace RDAT {
+
+struct ResourceKey {
+  uint32_t Class, ID;
+  ResourceKey(uint32_t Class, uint32_t ID) : Class(Class), ID(ID) {}
+  bool operator==(const ResourceKey& other) const {
+    return other.Class == Class && other.ID == ID;
+  }
+};
+
+// Size-checked reader
+//  on overrun: throw buffer_overrun{};
+//  on overlap: throw buffer_overlap{};
+class CheckedReader {
+  const char *Ptr;
+  size_t Size;
+  size_t Offset;
+
+public:
+  class exception : public std::exception {};
+  class buffer_overrun : public exception {
+  public:
+    buffer_overrun() noexcept {}
+    virtual const char * what() const noexcept override {
+      return ("buffer_overrun");
+    }
+  };
+  class buffer_overlap : public exception {
+  public:
+    buffer_overlap() noexcept {}
+    virtual const char * what() const noexcept override {
+      return ("buffer_overlap");
+    }
+  };
+
+  CheckedReader(const void *ptr, size_t size) :
+    Ptr(reinterpret_cast<const char*>(ptr)), Size(size), Offset(0) {}
+  void Reset(size_t offset = 0) {
+    if (offset >= Size) throw buffer_overrun{};
+    Offset = offset;
+  }
+  // offset is absolute, ensure offset is >= current offset
+  void Advance(size_t offset = 0) {
+    if (offset < Offset) throw buffer_overlap{};
+    if (offset >= Size) throw buffer_overrun{};
+    Offset = offset;
+  }
+  void CheckBounds(size_t size) const {
+    assert(Offset <= Size && "otherwise, offset larger than size");
+    if (size > Size - Offset)
+      throw buffer_overrun{};
+  }
+  template <typename T>
+  const T *Cast(size_t size = 0) {
+    if (0 == size) size = sizeof(T);
+    CheckBounds(size);
+    return reinterpret_cast<const T*>(Ptr + Offset);
+  }
+  template <typename T>
+  const T &Read() {
+    const size_t size = sizeof(T);
+    const T* p = Cast<T>(size);
+    Offset += size;
+    return *p;
+  }
+  template <typename T>
+  const T *ReadArray(size_t count = 1) {
+    const size_t size = sizeof(T) * count;
+    const T* p = Cast<T>(size);
+    Offset += size;
+    return p;
+  }
+};
+
+DxilRuntimeData::DxilRuntimeData() : DxilRuntimeData(nullptr, 0) {}
+
+DxilRuntimeData::DxilRuntimeData(const char *ptr, size_t size)
+    : m_TableCount(0), m_StringReader(), m_ResourceTableReader(),
+      m_FunctionTableReader(), m_IndexTableReader(), m_Context() {
+  m_Context = {&m_StringReader, &m_IndexTableReader, &m_ResourceTableReader,
+               &m_FunctionTableReader};
+  m_ResourceTableReader.SetContext(&m_Context);
+  m_FunctionTableReader.SetContext(&m_Context);
+  InitFromRDAT(ptr, size);
+}
+
+// initializing reader from RDAT. return true if no error has occured.
+bool DxilRuntimeData::InitFromRDAT(const void *pRDAT, size_t size) {
+  if (pRDAT) {
+    try {
+      CheckedReader Reader(pRDAT, size);
+      RuntimeDataHeader RDATHeader = Reader.Read<RuntimeDataHeader>();
+      if (RDATHeader.Version < RDAT_Version_0) {
+        // Prerelease version, fallback to that Init
+        return InitFromRDAT_Prerelease(pRDAT, size);
+      }
+      const uint32_t *offsets = Reader.ReadArray<uint32_t>(RDATHeader.PartCount);
+      for (uint32_t i = 0; i < RDATHeader.PartCount; ++i) {
+        Reader.Advance(offsets[i]);
+        RuntimeDataPartHeader part = Reader.Read<RuntimeDataPartHeader>();
+        CheckedReader PR(Reader.ReadArray<char>(part.Size), part.Size);
+        switch (part.Type) {
+        case RuntimeDataPartType::StringBuffer: {
+          m_StringReader = StringTableReader(
+            PR.ReadArray<char>(part.Size), part.Size);
+          break;
+        }
+        case RuntimeDataPartType::IndexArrays: {
+          uint32_t count = part.Size / sizeof(uint32_t);
+          m_IndexTableReader = IndexTableReader(
+            PR.ReadArray<uint32_t>(count), count);
+          break;
+        }
+        case RuntimeDataPartType::ResourceTable: {
+          RuntimeDataTableHeader table = PR.Read<RuntimeDataTableHeader>();
+          size_t tableSize = table.RecordCount * table.RecordStride;
+          m_ResourceTableReader.SetResourceInfo(PR.ReadArray<char>(tableSize),
+            table.RecordCount, table.RecordStride);
+          break;
+        }
+        case RuntimeDataPartType::FunctionTable: {
+          RuntimeDataTableHeader table = PR.Read<RuntimeDataTableHeader>();
+          size_t tableSize = table.RecordCount * table.RecordStride;
+          m_FunctionTableReader.SetFunctionInfo(PR.ReadArray<char>(tableSize),
+            table.RecordCount, table.RecordStride);
+          break;
+        }
+        default:
+          continue; // Skip unrecognized parts
+        }
+      }
+      return true;
+    } catch(CheckedReader::exception e) {
+      // TODO: error handling
+      //throw hlsl::Exception(DXC_E_MALFORMED_CONTAINER, e.what());
+      return false;
+    }
+  }
+  return false;
+}
+
+bool DxilRuntimeData::InitFromRDAT_Prerelease(const void *pRDAT, size_t size) {
+  enum class RuntimeDataPartType_Prerelease : uint32_t {
+    Invalid = 0,
+    String,
+    Function,
+    Resource,
+    Index
+  };
+  struct RuntimeDataTableHeader_Prerelease {
+    uint32_t tableType; // RuntimeDataPartType
+    uint32_t size;
+    uint32_t offset;
+  };
+  if (pRDAT) {
+    try {
+      CheckedReader Reader(pRDAT, size);
+      uint32_t partCount = Reader.Read<uint32_t>();
+      const RuntimeDataTableHeader_Prerelease *tableHeaders =
+        Reader.ReadArray<RuntimeDataTableHeader_Prerelease>(partCount);
+      for (uint32_t i = 0; i < partCount; ++i) {
+        uint32_t partSize = tableHeaders[i].size;
+        Reader.Advance(tableHeaders[i].offset);
+        CheckedReader PR(Reader.ReadArray<char>(partSize), partSize);
+        switch ((RuntimeDataPartType_Prerelease)(tableHeaders[i].tableType)) {
+        case RuntimeDataPartType_Prerelease::String: {
+          m_StringReader = StringTableReader(
+            PR.ReadArray<char>(partSize), partSize);
+          break;
+        }
+        case RuntimeDataPartType_Prerelease::Index: {
+          uint32_t count = partSize / sizeof(uint32_t);
+          m_IndexTableReader = IndexTableReader(
+            PR.ReadArray<uint32_t>(count), count);
+          break;
+        }
+        case RuntimeDataPartType_Prerelease::Resource: {
+          uint32_t count = partSize / sizeof(RuntimeDataResourceInfo);
+          m_ResourceTableReader.SetResourceInfo(PR.ReadArray<char>(partSize),
+            count, sizeof(RuntimeDataResourceInfo));
+          break;
+        }
+        case RuntimeDataPartType_Prerelease::Function: {
+          uint32_t count = partSize / sizeof(RuntimeDataFunctionInfo);
+          m_FunctionTableReader.SetFunctionInfo(PR.ReadArray<char>(partSize),
+            count, sizeof(RuntimeDataFunctionInfo));
+          break;
+        }
+        default:
+          return false; // There should be no unrecognized parts
+        }
+      }
+      return true;
+    } catch(CheckedReader::exception e) {
+      // TODO: error handling
+      //throw hlsl::Exception(DXC_E_MALFORMED_CONTAINER, e.what());
+      return false;
+    }
+  }
+  return false;
+}
+
+FunctionTableReader *DxilRuntimeData::GetFunctionTableReader() {
+  return &m_FunctionTableReader;
+}
+
+ResourceTableReader *DxilRuntimeData::GetResourceTableReader() {
+  return &m_ResourceTableReader;
+}
+
+}} // hlsl::RDAT
+
+using namespace hlsl;
+using namespace RDAT;
+
+template<>
+struct std::hash<ResourceKey> {
+public:
+  size_t operator()(const ResourceKey& key) const throw() {
+    return (std::hash<uint32_t>()(key.Class) * (size_t)16777619U)
+      ^ std::hash<uint32_t>()(key.ID);
+  }
+};
+
+namespace {
+
+class DxilRuntimeReflection_impl : public DxilRuntimeReflection {
+private:
+  typedef std::unordered_map<const char *, std::unique_ptr<wchar_t[]>> StringMap;
+  typedef std::vector<DxilResourceDesc> ResourceList;
+  typedef std::vector<DxilResourceDesc *> ResourceRefList;
+  typedef std::vector<DxilFunctionDesc> FunctionList;
+  typedef std::vector<const wchar_t *> WStringList;
+
+  DxilRuntimeData m_RuntimeData;
+  StringMap m_StringMap;
+  ResourceList m_Resources;
+  FunctionList m_Functions;
+  std::unordered_map<ResourceKey, DxilResourceDesc *> m_ResourceMap;
+  std::unordered_map<DxilFunctionDesc *, ResourceRefList> m_FuncToResMap;
+  std::unordered_map<DxilFunctionDesc *, WStringList> m_FuncToStringMap;
+  bool m_initialized;
+
+  const wchar_t *GetWideString(const char *ptr);
+  void AddString(const char *ptr);
+  void InitializeReflection();
+  const DxilResourceDesc * const*GetResourcesForFunction(DxilFunctionDesc &function,
+                             const FunctionReader &functionReader);
+  const wchar_t **GetDependenciesForFunction(DxilFunctionDesc &function,
+                             const FunctionReader &functionReader);
+  DxilResourceDesc *AddResource(const ResourceReader &resourceReader);
+  DxilFunctionDesc *AddFunction(const FunctionReader &functionReader);
+
+public:
+  // TODO: Implement pipeline state validation with runtime data
+  // TODO: Update BlobContainer.h to recognize 'RDAT' blob
+  DxilRuntimeReflection_impl()
+      : m_RuntimeData(), m_StringMap(), m_Resources(), m_Functions(),
+        m_FuncToResMap(), m_FuncToStringMap(), m_initialized(false) {}
+  virtual ~DxilRuntimeReflection_impl() {}
+  // This call will allocate memory for GetLibraryReflection call
+  bool InitFromRDAT(const void *pRDAT, size_t size) override;
+  const DxilLibraryDesc GetLibraryReflection() override;
+};
+
+void DxilRuntimeReflection_impl::AddString(const char *ptr) {
+  if (m_StringMap.find(ptr) == m_StringMap.end()) {
+    int size = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, ptr, -1,
+                                     nullptr, 0);
+    if (size != 0) {
+      auto pNew = std::make_unique<wchar_t[]>(size);
+      ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, ptr, -1,
+                            pNew.get(), size);
+      m_StringMap[ptr] = std::move(pNew);
+    }
+  }
+}
+
+const wchar_t *DxilRuntimeReflection_impl::GetWideString(const char *ptr) {
+  if (m_StringMap.find(ptr) == m_StringMap.end()) {
+    AddString(ptr);
+  }
+  return m_StringMap.at(ptr).get();
+}
+
+bool DxilRuntimeReflection_impl::InitFromRDAT(const void *pRDAT, size_t size) {
+  m_initialized = m_RuntimeData.InitFromRDAT(pRDAT, size);
+  if (m_initialized)
+    InitializeReflection();
+  return m_initialized;
+}
+
+const DxilLibraryDesc DxilRuntimeReflection_impl::GetLibraryReflection() {
+  DxilLibraryDesc reflection = {};
+  if (m_initialized) {
+    reflection.NumResources =
+        m_RuntimeData.GetResourceTableReader()->GetNumResources();
+    reflection.pResource = m_Resources.data();
+    reflection.NumFunctions =
+        m_RuntimeData.GetFunctionTableReader()->GetNumFunctions();
+    reflection.pFunction = m_Functions.data();
+  }
+  return reflection;
+}
+
+void DxilRuntimeReflection_impl::InitializeReflection() {
+  // First need to reserve spaces for resources because functions will need to
+  // reference them via pointers.
+  const ResourceTableReader *resourceTableReader = m_RuntimeData.GetResourceTableReader();
+  m_Resources.reserve(resourceTableReader->GetNumResources());
+  for (uint32_t i = 0; i < resourceTableReader->GetNumResources(); ++i) {
+    ResourceReader resourceReader = resourceTableReader->GetItem(i);
+    AddString(resourceReader.GetName());
+    DxilResourceDesc *pResource = AddResource(resourceReader);
+    if (pResource) {
+      ResourceKey key(pResource->Class, pResource->ID);
+      m_ResourceMap[key] = pResource;
+    }
+  }
+  const FunctionTableReader *functionTableReader = m_RuntimeData.GetFunctionTableReader();
+  m_Functions.reserve(functionTableReader->GetNumFunctions());
+  for (uint32_t i = 0; i < functionTableReader->GetNumFunctions(); ++i) {
+    FunctionReader functionReader = functionTableReader->GetItem(i);
+    AddString(functionReader.GetName());
+    AddFunction(functionReader);
+  }
+}
+
+DxilResourceDesc *
+DxilRuntimeReflection_impl::AddResource(const ResourceReader &resourceReader) {
+  assert(m_Resources.size() < m_Resources.capacity() && "Otherwise, number of resources was incorrect");
+  if (!(m_Resources.size() < m_Resources.capacity()))
+    return nullptr;
+  m_Resources.emplace_back(DxilResourceDesc({0}));
+  DxilResourceDesc &resource = m_Resources.back();
+  resource.Class = (uint32_t)resourceReader.GetResourceClass();
+  resource.Kind = (uint32_t)resourceReader.GetResourceKind();
+  resource.Space = resourceReader.GetSpace();
+  resource.LowerBound = resourceReader.GetLowerBound();
+  resource.UpperBound = resourceReader.GetUpperBound();
+  resource.ID = resourceReader.GetID();
+  resource.Flags = resourceReader.GetFlags();
+  resource.Name = GetWideString(resourceReader.GetName());
+  return &resource;
+}
+
+const DxilResourceDesc * const*DxilRuntimeReflection_impl::GetResourcesForFunction(
+    DxilFunctionDesc &function, const FunctionReader &functionReader) {
+  if (m_FuncToResMap.find(&function) == m_FuncToResMap.end())
+    m_FuncToResMap.insert(std::pair<DxilFunctionDesc *, ResourceRefList>(
+        &function, ResourceRefList()));
+  ResourceRefList &resourceList = m_FuncToResMap.at(&function);
+  if (resourceList.empty()) {
+    resourceList.reserve(functionReader.GetNumResources());
+    for (uint32_t i = 0; i < functionReader.GetNumResources(); ++i) {
+      const ResourceReader resourceReader = functionReader.GetResource(i);
+      ResourceKey key((uint32_t)resourceReader.GetResourceClass(),
+                      resourceReader.GetID());
+      auto it = m_ResourceMap.find(key);
+      assert(it != m_ResourceMap.end() && it->second && "Otherwise, resource was not in map, or was null");
+      resourceList.emplace_back(it->second);
+    }
+  }
+  return resourceList.empty() ? nullptr : resourceList.data();
+}
+
+const wchar_t **DxilRuntimeReflection_impl::GetDependenciesForFunction(
+    DxilFunctionDesc &function, const FunctionReader &functionReader) {
+  if (m_FuncToStringMap.find(&function) == m_FuncToStringMap.end())
+    m_FuncToStringMap.insert(
+        std::pair<DxilFunctionDesc *, WStringList>(&function, WStringList()));
+  WStringList &wStringList = m_FuncToStringMap.at(&function);
+  for (uint32_t i = 0; i < functionReader.GetNumDependencies(); ++i) {
+    wStringList.emplace_back(GetWideString(functionReader.GetDependency(i)));
+  }
+  return wStringList.empty() ? nullptr : wStringList.data();
+}
+
+DxilFunctionDesc *
+DxilRuntimeReflection_impl::AddFunction(const FunctionReader &functionReader) {
+  assert(m_Functions.size() < m_Functions.capacity() && "Otherwise, number of functions was incorrect");
+  if (!(m_Functions.size() < m_Functions.capacity()))
+    return nullptr;
+  m_Functions.emplace_back(DxilFunctionDesc({0}));
+  DxilFunctionDesc &function = m_Functions.back();
+  function.Name = GetWideString(functionReader.GetName());
+  function.UnmangledName = GetWideString(functionReader.GetUnmangledName());
+  function.NumResources = functionReader.GetNumResources();
+  function.Resources = GetResourcesForFunction(function, functionReader);
+  function.NumFunctionDependencies = functionReader.GetNumDependencies();
+  function.FunctionDependencies =
+      GetDependenciesForFunction(function, functionReader);
+  function.ShaderKind = (uint32_t)functionReader.GetShaderKind();
+  function.PayloadSizeInBytes = functionReader.GetPayloadSizeInBytes();
+  function.AttributeSizeInBytes = functionReader.GetAttributeSizeInBytes();
+  function.FeatureInfo1 = functionReader.GetFeatureInfo1();
+  function.FeatureInfo2 = functionReader.GetFeatureInfo2();
+  function.ShaderStageFlag = functionReader.GetShaderStageFlag();
+  function.MinShaderTarget = functionReader.GetMinShaderTarget();
+  return &function;
+}
+
+} // namespace anon
+
+DxilRuntimeReflection *hlsl::RDAT::CreateDxilRuntimeReflection() {
+  return new DxilRuntimeReflection_impl();
+}

+ 147 - 0
include/dxc/HLSL/DxilShaderFlags.h

@@ -0,0 +1,147 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilShaderFlags.h                                                         //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Shader flags for a dxil shader function.                                  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace hlsl {
+  class DxilModule;
+}
+
+namespace llvm {
+  class Function;
+}
+
+namespace hlsl {
+  // Shader properties.
+  class ShaderFlags {
+  public:
+    ShaderFlags();
+
+    static ShaderFlags CollectShaderFlags(const llvm::Function *F, const hlsl::DxilModule *M);
+    unsigned GetGlobalFlags() const;
+    uint64_t GetFeatureInfo() const;
+    static uint64_t GetShaderFlagsRawForCollection(); // some flags are collected (eg use 64-bit), some provided (eg allow refactoring)
+    uint64_t GetShaderFlagsRaw() const;
+    void SetShaderFlagsRaw(uint64_t data);
+    void CombineShaderFlags(const ShaderFlags &other);
+
+    void SetDisableOptimizations(bool flag) { m_bDisableOptimizations = flag; }
+    bool GetDisableOptimizations() const { return m_bDisableOptimizations; }
+
+    void SetDisableMathRefactoring(bool flag) { m_bDisableMathRefactoring = flag; }
+    bool GetDisableMathRefactoring() const { return m_bDisableMathRefactoring; }
+
+    void SetEnableDoublePrecision(bool flag) { m_bEnableDoublePrecision = flag; }
+    bool GetEnableDoublePrecision() const { return m_bEnableDoublePrecision; }
+
+    void SetForceEarlyDepthStencil(bool flag) { m_bForceEarlyDepthStencil = flag; }
+    bool GetForceEarlyDepthStencil() const { return m_bForceEarlyDepthStencil; }
+
+    void SetEnableRawAndStructuredBuffers(bool flag) { m_bEnableRawAndStructuredBuffers = flag; }
+    bool GetEnableRawAndStructuredBuffers() const { return m_bEnableRawAndStructuredBuffers; }
+
+    void SetLowPrecisionPresent(bool flag) { m_bLowPrecisionPresent = flag; }
+    bool GetLowPrecisionPresent() const { return m_bLowPrecisionPresent; }
+
+    void SetEnableDoubleExtensions(bool flag) { m_bEnableDoubleExtensions = flag; }
+    bool GetEnableDoubleExtensions() const { return m_bEnableDoubleExtensions; }
+
+    void SetEnableMSAD(bool flag) { m_bEnableMSAD = flag; }
+    bool GetEnableMSAD() const { return m_bEnableMSAD; }
+
+    void SetAllResourcesBound(bool flag) { m_bAllResourcesBound = flag; }
+    bool GetAllResourcesBound() const { return m_bAllResourcesBound; }
+
+    void SetCSRawAndStructuredViaShader4X(bool flag) { m_bCSRawAndStructuredViaShader4X = flag; }
+    bool GetCSRawAndStructuredViaShader4X() const { return m_bCSRawAndStructuredViaShader4X; }
+
+    void SetROVs(bool flag) { m_bROVS = flag; }
+    bool GetROVs() const { return m_bROVS; }
+
+    void SetWaveOps(bool flag) { m_bWaveOps = flag; }
+    bool GetWaveOps() const { return m_bWaveOps; }
+
+    void SetInt64Ops(bool flag) { m_bInt64Ops = flag; }
+    bool GetInt64Ops() const { return m_bInt64Ops; }
+
+    void SetTiledResources(bool flag) { m_bTiledResources = flag; }
+    bool GetTiledResources() const { return m_bTiledResources; }
+
+    void SetStencilRef(bool flag) { m_bStencilRef = flag; }
+    bool GetStencilRef() const { return m_bStencilRef; }
+
+    void SetInnerCoverage(bool flag) { m_bInnerCoverage = flag; }
+    bool GetInnerCoverage() const { return m_bInnerCoverage; }
+
+    void SetViewportAndRTArrayIndex(bool flag) { m_bViewportAndRTArrayIndex = flag; }
+    bool GetViewportAndRTArrayIndex() const { return m_bViewportAndRTArrayIndex; }
+
+    void SetUAVLoadAdditionalFormats(bool flag) { m_bUAVLoadAdditionalFormats = flag; }
+    bool GetUAVLoadAdditionalFormats() const { return m_bUAVLoadAdditionalFormats; }
+
+    void SetLevel9ComparisonFiltering(bool flag) { m_bLevel9ComparisonFiltering = flag; }
+    bool GetLevel9ComparisonFiltering() const { return m_bLevel9ComparisonFiltering; }
+
+    void Set64UAVs(bool flag) { m_b64UAVs = flag; }
+    bool Get64UAVs() const { return m_b64UAVs; }
+
+    void SetUAVsAtEveryStage(bool flag) { m_UAVsAtEveryStage = flag; }
+    bool GetUAVsAtEveryStage() const { return m_UAVsAtEveryStage; }
+
+    void SetViewID(bool flag) { m_bViewID = flag; }
+    bool GetViewID() const { return m_bViewID; }
+
+    void SetBarycentrics(bool flag) { m_bBarycentrics = flag; }
+    bool GetBarycentrics() const { return m_bBarycentrics; }
+
+    void SetUseNativeLowPrecision(bool flag) { m_bUseNativeLowPrecision = flag; }
+    bool GetUseNativeLowPrecision() const { return m_bUseNativeLowPrecision; }
+
+  private:
+    unsigned m_bDisableOptimizations :1;   // D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION
+    unsigned m_bDisableMathRefactoring :1; //~D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED
+    unsigned m_bEnableDoublePrecision :1; // D3D11_SB_GLOBAL_FLAG_ENABLE_DOUBLE_PRECISION_FLOAT_OPS
+    unsigned m_bForceEarlyDepthStencil :1; // D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL
+    unsigned m_bEnableRawAndStructuredBuffers :1; // D3D11_SB_GLOBAL_FLAG_ENABLE_RAW_AND_STRUCTURED_BUFFERS
+    unsigned m_bLowPrecisionPresent :1; // D3D11_1_SB_GLOBAL_FLAG_ENABLE_MINIMUM_PRECISION
+    unsigned m_bEnableDoubleExtensions :1; // D3D11_1_SB_GLOBAL_FLAG_ENABLE_DOUBLE_EXTENSIONS
+    unsigned m_bEnableMSAD :1;        // D3D11_1_SB_GLOBAL_FLAG_ENABLE_SHADER_EXTENSIONS
+    unsigned m_bAllResourcesBound :1; // D3D12_SB_GLOBAL_FLAG_ALL_RESOURCES_BOUND
+
+    unsigned m_bViewportAndRTArrayIndex :1;   // SHADER_FEATURE_VIEWPORT_AND_RT_ARRAY_INDEX_FROM_ANY_SHADER_FEEDING_RASTERIZER
+    unsigned m_bInnerCoverage :1;             // SHADER_FEATURE_INNER_COVERAGE
+    unsigned m_bStencilRef  :1;               // SHADER_FEATURE_STENCIL_REF
+    unsigned m_bTiledResources  :1;           // SHADER_FEATURE_TILED_RESOURCES
+    unsigned m_bUAVLoadAdditionalFormats :1;  // SHADER_FEATURE_TYPED_UAV_LOAD_ADDITIONAL_FORMATS
+    unsigned m_bLevel9ComparisonFiltering :1; // SHADER_FEATURE_LEVEL_9_COMPARISON_FILTERING
+                                              // SHADER_FEATURE_11_1_SHADER_EXTENSIONS shared with EnableMSAD
+    unsigned m_b64UAVs :1;                    // SHADER_FEATURE_64_UAVS
+    unsigned m_UAVsAtEveryStage :1;           // SHADER_FEATURE_UAVS_AT_EVERY_STAGE
+    unsigned m_bCSRawAndStructuredViaShader4X : 1; // SHADER_FEATURE_COMPUTE_SHADERS_PLUS_RAW_AND_STRUCTURED_BUFFERS_VIA_SHADER_4_X
+    
+    // SHADER_FEATURE_COMPUTE_SHADERS_PLUS_RAW_AND_STRUCTURED_BUFFERS_VIA_SHADER_4_X is specifically
+    // about shader model 4.x.
+
+    unsigned m_bROVS :1;              // SHADER_FEATURE_ROVS
+    unsigned m_bWaveOps :1;           // SHADER_FEATURE_WAVE_OPS
+    unsigned m_bInt64Ops :1;          // SHADER_FEATURE_INT64_OPS
+    unsigned m_bViewID : 1;           // SHADER_FEATURE_VIEWID
+    unsigned m_bBarycentrics : 1;     // SHADER_FEATURE_BARYCENTRICS
+
+    unsigned m_bUseNativeLowPrecision : 1;
+
+    unsigned m_align0 : 8;        // align to 32 bit.
+    uint32_t m_align1;            // align to 64 bit.
+  };
+
+
+
+}

+ 15 - 7
include/dxc/HLSL/DxilShaderModel.h

@@ -30,6 +30,7 @@ public:
   // Major/Minor version of highest shader model
   static const unsigned kHighestMajor = 6;
   static const unsigned kHighestMinor = 3;
+  static const unsigned kOfflineMinor = 0xF;
 
   bool IsPS() const     { return m_Kind == Kind::Pixel; }
   bool IsVS() const     { return m_Kind == Kind::Vertex; }
@@ -38,20 +39,27 @@ public:
   bool IsDS() const     { return m_Kind == Kind::Domain; }
   bool IsCS() const     { return m_Kind == Kind::Compute; }
   bool IsLib() const    { return m_Kind == Kind::Library; }
+  bool IsRay() const    { return m_Kind >= Kind::RayGeneration && m_Kind <= Kind::Callable; }
   bool IsValid() const;
   bool IsValidForDxil() const;
+  bool IsValidForModule() const;
 
   Kind GetKind() const      { return m_Kind; }
   unsigned GetMajor() const { return m_Major; }
   unsigned GetMinor() const { return m_Minor; }
   void GetDxilVersion(unsigned &DxilMajor, unsigned &DxilMinor) const;
   void GetMinValidatorVersion(unsigned &ValMajor, unsigned &ValMinor) const;
-  bool IsSM50Plus() const   { return m_Major >= 5; }
-  bool IsSM51Plus() const   { return m_Major > 5 || (m_Major == 5 && m_Minor >= 1); }
-  bool IsSM60Plus() const   { return m_Major >= 6; }
-  bool IsSM61Plus() const   { return m_Major > 6 || (m_Major == 6 && m_Minor >= 1); }
+  bool IsSMAtLeast(unsigned Major, unsigned Minor) const {
+    return m_Major > Major || (m_Major == Major && m_Minor >= Minor);
+  }
+  bool IsSM50Plus() const   { return IsSMAtLeast(5, 0); }
+  bool IsSM51Plus() const   { return IsSMAtLeast(5, 1); }
+  bool IsSM60Plus() const   { return IsSMAtLeast(6, 0); }
+  bool IsSM61Plus() const   { return IsSMAtLeast(6, 1); }
+  bool IsSM62Plus() const   { return IsSMAtLeast(6, 2); }
+  bool IsSM63Plus() const   { return IsSMAtLeast(6, 3); }
   const char *GetName() const { return m_pszName; }
-  std::string GetKindName() const;
+  const char *GetKindName() const;
   unsigned GetNumTempRegs() const { return DXIL::kMaxTempRegCount; }
   unsigned GetNumInputRegs() const { return m_NumInputRegs; }
   unsigned GetNumOutputRegs() const { return m_NumOutputRegs; }
@@ -65,7 +73,7 @@ public:
   static const ShaderModel *Get(unsigned Idx);
   static const ShaderModel *Get(Kind Kind, unsigned Major, unsigned Minor);
   static const ShaderModel *GetByName(const char *pszName);
-  static std::string GetKindName(Kind kind);
+  static const char *GetKindName(Kind kind);
 
   bool operator==(const ShaderModel &other) const;
   bool operator!=(const ShaderModel &other) const { return !(*this == other); }
@@ -86,7 +94,7 @@ private:
               unsigned m_NumInputRegs, unsigned m_NumOutputRegs,
               bool m_bUAVs, bool m_bTypedUavs, unsigned m_UAVRegsLim);
 
-  static const unsigned kNumShaderModels = 48;
+  static const unsigned kNumShaderModels = 49;
   static const ShaderModel ms_ShaderModels[kNumShaderModels];
 
   static const ShaderModel *GetInvalid();

+ 69 - 52
include/dxc/HLSL/DxilSigPoint.inl

@@ -20,24 +20,24 @@ namespace hlsl {
 // <py::lines('SIGPOINT-TABLE')>hctdb_instrhelp.get_sigpoint_table()</py>
 // SIGPOINT-TABLE:BEGIN
 //   SigPoint, Related, ShaderKind, PackingKind,    SignatureKind
-#define DO_SIGPOINTS(DO) \
-  DO(VSIn,     Invalid, Vertex,     InputAssembler, Input) \
-  DO(VSOut,    Invalid, Vertex,     Vertex,         Output) \
-  DO(PCIn,     HSCPIn,  Hull,       None,           Invalid) \
-  DO(HSIn,     HSCPIn,  Hull,       None,           Invalid) \
-  DO(HSCPIn,   Invalid, Hull,       Vertex,         Input) \
-  DO(HSCPOut,  Invalid, Hull,       Vertex,         Output) \
-  DO(PCOut,    Invalid, Hull,       PatchConstant,  PatchConstant) \
-  DO(DSIn,     Invalid, Domain,     PatchConstant,  PatchConstant) \
-  DO(DSCPIn,   Invalid, Domain,     Vertex,         Input) \
-  DO(DSOut,    Invalid, Domain,     Vertex,         Output) \
-  DO(GSVIn,    Invalid, Geometry,   Vertex,         Input) \
-  DO(GSIn,     GSVIn,   Geometry,   None,           Invalid) \
-  DO(GSOut,    Invalid, Geometry,   Vertex,         Output) \
-  DO(PSIn,     Invalid, Pixel,      Vertex,         Input) \
-  DO(PSOut,    Invalid, Pixel,      Target,         Output) \
-  DO(CSIn,     Invalid, Compute,    None,           Invalid) \
-  DO(Invalid,  Invalid, Invalid,    Invalid,        Invalid)
+#define DO_SIGPOINTS(ROW) \
+  ROW(VSIn,     Invalid, Vertex,     InputAssembler, Input) \
+  ROW(VSOut,    Invalid, Vertex,     Vertex,         Output) \
+  ROW(PCIn,     HSCPIn,  Hull,       None,           Invalid) \
+  ROW(HSIn,     HSCPIn,  Hull,       None,           Invalid) \
+  ROW(HSCPIn,   Invalid, Hull,       Vertex,         Input) \
+  ROW(HSCPOut,  Invalid, Hull,       Vertex,         Output) \
+  ROW(PCOut,    Invalid, Hull,       PatchConstant,  PatchConstant) \
+  ROW(DSIn,     Invalid, Domain,     PatchConstant,  PatchConstant) \
+  ROW(DSCPIn,   Invalid, Domain,     Vertex,         Input) \
+  ROW(DSOut,    Invalid, Domain,     Vertex,         Output) \
+  ROW(GSVIn,    Invalid, Geometry,   Vertex,         Input) \
+  ROW(GSIn,     GSVIn,   Geometry,   None,           Invalid) \
+  ROW(GSOut,    Invalid, Geometry,   Vertex,         Output) \
+  ROW(PSIn,     Invalid, Pixel,      Vertex,         Input) \
+  ROW(PSOut,    Invalid, Pixel,      Target,         Output) \
+  ROW(CSIn,     Invalid, Compute,    None,           Invalid) \
+  ROW(Invalid,  Invalid, Invalid,    Invalid,        Invalid)
 // SIGPOINT-TABLE:END
 
 const SigPoint SigPoint::ms_SigPoints[kNumSigPointRecords] = {
@@ -49,46 +49,63 @@ const SigPoint SigPoint::ms_SigPoints[kNumSigPointRecords] = {
 
 // <py::lines('INTERPRETATION-TABLE')>hctdb_instrhelp.get_interpretation_table()</py>
 // INTERPRETATION-TABLE:BEGIN
-//  Semantic            VSIn,           VSOut,  PCIn,            HSIn,            HSCPIn, HSCPOut, PCOut,         DSIn,            DSCPIn, DSOut,  GSVIn,  GSIn,            GSOut,  PSIn,             PSOut,            CSIn
-#define DO_INTERPRETATION_TABLE(D) \
-  {/*Arbitrary*/        D(Arb),         D(Arb), D(NA),           D(NA),           D(Arb), D(Arb),  D(Arb),        D(Arb),          D(Arb), D(Arb), D(Arb), D(NA),           D(Arb), D(Arb),           D(NA),            D(NA)}, \
-  {/*VertexID*/         D(SV),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*InstanceID*/       D(SV),          D(Arb), D(NA),           D(NA),           D(Arb), D(Arb),  D(NA),         D(NA),           D(Arb), D(Arb), D(Arb), D(NA),           D(Arb), D(Arb),           D(NA),            D(NA)}, \
-  {/*Position*/         D(Arb),         D(SV),  D(NA),           D(NA),           D(SV),  D(SV),   D(Arb),        D(Arb),          D(SV),  D(SV),  D(SV),  D(NA),           D(SV),  D(SV),            D(NA),            D(NA)}, \
-  {/*RenderTgArrayIdx*/ D(Arb),         D(SV),  D(NA),           D(NA),           D(SV),  D(SV),   D(Arb),        D(Arb),          D(SV),  D(SV),  D(SV),  D(NA),           D(SV),  D(SV),            D(NA),            D(NA)}, \
-  {/*ViewPortArrayIdx*/ D(Arb),         D(SV),  D(NA),           D(NA),           D(SV),  D(SV),   D(Arb),        D(Arb),          D(SV),  D(SV),  D(SV),  D(NA),           D(SV),  D(SV),            D(NA),            D(NA)}, \
-  {/*ClipDistance*/     D(Arb),         D(SV),  D(NA),           D(NA),           D(SV),  D(SV),   D(Arb),        D(Arb),          D(SV),  D(SV),  D(SV),  D(NA),           D(SV),  D(SV),            D(NA),            D(NA)}, \
-  {/*CullDistance*/     D(Arb),         D(SV),  D(NA),           D(NA),           D(SV),  D(SV),   D(Arb),        D(Arb),          D(SV),  D(SV),  D(SV),  D(NA),           D(SV),  D(SV),            D(NA),            D(NA)}, \
-  {/*OutputControlPtID*/D(NA),          D(NA),  D(NA),           D(NotInSig),     D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*DomainLocation*/   D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NotInSig),     D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*PrimitiveID*/      D(NA),          D(NA),  D(NotInSig),     D(NotInSig),     D(NA),  D(NA),   D(NA),         D(NotInSig),     D(NA),  D(NA),  D(NA),  D(Shadow),       D(SGV), D(SGV),           D(NA),            D(NA)}, \
-  {/*GSInstanceID*/     D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NotInSig),     D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*SampleIndex*/      D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(Shadow _41),    D(NA),            D(NA)}, \
-  {/*IsFrontFace*/      D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(SGV), D(SGV),           D(NA),            D(NA)}, \
-  {/*Coverage*/         D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NotInSig _50),  D(NotPacked _41), D(NA)}, \
-  {/*InnerCoverage*/    D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NotInSig _50),  D(NA),            D(NA)}, \
-  {/*Target*/           D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(Target),        D(NA)}, \
-  {/*Depth*/            D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NotPacked),     D(NA)}, \
-  {/*DepthLessEqual*/   D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NotPacked _50), D(NA)}, \
-  {/*DepthGreaterEqual*/D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NotPacked _50), D(NA)}, \
-  {/*StencilRef*/       D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NotPacked _50), D(NA)}, \
-  {/*DispatchThreadID*/ D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NotInSig)}, \
-  {/*GroupID*/          D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NotInSig)}, \
-  {/*GroupIndex*/       D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NotInSig)}, \
-  {/*GroupThreadID*/    D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NotInSig)}, \
-  {/*TessFactor*/       D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(TessFactor), D(TessFactor),   D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*InsideTessFactor*/ D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(TessFactor), D(TessFactor),   D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NA),            D(NA),            D(NA)}, \
-  {/*ViewID*/           D(NotInSig _61),D(NA),  D(NotInSig _61), D(NotInSig _61), D(NA),  D(NA),   D(NA),         D(NotInSig _61), D(NA),  D(NA),  D(NA),  D(NotInSig _61), D(NA),  D(NotInSig _61),  D(NA),            D(NA)}, \
-  {/*Barycentrics*/     D(NA),          D(NA),  D(NA),           D(NA),           D(NA),  D(NA),   D(NA),         D(NA),           D(NA),  D(NA),  D(NA),  D(NA),           D(NA),  D(NotPacked _61), D(NA),            D(NA)}, \
+//   Semantic,               VSIn,         VSOut, PCIn,         HSIn,         HSCPIn, HSCPOut, PCOut,      DSIn,         DSCPIn, DSOut, GSVIn, GSIn,         GSOut, PSIn,          PSOut,         CSIn
+#define DO_INTERPRETATION_TABLE(ROW) \
+  ROW(Arbitrary,              Arb,          Arb,   NA,           NA,           Arb,    Arb,     Arb,        Arb,          Arb,    Arb,   Arb,   NA,           Arb,   Arb,           NA,            NA) \
+  ROW(VertexID,               SV,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NA) \
+  ROW(InstanceID,             SV,           Arb,   NA,           NA,           Arb,    Arb,     NA,         NA,           Arb,    Arb,   Arb,   NA,           Arb,   Arb,           NA,            NA) \
+  ROW(Position,               Arb,          SV,    NA,           NA,           SV,     SV,      Arb,        Arb,          SV,     SV,    SV,    NA,           SV,    SV,            NA,            NA) \
+  ROW(RenderTargetArrayIndex, Arb,          SV,    NA,           NA,           SV,     SV,      Arb,        Arb,          SV,     SV,    SV,    NA,           SV,    SV,            NA,            NA) \
+  ROW(ViewPortArrayIndex,     Arb,          SV,    NA,           NA,           SV,     SV,      Arb,        Arb,          SV,     SV,    SV,    NA,           SV,    SV,            NA,            NA) \
+  ROW(ClipDistance,           Arb,          SV,    NA,           NA,           SV,     SV,      Arb,        Arb,          SV,     SV,    SV,    NA,           SV,    SV,            NA,            NA) \
+  ROW(CullDistance,           Arb,          SV,    NA,           NA,           SV,     SV,      Arb,        Arb,          SV,     SV,    SV,    NA,           SV,    SV,            NA,            NA) \
+  ROW(OutputControlPointID,   NA,           NA,    NA,           NotInSig,     NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NA) \
+  ROW(DomainLocation,         NA,           NA,    NA,           NA,           NA,     NA,      NA,         NotInSig,     NA,     NA,    NA,    NA,           NA,    NA,            NA,            NA) \
+  ROW(PrimitiveID,            NA,           NA,    NotInSig,     NotInSig,     NA,     NA,      NA,         NotInSig,     NA,     NA,    NA,    Shadow,       SGV,   SGV,           NA,            NA) \
+  ROW(GSInstanceID,           NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NotInSig,     NA,    NA,            NA,            NA) \
+  ROW(SampleIndex,            NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    Shadow _41,    NA,            NA) \
+  ROW(IsFrontFace,            NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           SGV,   SGV,           NA,            NA) \
+  ROW(Coverage,               NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NotInSig _50,  NotPacked _41, NA) \
+  ROW(InnerCoverage,          NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NotInSig _50,  NA,            NA) \
+  ROW(Target,                 NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            Target,        NA) \
+  ROW(Depth,                  NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NotPacked,     NA) \
+  ROW(DepthLessEqual,         NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NotPacked _50, NA) \
+  ROW(DepthGreaterEqual,      NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NotPacked _50, NA) \
+  ROW(StencilRef,             NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NotPacked _50, NA) \
+  ROW(DispatchThreadID,       NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NotInSig) \
+  ROW(GroupID,                NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NotInSig) \
+  ROW(GroupIndex,             NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NotInSig) \
+  ROW(GroupThreadID,          NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NA,            NA,            NotInSig) \
+  ROW(TessFactor,             NA,           NA,    NA,           NA,           NA,     NA,      TessFactor, TessFactor,   NA,     NA,    NA,    NA,           NA,    NA,            NA,            NA) \
+  ROW(InsideTessFactor,       NA,           NA,    NA,           NA,           NA,     NA,      TessFactor, TessFactor,   NA,     NA,    NA,    NA,           NA,    NA,            NA,            NA) \
+  ROW(ViewID,                 NotInSig _61, NA,    NotInSig _61, NotInSig _61, NA,     NA,      NA,         NotInSig _61, NA,     NA,    NA,    NotInSig _61, NA,    NotInSig _61,  NA,            NA) \
+  ROW(Barycentrics,           NA,           NA,    NA,           NA,           NA,     NA,      NA,         NA,           NA,     NA,    NA,    NA,           NA,    NotPacked _61, NA,            NA)
 // INTERPRETATION-TABLE:END
 
 const VersionedSemanticInterpretation SigPoint::ms_SemanticInterpretationTable[(unsigned)DXIL::SemanticKind::Invalid][(unsigned)SigPoint::Kind::Invalid] = {
 #define _41 ,4,1
 #define _50 ,5,0
 #define _61 ,6,1
-#define DO(k) VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::k)
-  DO_INTERPRETATION_TABLE(DO)
-#undef DO
+#define DO_ROW(SEM, VSIn, VSOut, PCIn, HSIn, HSCPIn, HSCPOut, PCOut, DSIn, DSCPIn, DSOut, GSVIn, GSIn, GSOut, PSIn, PSOut, CSIn) \
+  { VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::VSIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::VSOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::PCIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::HSIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::HSCPIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::HSCPOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::PCOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::DSIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::DSCPIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::DSOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::GSVIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::GSIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::GSOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::PSIn), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::PSOut), \
+    VersionedSemanticInterpretation(DXIL::SemanticInterpretationKind::CSIn), \
+  },
+  DO_INTERPRETATION_TABLE(DO_ROW)
+#undef DO_ROW
 };
 
 // -----------------------

+ 1 - 1
include/dxc/HLSL/DxilSignature.h

@@ -25,7 +25,7 @@ public:
   using Kind = DXIL::SignatureKind;
 
   DxilSignature(DXIL::ShaderKind shaderKind, DXIL::SignatureKind sigKind, bool useMinPrecision);
-  DxilSignature(DXIL::SigPointKind sigPointKind);
+  DxilSignature(DXIL::SigPointKind sigPointKind, bool useMinPrecision);
   DxilSignature(const DxilSignature &src);
   virtual ~DxilSignature();
 

+ 49 - 9
include/dxc/HLSL/DxilUtil.h

@@ -10,6 +10,10 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 #pragma once
+#include <unordered_set>
+#include <string>
+#include <memory>
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
 class Type;
@@ -19,6 +23,10 @@ class Module;
 class MemoryBuffer;
 class LLVMContext;
 class DiagnosticInfo;
+class Value;
+class Instruction;
+class BasicBlock;
+class raw_ostream;
 }
 
 namespace hlsl {
@@ -27,31 +35,63 @@ class DxilFieldAnnotation;
 class DxilTypeSystem;
 
 namespace dxilutil {
+  extern const char ManglingPrefix[];
+  extern const char EntryPrefix[];
+  extern const llvm::StringRef kResourceMapErrorMsg;
+
   unsigned
   GetLegacyCBufferFieldElementSize(DxilFieldAnnotation &fieldAnnotation,
                                    llvm::Type *Ty, DxilTypeSystem &typeSys);
   llvm::Type *GetArrayEltTy(llvm::Type *Ty);
   bool HasDynamicIndexing(llvm::Value *V);
 
-  // Find alloca insertion point, given instruction
-  llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I);
-  llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F);
-  llvm::Instruction *SkipAllocas(llvm::Instruction *I);
-  // Get first non-alloca insertion point, to avoid inserting non-allocas before alloca
-  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I);
-  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB);
+  // Find alloca insertion point, given instruction
+  llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I);
+  llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F);
+  llvm::Instruction *SkipAllocas(llvm::Instruction *I);
+  // Get first non-alloca insertion point, to avoid inserting non-allocas before alloca
+  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I);
+  llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB);
   llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Function* F);
 
   bool IsStaticGlobal(llvm::GlobalVariable *GV);
   bool IsSharedMemoryGlobal(llvm::GlobalVariable *GV);
   bool RemoveUnusedFunctions(llvm::Module &M, llvm::Function *EntryFunc,
                              llvm::Function *PatchConstantFunc, bool IsLib);
-
+  void EmitErrorOnInstruction(llvm::Instruction *I, llvm::StringRef Msg);
+  void EmitResMappingError(llvm::Instruction *Res);
+  // Simple demangle just support case "\01?name@" pattern.
+  llvm::StringRef DemangleFunctionName(llvm::StringRef name);
+  // ReplaceFunctionName replaces the undecorated portion of originalName with undecorated newName
+  std::string ReplaceFunctionName(llvm::StringRef originalName, llvm::StringRef newName);
+  void PrintEscapedString(llvm::StringRef Name, llvm::raw_ostream &Out);
+  void PrintUnescapedString(llvm::StringRef Name, llvm::raw_ostream &Out);
+  // Change select/phi on operands into select/phi on operation.
+  // phi0 = phi a0, b0, c0
+  // phi1 = phi a1, b1, c1
+  // Inst = Add(phi0, phi1);
+  // into
+  // A = Add(a0, a1);
+  // B = Add(b0, b1);
+  // C = Add(c0, c1);
+  // NewInst = phi A, B, C
+  // Only support 1 operand now, other oerands should be Constant.
+  llvm::Value * SelectOnOperation(llvm::Instruction *Inst, unsigned operandIdx);
+  // Collect all select operand used by Inst.
+  void CollectSelect(llvm::Instruction *Inst,
+                   std::unordered_set<llvm::Instruction *> &selectSet);
+  // If all operands are the same for a select inst, replace it with the operand.
+  // Returns replacement value if successful
+  llvm::Value *MergeSelectOnSameValue(llvm::Instruction *SelInst,
+                                      unsigned startOpIdx,
+                                      unsigned numOperands);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::StringRef BC,
     llvm::LLVMContext &Ctx, std::string &DiagStr);
   std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,
     llvm::LLVMContext &Ctx, std::string &DiagStr);
   void PrintDiagnosticHandler(const llvm::DiagnosticInfo &DI, void *Context);
+  // Returns true if type contains HLSL Object type (resource)
+  bool ContainsHLSLObjectType(llvm::Type *Ty);
 }
 
-}
+}

+ 22 - 1
include/dxc/HLSL/DxilValidation.h

@@ -41,12 +41,19 @@ enum class ValidationRule : unsigned {
   ContainerRootSignatureIncompatible, // Root Signature in DXIL Container must be compatible with shader
 
   // Declaration
+  DeclAttrStruct, // Attributes parameter must be struct type
   DeclDxilFnExtern, // External function must be a DXIL function
   DeclDxilNsReserved, // The DXIL reserved prefixes must only be used by built-in functions and types
+  DeclExtraArgs, // Extra arguments not allowed for shader functions
   DeclFnAttribute, // Functions should only contain known function attributes
   DeclFnFlattenParam, // Function parameters must not use struct types
   DeclFnIsCalled, // Functions can only be used by call instructions
   DeclNotUsedExternal, // External declaration should not be used
+  DeclParamStruct, // Callable function parameter must be struct type
+  DeclPayloadStruct, // Payload parameter must be struct type
+  DeclResourceInFnSig, // Resources not allowed in function signatures
+  DeclShaderMissingArg, // payload/params/attributes parameter is required for certain shader types
+  DeclShaderReturnVoid, // Shader functions must return void
   DeclUsedExternalFunction, // External function must be used
   DeclUsedInternal, // Internal declaration must be used
 
@@ -56,6 +63,7 @@ enum class ValidationRule : unsigned {
   InstrBarrierModeForNonCS, // sync in a non-Compute Shader must only sync UAV (sync_uglobal)
   InstrBarrierModeNoMemory, // sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional. 
   InstrBarrierModeUselessUGroup, // sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
+  InstrBufferUpdateCounterOnResHasCounter, // BufferUpdateCounter valid only when HasCounter is true
   InstrBufferUpdateCounterOnUAV, // BufferUpdateCounter valid only on UAV
   InstrCBufferClassForCBufferHandle, // Expect Cbuffer for CBufferLoad handle
   InstrCBufferOutOfBound, // Cbuffer access out of bound
@@ -106,13 +114,17 @@ enum class ValidationRule : unsigned {
   InstrResourceKindForSampleC, // samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray
   InstrResourceKindForTextureLoad, // texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray
   InstrResourceKindForTextureStore, // texture store only works on Texture1D/1DArray/2D/2DArray/3D
+  InstrResourceKindForTraceRay, // TraceRay should only use RTAccelerationStructure
+  InstrResourceMapToSingleEntry, // Fail to map resource to resource table
   InstrResourceOffsetMiss, // offset uninitialized
   InstrResourceOffsetTooMany, // out of bound offset must be undef
+  InstrResourceUser, // Resource should only used by Load/GEP/Call
   InstrSampleCompType, // sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
   InstrSampleIndexForLoad2DMS, // load on Texture2DMS/2DMSArray require sampleIndex
   InstrSamplerModeForLOD, // lod instruction requires sampler declared in default mode
   InstrSamplerModeForSample, // sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
   InstrSamplerModeForSampleC, // sample_c_*/gather_c instructions require sampler declared in comparison mode
+  InstrSignatureOperationNotInEntry, // Dxil operation for input output signature must be in entryPoints.
   InstrStatus, // Resource status should only used by CheckAccessFullyMapped
   InstrStructBitCast, // Bitcast on struct types is not allowed
   InstrTGSMRaceCond, // Race condition writing to shared memory detected, consider making this write conditional
@@ -142,6 +154,7 @@ enum class ValidationRule : unsigned {
   MetaInvalidControlFlowHint, // Invalid control flow hint
   MetaKnown, // Named metadata should be known
   MetaMaxTessFactor, // Hull Shader MaxTessFactor must be [%0..%1].  %2 specified
+  MetaNoEntryPropsForEntry, // EntryPoints must have entry properties.
   MetaNoSemanticOverlap, // Semantics must not overlap
   MetaRequired, // TODO - Required metadata missing
   MetaSemaKindMatchesName, // Semantic name must match system value, when defined.
@@ -176,11 +189,12 @@ enum class ValidationRule : unsigned {
   FlowReducible, // Execution flow must be reducible
 
   // Shader model
+  Sm64bitRawBufferLoadStore, // i64/f64 rawBufferLoad/Store overloads are allowed after SM 6.3
   SmAppendAndConsumeOnSameUAV, // BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
   SmCBufferElementOverflow, // CBuffer elements must not overflow
   SmCBufferOffsetOverlap, // CBuffer offsets must not overlap
   SmCBufferTemplateTypeMustBeStruct, // D3D12 constant/texture buffer template element can only be a struct
-  SmCSNoReturn, // Compute shaders can't return values, outputs must be written in writable resources (UAVs).
+  SmCSNoSignatures, // Compute shaders must not have shader signatures.
   SmCompletePosition, // Not all elements of SV_Position were written
   SmCounterOnlyOnStructBuf, // BufferUpdateCounter valid only on structured buffers
   SmDSInputControlPointCountRange, // DS input control point count must be [0..%0].  %1 specified
@@ -217,6 +231,8 @@ enum class ValidationRule : unsigned {
   SmPSTargetIndexMatchesRow, // SV_Target semantic index must match packed row location
   SmPatchConstantOnlyForHSDS, // patch constant signature only valid in HS and DS
   SmROVOnlyInPS, // RasterizerOrdered objects are only allowed in 5.0+ pixel shaders
+  SmRayShaderPayloadSize, // For shader '%0', %1 size is smaller than argument's allocation size
+  SmRayShaderSignatures, // Ray tracing shader '%0' should not have any shader signatures
   SmResourceRangeOverlap, // Resource ranges must not overlap
   SmSampleCountOnlyOn2DMS, // Only Texture2DMS/2DMSArray could has sample count
   SmSemantic, // Semantic must be defined in target shader model
@@ -259,6 +275,11 @@ bool VerifyPSVMatches(_In_ llvm::Module *pModule,
                       _In_reads_bytes_(PSVSize) const void *pPSVData,
                       _In_ uint32_t PSVSize);
 
+// PSV = data for Pipeline State Validation
+bool VerifyRDATMatches(_In_ llvm::Module *pModule,
+                       _In_reads_bytes_(RDATSize) const void *pRDATData,
+                       _In_ uint32_t RDATSize);
+
 bool VerifyFeatureInfoMatches(_In_ llvm::Module *pModule,
                               _In_reads_bytes_(FeatureInfoSize) const void *pFeatureInfoData,
                               _In_ uint32_t FeatureInfoSize);

+ 5 - 0
include/dxc/HLSL/HLMatrixLowerHelper.h

@@ -22,9 +22,14 @@ namespace llvm {
 
 namespace hlsl {
 
+class DxilFieldAnnotation;
+class DxilTypeSystem;
+
 namespace HLMatrixLower {
 // TODO: use type annotation.
 bool IsMatrixType(llvm::Type *Ty);
+DxilFieldAnnotation *FindAnnotationFromMatUser(llvm::Value *Mat,
+                                               DxilTypeSystem &typeSys);
 // Translate matrix type to vector type.
 llvm::Type *LowerMatrixType(llvm::Type *Ty);
 // TODO: use type annotation.

+ 25 - 3
include/dxc/HLSL/HLModule.h

@@ -24,6 +24,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace llvm {
 class LLVMContext;
@@ -63,6 +64,8 @@ struct HLOptions {
   unsigned unused                  : 24;
 };
 
+typedef std::unordered_map<const llvm::Function *, std::unique_ptr<DxilFunctionProps>> DxilFunctionPropsMap;
+
 /// Use this class to manipulate HLDXIR of a shader.
 class HLModule {
 public:
@@ -82,6 +85,11 @@ public:
   void SetHLOptions(HLOptions &opts);
   const HLOptions &GetHLOptions() const;
 
+  // AutoBindingSpace also enables automatic binding for libraries if set.
+  // UINT_MAX == unset
+  void SetAutoBindingSpace(uint32_t Space);
+  uint32_t GetAutoBindingSpace() const;
+
   // Entry function.
   llvm::Function *GetEntryFunction() const;
   void SetEntryFunction(llvm::Function *pEntryFunc);
@@ -127,6 +135,14 @@ public:
   bool HasDxilFunctionProps(llvm::Function *F);
   DxilFunctionProps &GetDxilFunctionProps(llvm::Function *F);
   void AddDxilFunctionProps(llvm::Function *F, std::unique_ptr<DxilFunctionProps> &info);
+  void SetPatchConstantFunctionForHS(llvm::Function *hullShaderFunc, llvm::Function *patchConstantFunc);
+  bool IsGraphicsShader(llvm::Function *F); // vs,hs,ds,gs,ps
+  bool IsPatchConstantShader(llvm::Function *F);
+  bool IsComputeShader(llvm::Function *F);
+
+  // Is an entry function that uses input/output signature conventions?
+  // Includes: vs/hs/ds/gs/ps/cs as well as the patch constant function.
+  bool IsEntryThatUsesSignatures(llvm::Function *F);
 
   DxilFunctionAnnotation *GetFunctionAnnotation(llvm::Function *F);
   DxilFunctionAnnotation *AddFunctionAnnotation(llvm::Function *F);
@@ -140,6 +156,10 @@ public:
   void SetFloat32DenormMode(const DXIL::Float32DenormMode mode);
   DXIL::Float32DenormMode GetFloat32DenormMode() const;
 
+  // Default function linkage for libraries
+  DXIL::DefaultLinkage GetDefaultLinkage() const;
+  void SetDefaultLinkage(const DXIL::DefaultLinkage linkage);
+
   // HLDXIR metadata manipulation.
   /// Serialize HLDXIR in-memory form to metadata form.
   void EmitHLMetadata();
@@ -204,8 +224,7 @@ public:
   DxilTypeSystem *ReleaseTypeSystem();
   OP *ReleaseOP();
   RootSignatureHandle *ReleaseRootSignature();
-  std::unordered_map<llvm::Function *, std::unique_ptr<DxilFunctionProps>> &&
-  ReleaseFunctionPropsMap();
+  DxilFunctionPropsMap &&ReleaseFunctionPropsMap();
 
   llvm::DebugInfoFinder &GetOrCreateDebugInfoFinder();
   static llvm::DIGlobalVariable *
@@ -237,7 +256,8 @@ private:
   std::vector<llvm::GlobalVariable*>  m_TGSMVariables;
 
   // High level function info.
-  std::unordered_map<llvm::Function *, std::unique_ptr<DxilFunctionProps>>  m_DxilFunctionPropsMap;
+  std::unordered_map<const llvm::Function *, std::unique_ptr<DxilFunctionProps>>  m_DxilFunctionPropsMap;
+  std::unordered_set<llvm::Function *>  m_PatchConstantFunctions;
 
   // Resource type annotation.
   std::unordered_map<llvm::Type *, std::pair<DXIL::ResourceClass, DXIL::ResourceKind>> m_ResTypeAnnotation;
@@ -258,6 +278,8 @@ private:
   HLOptions m_Options;
   std::unique_ptr<OP> m_pOP;
   size_t m_pUnused;
+  uint32_t m_AutoBindingSpace;
+  DXIL::DefaultLinkage m_DefaultLinkage;
 
   // DXIL metadata serialization/deserialization.
   llvm::MDTuple *EmitHLResources();

+ 7 - 0
include/dxc/HLSL/HLOperations.h

@@ -329,6 +329,13 @@ const unsigned kWaveAllEqualValueOpIdx = 1;
 const unsigned kCreateHandleResourceOpIdx = 1;
 const unsigned kCreateHandleIndexOpIdx = 2; // Only for array of cbuffer.
 
+// TraceRay.
+const unsigned kTraceRayRayDescOpIdx = 7;
+const unsigned kTraceRayPayLoadOpIdx = 8;
+
+// ReportIntersection.
+const unsigned kReportIntersectionAttributeOpIdx = 3;
+
 } // namespace HLOperandIndex
 
 llvm::Function *GetOrCreateHLFunction(llvm::Module &M,

+ 24 - 0
include/dxc/HlslIntrinsicOp.h

@@ -21,13 +21,17 @@ import hctdb_instrhelp
 
 /* <py::lines('HLSL-INTRINSICS')>hctdb_instrhelp.enum_hlsl_intrinsics()</py>*/
 // HLSL-INTRINSICS:BEGIN
+  IOP_AcceptHitAndEndSearch,
   IOP_AddUint64,
   IOP_AllMemoryBarrier,
   IOP_AllMemoryBarrierWithGroupSync,
+  IOP_CallShader,
   IOP_CheckAccessFullyMapped,
   IOP_D3DCOLORtoUBYTE4,
   IOP_DeviceMemoryBarrier,
   IOP_DeviceMemoryBarrierWithGroupSync,
+  IOP_DispatchRaysDimensions,
+  IOP_DispatchRaysIndex,
   IOP_EvaluateAttributeAtSample,
   IOP_EvaluateAttributeCentroid,
   IOP_EvaluateAttributeSnapped,
@@ -36,6 +40,10 @@ import hctdb_instrhelp
   IOP_GetRenderTargetSamplePosition,
   IOP_GroupMemoryBarrier,
   IOP_GroupMemoryBarrierWithGroupSync,
+  IOP_HitKind,
+  IOP_IgnoreHit,
+  IOP_InstanceID,
+  IOP_InstanceIndex,
   IOP_InterlockedAdd,
   IOP_InterlockedAnd,
   IOP_InterlockedCompareExchange,
@@ -46,6 +54,12 @@ import hctdb_instrhelp
   IOP_InterlockedOr,
   IOP_InterlockedXor,
   IOP_NonUniformResourceIndex,
+  IOP_ObjectRayDirection,
+  IOP_ObjectRayOrigin,
+  IOP_ObjectToWorld,
+  IOP_ObjectToWorld3x4,
+  IOP_ObjectToWorld4x3,
+  IOP_PrimitiveIndex,
   IOP_Process2DQuadTessFactorsAvg,
   IOP_Process2DQuadTessFactorsMax,
   IOP_Process2DQuadTessFactorsMin,
@@ -60,6 +74,11 @@ import hctdb_instrhelp
   IOP_QuadReadAcrossX,
   IOP_QuadReadAcrossY,
   IOP_QuadReadLaneAt,
+  IOP_RayFlags,
+  IOP_RayTCurrent,
+  IOP_RayTMin,
+  IOP_ReportHit,
+  IOP_TraceRay,
   IOP_WaveActiveAllEqual,
   IOP_WaveActiveAllTrue,
   IOP_WaveActiveAnyTrue,
@@ -80,6 +99,11 @@ import hctdb_instrhelp
   IOP_WavePrefixSum,
   IOP_WaveReadLaneAt,
   IOP_WaveReadLaneFirst,
+  IOP_WorldRayDirection,
+  IOP_WorldRayOrigin,
+  IOP_WorldToObject,
+  IOP_WorldToObject3x4,
+  IOP_WorldToObject4x3,
   IOP_abort,
   IOP_abs,
   IOP_acos,

+ 4 - 0
include/dxc/Support/HLSLOptions.h

@@ -110,6 +110,8 @@ public:
   llvm::StringRef VerifyRootSignatureSource; //OPT_verifyrootsignature
   llvm::StringRef RootSignatureDefine; // OPT_rootsig_define
   llvm::StringRef FloatDenormalMode; // OPT_denorm
+  std::vector<std::string> Exports; // OPT_exports
+  llvm::StringRef DefaultLinkage; // OPT_default_linkage
 
   bool AllResourcesBound = false; // OPT_all_resources_bound
   bool AstDump = false; // OPT_ast_dump
@@ -156,6 +158,8 @@ public:
   bool DisassembleByteOffset = false; //OPT_No
   bool DisaseembleHex = false; //OPT_Lx
   bool LegacyMacroExpansion = false; // OPT_flegacy_macro_expansion
+  unsigned long AutoBindingSpace = UINT_MAX; // OPT_auto_binding_space
+  bool ExportShadersOnly = false; // OPT_export_shaders_only
 
   bool IsRootSignatureProfile();
   bool IsLibraryProfile();

+ 9 - 1
include/dxc/Support/HLSLOptions.td

@@ -232,6 +232,14 @@ def rootsig_define : Separate<["-", "/"], "rootsig-define">, Group<hlslcomp_Grou
 def enable_16bit_types: Flag<["-", "/"], "enable-16bit-types">, Flags<[CoreOption, DriverOption]>, Group<hlslcomp_Group>,
   HelpText<"Enable 16bit types and disable min precision types. Available in HLSL 2018 and shader model 6.2">;
 def ignore_line_directives : Flag<["-", "/"], "ignore-line-directives">, HelpText<"Ignore line directives">, Flags<[CoreOption]>, Group<hlslcomp_Group>;
+def auto_binding_space : Separate<["-", "/"], "auto-binding-space">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Set auto binding space - enables auto resource binding in libraries">;
+def exports : Separate<["-", "/"], "exports">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Specify exports when compiling a library: export1[[,export1_clone,...]=internal_name][;...]">;
+def export_shaders_only : Flag<["-", "/"], "export-shaders-only">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Only export shaders when compiling a library">;
+def default_linkage : Separate<["-", "/"], "default-linkage">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Set default linkage for non-shader functions when compiling or linking to a library target (internal, external)">;
 
 // SPIRV Change Starts
 def spirv : Flag<["-"], "spirv">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
@@ -277,7 +285,7 @@ def Oconfig : CommaJoined<["-"], "Oconfig=">, Group<spirv_Group>, Flags<[CoreOpt
 // fxc-based flags that don't match those previously defined.
 
 def target_profile : JoinedOrSeparate<["-", "/"], "T">, Flags<[CoreOption]>, Group<hlslcomp_Group>, MetaVarName<"<profile>">,
-  HelpText<"Set target profile. \n\t<profile>: ps_6_0, ps_6_1, ps_6_2, vs_6_0, vs_6_1, vs_6_2, \n\t\t cs_6_0, cs_6_1, cs_6_2, gs_6_0, gs_6_1, gs_6_2, \n\t\t ds_6_0, ds_6_1, ds_6_2, hs_6_0, hs_6_1, hs_6_2, \n\t\t lib_6_0, lib_6_1, lib_6_2">;
+  HelpText<"Set target profile. \n\t<profile>: ps_6_0, ps_6_1, ps_6_2, ps_6_3, \n\t\t vs_6_0, vs_6_1, vs_6_2, vs_6_3, \n\t\t cs_6_0, cs_6_1, cs_6_2, cs_6_3, \n\t\t gs_6_0, gs_6_1, gs_6_2, gs_6_3, \n\t\t ds_6_0, ds_6_1, ds_6_2, ds_6_3, \n\t\t hs_6_0, hs_6_1, hs_6_2, hs_6_3, \n\t\t lib_6_3">;
 def entrypoint :  JoinedOrSeparate<["-", "/"], "E">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
   HelpText<"Entry point name">;
 // /I <include> - already defined above

+ 1 - 0
include/dxc/Support/WinAdapter.h

@@ -292,6 +292,7 @@
 #define _Ret_opt_
 
 #define _Use_decl_annotations_
+#define __analysis_assume(expr)
 #define _Analysis_assume_(expr)
 #define _Analysis_assume_nullterminated_(x)
 #define _Success_(expr)

+ 4 - 0
include/dxc/dxcapi.h

@@ -293,6 +293,10 @@ IDxcAssembler : public IUnknown {
   DECLARE_CROSS_PLATFORM_UUIDOF(IDxcAssembler)
 };
 
+// D3D_SIT_RTACCELERATIONSTRUCTURE is an additional value for D3D_SHADER_INPUT_TYPE,
+// in order to fit it in to ID3D12LibraryReflection.
+static const UINT32 D3D_SIT_RTACCELERATIONSTRUCTURE = 12; // (D3D_SIT_UAV_RWSTRUCTURED_WITH_COUNTER + 1)
+
 struct __declspec(uuid("d2c21b26-8350-4bdc-976a-331ce6f4c54c"))
 IDxcContainerReflection : public IUnknown {
   virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pContainer) = 0; // Container to load.

+ 5 - 1
include/dxc/dxcapi.internal.h

@@ -82,7 +82,11 @@ enum LEGAL_INTRINSIC_COMPTYPES {
   LICOMPTYPE_UINT16 = 28,
   LICOMPTYPE_NUMERIC16_ONLY = 29,
 
-  LICOMPTYPE_COUNT = 30
+  LICOMPTYPE_RAYDESC = 30,
+  LICOMPTYPE_ACCELERATION_STRUCT = 31,
+  LICOMPTYPE_USER_DEFINED_TYPE = 32,
+
+  LICOMPTYPE_COUNT = 33
 };
 
 static const BYTE IA_SPECIAL_BASE = 0xf0;

+ 113 - 0
include/dxc/dxcdxrfallbackcompiler.h

@@ -0,0 +1,113 @@
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// dxcapi.h                                                                  //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides declarations for the DirectX Compiler API entry point.           //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef __DXC_DXR_FALLBACK_COMPILER_API__
+#define __DXC_DXR_FALLBACK_COMPILER_API__
+#include "dxcapi.h"
+
+enum class ShaderType : unsigned int
+{
+    Raygen,
+    AnyHit,
+    ClosestHit,
+    Intersection,
+    Miss,
+    Callable,
+    Lib,
+};
+
+struct DxcShaderInfo
+{
+    UINT32 Identifier;
+    UINT32 StackSize;
+    ShaderType Type;
+};
+
+struct DxcShaderBytecode
+{
+    LPBYTE pData;
+    UINT32 Size;
+};
+
+struct DxcExportDesc
+{
+    LPCWSTR ExportToRename;
+    LPCWSTR ExportName;
+};
+
+struct __declspec(uuid("76bb3c85-006d-4b72-9e10-63cd97df57f0"))
+  IDxcDxrFallbackCompiler : public IUnknown {
+
+  // If set to true then shaders not listed in pShaderNames in Compile() but 
+  // called by shaders in pShaderNames are added to the final computer shader. 
+  // Otherwise these are considered errors. This is intended for testing purposes.
+  virtual HRESULT STDMETHODCALLTYPE SetFindCalledShaders(bool val) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE SetDebugOutput(int val) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE RenameAndLink(
+      _In_count_(libCount) DxcShaderBytecode *pLibs,
+      UINT32 libCount,
+      _In_count_(ExportCount) DxcExportDesc *pExports,
+      UINT32 ExportCount,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  ) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE PatchShaderBindingTables(
+      _In_ const LPCWSTR pEntryName,
+      _In_ DxcShaderBytecode *pShaderBytecode,
+      _In_ void *pShaderInfo,
+      _COM_Outptr_ IDxcOperationResult **ppResult
+  ) = 0;
+
+  // Compiles libs together to create a raytracing compute shader. One of the libs 
+  // should be the fallback implementation lib that defines functions like 
+  // Fallback_TraceRay(), Fallback_ReportHit(), etc. Fallback_TraceRay() should 
+  // be one of the shader names so that it gets included in the compile. 
+  virtual HRESULT STDMETHODCALLTYPE Compile(
+    _In_count_(libCount) DxcShaderBytecode *pLibs,                  // Array of libraries containing shaders
+    UINT32 libCount,                                        // Number of libraries containing shaders
+    _In_count_(shaderCount) const LPCWSTR *pShaderNames,    // Array of shader names to compile
+    _Out_writes_(shaderCount) DxcShaderInfo *pShaderInfo,   // Array of shaderInfo corresponding to pShaderNames
+    UINT32 shaderCount,                                     // Number of shaders to compile
+    UINT32 maxAttributeSize,
+    _COM_Outptr_ IDxcOperationResult **ppResult             // Compiler output status, buffer, and errors
+  ) = 0;
+
+  virtual HRESULT STDMETHODCALLTYPE Link(
+      _In_ const LPCWSTR pEntryName,                          // Name of entry function, null if compiling a collection
+      _In_count_(libCount) IDxcBlob **pLibs,                  // Array of libraries containing shaders
+      UINT32 libCount,                                        // Number of libraries containing shaders
+      _In_count_(shaderCount) const LPCWSTR *pShaderNames,    // Array of shader names to compile
+      _In_count_(shaderCount) DxcShaderInfo *pShaderInfo,   // Array of shaderInfo corresponding to pShaderNames
+      UINT32 shaderCount,                                     // Number of shaders to compile
+      UINT32 maxAttributeSize,
+      UINT32 stackSizeInBytes,                                // Continuation stack size. Use 0 for default.
+      _COM_Outptr_ IDxcOperationResult **ppResult             // Compiler output status, buffer, and errors
+  ) = 0;
+};
+
+// {76bb3c85-006d-4b72-9e10-63cd97df57f0}
+__declspec(selectany) extern const GUID CLSID_DxcDxrFallbackCompiler = {
+  0x76bb3c85,
+  0x006d,
+  0x4b72,
+{ 0x9e, 0x10, 0x63, 0xcd, 0x97, 0xdf, 0x57, 0xf0 }
+};
+
+typedef HRESULT(__stdcall *DxcCreateDxrFallbackCompilerProc)(
+  _In_ REFCLSID   rclsid,
+  _In_ REFIID     riid,
+  _Out_ LPVOID*   ppv
+  );
+
+#endif

+ 3 - 0
include/llvm/Support/FileSystem.h

@@ -54,8 +54,11 @@ namespace fs {
 class MSFileSystem;
 typedef _Inout_ MSFileSystem* MSFileSystemRef;
 
+std::error_code GetFileSystemTlsStatus() throw();
+
 std::error_code SetupPerThreadFileSystem() throw();
 void CleanupPerThreadFileSystem() throw();
+struct AutoCleanupPerThreadFileSystem { ~AutoCleanupPerThreadFileSystem() { CleanupPerThreadFileSystem(); } };
 
 /// <summary>Gets a reference to the file system installed for the current thread (possibly NULL).</summary>
 /// <remarks>In practice, consumers of the library should always install a file system.</remarks>

+ 1 - 1
include/llvm/Support/raw_ostream.h

@@ -238,7 +238,7 @@ public:
   raw_ostream &operator<<(const FormattedNumber &);
 
   raw_ostream &
-  operator<<(std::ios_base &(*iomanip)(std::ios_base &)); // HLSL Change
+  operator<<(std::ios_base &(__cdecl*iomanip)(std::ios_base &)); // HLSL Change
 
   /// indent - Insert 'NumSpaces' spaces.
   raw_ostream &indent(unsigned NumSpaces);

+ 1 - 0
lib/CMakeLists.txt

@@ -22,3 +22,4 @@ add_subdirectory(ProfileData)
 # add_subdirectory(LibDriver) # HLSL Change
 add_subdirectory(DxcSupport) # HLSL Change
 add_subdirectory(HLSL) # HLSL Change
+add_subdirectory(DxrFallback) # HLSL Change

+ 72 - 11
lib/DxcSupport/HLSLOptions.cpp

@@ -193,17 +193,29 @@ StringRefUtf16::StringRefUtf16(llvm::StringRef value) {
 }
 
 static bool GetTargetVersionFromString(llvm::StringRef ref, unsigned *major, unsigned *minor) {
-  try {
-    *major = (unsigned)std::stoul(std::string(1, ref[ref.size() - 3]));
-    *minor = (unsigned)std::stoul(std::string(1, ref[ref.size() - 1]));
-    return true;
-  }
-  catch (std::invalid_argument &) {
+  *major = *minor = -1;
+  unsigned len = ref.size();
+  if (len < 6 || len > 11) // length: ps_6_0 to rootsig_1_0
     return false;
-  }
-  catch (std::out_of_range &) {
+  if (ref[len - 4] != '_' || ref[len - 2] != '_')
     return false;
-  }
+
+  char cMajor = ref[len - 3];
+  char cMinor = ref[len - 1];
+
+  if (cMajor >= '0' && cMajor <= '9')
+    *major = cMajor - '0';
+  else
+    return false;
+
+  if (cMinor == 'x')
+    *minor = 0xF;
+  else if (cMinor >= '0' && cMinor <= '9')
+    *minor = cMinor - '0';
+  else
+    return false;
+
+  return true;
 }
 
 // SPIRV Change Starts
@@ -338,6 +350,17 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
       // Set entry point to impossible name.
       opts.EntryPoint = "lib.no::entry";
     }
+  } else {
+    if (Args.getLastArg(OPT_exports)) {
+      errors << "library profile required when using -exports option";
+      return 1;
+    } else if (Args.hasFlag(OPT_export_shaders_only, OPT_INVALID, false)) {
+      errors << "library profile required when using -export-shaders-only option";
+      return 1;
+    } else if (Args.getLastArg(OPT_default_linkage)) {
+      errors << "library profile required when using -default-linkage option";
+      return 1;
+    }
   }
 
   opts.EnableBackCompatMode = Args.hasFlag(OPT_Gec, OPT_INVALID, false);
@@ -425,11 +448,34 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     }
   }
 
+  llvm::StringRef auto_binding_space = Args.getLastArgValue(OPT_auto_binding_space);
+  if (!auto_binding_space.empty()) {
+    if (auto_binding_space.getAsInteger(10, opts.AutoBindingSpace)) {
+      errors << "Unsupported value '" << auto_binding_space << "' for auto binding space.";
+      return 1;
+    }
+  }
+
+  opts.Exports = Args.getAllArgValues(OPT_exports);
+
+  opts.DefaultLinkage = Args.getLastArgValue(OPT_default_linkage);
+  if (!opts.DefaultLinkage.empty()) {
+    if (!(opts.DefaultLinkage.equals_lower("internal") ||
+          opts.DefaultLinkage.equals_lower("external"))) {
+      errors << "Unsupported value '" << opts.DefaultLinkage
+             << "for -default-linkage option.";
+      return 1;
+    }
+  }
+
   // Check options only allowed in shader model >= 6.2FPDenormalMode
   unsigned Major = 0;
   unsigned Minor = 0;
   if (!opts.TargetProfile.empty()) {
-    GetTargetVersionFromString(opts.TargetProfile, &Major, &Minor);
+    if (!GetTargetVersionFromString(opts.TargetProfile, &Major, &Minor)) {
+      errors << "unable to parse shader model.";
+      return 1;
+    }
   }
 
   if (opts.TargetProfile.empty() || Major < 6 || (Major == 6 && Minor < 2)) {
@@ -492,6 +538,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
   opts.DisassembleByteOffset = Args.hasFlag(OPT_No, OPT_INVALID, false);
   opts.DisaseembleHex = Args.hasFlag(OPT_Lx, OPT_INVALID, false);
   opts.LegacyMacroExpansion = Args.hasFlag(OPT_flegacy_macro_expansion, OPT_INVALID, false);
+  opts.ExportShadersOnly = Args.hasFlag(OPT_export_shaders_only, OPT_INVALID, false);
 
   if (opts.DefaultColMajor && opts.DefaultRowMajor) {
     errors << "Cannot specify /Zpr and /Zpc together, use /? to get usage information";
@@ -564,7 +611,21 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
     return 1;
   }
 
-  // SPIRV Change Starts
+  if (opts.IsLibraryProfile() && Minor == 0xF) {
+    // Disable validation for offline link only target
+    opts.DisableValidation = true;
+  }
+
+  // Disable lib_6_1 and lib_6_2 if /Vd is not present
+  if (opts.IsLibraryProfile() && (Major < 6 || (Major == 6 && Minor < 3))) {
+    if (!opts.DisableValidation) {
+      errors << "Must disable validation for unsupported lib_6_1 or lib_6_2 "
+                "targets.";
+      return 1;
+    }
+  }
+
+    // SPIRV Change Starts
 #ifdef ENABLE_SPIRV_CODEGEN
   opts.GenSPIRV = Args.hasFlag(OPT_spirv, OPT_INVALID, false);
   opts.SpirvOptions.invertY = Args.hasFlag(OPT_fvk_invert_y, OPT_INVALID, false);

+ 14 - 0
lib/DxrFallback/CMakeLists.txt

@@ -0,0 +1,14 @@
+add_llvm_library(LLVMDxrFallback
+  DxrFallbackCompiler.cpp
+  FunctionBuilder.h
+  LiveValues.cpp
+  LiveValues.h
+  LLVMUtils.cpp
+  LLVMUtils.h
+  Reducibility.h
+  Reducibility.cpp
+  StateFunctionTransform.cpp
+  StateFunctionTransform.h
+)
+
+add_dependencies(LLVMDxrFallback intrinscs_gen)

+ 864 - 0
lib/DxrFallback/DxrFallbackCompiler.cpp

@@ -0,0 +1,864 @@
+#include "dxc/DxrFallback/DxrFallbackCompiler.h"
+
+#include "dxc/Support/Global.h"
+#include "dxc/Support/Unicode.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/dxcapi.h"
+#include "dxc/dxcdxrfallbackcompiler.h"
+#include "dxc/Support/dxcapi.use.h"
+#include "dxc/Support/dxcapi.impl.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilLinker.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilInstructions.h"
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "FunctionBuilder.h"
+#include "LLVMUtils.h"
+#include "runtime.h"
+#include "StateFunctionTransform.h"
+
+#include <queue>
+
+using namespace hlsl;
+using namespace llvm;
+
+static std::vector<Function*> getFunctionsWithPrefix(Module* module, const std::string& prefix)
+{
+  std::vector<Function*> functions;
+  for (auto F = module->begin(), E = module->end(); F != E; ++F)
+  {
+    StringRef name = F->getName();
+    if (name.startswith(prefix))
+      functions.push_back(F);
+  }
+  return functions;
+}
+
+
+static bool inlineFunc(CallInst* call, Function* Fimpl)
+{
+  // Note. LLVM inlining may not be sufficient if the function references DX 
+  // resources because the corresponding metadata is not created if the function
+  // comes from another module.
+
+  // Make sure that we have a definition for the called function in this module
+  Function* F = call->getCalledFunction();
+  Module* dstM = F->getParent();
+  if (F->isDeclaration())
+  {
+    // Map called functions in impl module to functions in this one (because the
+    // cloning step doesn't do this automatically)
+    ValueToValueMapTy VMap;
+    for (auto& I : inst_range(Fimpl))
+    {
+      if (CallInst* c = dyn_cast<CallInst>(&I))
+      {
+        Function* calledFimpl = c->getCalledFunction();
+        if (VMap.count(calledFimpl))
+          continue;
+
+        Constant* calledF = dstM->getOrInsertFunction(calledFimpl->getName(), calledFimpl->getFunctionType(), calledFimpl->getAttributes());
+        VMap[calledFimpl] = calledF;
+      }
+    }
+
+    // Map arguments
+    for (auto SI = Fimpl->arg_begin(), SE = Fimpl->arg_end(), DI = F->arg_begin(); SI != SE; ++SI, ++DI)
+      VMap[SI] = DI;
+
+    SmallVector<ReturnInst*, 4> returns;
+    CloneFunctionInto(F, Fimpl, VMap, true, returns);
+    F->setLinkage(GlobalValue::InternalLinkage);
+  }
+
+  InlineFunctionInfo IFI;
+  return InlineFunction(call, IFI, false);
+}
+
+
+// Remove ELF mangling
+static std::string cleanName(StringRef name)
+{
+  if (!name.startswith("\x1?"))
+    return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+  std::string newName = name.substr(2, pos - 2);
+  return newName;
+}
+
+
+static inline Function* getOrInsertFunction(Module* module, Function* F)
+{
+  return dyn_cast<Function>(module->getOrInsertFunction(F->getName(), F->getFunctionType()));
+}
+
+
+template<typename K, typename V>
+V get(std::map<K, V>& theMap, const K& key, V defaultVal = static_cast<V>(nullptr))
+{
+  auto it = theMap.find(key);
+  if (it == theMap.end())
+    return defaultVal;
+  else
+    return it->second;
+}
+
+
+DxrFallbackCompiler::DxrFallbackCompiler(llvm::Module* module, const std::vector<std::string>& shaderNames, unsigned maxAttributeSize, unsigned stackSizeInBytes, bool findCalledShaders /*= false*/)
+  : m_module(module)
+  , m_entryShaderNames(shaderNames)
+  , m_stackSizeInBytes(stackSizeInBytes)
+  , m_maxAttributeSize(maxAttributeSize)
+  , m_findCalledShaders(findCalledShaders)
+{}
+
+void DxrFallbackCompiler::compile(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap)
+{
+  std::vector<std::string> shaderNames = m_entryShaderNames;
+  initShaderMap(shaderNames);
+
+  // Bring in runtime so we can get the runtime data type
+  linkRuntime();
+  Type* runtimeDataArgTy = getRuntimeDataArgType();
+  
+  // Make sure all calls to intrinsics and shaders are at function scope and 
+  // fix up control flow.
+  lowerAnyHitControlFlowFuncs();
+  lowerReportHit();
+  lowerTraceRay(runtimeDataArgTy);
+  
+  // Create state functions
+  IntToFuncMap stateFunctionMap; // stateID -> state function
+  const int baseStateId = 1000;  // could be anything but this makes stateIds more recognizable 
+  createStateFunctions(stateFunctionMap, shaderEntryStateIds, shaderStackSizes, baseStateId, shaderNames, runtimeDataArgTy);
+
+  if (pCachedMap)
+  {
+      for (auto &entry : stateFunctionMap)
+      {
+          (*pCachedMap)[entry.first] = entry.second->getName().str();
+      }
+  }
+}
+
+void DxrFallbackCompiler::link(std::vector<int>& shaderEntryStateIds, std::vector<unsigned int> &shaderStackSizes, IntToFuncNameMap *pCachedMap)
+{
+    IntToFuncMap stateFunctionMap; // stateID -> state function
+    if (pCachedMap)
+    {
+        for (auto entry : *pCachedMap)
+        {
+            stateFunctionMap[entry.first] = m_module->getFunction(entry.second);
+        }
+    }
+    else
+    {
+        for (UINT i = 0; i < shaderEntryStateIds.size(); i++)
+        {
+            UINT substateIndex = 0;
+            UINT baseStateId = shaderEntryStateIds[i];
+            while (true)
+            {
+                auto substateName = m_entryShaderNames[i] + ".ss_" + std::to_string(substateIndex);
+
+                auto function = m_module->getFunction(substateName);
+                if (!function) break;
+                stateFunctionMap[baseStateId + substateIndex] = m_module->getFunction(substateName);
+                substateIndex++;
+            }
+        }
+    }
+    
+    // Fix up scheduler
+    Function* schedulerFunc = m_module->getFunction("fb_Fallback_Scheduler");
+    createLaunchParams(schedulerFunc);
+
+    Type* runtimeDataArgTy = getRuntimeDataArgType();
+    createStateDispatch(schedulerFunc, stateFunctionMap, runtimeDataArgTy);
+    createStack(schedulerFunc);
+
+    lowerIntrinsics();
+}
+
+
+void DxrFallbackCompiler::setDebugOutputLevel(int val)
+{
+  m_debugOutputLevel = val;
+}
+
+static bool isShader(Function* F)
+{
+  if (F->hasFnAttribute("exp-shader"))
+    return true;
+
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  return (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay());
+}
+
+DXIL::ShaderKind getRayShaderKind(Function* F)
+{
+  if (F->hasFnAttribute("exp-shader"))
+    return DXIL::ShaderKind::RayGeneration;
+
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  if (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay())
+    return DM.GetDxilFunctionProps(F).shaderKind;
+
+  return DXIL::ShaderKind::Invalid;
+}
+
+
+// Some shaders should use the "pending" values of intrinsics instead of the 
+// committed ones. In particular anyhit and intersection shaders use the
+// pending values with the exception that the committed rayTCurrent should be
+// used in intersection.
+static bool shouldUsePendingValue(Function* F, StringRef instrinsicName)
+{
+  DxilModule& DM = F->getParent()->GetDxilModule();
+  if (!DM.HasDxilFunctionProps(F))
+    return false;
+  const hlsl::DxilFunctionProps& props = DM.GetDxilFunctionProps(F);
+
+  return props.IsAnyHit() || (props.IsIntersection() && instrinsicName != "rayTCurrent");
+}
+
+void DxrFallbackCompiler::initShaderMap(std::vector<std::string>& shaderNames)
+{
+  // Clean names and initialize shaderMap
+  StringToFuncMap allShadersMap;
+  for (Function& F : m_module->functions())
+  {
+    if (isShader(&F))
+    {
+      if (!F.isDeclaration())
+        allShadersMap[cleanName(F.getName())] = &F;
+    }
+
+    F.removeFnAttr(Attribute::NoInline);
+  }
+
+
+  for (auto& name : shaderNames)
+    m_shaderMap[name] = allShadersMap[name];
+
+
+  if (!m_findCalledShaders)
+    return;
+
+
+  // Create a map from shader name to CallGraphNode
+  CallGraph callGraph(*m_module);
+  std::map<std::string, CallGraphNode*> allShaderNodes;
+  for (auto& kv : m_shaderMap)
+  {
+    const std::string& name = kv.first;
+    Function* func = kv.second;
+    allShaderNodes[name] = callGraph[func];
+  }
+
+  // Start traversing the call graph from given shaderNames
+  std::deque<CallGraphNode*> workList;
+  for (auto& name : shaderNames)
+    workList.push_back(allShaderNodes[name]);
+  while (!workList.empty())
+  {
+    CallGraphNode* cur = workList.front();
+    workList.pop_front();
+    for (size_t i = 0; i < cur->size(); ++i)
+    {
+      Function* nextFunc = (*cur)[i]->getFunction();
+      if (!nextFunc)
+        continue;
+      if (isShader(nextFunc))
+      {
+        const std::string nextName = cleanName(nextFunc->getName());
+        if (m_shaderMap.count(nextName) == 0) // not in the shaderMap yet?
+        {
+          workList.push_back(allShaderNodes[nextName]);
+          shaderNames.push_back(nextName);
+          m_shaderMap[nextName] = workList.back()->getFunction();
+        }
+      }
+    }
+  }
+}
+
+void DxrFallbackCompiler::linkRuntime()
+{
+  Linker linker(m_module);
+  std::unique_ptr<Module> runtimeModule = loadModuleFromAsmString(m_module->getContext(), getRuntimeString());
+  bool linkErr = linker.linkInModule(runtimeModule.get());
+  assert(!linkErr && "Error linking runtime");
+  UNREFERENCED_PARAMETER(linkErr);
+
+}
+
+static void inlineFuncAndAddRet(CallInst* call, Function*F)
+{
+  // Add a return after the function call.
+  // Should be followed immediately by "unreachable". Turn that into a "ret void".
+  Instruction* ret = ReturnInst::Create(call->getContext());
+  ReplaceInstWithInst(call->getParent()->getTerminator(), ret);
+
+  bool success = inlineFunc(call, F);
+  assert(success);
+  UNREFERENCED_PARAMETER(success);
+}
+
+void DxrFallbackCompiler::lowerAnyHitControlFlowFuncs()
+{
+  std::vector<CallInst*> callsToIgnoreHit = getCallsInShadersToFunction("dx.op.ignoreHit");
+  if (!callsToIgnoreHit.empty())
+  {
+    Function* ignoreHitFunc = m_module->getFunction("\x1?Fallback_IgnoreHit@@YAXXZ");
+    assert(ignoreHitFunc && "IgnoreHit() implementation not found");
+    for (CallInst* call : callsToIgnoreHit)
+      inlineFuncAndAddRet(call, ignoreHitFunc);
+  }
+
+  std::vector<CallInst*> callsToAcceptHitAndEndSearch = getCallsInShadersToFunction("dx.op.acceptHitAndEndSearch");
+  if (!callsToAcceptHitAndEndSearch.empty())
+  {
+    Function* acceptHitAndEndSearchFunc = m_module->getFunction("\x1?Fallback_AcceptHitAndEndSearch@@YAXXZ");
+    assert(acceptHitAndEndSearchFunc && "AcceptHitAndEndSearch() implementation not found");
+    for (CallInst* call : callsToAcceptHitAndEndSearch)
+      inlineFuncAndAddRet(call, acceptHitAndEndSearchFunc);
+  }
+}
+
+void DxrFallbackCompiler::lowerReportHit()
+{
+  std::vector<CallInst*> callsToReportHit = getCallsInShadersToFunctionWithPrefix("dx.op.reportHit");
+  if (callsToReportHit.empty())
+    return;
+
+  Function* reportHitFunc = m_module->getFunction("\x1?Fallback_ReportHit@@YAHMI@Z");
+  assert(reportHitFunc && "ReportHit() implementation not found");
+
+  LLVMContext& C = m_module->getContext();
+  for (CallInst* call : callsToReportHit)
+  {
+    // Wrap attribute arguments in Fallback_SetPendingAttr() call
+    Instruction* insertBefore = call;
+    hlsl::DxilInst_ReportHit reportHitCall(call);
+
+    Value* attr = reportHitCall.get_Attributes();
+    Function* setPendingAttrFunc = FunctionBuilder(m_module, "\x1?Fallback_SetPendingAttr@@").voidTy().type(attr->getType(), "attr").build();
+    CallInst::Create(setPendingAttrFunc, { attr }, "", insertBefore);
+
+    // Make call to implementation and load result
+    CallInst* callImpl = CallInst::Create(reportHitFunc, { reportHitCall.get_THit(), reportHitCall.get_HitKind() }, "reportHit.result", insertBefore);
+    Value* result = callImpl;
+
+    // Result < 0 ==> ret
+    Value* zero = makeInt32(0, C);
+    Value* ltz = new ICmpInst(insertBefore, CmpInst::ICMP_SLT, result, zero, "endSearch");
+    BasicBlock* prevBlock = call->getParent();
+    BasicBlock* retBlock = prevBlock->splitBasicBlock(call, "endSearch");
+    BasicBlock* nextBlock = retBlock->splitBasicBlock(call, "afterReportHit");
+    ReplaceInstWithInst(prevBlock->getTerminator(), BranchInst::Create(retBlock, nextBlock, ltz));
+    ReplaceInstWithInst(retBlock->getTerminator(), ReturnInst::Create(C));
+
+    // Compare result to zero and store into original result
+    Value* gtz = new ICmpInst(insertBefore, CmpInst::ICMP_SGT, result, zero, "accepted");
+    call->replaceAllUsesWith(gtz);
+
+    bool success = inlineFunc(callImpl, reportHitFunc);
+    assert(success);
+    (void)success;
+
+    call->eraseFromParent();
+  }
+}
+
+void DxrFallbackCompiler::lowerTraceRay(Type* runtimeDataArgTy)
+{
+  std::vector<CallInst*> callsToTraceRay = getCallsInShadersToFunctionWithPrefix("dx.op.traceRay");
+  if (callsToTraceRay.empty())
+  {
+    // TODO: It might be worth dropping this from the tests eventually
+    callsToTraceRay = getCallsInShadersToFunctionWithPrefix("\x1?TraceRayTest@@");
+    if (callsToTraceRay.empty())
+      return;
+  }
+
+  std::vector<Function*> traceRayImpl = getFunctionsWithPrefix(m_module, "\x1?Fallback_TraceRay@@");
+  assert(traceRayImpl.size() == 1 && "Could not find Fallback_TraceRay() implementation");
+
+  enum { CLOSEST_HIT = 0, MISS = 1 };
+  Function* traceRaySave[] = { m_module->getFunction("traceRaySave_ClosestHit"), m_module->getFunction("traceRaySave_Miss") };
+  Function* traceRayRestore[] = { m_module->getFunction("traceRayRestore_ClosestHit"), m_module->getFunction("traceRayRestore_Miss") };
+  assert(traceRaySave[CLOSEST_HIT] && traceRayRestore[CLOSEST_HIT] && traceRaySave[MISS] && traceRayRestore[MISS] &&
+    "Could not find TraceRay spill functions");
+
+  Function* dummyRuntimeDataArgFunc = StateFunctionTransform::createDummyRuntimeDataArgFunc(m_module, runtimeDataArgTy);
+  assert(dummyRuntimeDataArgFunc && "dummyRuntimeDataArg function could not be created.");
+
+  // Process calls
+  LLVMContext& C = m_module->getContext();
+  Type* int32Ty = Type::getInt32Ty(C);
+  std::map<FunctionType*, Function*> movePayloadToStackFuncs;
+  std::map<Function*, AllocaInst*> funcToSpillAlloca;
+  for (CallInst* call : callsToTraceRay)
+  {
+    Instruction* insertBefore = call;
+
+    
+    // Spill runtime data values, if necessary (closesthit and miss shaders)
+    Function* caller = call->getParent()->getParent();
+    DXIL::ShaderKind kind = getRayShaderKind(caller);
+    if (kind == DXIL::ShaderKind::ClosestHit || kind == DXIL::ShaderKind::Miss)
+    {
+      int sh = (kind == DXIL::ShaderKind::ClosestHit) ? CLOSEST_HIT : MISS;
+      AllocaInst* spillAlloca = get(funcToSpillAlloca, caller);
+      if (!spillAlloca)
+      {
+        Argument* spillAllocaArg = (++traceRaySave[sh]->arg_begin());
+        Type* spillAllocaTy = spillAllocaArg->getType()->getPointerElementType();
+        spillAlloca = new AllocaInst(spillAllocaTy, "spill.alloca", caller->getEntryBlock().begin());
+        funcToSpillAlloca[caller] = spillAlloca;
+      }
+      
+      // Create calls. SFT will inline them.
+      Value* runtimeDataArg = CallInst::Create(dummyRuntimeDataArgFunc, "runtimeData", insertBefore);
+      CallInst::Create(traceRaySave[sh], {runtimeDataArg, spillAlloca}, "", insertBefore);
+      CallInst::Create(traceRayRestore[sh], {runtimeDataArg, spillAlloca}, "", getInstructionAfter(call));    
+    }
+
+    
+    // Get the payload offset to pass to trace implementation
+    //hlsl::DxilInst_TraceRay traceRayCall(call);
+    // TODO: Avoiding the intrinsic to support the test's use of TraceRayTest
+    Value* payload = call->getOperand(call->getNumArgOperands() - 1);
+    FunctionType* funcType = FunctionType::get(int32Ty, { payload->getType() }, false);
+    Function* movePayloadToStackFunc = getOrCreateFunction("movePayloadToStack", m_module, funcType, movePayloadToStackFuncs);
+    Value* newPayloadOffset = CallInst::Create(movePayloadToStackFunc, { payload }, "new.payload.offset", insertBefore);
+
+    // Call implementation
+    unsigned i = 0;
+    if (call->getCalledFunction()->getName().startswith("dx.op"))
+      i += 2; // skip intrinsic number and acceleration structure (for now)
+    std::vector<Value*> args;
+    for (; i < call->getNumArgOperands() - 1; ++i)
+      args.push_back(call->getArgOperand(i));
+    args.push_back(newPayloadOffset);
+    CallInst::Create(traceRayImpl[0], args, "", insertBefore);
+
+    call->eraseFromParent();
+  }
+}
+
+static std::vector<StateFunctionTransform::ParameterSemanticType> getParameterTypes(Function* F, DXIL::ShaderKind shaderKind)
+{
+  std::vector<StateFunctionTransform::ParameterSemanticType> paramTypes;
+  if (shaderKind == DXIL::ShaderKind::AnyHit || shaderKind == DXIL::ShaderKind::ClosestHit)
+  {
+    paramTypes.push_back(StateFunctionTransform::PST_PAYLOAD);
+    paramTypes.push_back(StateFunctionTransform::PST_ATTRIBUTE);
+  }
+  else if (shaderKind == DXIL::ShaderKind::Miss)
+  {
+    paramTypes.push_back(StateFunctionTransform::PST_PAYLOAD);
+  }
+  else
+  {
+    paramTypes.assign(F->getNumOperands(), StateFunctionTransform::PST_NONE);
+  }
+  return paramTypes;
+}
+
+static void collectResources(DxilModule& DM, std::set<Value*>& resources)
+{
+  for (auto& r : DM.GetCBuffers())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetUAVs())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetSRVs())
+    resources.insert(r->GetGlobalSymbol());
+  for (auto& r : DM.GetSamplers())
+    resources.insert(r->GetGlobalSymbol());
+}
+
+
+void DxrFallbackCompiler::createStateFunctions(
+  IntToFuncMap& stateFunctionMap,
+  std::vector<int>& shaderEntryStateIds,
+  std::vector<unsigned int>& shaderStackSizes,
+  int baseStateId,
+  const std::vector<std::string>& shaderNames,
+  Type* runtimeDataArgTy
+)
+{
+  for (auto& kv : m_shaderMap)
+  {
+    if (kv.second == nullptr)
+      errs() << "Function not found for shader " << kv.first << "\n";
+  }
+
+  DxilModule& DM = m_module->GetOrCreateDxilModule();
+  std::set<Value*> resources;
+  collectResources(DM, resources);
+
+  shaderEntryStateIds.clear();
+  shaderStackSizes.clear();
+  int stateId = baseStateId;
+  for (auto& shader : shaderNames)
+  {
+    std::vector<Function*> stateFunctions;
+    Function* F = m_shaderMap[shader];
+    StateFunctionTransform sft(F, shaderNames, runtimeDataArgTy);
+    if (m_debugOutputLevel >= 2)
+      sft.setVerbose(true);
+    if (m_debugOutputLevel >= 3)
+      sft.setDumpFilename("dump.ll");
+    if (shader == "Fallback_TraceRay")
+      sft.setAttributeSize(m_maxAttributeSize);
+    DXIL::ShaderKind shaderKind = getRayShaderKind(F);
+    if (shaderKind != DXIL::ShaderKind::Invalid)
+      sft.setParameterInfo(getParameterTypes(F, shaderKind), shaderKind == DXIL::ShaderKind::ClosestHit);
+    sft.setResourceGlobals(resources);
+    UINT shaderStackSize = 0;
+    sft.run(stateFunctions, shaderStackSize);
+
+    shaderEntryStateIds.push_back(stateId);
+    shaderStackSizes.push_back(shaderStackSize);
+    for (Function* stateF : stateFunctions)
+    {
+      stateFunctionMap[stateId++] = stateF;
+      if (DM.HasDxilFunctionProps(F)) {
+        DM.CloneDxilEntryProps(F, stateF);
+      }
+    }
+  }
+
+  StateFunctionTransform::finalizeStateIds(m_module, shaderEntryStateIds);
+}
+
+void DxrFallbackCompiler::createLaunchParams(Function* func)
+{
+  Module* module = func->getParent();
+  Function* rewrite_setLaunchParams = module->getFunction("rewrite_setLaunchParams");
+  CallInst* call = dyn_cast<CallInst>(*rewrite_setLaunchParams->user_begin());
+
+  LLVMContext& context = module->getContext();
+  Instruction* insertBefore = call;
+
+  Function* DTidFunc = FunctionBuilder(module, "dx.op.threadId.i32").i32().i32().i32().build();
+  Value* DTidx = CallInst::Create(DTidFunc, { makeInt32((int)hlsl::OP::OpCode::ThreadId, context), makeInt32(0, context) }, "DTidx", insertBefore);
+  Value* DTidy = CallInst::Create(DTidFunc, { makeInt32((int)hlsl::OP::OpCode::ThreadId, context), makeInt32(1, context) }, "DTidy", insertBefore);
+
+  Value* dimx = call->getArgOperand(1);
+  Value* dimy = call->getArgOperand(2);
+
+  Function* groupIndexFunc = FunctionBuilder(module, "dx.op.flattenedThreadIdInGroup.i32").i32().i32().build();
+  Value* groupIndex = CallInst::Create(groupIndexFunc, { makeInt32(96, context) }, "groupIndex", insertBefore);
+
+  Function* fb_setLaunchParams = module->getFunction("fb_Fallback_SetLaunchParams");
+  Value* runtimeDataArg = call->getArgOperand(0);
+  CallInst::Create(fb_setLaunchParams, { runtimeDataArg, DTidx, DTidy, dimx, dimy, groupIndex }, "", insertBefore);
+
+  call->eraseFromParent();
+  rewrite_setLaunchParams->eraseFromParent();
+}
+
+void DxrFallbackCompiler::createStateDispatch(Function* func, const IntToFuncMap& stateFunctionMap, Type* runtimeDataArgTy)
+{
+  Module* module = func->getParent();
+  Function* dispatchFunc = createDispatchFunction(stateFunctionMap, runtimeDataArgTy);
+  Function* rewrite_dispatchFunc = module->getFunction("rewrite_dispatch");
+  rewrite_dispatchFunc->replaceAllUsesWith(dispatchFunc);
+  rewrite_dispatchFunc->eraseFromParent();
+}
+
+void DxrFallbackCompiler::createStack(Function* func)
+{
+  LLVMContext& context = func->getContext();
+
+  // We would like to allocate the properly sized stack here, but DXIL doesn't
+  // allow bitcasts between objects of different sizes. So we have to use the
+  // default size from the runtime and replace all the accesses later.
+  Function* rewrite_createStack = m_module->getFunction("rewrite_createStack");
+  CallInst* call = dyn_cast<CallInst>(*rewrite_createStack->user_begin());
+  AllocaInst* stack = new AllocaInst(call->getType()->getPointerElementType(), "theStack", call);
+  stack->setAlignment(sizeof(int));
+  call->replaceAllUsesWith(stack);
+  call->eraseFromParent();
+  rewrite_createStack->eraseFromParent();
+
+  if (m_stackSizeInBytes == 0) // Take the default
+    m_stackSizeInBytes = stack->getType()->getPointerElementType()->getArrayNumElements() * sizeof(int);
+  Function* rewrite_getStackSize = m_module->getFunction("rewrite_getStackSize");
+  call = dyn_cast<CallInst>(*rewrite_getStackSize->user_begin());
+  Value* stackSizeVal = makeInt32(m_stackSizeInBytes, context);
+  call->replaceAllUsesWith(stackSizeVal);
+  call->eraseFromParent();
+  rewrite_getStackSize->eraseFromParent();
+}
+
+// WAR to avoid crazy <3 x float> code emitted by vanilla clang in the runtime
+static bool expandFloat3(std::vector<Value*>& args, Value* arg, Instruction* insertBefore)
+{
+  VectorType* argTy = dyn_cast<VectorType>(arg->getType());
+  if (!argTy || argTy->getVectorNumElements() != 3)
+    return false;
+
+  LLVMContext& C = arg->getContext();
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(0, C), "vec.x", insertBefore));
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(1, C), "vec.y", insertBefore));
+  args.push_back(ExtractElementInst::Create(arg, makeInt32(2, C), "vec.z", insertBefore));
+
+  return true;
+}
+
+static bool float3x4ToFloat12(std::vector<Value*>& args, Value* arg, Instruction* insertBefore)
+{
+  StructType* STy = dyn_cast<StructType>(arg->getType());
+  if (!STy || STy->getName() != "class.matrix.float.3.4")
+    return false;
+
+  BasicBlock& entryBlock = insertBefore->getParent()->getParent()->getEntryBlock();
+  AllocaInst* alloca = new AllocaInst(arg->getType(), "tmp", entryBlock.begin());
+  new StoreInst(arg, alloca, insertBefore);
+  VectorType* VTy = VectorType::get(Type::getFloatTy(arg->getContext()), 12);
+  Value* vec12Ptr = new BitCastInst(alloca, VTy->getPointerTo(), "vec12.ptr", insertBefore);
+  Value* vec12 = new LoadInst(vec12Ptr, "vec12.", insertBefore);
+  args.push_back(vec12);
+
+  return true;
+}
+
+void DxrFallbackCompiler::lowerIntrinsics()
+{
+  std::vector<Function*> intrinsics = getFunctionsWithPrefix(m_module, "fb_");
+  assert(intrinsics.size() > 0);
+
+
+  // Replace intrinsics in anyhit shaders with their pending versions
+  LLVMContext& C = m_module->getContext();
+  std::map<std::string, Function*> pendingIntrinsics;
+  std::string pendingPrefixes[] = { "fb_dxop_pending_",  "fb_Fallback_Pending" };
+  for (auto& F : intrinsics)
+  {
+    std::string intrinsicName;
+    if (F->getName().startswith(pendingPrefixes[0]))
+      intrinsicName = F->getName().substr(pendingPrefixes[0].length());
+    else if (F->getName().startswith(pendingPrefixes[1]))
+      intrinsicName = "Fallback_" + F->getName().substr(pendingPrefixes[1].length()).str();
+    else
+      continue;
+
+    pendingIntrinsics[intrinsicName] = F;
+  }
+
+  for (Function* func : intrinsics)
+  {
+    StringRef intrinsicName;
+    std::string name;
+    bool isDxilOp = false;
+    if (func->getName().startswith("fb_Fallback_"))
+    {
+      intrinsicName = func->getName().substr(3); // after the "fb_" prefix
+      name = "\x1?" + intrinsicName.str();
+    }
+    else if (func->getName().startswith("fb_dxop_"))
+    {
+      intrinsicName = func->getName().substr(8);
+      name = "dx.op." + intrinsicName.str();
+      isDxilOp = true;
+    }
+    else
+    {
+      assert(0 && "Bad intrinsic");
+    }
+    std::vector<Function*> calledFunc = getFunctionsWithPrefix(m_module, name);
+    if (calledFunc.empty())
+      continue;
+    std::vector<CallInst*> calls = getCallsToFunction(calledFunc[0]);
+    if (calls.empty())
+      continue;
+
+
+    bool needsRuntimeDataArg = (intrinsicName != "Fallback_Scheduler");
+    Function* pendingFunc = get(pendingIntrinsics, intrinsicName.str());
+    Function* funcInModule = nullptr;
+    Function* pendingFuncInModule = nullptr;
+    for (CallInst* call : calls)
+    {
+      Function* caller = call->getParent()->getParent();
+      if (needsRuntimeDataArg && !caller->hasFnAttribute("state_function"))
+        continue;
+
+      Function* F = nullptr;
+      if (pendingFunc && shouldUsePendingValue(caller, intrinsicName))
+      {
+        if (!pendingFuncInModule)
+          pendingFuncInModule = getOrInsertFunction(m_module, pendingFunc);
+        F = pendingFuncInModule;
+      }
+      else
+      {
+        if (!funcInModule)
+          funcInModule = getOrInsertFunction(m_module, func);
+        F = funcInModule;
+      }
+
+      // insert runtime data and the rest of the arguments
+      std::vector<Value*> args;
+      if (needsRuntimeDataArg)
+        args.push_back(caller->arg_begin());
+      int argIdx = 0;
+      for (auto& arg : call->arg_operands())
+      {
+        if (argIdx++ == 0 && isDxilOp)
+          continue; // skip the intrinsic number
+        if (!expandFloat3(args, arg, call) && !float3x4ToFloat12(args, arg, call))
+          args.push_back(arg);
+      }
+
+      CallInst* newCall = CallInst::Create(F, args, "", call);
+      if (F->getFunctionType()->getReturnType() != Type::getVoidTy(C))
+      {
+        newCall->takeName(call);
+        call->replaceAllUsesWith(newCall);
+      }
+      call->eraseFromParent();
+    }
+  }
+}
+
+Type* DxrFallbackCompiler::getRuntimeDataArgType()
+{
+  // Get the first argument from a known runtime function (assuming the runtime
+  // has already been linked in).
+  Function* F = m_module->getFunction("stackIntPtr");
+  return F->arg_begin()->getType();
+}
+
+Function* DxrFallbackCompiler::createDispatchFunction(const IntToFuncMap &stateFunctionMap, Type* runtimeDataArgTy)
+{
+  LLVMContext& context = m_module->getContext();
+  FunctionType* stateFuncTy = FunctionType::get(Type::getInt32Ty(context), { runtimeDataArgTy }, false);
+
+  Function* dispatchFunc = FunctionBuilder(m_module, "dispatch").i32().type(runtimeDataArgTy, "runtimeData").i32("stateID").build();
+  Value* runtimeDataArg = dispatchFunc->arg_begin();
+  Value* stateIdArg = ++dispatchFunc->arg_begin();
+  BasicBlock* entryBlock = BasicBlock::Create(context, "entry", dispatchFunc);
+  BasicBlock* badBlock = BasicBlock::Create(context, "badStateID", dispatchFunc);
+  IRBuilder<> builder(badBlock);
+  builder.SetInsertPoint(badBlock);
+  builder.CreateRet(makeInt32(-3, context)); // return an error value
+
+  builder.SetInsertPoint(entryBlock);
+  SwitchInst* switchInst = builder.CreateSwitch(stateIdArg, badBlock, stateFunctionMap.size());
+  BasicBlock* endBlock = badBlock;
+  for (auto& kv : stateFunctionMap)
+  {
+    int stateId = kv.first;
+    Function* stateFunc = kv.second;
+
+    Value* stateFuncInModule = m_module->getOrInsertFunction(stateFunc->getName(), stateFuncTy);
+    BasicBlock* block = BasicBlock::Create(context, "state_" + Twine(stateId) + "." + stateFunc->getName(), dispatchFunc, endBlock);
+    builder.SetInsertPoint(block);
+    Value* nextStateId = builder.CreateCall(stateFuncInModule, { runtimeDataArg }, "nextStateId");
+    builder.CreateRet(nextStateId);
+
+    switchInst->addCase(makeInt32(stateId, context), block);
+  }
+
+  return dispatchFunc;
+}
+
+std::vector<CallInst*> DxrFallbackCompiler::getCallsInShadersToFunction(const std::string& funcName)
+{
+  std::vector<CallInst*> calls;
+  Function* F = m_module->getFunction(funcName);
+  if (!F)
+    return calls;
+
+  for (User* U : F->users())
+  {
+    CallInst* call = dyn_cast<CallInst>(U);
+    if (!call)
+      continue;
+
+    Function* caller = call->getParent()->getParent();
+    auto it = m_shaderMap.find(cleanName(caller->getName()));
+    if (it != m_shaderMap.end())
+      calls.push_back(call);
+  }
+  return calls;
+}
+
+std::vector<CallInst*> DxrFallbackCompiler::getCallsInShadersToFunctionWithPrefix(const std::string& funcNamePrefix)
+{
+  std::vector<CallInst*> calls;
+  for (Function* F : getFunctionsWithPrefix(m_module, funcNamePrefix))
+  {
+    for (User* U : F->users())
+    {
+      CallInst* call = dyn_cast<CallInst>(U);
+      if (!call)
+        continue;
+
+      Function* caller = call->getParent()->getParent();
+      if (m_shaderMap.count(cleanName(caller->getName())))
+        calls.push_back(call);
+    }
+  }
+  return calls;
+}
+
+void DxrFallbackCompiler::resizeStack(Function* F, unsigned sizeInBytes)
+{
+  // Find the stack
+  AllocaInst* stack = nullptr;
+  for (auto& I : F->getEntryBlock().getInstList())
+  {
+    AllocaInst* alloc = dyn_cast<AllocaInst>(&I);
+    if (alloc && alloc->getName().startswith("theStack"))
+    {
+      stack = alloc;
+      break;
+    }
+  }
+  if (!stack)
+    return;
+
+  // Create a new stack
+  LLVMContext& C = F->getContext();
+  ArrayType* newStackTy = ArrayType::get(Type::getInt32Ty(C), sizeInBytes / sizeof(int));
+  AllocaInst* newStack = new AllocaInst(newStackTy, "", stack);
+  newStack->takeName(stack);
+
+  // Remap all GEPs - replaceAllUsesWith() won't change types
+  for (auto U = stack->user_begin(), UE = stack->user_end(); U != UE; )
+  {
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(*U++);
+    assert(gep && "theStack has non-gep user.");
+
+    std::vector<Value*> idxList(gep->idx_begin(), gep->idx_end());
+    GetElementPtrInst* newGep = GetElementPtrInst::CreateInBounds(newStack, idxList, "", gep);
+    newGep->takeName(gep);
+    gep->replaceAllUsesWith(newGep);
+    gep->eraseFromParent();
+  }
+
+  stack->eraseFromParent();
+}

+ 148 - 0
lib/DxrFallback/FunctionBuilder.h

@@ -0,0 +1,148 @@
+#pragma once
+
+#include "llvm/IR/Module.h"
+
+#include <string>
+#include <vector>
+
+//==============================================================================
+// Simplifies the creation of functions.
+//
+// To create a function 'void foo( userType, i32, float* )' use the following
+// code:
+//   FunctionBuilder(module, "foo").voidTy().type(userType).i32().floatPtr().build()
+//
+// The first type specified is the return type.
+class FunctionBuilder
+{
+public:
+  FunctionBuilder(llvm::Module* module, const std::string& name)
+    : m_context(module->getContext())
+    , m_module(module)
+    , m_name(name)
+  {}
+
+  FunctionBuilder& voidTy()
+  {
+    m_argNames.push_back("");
+    m_types.push_back(llvm::Type::getVoidTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& floatTy(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getFloatTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& floatPtr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getFloatPtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& doubleTy(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getDoubleTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& doublePtr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getDoublePtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i32(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt32Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i32Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt32PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i16(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt16Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i16Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt16PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i8(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt8Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i8Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt8PtrTy(m_context));
+    return *this;
+  }
+  FunctionBuilder& i1(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt1Ty(m_context));
+    return *this;
+  }
+  FunctionBuilder& i1Ptr(const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(llvm::Type::getInt1PtrTy(m_context));
+    return *this;
+  }
+
+  FunctionBuilder& type(llvm::Type* ty, const std::string& argName = "")
+  {
+    m_argNames.push_back(argName);
+    m_types.push_back(ty);
+    return *this;
+  }
+  FunctionBuilder& types(const std::vector<llvm::Type*>& ty, const std::vector<std::string>& argNames)
+  {
+    if (argNames.empty())
+      for (size_t i = 0; i < ty.size(); ++i)
+        m_argNames.push_back("");
+    m_types.insert(m_types.end(), ty.begin(), ty.end());
+    return *this;
+  }
+
+  llvm::Function* build()
+  {
+    using namespace llvm;
+
+    Type*        retTy = m_types[0];
+    AttributeSet attributes;
+    Type**       argsBegin = (&m_types[0]) + 1;
+    Type**       argsEnd = argsBegin + m_types.size() - 1;
+    Constant*    funcC =
+      m_module->getOrInsertFunction(m_name, FunctionType::get(retTy, ArrayRef<Type*>(argsBegin, argsEnd), false), attributes);
+    Function* func = cast<Function>(funcC);
+
+    std::string* argNamePtr = m_argNames.data() + 1;
+    for (auto& arg : func->args())
+      arg.setName(*argNamePtr++);
+
+    return func;
+  }
+
+private:
+  llvm::LLVMContext&       m_context;
+  llvm::Module*            m_module = nullptr;
+  std::string              m_name;
+  std::vector<std::string> m_argNames;
+  std::vector<llvm::Type*> m_types;
+
+  // forbidden
+  FunctionBuilder();
+  FunctionBuilder(const FunctionBuilder&);
+};

+ 16 - 0
lib/DxrFallback/LLVMBuild.txt

@@ -0,0 +1,16 @@
+; Copyright (C) Microsoft Corporation. All rights reserved.
+; This file is distributed under the University of Illinois Open Source License. See LICENSE.TXT for details.
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DxrFallback
+parent = Libraries
+required_libraries = Core Support

+ 122 - 0
lib/DxrFallback/LLVMUtils.cpp

@@ -0,0 +1,122 @@
+#include "llvm/Analysis/CFGPrinter.h"  // needed for DOTGraphTraits<const Function*>
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/GraphWriter.h"
+
+
+using namespace llvm;
+
+std::vector<CallInst*> getCallsToFunction(Function* callee, const Function* caller)
+{
+  std::vector<CallInst*> calls;
+  if (callee == nullptr)
+    return calls;
+
+  for (auto U = callee->user_begin(), UE = callee->user_end(); U != UE; ++U)
+  {
+    CallInst* CI = dyn_cast<CallInst>(*U);
+    if (!CI) // We are not interested in uses that are not calls 
+      continue;
+    assert(CI->getCalledFunction() == callee);
+
+    if (caller == nullptr || CI->getParent()->getParent() == caller)
+      calls.push_back(CI);
+  }
+  return calls;
+}
+
+ConstantInt* makeInt32(int val, LLVMContext& context)
+{
+  return ConstantInt::get(Type::getInt32Ty(context), val);
+}
+
+Instruction* getInstructionAfter(Instruction* inst)
+{
+  return ++BasicBlock::iterator(inst);
+}
+
+std::unique_ptr<Module> loadModuleFromAsmFile(LLVMContext& context, const std::string& filename)
+{
+  SMDiagnostic err;
+  std::unique_ptr<Module> module = parseIRFile(filename, err, context);
+  if (!module)
+  {
+    err.print(filename.c_str(), errs());
+    exit(1);
+  }
+
+  return module;
+}
+
+std::unique_ptr<Module> loadModuleFromAsmString(LLVMContext& context, const std::string& str)
+{
+  SMDiagnostic  err;
+  MemoryBufferRef memBuffer(str, "id");
+  std::unique_ptr<Module> module = parseIR(memBuffer, err, context);
+  return module;
+}
+
+void saveModuleToAsmFile(const llvm::Module* module, const std::string& filename)
+{
+  std::error_code EC;
+  raw_fd_ostream out(filename, EC, sys::fs::F_Text);
+  if (!out.has_error())
+  {
+    module->print(out, 0);
+    out.close();
+  }
+  if (out.has_error())
+  {
+    errs() << "Error saving to " << filename << "\n";
+    exit(1);
+  }
+}
+
+
+void dumpCFG(const Function* F, const std::string& suffix)
+{
+  std::string filename = ("cfg." + F->getName() + "." + suffix + ".dot").str();
+
+  std::error_code EC;
+  raw_fd_ostream out(filename, EC, sys::fs::F_Text);
+  if (!out.has_error())
+  {
+    errs() << "Writing '" << filename << "'...\n";
+    WriteGraph(out, F, true, F->getName());
+    out.close();
+  }
+  if (out.has_error())
+  {
+    errs() << "Error saving to " << filename << "\n";
+    exit(1);
+  }
+}
+
+Function* getOrCreateFunction(const std::string& name, Module* module, FunctionType* funcType, std::map<FunctionType*, Function*>& typeToFuncMap)
+{
+  auto it = typeToFuncMap.find(funcType);
+  if (it != typeToFuncMap.end())
+    return it->second;
+
+  // Give name a numerical suffix to make it unique 
+  std::string uniqueName = name + std::to_string(typeToFuncMap.size());
+  Function* F = dyn_cast<Function>(module->getOrInsertFunction(uniqueName, funcType));
+  typeToFuncMap[funcType] = F;
+  return F;
+}
+
+void runPasses(llvm::Function* F, const std::vector<llvm::Pass*>& passes)
+{
+  legacy::FunctionPassManager FPM(F->getParent());
+  for (Pass* pass : passes)
+    FPM.add(pass);
+  FPM.doInitialization();
+  FPM.run(*F);
+  FPM.doFinalization();
+}

+ 34 - 0
lib/DxrFallback/LLVMUtils.h

@@ -0,0 +1,34 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class CallInst;
+  class ConstantInt;
+  class Function;
+  class FunctionType;
+  class Instruction;
+  class LLVMContext;
+  class Module;
+  class Pass;
+}
+
+std::vector<llvm::CallInst*> getCallsToFunction(llvm::Function* callee, const llvm::Function* caller = nullptr);
+
+llvm::Function* getOrCreateFunction(const std::string& name, llvm::Module* module, llvm::FunctionType* funcType, std::map<llvm::FunctionType*, llvm::Function*>& typeToFuncMap);
+
+llvm::ConstantInt* makeInt32(int val, llvm::LLVMContext& context);
+
+llvm::Instruction* getInstructionAfter(llvm::Instruction* inst);
+
+std::unique_ptr<llvm::Module> loadModuleFromAsmFile(llvm::LLVMContext& context, const std::string& filename);
+std::unique_ptr<llvm::Module> loadModuleFromAsmString(llvm::LLVMContext& context, const std::string& str);
+void saveModuleToAsmFile(const llvm::Module* module, const std::string& filename);
+
+void dumpCFG(const llvm::Function* F, const std::string& suffix);
+
+void runPasses(llvm::Function*, const std::vector<llvm::Pass*>& passes);

+ 337 - 0
lib/DxrFallback/LiveValues.cpp

@@ -0,0 +1,337 @@
+#include "LiveValues.h"
+
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+static void applyMapping(InstructionSetVector& iset, llvm::DenseMap<llvm::Instruction *, llvm::Instruction *>& imap)
+{
+  // There will be probably be few entries in the imap, so apply them one at a time to the iset.
+  for (auto& kv : imap)
+  {
+    if (iset.count(kv.first) != 0)
+    {
+      iset.remove(kv.first);
+      iset.insert(kv.second);
+    }
+  }
+}
+
+// Compute liveness of a value at basic blocks. Roughly based on
+// Algorithm 6 & 7 from the paper "Computing Liveness Sets for SSA-
+// Form Programs" by Brander et al., 2011.
+
+LiveValues::LiveValues(ArrayRef<Instruction*> computeLiveAt)
+{
+  m_liveSets.resize(computeLiveAt.size());
+
+  // Build index and set of active blocks
+  for (unsigned int i = 0; i < computeLiveAt.size(); i++)
+  {
+    Instruction* v = computeLiveAt[i];
+    m_computeLiveAtIndex.insert(std::make_pair(v, i));
+
+    m_activeBlocks.insert(v->getParent());
+  }
+
+  if (computeLiveAt.size() > 0)
+  {
+      m_function = computeLiveAt[0]->getParent()->getParent();
+  }
+}
+
+// Go over all the instructions between begin (included) and end (excluded) and mark the given value
+// live for code locations contained in the given range.
+void LiveValues::markLiveRange(Instruction* value, BasicBlock::iterator begin, BasicBlock::iterator end)
+{
+  BasicBlock* B = begin->getParent();
+
+  if (m_activeBlocks.count(B) == 0)
+    return;  // Nothing to mark in this block
+
+  for (BasicBlock::iterator I = begin; I != end; ++I)
+  {
+    if (m_computeLiveAtIndex.count(I))
+    {
+      // Mark this value
+      unsigned int index = m_computeLiveAtIndex[I];
+      m_liveSets[index].insert(value);
+      m_allLiveSet.insert(value);
+      // Also store for each value where it is live.
+      m_liveAtIndices[value].insert(index);
+    }
+  }
+}
+
+void LiveValues::upAndMark(Instruction* def, Use& use, BlockSet& scanned)
+{
+  // Determine the starting point for the backwards search.
+  // (Remember that Use represents an edge between the definition of a value and its use)
+  // In the case in which the user of the use is a phi node we start the search from the terminator
+  // of the preceding block.
+  // This allows to avoid going through loop back-edges in cases like these:
+  //                 |
+  //                 | (y)
+  //                 v
+  //          -----------------
+  //     (x)  | z = phi(x, y) |
+  //    ----> | ...           |
+  //    |     | x = z + 1     |
+  //    |     -----------------
+  //    |             |
+  //    |             |
+  //    |             |
+  //    |             v
+  //    |     -----------------
+  //    |     |               |
+  //    ------| INDIRECT CALL |
+  //          |               |
+  //          -----------------
+  //                  | (Start the search for the definition of x (backwards) from here!)
+  //                  v
+  //
+  // Notice that here x is live across the call. This case is tricky because the def comes 'after'
+  // the use. The def still dominates the use because phi nodes logically use their input values on the
+  // edges, i.e. on the terminator of the preceding blocks.
+  //
+  // This has the advantage of being able to traverse edges strictly backwards.
+
+  Instruction* startingPoint = dyn_cast<Instruction>(use.getUser());
+  if (PHINode* usePHI = dyn_cast<PHINode>(startingPoint))
+  {
+    BasicBlock* predecessor = usePHI->getIncomingBlock(use);
+    startingPoint = predecessor->getTerminator();
+  }
+
+  BasicBlock* startingPointBB = startingPoint->getParent();
+  BasicBlock* defBB = def->getParent();
+
+  // Start a bottom-up recursive search from startingPoint to the definition of the current value.
+  // Mark all the code ranges that we encounter on the way a having the current value 'live'.
+  // 'scanned' contains the blocks that we have scanned to the bottom of the block and the we know
+  // already having the current value 'live'.
+
+  SmallVector<BasicBlock*, 16> worklist;
+  worklist.push_back(startingPointBB);
+
+  BlockSet visited;
+
+  while (!worklist.empty())
+  {
+    BasicBlock* B = worklist.pop_back_val();
+
+    if (scanned.count(B) != 0)
+      continue;
+
+    // We have reached the block that contains the definition of the value. We are done for this
+    // branch of the search.
+    if (B == defBB)
+    {
+      if (defBB == startingPointBB)
+      {
+        // If the first block that we visit is also the last mark only the range of instructions
+        // between the def and the starting point.
+        //    -----------------
+        //    |               |
+        //    | x = // def    |  <--
+        //    |               |    !
+        //    |               |    ! This is the range in which x is live.
+        //    |               |    !
+        //    | = x // use    |  <--
+        //    |               |
+        //    -----------------
+
+        markLiveRange(def, ++BasicBlock::iterator(def), BasicBlock::iterator(startingPoint));
+      }
+      else
+      {
+        markLiveRange(def, ++BasicBlock::iterator(def), defBB->end());
+        scanned.insert(B);
+      }
+    }
+    else
+    {
+      if (B == startingPointBB)
+      {
+        // We are in the starting-point block.
+        // This can mean two things:
+        // 1. We are in the first iteration, mark the range between begin and starting point as
+        // live.
+        if (visited.count(B) == 0)
+        {
+          markLiveRange(def, B->begin(), BasicBlock::iterator(startingPoint));
+        }
+        // 2. We came back here because the starting point is in a loop.
+        // In this case mark the whole block as live range and don't come back anymore.
+        else
+        {
+          markLiveRange(def, B->begin(), B->end());
+          scanned.insert(B);
+        }
+
+        // The if statement above allows to manage situations like this:
+        //         BB0
+        //        -----------------
+        //        | x = ...       |
+        //        -----------------
+        //                |
+        //                |
+        //                |
+        //         BB1    v
+        //        -----------------<--                     <--
+        //        |               |  !                       !
+        //  ----->|               |  ! First range marked    !
+        //  |     |               |  !                       !
+        //  |     | ... = x       |<--                       ! Second and final range marked
+        //  |     |               |                          !
+        //  |     | INDIRECT CALL |                          !
+        //  |     |               |                          !
+        //  |     -----------------                        <--
+        //  |              |
+        //  ---------------
+        // x is defined outside a loop and used inside a loop. This means that it is live inside the
+        // whole loop.
+        // So, we first mark the range from the use of x to the top of BB1 and, when we visit BB1
+        // again (because BB1 is a predecessor of BB1) we mark the whole block as live range.
+        // <rant>
+        // This case could have been managed much more easily and efficiently if we had access to
+        // LLVM LoopInfo analysis pass.
+        // We could have done the following: x is uses in a loop and defined outside of it => mark
+        // the whole loop body as live range.
+        // </rant>
+      }
+      else
+      {
+        // We are in an intermediate block on the way to the definition mark it, all as live range.
+        markLiveRange(def, B->begin(), B->end());
+        scanned.insert(B);
+      }
+
+      visited.insert(B);
+
+      for (pred_iterator P = pred_begin(B), PE = pred_end(B); P != PE; ++P)
+      {
+        worklist.push_back(*P);
+      }
+    }
+  }
+}
+
+void LiveValues::run()
+{
+  if (m_computeLiveAtIndex.empty())
+    return;
+
+  // for each variable v do
+  for (inst_iterator I = inst_begin(m_function), E = inst_end(m_function); I != E; ++I)
+  {
+    Instruction* v = &*I;
+    assert(v->getParent()->getParent() == m_function);
+
+    // for each block B where v is used do
+    BlockSet scanned;
+    for (Value::use_iterator U = v->use_begin(), UE = v->use_end(); U != UE; ++U)
+    {
+      Instruction* user = cast<Instruction>(U->getUser());
+      assert(user->getParent()->getParent() == m_function);
+      (void)user;
+
+      upAndMark(v, *U, scanned);
+    }
+  }
+}
+
+void LiveValues::remapLiveValues(llvm::DenseMap<llvm::Instruction*, llvm::Instruction*>& imap)
+{
+  applyMapping(m_allLiveSet, imap);
+  for (auto& liveSet : m_liveSets)
+    applyMapping(liveSet, imap);
+}
+
+const LiveValues::Indices* LiveValues::getIndicesWhereLive(const Value* value) const
+{
+  const auto& iter = m_liveAtIndices.find(value);
+  if (iter == m_liveAtIndices.end())
+    return nullptr;
+  return &iter->second;
+}
+
+void LiveValues::setIndicesWhereLive(Value* value, const Indices* indices)
+{
+  for (unsigned int idx : *indices)
+    setLiveAtIndex(value, idx, true);
+}
+
+bool LiveValues::liveInDisjointRegions(const Value* valueA, const Value* valueB) const
+{
+  const Indices* indicesA = getIndicesWhereLive(valueA);
+  if (!indicesA)
+    return true;
+
+  const Indices* indicesB = getIndicesWhereLive(valueB);
+  if (!indicesB)
+    return true;
+
+  for (const unsigned int index : *indicesA)
+  {
+    if (indicesB->count(index))
+      return false;
+  }
+
+  return true;
+}
+
+void LiveValues::setLiveAtIndex(Value* value, unsigned int index, bool live)
+{
+  assert(index <= m_computeLiveAtIndex.size());
+  if (live)
+  {
+    m_liveAtIndices[value].insert(index);
+    Instruction* inst = cast<Instruction>(value);
+    m_liveSets[index].insert(inst);
+    m_allLiveSet.insert(inst);
+  }
+  else
+  {
+    m_liveAtIndices[value].remove(index);
+    Instruction* inst = cast<Instruction>(value);
+    m_liveSets[index].remove(inst);
+    if (m_liveAtIndices[value].empty())
+      m_allLiveSet.remove(inst);
+  }
+}
+
+void LiveValues::setLiveAtAllIndices(llvm::Value* value, bool live)
+{
+  Instruction* inst = cast<Instruction>(value);
+  if (live)
+  {
+    for (unsigned int index = 0; index < m_computeLiveAtIndex.size(); ++index)
+    {
+      m_liveAtIndices[value].insert(index);
+      m_liveSets[index].insert(inst);
+    }
+    m_allLiveSet.insert(inst);
+  }
+  else
+  {
+    for (unsigned int index = 0; index < m_computeLiveAtIndex.size(); ++index)
+    {
+      m_liveAtIndices[value].remove(index);
+      m_liveSets[index].remove(inst);
+    }
+    if (m_liveAtIndices[value].empty())
+      m_allLiveSet.remove(inst);
+  }
+}
+
+bool LiveValues::getLiveAtIndex(const Value* value, unsigned int index) const
+{
+  assert(index <= m_computeLiveAtIndex.size());
+  const auto& it = m_liveAtIndices.find(value);
+  if (it == m_liveAtIndices.end())
+    return false;
+  return (it->second.count(index) != 0);
+}

+ 81 - 0
lib/DxrFallback/LiveValues.h

@@ -0,0 +1,81 @@
+#pragma once
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm
+{
+  class AllocaInst;
+  class BasicBlock;
+  class Function;
+  class Instruction;
+  class Use;
+  class Value;
+}
+
+typedef std::set<llvm::BasicBlock*> BasicBlockSet;
+typedef llvm::SetVector<llvm::Instruction*> InstructionSetVector;
+
+// Compute live values at specified instructions.
+class LiveValues
+{
+public:
+  LiveValues(llvm::ArrayRef<llvm::Instruction*> computeLiveAt);
+
+  // Compute live values at specified instructions (computeLiveAt)
+  void run();
+
+  // Returns all values that are live at the index.
+  const InstructionSetVector& getLiveValues(unsigned int index) const { return m_liveSets[index]; }
+
+  // Returns all live values, excluding allocas.
+  const InstructionSetVector& getAllLiveValues() const { return m_allLiveSet; }
+
+  // Update the live sets using the map
+  void remapLiveValues(llvm::DenseMap<llvm::Instruction*, llvm::Instruction*>& imap);
+
+  typedef llvm::SetVector<unsigned int> Indices;
+
+  // Return all indices at which the given value is live.
+  const Indices* getIndicesWhereLive(const llvm::Value* value) const;
+
+
+  // For the two given values, check if they are both live at any of the
+  // marker instructions. This does not perform a true "lifetime overlap"
+  // test, it considers values to be disjoint if they have disjoint sets of
+  // markers.
+  // For example, value A is live at call sites 0, 1, 2, value B is live at
+  // 3, 4, where A is used for the last time between 2 and 3 and B is defined
+  // before that use. A and B will be considered "disjoint" in the sense of
+  // this method, even though the lifetimes of their values overlap.
+  bool liveInDisjointRegions(const llvm::Value* valueA, const llvm::Value* valueB) const;
+
+  // Return true if the given value is live at the given index.
+  bool getLiveAtIndex(const llvm::Value* value, unsigned int index) const;
+
+  // Update the analysis manually. Use only if you know exactly what you are
+  // doing and document the reason thoroughly.
+  void setLiveAtIndex(llvm::Value* value, unsigned int index, bool live);
+  void setLiveAtAllIndices(llvm::Value* value, bool live);
+  void setIndicesWhereLive(llvm::Value* value, const Indices* indices);
+
+
+private:
+  llvm::Function*                   m_function = nullptr;
+  std::vector<InstructionSetVector> m_liveSets;
+  InstructionSetVector              m_allLiveSet;
+  llvm::SmallSet<llvm::BasicBlock*, 8>             m_activeBlocks;
+  llvm::DenseMap<llvm::Instruction*, unsigned int> m_computeLiveAtIndex;
+  llvm::DenseMap<const llvm::Value*, Indices>      m_liveAtIndices;
+
+  typedef llvm::SmallSet<llvm::BasicBlock*, 8> BlockSet;
+
+  void markLiveRange(llvm::Instruction* value, llvm::BasicBlock::iterator begin, llvm::BasicBlock::iterator end);
+  void upAndMark(llvm::Instruction* v, llvm::Use& use, BlockSet& scanned);
+};

+ 356 - 0
lib/DxrFallback/Reducibility.cpp

@@ -0,0 +1,356 @@
+#include "Reducibility.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "LLVMUtils.h"
+
+#include <fstream>
+#include <vector>
+#include <map>
+
+#define DBGS errs
+//#define DBGS dbgs
+
+using namespace llvm;
+
+struct Node
+{
+  SetVector<Node*> in;
+  SetVector<Node*> out;
+  SetVector<BasicBlock*> blocks; // block 0 dominates all others in this node
+  size_t numInstructions = 0;
+
+  Node() {}
+  Node(BasicBlock* B) { insert(B); }
+
+  void insert(BasicBlock* B)
+  {
+    numInstructions += B->size();
+    blocks.insert(B);
+  }
+};
+
+
+static void printDotGraph(const std::vector<Node*> nodes, const std::string& filename)
+{
+  DBGS() << "Writing " << filename << " ...";
+  std::ofstream out(filename);
+  if (!out)
+  {
+    DBGS() << "FAILED\n";
+    return;
+  }
+
+  // Give nodes a numerical index to make the output cleaner
+  std::map<Node*, int> idxMap;
+  for (size_t i = 0; i < nodes.size(); ++i)
+    idxMap[nodes[i]] = i;
+
+
+  // Error check - make sure that all the out/in nodes are in the map
+  for (Node* N : nodes)
+  {
+    for (Node* P : N->in)
+    {
+      if (idxMap.find(P) == idxMap.end())
+        DBGS() << "MISSING INPUT NODE\n";
+      if (P->out.count(N) == 0)
+        DBGS() << "MISSING OUTGOING EDGE FROM PREDECESSOR.\n";
+    }
+    for (Node* S : N->out)
+    {
+      if (idxMap.find(S) == idxMap.end())
+        DBGS() << "MISSING OUTPUT NODE\n";
+      if (S->in.count(N) == 0)
+        DBGS() << "MISSING INCOMING EDGE FROM SUCCESSOR.\n";
+    }
+  }
+
+
+  // Print header
+  out << "digraph g {\n";
+  out << "node [\n";
+  out << "  fontsize = \"12\"\n";
+  out << "  labeljust = \"l\"\n";
+  out << "]\n";
+
+  for (unsigned i = 0; i < nodes.size(); ++i)
+  {
+    Node* N = nodes[i];
+
+    // node
+    out << "  N" << i << " [shape=record,label=\"";
+    for (BasicBlock* B : N->blocks)
+      out << B->getName().str() << "\\n";
+    out << "\"];\n";
+
+    // out edges
+    for (Node* S : N->out)
+      out << "  N" << i << " -> N" << idxMap[S] << ";\n";
+
+    // in edges
+    //for( Node* P : N->in )    
+    //  out << "  N" << idxMap[P] << " -> N" << i << " [style=dotted];\n";
+  }
+
+  out << "}\n";
+
+  DBGS() << "\n";
+}
+
+static void printDotGraph(const std::vector<Node*> nodes, Function* F, int step)
+{
+  printDotGraph(nodes, ("red." + F->getName() + "_" + std::to_string(step) + ".dot").str());
+}
+
+
+static Node* split(Node* N, std::map<BasicBlock*, Node*>& bbToNode, bool firstSplit)
+{
+  // Remove one predecessor P from N
+  assert(N->in.size() > 1);
+  Node* P = N->in.pop_back_val();
+  P->out.remove(N);
+
+  // Point P to the clone of N, Np
+  Node* Np = new Node();
+  P->out.insert(Np);
+  Np->in.insert(P);
+
+  // Copy successors of N to Np
+  for (Node* S : N->out)
+  {
+    Np->out.insert(S);
+    S->in.insert(Np);
+  }
+
+#if 1
+  // Run reg2mem on the whole function so we don't have to deal with phis
+  if (firstSplit)
+  {
+    runPasses(N->blocks[0]->getParent(), {
+      createDemoteRegisterToMemoryPass()
+    });
+  }
+
+
+  // Clone N into Np
+  ValueToValueMapTy VMap;
+  for (BasicBlock* B : N->blocks)
+  {
+    BasicBlock* Bp = CloneBasicBlock(B, VMap, ".c", B->getParent());
+    Np->insert(Bp);
+    VMap[B] = Bp;
+  }
+  for (BasicBlock* B : Np->blocks)
+    for (Instruction& I : *B)
+      RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+  // Remap terminators of P from N to Np
+  for (BasicBlock* B : P->blocks)
+    RemapInstruction(B->getTerminator(), VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+#else
+  // Clone N into Np
+  ValueToValueMapTy VMap;
+  for (BasicBlock* B : N->blocks)
+  {
+    BasicBlock* Bp = CloneBasicBlock(B, VMap, ".c", B->getParent());
+    Np->insert(Bp);
+    VMap[B] = Bp;
+  }
+  for (BasicBlock* B : Np->blocks)
+    for (Instruction& I : *B)
+      RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+
+  // Remove incoming values from phis in Np that don't come from actual predecessors
+  BasicBlock* NpEntry = Np->blocks[0];
+  std::set<BasicBlock*> predSet(pred_begin(NpEntry), pred_end(NpEntry));
+  auto I = NpEntry->begin();
+  while (PHINode* phi = dyn_cast<PHINode>(I++))
+  {
+    if (phi->getNumIncomingValues() == predSet.size())
+      continue;
+    for (unsigned i = 0; i < phi->getNumIncomingValues(); )
+    {
+      BasicBlock* B = phi->getIncomingBlock(i);
+      if (!predSet.count(B))
+      {
+        phi->removeIncomingValue(B);
+        continue;
+      }
+      ++i;
+    }
+  }
+
+
+  // Remove phi references to P in N. (Do this before remapping terminators.)
+  BasicBlock* Nentry = N->blocks[0];
+  for (BasicBlock* PB : predecessors(Nentry))
+  {
+    if (P->blocks.count(PB))
+      Nentry->removePredecessor(PB);
+  }
+
+  // Remap terminators of P from N to Np
+  for (BasicBlock* B : P->blocks)
+    RemapInstruction(B->getTerminator(), VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+
+  // Update phis in successors of Np.
+  // There are several cases for a value Vs reaching S. Vs may be defined in N and
+  // a clone Vsp in Np or only passing through one or the other. Furthermore, Vs may 
+  // either appear in a phi in the entry block of S or not.
+  // 1) Vs defined in N (and clone Vsp in Np) and in phi:
+  //    Add incoming value [Vsp, Bp] for cloned value Vsp from predecessor basic
+  //    block Bp in Np wherever [Vs, B] appears
+  // 2) Vs defined in N (and clone Vsp in Np) and not in phi:
+  //    Add phi [Vs,B],[Vsp,Bp] if Vs reaches a use in or through S
+  // 3) Vs passing through N or Np and in phi
+  //    Change [Vs,B] to [Vs,Bp] in phis in S if Vs reached S through P
+  // 4) Vs passing through N or Np and not in a phi
+  //    Do nothing
+  // 
+  // TODO: Only 1) is implemented below and it isn't checking for definition in N
+  for (Node* S : Np->out)
+  {
+    BasicBlock* Sentry = S->blocks[0];
+    auto I = Sentry->begin();
+    while (PHINode* phi = dyn_cast<PHINode>(I++))
+    {
+      for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i)
+      {
+        BasicBlock* B = phi->getIncomingBlock(i);
+        if (N->blocks.count(B))
+        {
+          Value* V = phi->getIncomingValue(i);
+          Value* Vp = VMap[V];
+          if (!Vp)
+            Vp = V; // Def not in N
+          BasicBlock* Bp = dyn_cast<BasicBlock>(VMap[B]);
+          phi->addIncoming(Vp, Bp);
+        }
+      }
+    }
+  }
+#endif
+
+  return Np;
+}
+
+// Returns the number of splits
+int makeReducible(Function* F)
+{
+  // Break critical edges now in case we need to do mem2reg in split(). mem2reg
+  // will break critical edges and the CFG needs to remain unchanged.
+  runPasses(F, {
+    createBreakCriticalEdgesPass()
+  });
+
+  // initialize nodes
+  std::vector<Node*> nodes;
+  std::map<BasicBlock*, Node*> bbToNode;
+  for (BasicBlock& B : *F)
+  {
+    nodes.push_back(new Node(&B));
+    bbToNode[&B] = nodes.back();
+  }
+
+  // initialize edges
+  for (Node* N : nodes)
+  {
+    for (BasicBlock* B : successors(N->blocks[0]))
+    {
+      Node* BN = bbToNode[B];
+      N->out.insert(BN);
+      BN->in.insert(N);
+    }
+  }
+
+  int step = 0;
+  bool print = false;
+  if (print) printDotGraph(nodes, F, step++);
+
+  int numSplits = 0;
+  while (!nodes.empty())
+  {
+    bool changed;
+    do
+    {
+  // It might more efficient to use a worklist based implementation instead
+  // of iterating over the vector.
+      changed = false;
+      for (size_t i = 0; i < nodes.size(); )
+      {
+        Node* N = nodes[i];
+
+        // Remove self references
+        if (N->in.count(N))
+        {
+          N->in.remove(N);
+          N->out.remove(N);
+          changed = true;
+        }
+
+        // Remove singletons
+        if (N->in.size() == 0 && N->out.size() == 0)
+        {
+          nodes.erase(nodes.begin() + i);
+          changed = true;
+          if (print) printDotGraph(nodes, F, step++);
+          continue;
+        }
+
+        // Remove nodes with only one incoming edge
+        if (N->in.size() == 1)
+        {
+          // fold into predecessor
+          Node* P = N->in.back();
+          P->blocks.insert(N->blocks.begin(), N->blocks.end());
+          P->out.remove(N);
+          for (Node* S : N->out)
+          {
+            S->in.remove(N);
+            P->out.insert(S);
+            S->in.insert(P);
+          }
+          P->numInstructions += N->numInstructions;
+          nodes.erase(nodes.begin() + i);
+          changed = true;
+          if (print) printDotGraph(nodes, F, step++);
+          continue;
+        }
+
+        i++;
+      }
+    } while (changed);
+
+    if (!nodes.empty())
+    {
+      // Duplicate the smallest node with more than one incoming edge. Better 
+      // methods exist for picking the node to split, e.g. "Making Graphs Reducible
+      // with Controlled Node Splitting" by Janssen and Corporaal.
+      size_t idxMin = ~0;
+      for (size_t i = 0; i < nodes.size(); ++i)
+      {
+        if (nodes[i]->in.size() <= 1)
+          continue;
+
+        if (idxMin == ~0 || nodes[i]->numInstructions < nodes[idxMin]->numInstructions)
+          idxMin = i;
+      }
+      nodes.push_back(split(nodes[idxMin], bbToNode, numSplits == 0));
+      numSplits++;
+      if (print) printDotGraph(nodes, F, step++);
+    }
+  }
+  return numSplits;
+}

+ 10 - 0
lib/DxrFallback/Reducibility.h

@@ -0,0 +1,10 @@
+#pragma once
+
+namespace llvm
+{
+  class Function;
+}
+
+// Analyzes the reducibility of the control flow graph of F and uses node splitting
+// to make an irredicible CFG reducible. Returns the number of node splits.
+int makeReducible(llvm::Function* F);

+ 1797 - 0
lib/DxrFallback/StateFunctionTransform.cpp

@@ -0,0 +1,1797 @@
+#include "StateFunctionTransform.h"
+
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include "FunctionBuilder.h"
+#include "LiveValues.h"
+#include "LLVMUtils.h"
+#include "Reducibility.h"
+
+#define DBGS dbgs
+//#define DBGS errs
+
+using namespace llvm;
+
+static const char* CALL_INDIRECT_NAME = "\x1?Fallback_CallIndirect@@YAXH@Z";
+static const char* SET_PENDING_ATTR_PREFIX = "\x1?Fallback_SetPendingAttr@@";
+
+
+// Create a string with printf-like arguments
+inline std::string stringf(const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+#ifdef WIN32
+  int size = _vscprintf(fmt, args);
+#else
+  int size = vsnprintf(0, 0, fmt, args);
+#endif
+  va_end(args);
+
+  std::string ret;
+  if (size > 0)
+  {
+    ret.resize(size);
+    va_start(args, fmt);
+    vsnprintf((char*)ret.data(), size + 1, fmt, args);
+    va_end(args);
+  }
+  return ret;
+}
+
+
+// Remove ELF mangling
+static std::string cleanName(StringRef name)
+{
+  if (!name.startswith("\x1?"))
+    return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+  std::string newName = name.substr(2, pos - 2);
+  return newName;
+}
+
+
+// Utility to append the suffix to the name of the value, but returns
+// an empty string if name is empty.  This is to avoid names like ".ptr".
+static std::string addSuffix(StringRef valueName, StringRef suffix)
+{
+  if (!valueName.empty())
+  {
+
+    if (valueName.back() == '.' && suffix.front() == '.') // avoid double dots
+      return (valueName + suffix.substr(1)).str();
+    else
+      return (valueName + suffix).str();
+  }
+  else
+    return valueName.str();
+}
+
+
+// Remove suffix from name.
+static std::string stripSuffix(StringRef name, StringRef suffix)
+{
+  size_t pos = name.rfind(suffix);
+  if (pos != name.npos)
+    return name.substr(0, pos).str();
+  else
+    return name.str();
+}
+
+
+static std::string stripAfter(StringRef name, StringRef suffixStart)
+{
+  size_t pos = name.find(suffixStart);
+  if (pos != name.npos)
+    return name.substr(0, pos).str();
+  else
+    return name.str();
+}
+
+
+// Insert str before the final "." in filename.
+static std::string insertBeforeExtension(const std::string& filename, const std::string& str)
+{
+  std::string ret = filename;
+  size_t      pos = filename.rfind('.');
+  if (pos != std::string::npos)
+    ret.insert(pos, str);
+  else
+    ret += str;
+
+  return ret;
+}
+
+
+// Inserts <functionName>-<id>-<suffix> before the extension in baseName
+static std::string createDumpPath(
+  const std::string& baseName,
+  unsigned           id,
+  const std::string& suffix,
+  const std::string& functionName)
+{
+  std::string s;
+  if (!functionName.empty())
+    s = "-" + functionName;
+  s += stringf("-%02d-", id) + suffix;
+  return insertBeforeExtension(baseName, s);
+}
+
+
+// Return byte offset aligned to the alignment required by inst.
+static uint64_t align(uint64_t offset, Instruction* inst, DataLayout& DL)
+{
+  unsigned alignment = 0;
+  if (AllocaInst* ai = dyn_cast<AllocaInst>(inst))
+    alignment = ai->getAlignment();
+
+  if (alignment == 0)
+    alignment = DL.getPrefTypeAlignment(inst->getType());
+
+  return RoundUpToAlignment(offset, alignment);
+}
+
+
+template <class T>  // T can be Value* or Instruction*
+T createCastForStack(T ptr, llvm::Type* targetPtrElemType, llvm::Instruction* insertBefore)
+{
+  llvm::PointerType* requiredType = llvm::PointerType::get(targetPtrElemType, ptr->getType()->getPointerAddressSpace());
+  if (ptr->getType() == requiredType)
+    return ptr;
+
+  return new llvm::BitCastInst(ptr, requiredType, ptr->getName(), insertBefore);
+}
+
+
+static Value* createCastToInt(Value* val, Instruction* insertBefore)
+{
+  Type* i32Ty = Type::getInt32Ty(val->getContext());
+  if (val->getType() == i32Ty)
+    return val;
+
+  if (val->getType() == Type::getInt1Ty(val->getContext()))
+    return new ZExtInst(val, i32Ty, addSuffix(val->getName(), ".int"), insertBefore);
+
+  Value* intVal = new BitCastInst(val, i32Ty, addSuffix(val->getName(), ".int"), insertBefore);
+  return intVal;
+}
+
+
+static Value* createCastFromInt(Value* intVal, Type* ty, Instruction* insertBefore)
+{
+  Type* i32Ty = Type::getInt32Ty(intVal->getContext());
+  if (ty == i32Ty)
+    return intVal;
+
+  std::string name = intVal->getName();
+  intVal->setName(addSuffix(name, ".int"));
+
+  // Create boolean with compare
+  if (ty == Type::getInt1Ty(intVal->getContext()))
+    return new ICmpInst(insertBefore, CmpInst::ICMP_SGT, intVal, makeInt32(0, intVal->getContext()), name);
+
+  return new BitCastInst(intVal, ty, name, insertBefore);
+}
+
+
+// Gives every value in the given function a name. This can aid in debugging.
+static void dbgNameUnnamedVals(Function* func)
+{
+  Type* voidTy = Type::getVoidTy(func->getContext());
+  for (auto& I : inst_range(func))
+  {
+    if (!I.hasName() && I.getType() != voidTy)
+      I.setName("v"); // LLVM will uniquify the name by adding a numeric suffix
+  }
+}
+
+
+// Returns an iterator for the instruction after the last alloca in the entry block
+// (assuming that allocas are at the top of the entry block).
+static BasicBlock::iterator afterEntryBlockAllocas(Function* function)
+{
+  BasicBlock::iterator insertBefore = function->getEntryBlock().begin();
+  while (isa<AllocaInst>(insertBefore))
+    ++insertBefore;
+  return insertBefore;
+}
+
+
+// Return all the blocks reachable from entryBlock.
+static BasicBlockVector getReachableBlocks(BasicBlock* entryBlock)
+{
+  BasicBlockVector        blocks;
+  std::deque<BasicBlock*> stack = { entryBlock };
+  ::BasicBlockSet         visited = { entryBlock };
+  while (!stack.empty())
+  {
+    BasicBlock* block = stack.front();
+    stack.pop_front();
+
+    blocks.push_back(block);
+
+    TerminatorInst* termInst = block->getTerminator();
+    for (unsigned int succ = 0, succEnd = termInst->getNumSuccessors(); succ != succEnd; ++succ)
+    {
+      BasicBlock* succBlock = termInst->getSuccessor(succ);
+      if (visited.insert(succBlock).second)
+        stack.push_front(succBlock);
+    }
+  }
+
+  return blocks;
+}
+
+
+// Creates a new function with the same arguments and attributes as oldFunction
+static Function* cloneFunctionPrototype(const Function* oldFunction, ValueToValueMapTy& VMap)
+{
+  std::vector<Type*> argTypes;
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I)
+    argTypes.push_back(I->getType());
+
+  FunctionType* FTy = FunctionType::get(oldFunction->getFunctionType()->getReturnType(), argTypes,
+    oldFunction->getFunctionType()->isVarArg());
+  Function* newFunction = Function::Create(FTy, oldFunction->getLinkage(), oldFunction->getName());
+
+  Function::arg_iterator destI = newFunction->arg_begin();
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I, ++destI)
+  {
+    destI->setName(I->getName());
+    VMap[I] = destI;
+  }
+
+  AttributeSet oldAttrs = oldFunction->getAttributes();
+  for (auto I = oldFunction->arg_begin(), E = oldFunction->arg_end(); I != E; ++I)
+  {
+    if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
+    {
+      AttributeSet attrs = oldAttrs.getParamAttributes(I->getArgNo() + 1);
+      if (attrs.getNumSlots() > 0)
+        Anew->addAttr(attrs);
+    }
+  }
+
+  newFunction->setAttributes(newFunction->getAttributes().addAttributes(newFunction->getContext(), AttributeSet::ReturnIndex,
+    oldAttrs.getRetAttributes()));
+  newFunction->setAttributes(newFunction->getAttributes().addAttributes(newFunction->getContext(), AttributeSet::FunctionIndex,
+    oldAttrs.getFnAttributes()));
+  return newFunction;
+}
+
+
+// Creates a new function by cloning blocks reachable from entryBlock
+static Function* cloneBlocksReachableFrom(BasicBlock* entryBlock, ValueToValueMapTy& VMap)
+{
+  Function* oldFunction = entryBlock->getParent();
+  Function* newFunction = cloneFunctionPrototype(oldFunction, VMap);
+
+  // Insert a clone of the entry block into the function.
+  BasicBlock* newEntry = CloneBasicBlock(entryBlock, VMap, "", newFunction);
+  VMap[entryBlock] = newEntry;
+
+  // Clone all other blocks.
+  BasicBlockVector blocks = getReachableBlocks(entryBlock);
+  for (auto block : blocks)
+  {
+    if (block == entryBlock)
+      continue;
+    BasicBlock* clonedBlock = CloneBasicBlock(block, VMap, "", newFunction);
+    VMap[block] = clonedBlock;
+  }
+
+  // Remap new instructions to reference blocks and instructions of the new function.
+  for (auto block : blocks)
+  {
+    auto clonedBlock = cast<BasicBlock>(VMap[block]);
+    for (BasicBlock::iterator I = clonedBlock->begin(); I != clonedBlock->end(); ++I)
+    {
+      RemapInstruction(I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+    }
+  }
+
+  // Remove phi operands incoming from blocks that are not present in the new function anymore.
+  for (auto& block : *newFunction)
+  {
+    PHINode* firstPHI = dyn_cast<PHINode>(block.begin());
+    if (firstPHI == nullptr)
+      continue; // phi instructions only at beginning
+
+    // Create set of actual predecessors
+    BasicBlockSet preds(pred_begin(&block), pred_end(&block));
+    if (preds.size() == firstPHI->getNumIncomingValues())
+      continue;
+
+    // Remove phi incoming blocks not in preds
+    for (auto iter = block.begin(); isa<PHINode>(iter); ++iter)
+    {
+      std::vector<unsigned int> toRemove;
+      PHINode*                  phi = cast<PHINode>(iter);
+      for (unsigned int op = 0, opEnd = phi->getNumIncomingValues(); op != opEnd; ++op)
+      {
+        BasicBlock* pred = phi->getIncomingBlock(op);
+        if (preds.count(pred) == 0)
+        {
+          toRemove.push_back(op);
+        }
+      }
+      for (auto I = toRemove.rbegin(), E = toRemove.rend(); I != E; ++I)
+        phi->removeIncomingValue(*I, false);
+    }
+  }
+
+  return newFunction;
+}
+
+
+// Replace and remove calls to func with val
+static void replaceValAndRemoveUnusedDummyFunc(Value* oldVal, Value* newVal, Function* caller)
+{
+  CallInst* call = dyn_cast<CallInst>(oldVal);
+  assert(call != nullptr && "Must be a call");
+  Function* func = call->getCalledFunction();
+  for (CallInst* CI : getCallsToFunction(func, caller))
+  {
+    CI->replaceAllUsesWith(newVal);
+    CI->eraseFromParent();
+  }
+  if (func->getNumUses() == 0)
+    func->eraseFromParent();
+}
+
+
+// Get the integer value of val. If val is not a ConstantInt return false.
+static bool getConstantValue(int& constant, const Value* val)
+{
+  const ConstantInt* CI = dyn_cast<ConstantInt>(val);
+  if (!CI)
+    return false;
+
+  if (CI->getBitWidth() > 32)
+    return false;
+
+  constant = static_cast<int>(CI->getSExtValue());
+  return true;
+}
+
+static int getConstantValue(const Value* val)
+{
+    const ConstantInt* CI = dyn_cast<ConstantInt>(val);
+    assert(CI && CI->getBitWidth() <= 32);
+    return static_cast<int>(CI->getSExtValue());
+}
+
+
+struct StoreInfo
+{
+  Function* stackIntPtrFunc;
+  Value* runtimeDataArg;
+  Value* baseOffset;
+  Instruction* insertBefore;
+
+  Value* val;
+  std::vector<Value*> idxList;
+};
+
+// Takes the offset at which to store the next value.
+// Returns the next available offset.
+static int store(int offset, StoreInfo& SI, Type* ty)
+{
+  if (StructType* STy = dyn_cast<StructType>(ty))
+  {
+    SI.idxList.push_back(nullptr);
+    int elIdx = 0;
+    for (auto& elTy : STy->elements())
+    {
+      SI.idxList.back() = makeInt32(elIdx++, ty->getContext());
+      offset = store(offset, SI, elTy);
+    }
+    SI.idxList.pop_back();
+  }
+  else if (ArrayType* ATy = dyn_cast<ArrayType>(ty))
+  {
+    Type* elTy = ATy->getArrayElementType();
+    SI.idxList.push_back(nullptr);
+    for (int elIdx = 0; elIdx < (int)ATy->getArrayNumElements(); ++elIdx)
+    {
+      SI.idxList.back() = makeInt32(elIdx, ty->getContext());
+      offset = store(offset, SI, elTy);
+    }
+    SI.idxList.pop_back();
+  }
+  else if (PointerType* PTy = dyn_cast<PointerType>(ty))
+  {
+    SI.idxList.push_back(makeInt32(0, ty->getContext()));
+    offset = store(offset, SI, PTy->getPointerElementType());
+    SI.idxList.pop_back();
+  }
+  else
+  {
+    Value* val = SI.val;
+    if (!SI.idxList.empty())
+    {
+      Value* gep = GetElementPtrInst::CreateInBounds(SI.val, SI.idxList, "", SI.insertBefore);
+      val = new LoadInst(gep, "", SI.insertBefore);
+    }
+    if (VectorType* VTy = dyn_cast<VectorType>(ty))
+    {
+      std::vector<Value*>idxList = std::move(SI.idxList);
+      Type* elTy = VTy->getVectorElementType();
+      for (int elIdx = 0; elIdx < (int)VTy->getVectorNumElements(); ++elIdx)
+      {
+        Value* idxVal = makeInt32(elIdx, ty->getContext());
+        Value* el = ExtractElementInst::Create(val, idxVal, "", SI.insertBefore);
+        SI.val = el;
+        offset = store(offset, SI, elTy);
+      }
+      SI.idxList = std::move(idxList);
+    }
+    else
+    {
+      Value* idxVal = makeInt32(offset, val->getContext());
+      Value* intVal = createCastToInt(val, SI.insertBefore);
+      Value* intPtr = CallInst::Create(SI.stackIntPtrFunc, { SI.runtimeDataArg, SI.baseOffset, idxVal }, addSuffix(val->getName(), ".ptr"), SI.insertBefore);
+      new StoreInst(intVal, intPtr, SI.insertBefore);
+      offset += 1;
+    }
+  }
+  return offset;
+}
+
+// Store value to the stack at given baseOffset + offset. Will flatten aggregates and vectors.
+// Returns the offset where writing left off. For pointer vals stores what is pointed to.
+static int store(Value* val, Function* stackIntPtrFunc, Value* runtimeDataArg, Value* baseOffset, int offset, Instruction* insertBefore)
+{
+  StoreInfo SI;
+  SI.stackIntPtrFunc = stackIntPtrFunc;
+  SI.runtimeDataArg = runtimeDataArg;
+  SI.baseOffset = baseOffset;
+  SI.insertBefore = insertBefore;
+  SI.val = val;
+
+  return store(offset, SI, val->getType());
+}
+
+
+static Value* load(llvm::Function* m_stackIntPtrFunc, Value* runtimeDataArg, Value* offset, Value* idx, const std::string& name, Type* ty, Instruction* insertBefore)
+{
+  if (VectorType* VTy = dyn_cast<VectorType>(ty))
+  {
+    LLVMContext& C = ty->getContext();
+    int baseIdx = getConstantValue(idx);
+    Type* elTy = VTy->getVectorElementType();
+    Value* vec = UndefValue::get(VTy);
+    for (int i = 0; i < (int)VTy->getVectorNumElements(); ++i)
+    {
+      std::string elName = stringf("el%d.", i);
+      Value* intPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, makeInt32(baseIdx + i, C) }, elName + "ptr", insertBefore);
+      Value* intEl = new LoadInst(intPtr, elName, insertBefore);
+      Value* el = createCastFromInt(intEl, elTy, insertBefore);
+      vec = InsertElementInst::Create(vec, el, makeInt32(i, C), "tmpvec", insertBefore);
+    }
+    vec->setName(name);
+    return vec;
+  }
+  else
+  {
+    Value* intPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, idx }, addSuffix(name, ".ptr"), insertBefore);
+    Value* intVal = new LoadInst(intPtr, name, insertBefore);
+    Value* val = createCastFromInt(intVal, ty, insertBefore); 
+    return val;
+  }
+}
+
+static void reg2Mem(DenseMap<Instruction*, AllocaInst*>& valToAlloca, DenseMap<AllocaInst*, Instruction*>& allocaToVal, Instruction* inst)
+{
+  if (valToAlloca.count(inst))
+    return;
+
+  // Convert the value to an alloca
+  AllocaInst*  allocaPtr = DemoteRegToStack(*inst, false);
+  if (allocaPtr)
+  {
+    valToAlloca[inst] = allocaPtr;
+    allocaToVal[allocaPtr] = inst;
+  }
+}
+
+
+// Utility class for rematerializing values at a callsite
+class Rematerializer
+{
+public:
+  Rematerializer(
+    DenseMap<AllocaInst*, Instruction*>& allocaToVal,
+    const InstructionSetVector& liveHere,
+    const std::set<Value*>& resources
+  )
+    : m_allocaToVal(allocaToVal)
+    , m_liveHere(liveHere)
+    , m_resources(resources)
+  {}
+
+
+  // Returns true if inst can be rematerialized.
+  bool canRematerialize(Instruction* inst)
+  {
+    if (CallInst* call = dyn_cast<CallInst>(inst))
+    {
+      StringRef funcName = call->getCalledFunction()->getName();
+      if (funcName.startswith("dummyStackFrameSize"))
+        return true;
+      if (funcName.startswith("stack.ptr"))
+        return true;
+      if (funcName.startswith("stack.load"))
+        return true;
+      if (funcName.startswith("dx.op.createHandle"))
+        return true;
+    }
+    else if (LoadInst* load = dyn_cast<LoadInst>(inst))
+    {
+      Value* op = load->getOperand(0);
+      if (GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(op)) // for descriptor tables
+        op = gep->getOperand(0);
+      if (m_resources.count(op))
+        return true;
+    }
+    else if (GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(inst))
+    {
+      assert(gep->hasAllConstantIndices() && "Unhandled non-constant index"); // Should have been changed to stack.ptr
+      return true;
+    }
+
+    return false;
+  }
+
+
+  // Rematerialize the given instruction and its dependency graph, adding 
+  // any nonrematerializable values that are live in the function, but not 
+  // at this callsite to the work list to insure that their values are restored.
+  Instruction* rematerialize(Instruction* inst, std::vector<Instruction *> workList, Instruction* insertBefore, int depth = 0)
+  {
+    // Signal if we hit a complex case. Deep rematerialization needs more analysis.
+    // To make this robust we would need to make it possible to run the current
+    // value through the live value handling pipeline: figure out where it is live,
+    // reg2mem, save/restore at appropriate callsites, etc.
+    assert(depth < 8);
+
+    // Reuse an already rematerialized value?
+    auto it = m_rematMap.find(inst);
+    if (it != m_rematMap.end())
+      return it->second;
+
+    // Handle allocas
+    if (AllocaInst* alloc = dyn_cast<AllocaInst>(inst))
+    {
+      assert(depth > 0); // Should only be an operand to another rematerialized value
+      auto it = m_allocaToVal.find(alloc);
+      if (it != m_allocaToVal.end()) // Is it a value that is live at some callsite (and reg2mem'd)?
+      {
+        Instruction* val = it->second;
+        if (canRematerialize(val))
+        {
+          // Rematerialize here and store to the alloca. We may have already rematerialized a load
+          // from the alloca. Any future uses will use the rematerialized value directly.
+          Instruction* remat = rematerialize(val, workList, insertBefore, depth + 1);
+          new StoreInst(remat, alloc, insertBefore);
+        }
+        else
+        {
+          // Value has to be restored, but it rematerialization may have extended
+          // the liveness of this value to this callsite. Make sure it gets restored.
+          if (!m_liveHere.count(val))
+            workList.push_back(val);
+        }
+      }
+
+      // Allocas are not cloned.
+      return inst;
+    }
+
+    Instruction* clone = inst->clone();
+    clone->setName(addSuffix(inst->getName(), ".remat"));
+    for (unsigned i = 0; i < inst->getNumOperands(); ++i)
+    {
+      Value* op = inst->getOperand(i);
+      if (Instruction* opInst = dyn_cast<Instruction>(op))
+        clone->setOperand(i, rematerialize(opInst, workList, insertBefore, depth + 1));
+      else
+        clone->setOperand(i, op);
+    }
+    clone->insertBefore(insertBefore); // insert after any instructions cloned for operands
+    m_rematMap[inst] = clone;
+    return clone;
+  }
+
+
+  Instruction* getRematerializedValueFor(Instruction* val)
+  {
+    auto it = m_rematMap.find(val);
+    if (it != m_rematMap.end())
+      return it->second;
+    else
+      return nullptr;
+  }
+
+
+private:
+  DenseMap<Instruction*, Instruction*> m_rematMap;    // Map instructions to their rematerialized counterparts
+  DenseMap<AllocaInst*, Instruction*>& m_allocaToVal; // Map allocas for reg2mem'd live values back to the value
+  const InstructionSetVector& m_liveHere;             // Values live at this callsite
+  const std::set<Value*>& m_resources;                // Values for resources like SRVs, UAVs, etc.
+};
+
+
+
+StateFunctionTransform::StateFunctionTransform(Function* func, const std::vector<std::string>& candidateFuncNames, Type* runtimeDataArgTy)
+  : m_function(func)
+  , m_candidateFuncNames(candidateFuncNames)
+  , m_runtimeDataArgTy(runtimeDataArgTy)
+{
+  m_functionName = cleanName(m_function->getName());
+  auto it = std::find(m_candidateFuncNames.begin(), m_candidateFuncNames.end(), m_functionName);
+  assert(it != m_candidateFuncNames.end());
+  m_functionIdx = it - m_candidateFuncNames.begin();
+}
+
+void StateFunctionTransform::setAttributeSize(int size)
+{
+  m_attributeSizeInBytes = size;
+}
+
+void StateFunctionTransform::setParameterInfo(const std::vector<ParameterSemanticType>& paramTypes, bool useCommittedAttr)
+{
+  m_paramTypes = paramTypes;
+  m_useCommittedAttr = useCommittedAttr;
+}
+
+void StateFunctionTransform::setResourceGlobals(const std::set<llvm::Value*>& resources)
+{
+  m_resources = &resources;
+}
+
+Function* StateFunctionTransform::createDummyRuntimeDataArgFunc(Module* module, Type* runtimeDataArgTy)
+{
+  return FunctionBuilder(module, "dummyRuntimeDataArg").type(runtimeDataArgTy).build();
+}
+
+void StateFunctionTransform::setVerbose(bool val)
+{
+  m_verbose = val;
+}
+
+void StateFunctionTransform::setDumpFilename(const std::string& dumpFilename)
+{
+  m_dumpFilename = dumpFilename;
+}
+
+void StateFunctionTransform::run(std::vector<Function*>& stateFunctions, _Out_ unsigned int &shaderStackSize)
+{
+  printFunction("Initial");
+
+  init();
+  printFunction("AfterInit");
+
+  changeCallingConvention();
+  printFunction("AfterCallingConvention");
+
+  preserveLiveValuesAcrossCallsites(shaderStackSize);
+  printFunction("AfterPreserveLiveValues");
+
+  createSubstateFunctions(stateFunctions);
+  printFunctions(stateFunctions, "AfterSubstateFunctions");
+
+  lowerStackFuncs();
+  printFunctions(stateFunctions, "AfterLowerStackFuncs");
+}
+
+void StateFunctionTransform::finalizeStateIds(llvm::Module* module, const std::vector<int>& candidateFuncEntryStateIds)
+{
+  LLVMContext& context = module->getContext();
+  Function* func = module->getFunction("dummyStateId");
+  if (!func)
+    return;
+
+  std::vector<Instruction*> toRemove;
+  for (User* U : func->users())
+  {
+    CallInst* call = dyn_cast<CallInst>(U);
+    if (!call)
+      continue;
+
+    int  functionIdx = 0;
+    int  substate = 0;
+    getConstantValue(functionIdx, call->getArgOperand(0));
+    getConstantValue(substate, call->getArgOperand(1));
+    int stateId = candidateFuncEntryStateIds[functionIdx] + substate;
+
+    call->replaceAllUsesWith(makeInt32(stateId, context));
+    toRemove.push_back(call);
+  }
+
+  for (Instruction* v : toRemove)
+    v->eraseFromParent();
+  func->eraseFromParent();
+
+}
+
+void StateFunctionTransform::init()
+{
+  Module* module = m_function->getParent();
+  m_function->setName(cleanName(m_function->getName()));
+
+  // Run preparatory passes
+  runPasses(m_function, {
+    //createBreakCriticalEdgesPass(),
+    //createLoopSimplifyPass(),
+    //createLCSSAPass(),
+    createPromoteMemoryToRegisterPass()
+  });
+
+  // Make debugging a little easier by giving things names
+  dbgNameUnnamedVals(m_function);
+
+
+  findCallSitesIntrinsicsAndReturns();
+
+
+  // Create a bunch of functions that we are going to need
+  m_stackIntPtrFunc = FunctionBuilder(module, "stackIntPtr").i32Ptr().type(m_runtimeDataArgTy, "runtimeData").i32("baseOffset").i32("offset").build();
+
+  Instruction* insertBefore = afterEntryBlockAllocas(m_function);
+  Function* runtimeDataArgFunc = createDummyRuntimeDataArgFunc(module, m_runtimeDataArgTy);
+  m_runtimeDataArg = CallInst::Create(runtimeDataArgFunc, "runtimeData", insertBefore);
+
+  Function* stackFrameSizeFunc = FunctionBuilder(module, "dummyStackFrameSize").i32().build();
+  m_stackFrameSizeVal = CallInst::Create(stackFrameSizeFunc, "stackFrame.size", insertBefore);
+
+  // TODO only create the values that are actually needed
+  Function* payloadOffsetFunc = FunctionBuilder(module, "payloadOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_payloadOffset = CallInst::Create(payloadOffsetFunc, { m_runtimeDataArg }, "payload.offset", insertBefore);
+
+  Function* committedAttrOffsetFunc = FunctionBuilder(module, "committedAttrOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_committedAttrOffset = CallInst::Create(committedAttrOffsetFunc, { m_runtimeDataArg }, "committedAttr.offset", insertBefore);
+
+  Function* pendingAttrOffsetFunc = FunctionBuilder(module, "pendingAttrOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_pendingAttrOffset = CallInst::Create(pendingAttrOffsetFunc, { m_runtimeDataArg }, "pendingAttr.offset", insertBefore);
+
+  Function* stackFrameOffsetFunc = FunctionBuilder(module, "stackFrameOffset").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  m_stackFrameOffset = CallInst::Create(stackFrameOffsetFunc, { m_runtimeDataArg }, "stackFrame.offset", insertBefore);
+
+
+  // lower SetPendingAttr() now
+  for (CallInst* call : m_setPendingAttrCalls)
+  {
+    // Get the current pending attribute offset. It can change when a hit is committed
+    Instruction* insertBefore = call;
+    Value* currentPendingAttrOffset = CallInst::Create(pendingAttrOffsetFunc, { m_runtimeDataArg }, "cur.pendingAttr.offset", insertBefore);
+    Value* attr = call->getArgOperand(0);
+    createStackStore(currentPendingAttrOffset, attr, 0, insertBefore);
+    call->eraseFromParent();
+  }
+}
+
+void StateFunctionTransform::findCallSitesIntrinsicsAndReturns()
+{
+  // Create a map for log N lookup
+  std::map<std::string, int> candidateFuncMap;
+  for (int i = 0; i < (int)m_candidateFuncNames.size(); ++i)
+    candidateFuncMap[m_candidateFuncNames[i]] = i;
+
+  for (auto& I : inst_range(m_function))
+  {
+    if (CallInst* call = dyn_cast<CallInst>(&I))
+    {
+      StringRef calledFuncName = call->getCalledFunction()->getName();
+      if (calledFuncName.startswith(SET_PENDING_ATTR_PREFIX))
+        m_setPendingAttrCalls.push_back(call);
+      else if (calledFuncName.startswith("movePayloadToStack"))
+        m_movePayloadToStackCalls.push_back(call);
+      else if (calledFuncName == CALL_INDIRECT_NAME)
+        m_callSites.push_back(call);
+      else
+      {
+        auto it = candidateFuncMap.find(cleanName(calledFuncName));
+        if (it == candidateFuncMap.end())
+          continue;
+
+        assert(call->getCalledFunction()->getReturnType() == Type::getVoidTy(call->getContext()) && "Continuations with returns not supported");
+        m_callSites.push_back(call);
+        m_callSiteFunctionIdx.push_back(it->second);
+      }
+    }
+    else if (ReturnInst* ret = dyn_cast<ReturnInst>(&I))
+    {
+      m_returns.push_back(ret);
+    }
+  }
+}
+
+void StateFunctionTransform::changeCallingConvention()
+{
+  if (!m_callSites.empty() || m_attributeSizeInBytes >= 0)
+    allocateStackFrame();
+
+  if (m_attributeSizeInBytes >= 0)
+    allocateTraceFrame();
+
+  createArgFrames();
+
+  changeFunctionSignature();
+}
+
+static bool isCallToStackPtr(Value* inst)
+{
+  CallInst* call = dyn_cast<CallInst>(inst);
+  if (call && call->getCalledFunction()->getName().startswith("stack.ptr"))
+    return true;
+
+  return false;
+}
+
+static void extendAllocaLifetimes(LiveValues& lv)
+{
+  for (Instruction* inst : lv.getAllLiveValues())
+  {
+    if (!inst->getType()->isPointerTy())
+      continue;
+
+    if (isa<AllocaInst>(inst) || isCallToStackPtr(inst))
+      continue;
+
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(inst);
+    assert(gep && "Unhandled live pointer");
+    Value* ptr = gep->getPointerOperand();
+    if (isCallToStackPtr(ptr))
+      continue;
+    AllocaInst* alloc = dyn_cast<AllocaInst>(gep->getPointerOperand());
+    assert(alloc && "GEP of non-alloca pointer");
+
+    // TODO: We need to set indices of the uses of the gep, not the gep itself
+    const LiveValues::Indices* gepIndices = lv.getIndicesWhereLive(gep);
+    const LiveValues::Indices* allocIndices = lv.getIndicesWhereLive(alloc);
+    if (!allocIndices || *allocIndices != *gepIndices)
+      lv.setIndicesWhereLive(alloc, gepIndices);
+  }
+}
+
+
+void StateFunctionTransform::preserveLiveValuesAcrossCallsites(_Out_ unsigned int &shaderStackSize)
+{
+  if (m_callSites.empty())
+  {
+    // No stack frame. Nothing to do.
+    rewriteDummyStackSize(0);
+    return;
+  }
+
+  SetVector<Instruction*> stackOffsets;
+  stackOffsets.insert(m_stackFrameOffset);
+  if (m_payloadOffset && !m_payloadOffset->user_empty())
+    stackOffsets.insert(m_payloadOffset);
+  if (m_committedAttrOffset && !m_committedAttrOffset->user_empty())
+    stackOffsets.insert(m_committedAttrOffset);
+  if (m_pendingAttrOffset && !m_pendingAttrOffset->user_empty())
+    stackOffsets.insert(m_pendingAttrOffset);
+
+  // Do liveness analysis
+  ArrayRef<Instruction*> instructions((Instruction**)m_callSites.data(), m_callSites.size());
+  LiveValues lv(instructions);
+  lv.run();
+
+  // Make sure alloca lifetimes match their uses
+  extendAllocaLifetimes(lv);
+
+  // Make sure stack offsets get included
+  for (auto o : stackOffsets)
+    lv.setLiveAtAllIndices(o, true);
+
+  // Add payload allocas, if any
+  for (CallInst* call : m_movePayloadToStackCalls)
+  {
+    if (AllocaInst* payloadAlloca = dyn_cast<AllocaInst>(call->getArgOperand(0)))
+      lv.setLiveAtAllIndices(payloadAlloca, true);
+  }
+
+  printSet(lv.getAllLiveValues(), "live values");
+
+
+
+  //
+  // Carve up the stack frame. 
+  //
+  uint64_t offsetInBytes = 0;
+
+  // ... argument frame
+  offsetInBytes += m_maxCallerArgFrameSizeInBytes;
+
+
+  // ... live allocas. 
+  Module* module = m_function->getParent();
+  DataLayout DL(module);
+  DenseMap<Instruction*, Instruction*> allocaToStack;
+  Instruction* insertBefore = getInstructionAfter(m_stackFrameOffset);
+  for (Instruction* inst : lv.getAllLiveValues())
+  {
+    AllocaInst* alloc = dyn_cast<AllocaInst>(inst);
+    if (!alloc)
+      continue;
+
+    // Allocate a slot in the stack frame for the alloca
+    offsetInBytes = align(offsetInBytes, inst, DL);
+    Instruction* stackAlloca = createStackPtr(m_stackFrameOffset, alloc, offsetInBytes, insertBefore);
+    alloc->replaceAllUsesWith(stackAlloca);
+    allocaToStack[inst] = stackAlloca;
+
+    offsetInBytes += DL.getTypeAllocSize(alloc->getAllocatedType());
+  }
+  lv.remapLiveValues(allocaToStack); // replace old allocas with stackAllocas
+  for (auto& kv : allocaToStack)
+    kv.first->eraseFromParent(); // delete old allocas
+
+  // Set payload offsets now that they are all on the stack
+  for (CallInst* call : m_movePayloadToStackCalls)
+  {
+    CallInst* payloadStackPtr = dyn_cast<CallInst>(call->getArgOperand(0));
+    assert(payloadStackPtr->getCalledFunction()->getName().startswith("stack.ptr"));
+    Value* baseOffset = payloadStackPtr->getArgOperand(0);
+    Value* idx = payloadStackPtr->getArgOperand(1);
+    Value* payloadOffset = BinaryOperator::Create(Instruction::Add, baseOffset, idx, "", call);
+    call->replaceAllUsesWith(payloadOffset);
+    payloadOffset->takeName(call);
+    call->eraseFromParent();
+  }
+  //printFunction("AfterStackAllocas");
+
+
+  // ... saves/restores for each call site
+  // Create allocas for live values. This makes it easier to generate code because
+  // we don't have to maintain the use-def chains of SSA form. We can just
+  // load/store from/to the alloca for a particular value. A subsequent mem2reg
+  // pass will rebuild the SSA form.
+  DenseMap<Instruction*, AllocaInst*> valToAlloca;
+  DenseMap<AllocaInst*, Instruction*> allocaToVal;
+  for (Instruction* inst : lv.getAllLiveValues())
+    reg2Mem(valToAlloca, allocaToVal, inst);
+  //printFunction("AfterReg2Mem");
+
+  uint64_t baseOffsetInBytes = offsetInBytes;
+  uint64_t maxOffsetInBytes = offsetInBytes;
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    offsetInBytes = baseOffsetInBytes;
+
+    const InstructionSetVector& liveHere = lv.getLiveValues(i);
+    std::vector<Instruction*> workList(liveHere.begin(), liveHere.end());
+    std::set<Instruction*> visited;
+    Rematerializer R(allocaToVal, liveHere, *m_resources);
+    Instruction* saveInsertBefore = m_callSites[i];
+    Instruction* restoreInsertBefore = getInstructionAfter(m_callSites[i]);
+    Instruction* rematInsertBefore = nullptr; // create only if needed
+
+    // Rematerialize stack offsets after the continuation before other restores
+    for (Instruction* inst : stackOffsets)
+    {
+      visited.insert(inst);
+      Instruction* remat = R.rematerialize(inst, workList, restoreInsertBefore);
+      new StoreInst(remat, valToAlloca[inst], restoreInsertBefore);
+    }
+    Instruction* saveStackFrameOffset = new LoadInst(valToAlloca[m_stackFrameOffset], "stackFrame.offset", saveInsertBefore);
+    Instruction* restoreStackFrameOffset = R.getRematerializedValueFor(m_stackFrameOffset);
+
+    while (!workList.empty())
+    {
+      Instruction* inst = workList.back();
+      workList.pop_back();
+      if (!visited.insert(inst).second)
+        continue;
+
+      if (!R.canRematerialize(inst))
+      {
+        assert(!inst->getType()->isPointerTy() && "Can not save pointers");
+
+        offsetInBytes = align(offsetInBytes, inst, DL);
+        AllocaInst* alloca = valToAlloca[inst];
+
+        Value* saveVal = new LoadInst(alloca, addSuffix(inst->getName(), ".save"), saveInsertBefore);
+        createStackStore(saveStackFrameOffset, saveVal, offsetInBytes, saveInsertBefore);
+
+        Value* restoreVal = createStackLoad(restoreStackFrameOffset, inst, offsetInBytes, restoreInsertBefore);
+        new StoreInst(restoreVal, alloca, restoreInsertBefore);
+
+        offsetInBytes += DL.getTypeAllocSize(inst->getType());
+      }
+      else if (R.getRematerializedValueFor(inst) == nullptr)
+      {
+        if (!rematInsertBefore)
+        {
+          // Create a new block after restores for rematerialized values. This 
+          // ensures that we can use restored values (through their allocas) even
+          // if we haven't generated the actual restore yet.
+          rematInsertBefore = restoreInsertBefore->getParent()->splitBasicBlock(restoreInsertBefore, "remat_begin")->begin();
+          restoreInsertBefore = m_callSites[i]->getParent()->getTerminator();
+        }
+        Instruction* remat = R.rematerialize(inst, workList, rematInsertBefore);
+        new StoreInst(remat, valToAlloca[inst], rematInsertBefore);
+      }
+    }
+
+    // Take the max offset over all call sites
+    maxOffsetInBytes = std::max(maxOffsetInBytes, offsetInBytes);
+  }
+
+
+  // ... traceFrame (if any)
+  maxOffsetInBytes += m_traceFrameSizeInBytes;
+
+
+  // Set the stack size
+  rewriteDummyStackSize(maxOffsetInBytes);
+  shaderStackSize = maxOffsetInBytes;
+}
+
+void StateFunctionTransform::createSubstateFunctions(std::vector<Function*>& stateFunctions)
+{
+  // The runtime perf of split() depends on the number of blocks in the function.
+  // Simplifying the CFG before the split helps reduce the cost of that operation.
+  runPasses(m_function, {
+    createCFGSimplificationPass()
+  });
+
+  stateFunctions.resize(m_callSites.size() + 1);
+  BasicBlockVector substateEntryBlocks = replaceCallSites();
+  for (size_t i = 0, e = stateFunctions.size(); i < e; ++i)
+  {
+    stateFunctions[i] = split(m_function, substateEntryBlocks[i], i);
+
+    // Add an attribute so we can detect when an intrinsic is not being called
+    // from a state function, and thus doesn't have access to the runtimeData pointer.
+    stateFunctions[i]->addFnAttr("state_function", "true");
+  }
+
+  // Erase base function
+  m_function->eraseFromParent();
+  m_function = nullptr;
+}
+
+void StateFunctionTransform::allocateStackFrame()
+{
+  Module* module = m_function->getParent();
+
+  // Push stack frame in entry block. 
+  Instruction* insertBefore = m_stackFrameOffset;
+  Function* stackFramePushFunc = FunctionBuilder(module, "stackFramePush").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("size").build();
+  m_stackFramePush = CallInst::Create(stackFramePushFunc, { m_runtimeDataArg, m_stackFrameSizeVal }, "", insertBefore);
+
+  // Pop the stack frame just before returns.
+  Function* stackFramePop = FunctionBuilder(module, "stackFramePop").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("size").build();
+  for (Instruction* insertBefore : m_returns)
+    CallInst::Create(stackFramePop, { m_runtimeDataArg, m_stackFrameSizeVal }, "", insertBefore);
+}
+
+void StateFunctionTransform::allocateTraceFrame()
+{
+  assert(m_attributeSizeInBytes >= 0 && "Attribute size has not been specified");
+
+  m_traceFrameSizeInBytes =
+      2 * m_attributeSizeInBytes // committed and pending attributes
+    + 2 * sizeof(int);           // old committed/pending attribute offsets
+  int attrSizeInInts = m_attributeSizeInBytes / sizeof(int);
+
+  // Push the trace frame first thing so that the runtime 
+  // can do setup relative to the entry stack offset.
+  Module* module = m_function->getParent();
+  Instruction* insertBefore = afterEntryBlockAllocas(m_function);
+  Value* attrSize = makeInt32(attrSizeInInts, module->getContext());
+  Function* traceFramePushFunc = FunctionBuilder(module, "traceFramePush").voidTy().type(m_runtimeDataArgTy, "runtimeData").i32("attrSize").build();
+  CallInst::Create(traceFramePushFunc, { m_runtimeDataArg, attrSize }, "", insertBefore);
+
+  // Pop the trace frame just before returns.
+  Function* traceFramePopFunc = FunctionBuilder(module, "traceFramePop").voidTy().type(m_runtimeDataArgTy, "runtimeData").build();
+  for (Instruction* insertBefore : m_returns)
+    CallInst::Create(traceFramePopFunc, { m_runtimeDataArg }, "", insertBefore);
+}
+
+bool isTemporaryAlloca(Value* op)
+{
+  // TODO: Need to some analysis to figure this out. We can put the alloca on
+  // the caller stack if:
+  //  there is only a single callsite OR
+  //  if no callsite between stores/loads and this callsite
+  return true;
+}
+
+void StateFunctionTransform::createArgFrames()
+{
+  Module* module = m_function->getParent();
+  DataLayout DL(module);
+  Instruction* stackAllocaInsertBefore = getInstructionAfter(m_stackFrameOffset);
+
+  // Retrieve this function's arguments from the stack
+  if (m_function->getFunctionType()->getNumParams() > 0)
+  {
+    if (m_paramTypes.empty())
+      m_paramTypes.assign(m_function->getFunctionType()->getNumParams(), PST_NONE); // assume standard argument types
+
+    static_assert(PST_COUNT == 3, "Expected 3 parameter semantic types");
+    int offsetInBytes[PST_COUNT] = { 0, 0, 0 };
+    Value* baseOffset[PST_COUNT] = { nullptr, nullptr, nullptr };
+
+    Instruction* insertBefore = stackAllocaInsertBefore;
+    for (auto pst : m_paramTypes)
+    {
+      if (baseOffset[pst])
+        continue;
+
+      if (pst == PST_NONE)
+      {
+        baseOffset[pst] = BinaryOperator::Create(Instruction::Add, m_stackFrameOffset, m_stackFrameSizeVal, "callerArgFrame.offset", insertBefore);
+        offsetInBytes[pst] = sizeof(int); // skip the first element in caller arg frame (returnStateID)
+      }
+      else if (pst == PST_PAYLOAD)
+      {
+        baseOffset[pst] = m_payloadOffset;
+      }
+      else if (pst == PST_ATTRIBUTE)
+      {
+        baseOffset[pst] = (m_useCommittedAttr) ? m_committedAttrOffset : m_pendingAttrOffset;
+      }
+      else
+      {
+        assert(0 && "Bad parameter type");
+      }
+    }
+
+    int argIdx = 0;
+    for (auto& arg : m_function->args())
+    {
+      ParameterSemanticType pst = m_paramTypes[argIdx];
+      Value* val = nullptr;
+      if (arg.getType()->isPointerTy())
+      {
+        // Assume that pointed to memory is on the stack.
+        val = createStackPtr(baseOffset[pst], &arg, offsetInBytes[pst], insertBefore);
+        offsetInBytes[pst] += DL.getTypeAllocSize(arg.getType()->getPointerElementType());
+      }
+      else
+      {
+        val = createStackLoad(baseOffset[pst], &arg, offsetInBytes[pst], insertBefore);
+        offsetInBytes[pst] += DL.getTypeAllocSize(arg.getType());
+      }
+
+      // Replace use of the argument with the loaded value
+      if (arg.hasName())
+        val->takeName(&arg);
+      else
+        val->setName("arg" + std::to_string(argIdx));
+      arg.replaceAllUsesWith(val);
+
+      argIdx++;
+    }
+  }
+
+
+  // Process function arguments for each call site
+  m_maxCallerArgFrameSizeInBytes = 0;
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    int offsetInBytes = 0;
+    CallInst* call = m_callSites[i];
+    FunctionType* FT = call->getCalledFunction()->getFunctionType();
+    StringRef calledFuncName = call->getCalledFunction()->getName();
+
+    Instruction* insertBefore = call;
+
+    // Set the return stateId (next substate of this function)
+    int nextSubstate = i + 1;
+    Value* nextStateId = getDummyStateId(m_functionIdx, nextSubstate, insertBefore);
+    createStackStore(m_stackFrameOffset, nextStateId, offsetInBytes, insertBefore);
+    offsetInBytes += DL.getTypeAllocSize(nextStateId->getType());
+    if (FT->getNumParams() && calledFuncName != CALL_INDIRECT_NAME)
+    {
+      for (unsigned index = 0; index < FT->getNumParams(); ++index)
+      {
+        // Save the argument from the argFrame
+        Value* op = call->getArgOperand(index);
+        Type* opTy = op->getType();
+        if (opTy->isPointerTy())
+        {
+          // TODO: Until we have callable shaders we should not get here except
+          // in tests.
+          if (isTemporaryAlloca(op))
+          {
+            // We can just replace the alloca with space in the arg frame
+            assert(isa<AllocaInst>(op));
+            Value* stackAlloca = createStackPtr(m_stackFrameOffset, op, offsetInBytes, stackAllocaInsertBefore);
+            op->replaceAllUsesWith(stackAlloca);
+            cast<AllocaInst>(op)->eraseFromParent();
+          }
+          else
+          {
+            // copy in/out
+            assert(0);
+          }
+          offsetInBytes += DL.getTypeAllocSize(opTy->getPointerElementType());
+        }
+        else
+        {
+          createStackStore(m_stackFrameOffset, op, offsetInBytes, insertBefore);
+          offsetInBytes += DL.getTypeAllocSize(opTy);
+        }
+
+        // Replace use of the argument with undef
+        call->setArgOperand(index, UndefValue::get(opTy));
+
+      }
+    }
+
+    if (offsetInBytes > m_maxCallerArgFrameSizeInBytes)
+      m_maxCallerArgFrameSizeInBytes = offsetInBytes;
+  }
+}
+
+void StateFunctionTransform::changeFunctionSignature()
+{
+  // Create a new function that takes a state object pointer and returns next state ID
+  // and splice in the body of the old function into the new one.
+  Function* newFunc = FunctionBuilder(m_function->getParent(), m_functionName + "_tmp").i32().type(m_runtimeDataArgTy, "runtimeData").build();
+  newFunc->getBasicBlockList().splice(newFunc->begin(), m_function->getBasicBlockList());
+  m_function = newFunc;
+
+  // Set the runtime data pointer and remove the dummy function .
+  Value* runtimeDataArg = m_function->arg_begin();
+  replaceValAndRemoveUnusedDummyFunc(m_runtimeDataArg, runtimeDataArg, m_function);
+  m_runtimeDataArg = runtimeDataArg;
+
+  // Get return stateID from stack on each return.
+  LLVMContext& context = m_function->getContext();
+  Value* zero = makeInt32(0, context);
+  CallInst* retStackFrameOffset = m_stackFrameOffset;
+  for (ReturnInst*& ret : m_returns)
+  {
+    Instruction* insertBefore = ret;
+    if (m_stackFramePush)
+      retStackFrameOffset = CallInst::Create(m_stackFrameOffset->getCalledFunction(), { m_runtimeDataArg }, "ret.stackFrame.offset", insertBefore);
+    Instruction* returnStateIdPtr = CallInst::Create(m_stackIntPtrFunc, { m_runtimeDataArg, retStackFrameOffset, zero }, "ret.stateId.ptr", insertBefore);
+    Value* returnStateId = new LoadInst(returnStateIdPtr, "ret.stateId", insertBefore);
+    ReturnInst* newRet = ReturnInst::Create(context, returnStateId);
+    ReplaceInstWithInst(ret, newRet);
+    ret = newRet; // update reference
+  }
+}
+
+
+void StateFunctionTransform::rewriteDummyStackSize(uint64_t frameSizeInBytes)
+{
+  assert(frameSizeInBytes % sizeof(int) == 0);
+  Value*   frameSizeVal = makeInt32(frameSizeInBytes / sizeof(int), m_function->getContext());
+  replaceValAndRemoveUnusedDummyFunc(m_stackFrameSizeVal, frameSizeVal, m_function);
+  m_stackFrameSizeVal = frameSizeVal;
+}
+
+static inline Value* toIntIndex(int offsetInBytes, Value* baseOffset, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  if (baseOffset)
+    intIndex = BinaryOperator::Create(Instruction::Add, intIndex, baseOffset, "", insertBefore);
+  return intIndex;
+}
+
+void StateFunctionTransform::createStackStore(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Value* args[] = { val, baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType(), args[2]->getType() };
+  FunctionType* FT = FunctionType::get(Type::getVoidTy(val->getContext()), argTypes, false);
+  Function* F = getOrCreateFunction("stack.store", insertBefore->getModule(), FT, m_stackStoreFuncs);
+  CallInst::Create(F, args, "", insertBefore);
+}
+
+Instruction* StateFunctionTransform::createStackLoad(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Value* args[] = { baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType() };
+  FunctionType* FT = FunctionType::get(val->getType(), argTypes, false);
+  Function* F = getOrCreateFunction("stack.load", insertBefore->getModule(), FT, m_stackLoadFuncs);
+  return CallInst::Create(F, args, addSuffix(val->getName(), ".restore"), insertBefore);
+}
+
+Instruction* StateFunctionTransform::createStackPtr(Value* baseOffset, Type* valTy, Value* intIndex, Instruction* insertBefore)
+{
+  Value* args[] = { baseOffset, intIndex };
+  Type* argTypes[] = { args[0]->getType(), args[1]->getType() };
+  FunctionType* FT = FunctionType::get(valTy, argTypes, false);
+  Function* F = getOrCreateFunction("stack.ptr", insertBefore->getModule(), FT, m_stackPtrFuncs);
+  CallInst* call = CallInst::Create(F, args, "", insertBefore);
+  return call;
+}
+
+Instruction* StateFunctionTransform::createStackPtr(Value* baseOffset, Value* val, int offsetInBytes, Instruction* insertBefore)
+{
+  assert(offsetInBytes % sizeof(int) == 0);
+  Value* intIndex = makeInt32(offsetInBytes / sizeof(int), insertBefore->getContext());
+  Instruction* ptr = createStackPtr(baseOffset, val->getType(), intIndex, insertBefore);
+  ptr->takeName(val);
+  return ptr;
+}
+
+static bool isStackIntPtr(Value* val)
+{
+  CallInst* call = dyn_cast<CallInst>(val);
+  return call && call->getCalledFunction()->getName().startswith("stack.ptr");
+}
+
+// This code adapted from GetElementPtrInst::accumulateConstantOffset(). 
+// TODO: Use a single function for both constant and dynamic offsets? Could do
+// some constant folding along the way for dynamic offsets.
+Value* accumulateDynamicOffset(GetElementPtrInst* gep, const DataLayout &DL)
+{
+  LLVMContext& C = gep->getContext();
+  Instruction* insertBefore = gep;
+  Value* offset = makeInt32(0, C);
+  for (gep_type_iterator GTI = gep_type_begin(gep), GTE = gep_type_end(gep); GTI != GTE; ++GTI)
+  {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (OpC && OpC->isZero())
+      continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    Value* elementOffset = nullptr;
+    if (StructType *STy = dyn_cast<StructType>(*GTI))
+    {
+      assert(OpC && "Structure indices must be constant");
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = DL.getStructLayout(STy);
+      elementOffset = makeInt32(SL->getElementOffset(ElementIdx) / sizeof(int), C);
+    }
+    else
+    {
+      // For array or vector indices, scale the index by the size of the type.
+      Value* stride = makeInt32(DL.getTypeAllocSize(GTI.getIndexedType()) / sizeof(int), C);
+      elementOffset = BinaryOperator::Create(Instruction::Mul, GTI.getOperand(), stride, "elOffs", insertBefore);
+    }
+
+    offset = BinaryOperator::Create(Instruction::Add, offset, elementOffset, "offs", insertBefore);
+  }
+  return offset;
+}
+
+
+// Adds gep offset to offsetVal and returns the result
+static Value* accumulateGepOffset(GetElementPtrInst* gep, Value* offsetVal)
+{
+  Module* M = gep->getModule();
+  const DataLayout& DL = M->getDataLayout();
+
+  Value* elementOffsetVal = nullptr;
+  APInt constOffset(DL.getPointerSizeInBits(), 0);
+  if (gep->accumulateConstantOffset(DL, constOffset))
+    elementOffsetVal = makeInt32((int)constOffset.getZExtValue() / sizeof(int), M->getContext());
+  else
+    elementOffsetVal = accumulateDynamicOffset(gep, DL);
+  elementOffsetVal = BinaryOperator::Create(Instruction::Add, offsetVal, elementOffsetVal, "offs", gep);
+
+  return elementOffsetVal;
+}
+
+// Turn GEPs on a stack.ptr of aggregate type into stack.ptrs of scalar type
+void StateFunctionTransform::flattenGepsOnValue(Value* val, Value* baseOffset, Value* offsetVal)
+{
+  for (auto U = val->user_begin(), UE = val->user_end(); U != UE;)
+  {
+    User* user = *U++;
+    if (CallInst* call = dyn_cast<CallInst>(user))
+    {
+      // inline the call to expose GEPs and restart the loop. 
+      InlineFunctionInfo IFI;
+      bool success = InlineFunction(call, IFI, false);
+      assert(success);
+      (void)success; 
+
+      U = val->user_begin();
+      UE = val->user_end();
+      continue;
+    }
+
+    GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(user);
+    if (!gep)
+      continue;
+
+    Value* elementOffsetVal = accumulateGepOffset(gep, offsetVal);
+    Type* gepElTy = gep->getType()->getPointerElementType();
+    if (gepElTy->isAggregateType())
+    {
+      // flatten geps on this gep
+      flattenGepsOnValue(gep, baseOffset, elementOffsetVal);
+    }
+    else if (isa<VectorType>(gepElTy))
+      scalarizeVectorStackAccess(gep, baseOffset, elementOffsetVal);
+    else 
+    {
+      Value* ptr = createStackPtr(baseOffset, gep->getType(), elementOffsetVal, gep);
+      ptr->takeName(gep); // could use a name that encodes the gep type and indices
+      gep->replaceAllUsesWith(ptr);
+    }
+
+    gep->eraseFromParent();
+  }
+}
+
+
+void StateFunctionTransform::scalarizeVectorStackAccess(Instruction* vecPtr, Value* baseOffset, Value* offsetVal)
+{
+  std::vector<Value*> elPtrs;
+  Type* VTy = vecPtr->getType()->getPointerElementType();
+  Type* elTy = VTy->getVectorElementType();
+  LLVMContext& C = vecPtr->getContext();
+  Value* curOffsetVal = offsetVal;
+  Value* one = makeInt32(1, C);
+  offsetVal->setName("offs0.");
+  for (unsigned i = 0; i < VTy->getVectorNumElements(); ++i)
+  {
+    // TODO: If offsetVal is a constant we could just create constants instead of add instructions
+    if (i > 0)
+      curOffsetVal = BinaryOperator::Create(Instruction::Add, curOffsetVal, one, stringf("offs%d.", i), vecPtr);
+    elPtrs.push_back(createStackPtr(baseOffset, elTy->getPointerTo(), curOffsetVal, vecPtr));
+    elPtrs.back()->setName(addSuffix(vecPtr->getName(), stringf(".el%d.", i)));
+  }
+
+  // Scalarize load/stores
+  for (auto U = vecPtr->user_begin(), UE = vecPtr->user_end(); U != UE;)
+  {
+    User* user = *U++;
+    if (LoadInst* load = dyn_cast<LoadInst>(user))
+    {
+      Value* vec = UndefValue::get(VTy);
+      for (size_t i = 0; i < elPtrs.size(); ++i)
+      {
+        Value* el = new LoadInst(elPtrs[i], stringf("el%d.", i), load);
+        vec = InsertElementInst::Create(vec, el, makeInt32(i, C), "vec", load);
+      }
+      load->replaceAllUsesWith(vec);
+      load->eraseFromParent();
+    }
+    else if (StoreInst* store = dyn_cast<StoreInst>(user))
+    {
+      Value* vec = store->getOperand(0);
+      for (size_t i = 0; i < elPtrs.size(); ++i)
+      {
+        Value* el = ExtractElementInst::Create(vec, makeInt32(i, C), stringf("el%d.", i), store);
+        new StoreInst(el, elPtrs[i], store);
+      }
+      store->eraseFromParent();
+    }
+    else
+    {
+      assert(0 && "Unhandled user");
+    }
+  }
+}
+
+
+void StateFunctionTransform::lowerStackFuncs()
+{
+  LLVMContext& C = m_stackIntPtrFunc->getContext();
+  const DataLayout& DL = m_stackIntPtrFunc->getParent()->getDataLayout();
+
+  // stack.store functions
+  for (auto& kv : m_stackStoreFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* val = call->getArgOperand(0);
+      Value* offset = call->getArgOperand(1);
+      int idx = getConstantValue(call->getArgOperand(2));
+
+      Instruction* insertBefore = call;
+      if (isStackIntPtr(val))
+      {
+        // Copy from one part of the stack to another
+        CallInst* valCall = dyn_cast<CallInst>(val);
+        Value* srcOffset = valCall->getArgOperand(0);
+        int srcIdx = getConstantValue(valCall->getArgOperand(1));
+        Value* dstOffset = offset;
+        int dstIdx = idx;
+        int intCount = (int)DL.getTypeAllocSize(val->getType()->getPointerElementType()) / sizeof(int);
+        for (int i = 0; i < intCount; ++i)
+        {
+          std::string idxStr = stringf("%d.", i);
+          Value* srcPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, srcOffset, makeInt32(srcIdx + i, C) }, addSuffix(val->getName(), ".ptr" + idxStr), insertBefore);
+          Value* dstPtr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, dstOffset, makeInt32(dstIdx + i, C) }, "dst.ptr" + idxStr, insertBefore);
+          Value* intVal = new LoadInst(srcPtr, "copy.val" + idxStr, insertBefore);
+          new StoreInst(intVal, dstPtr, insertBefore);
+        }
+      }
+      else
+      {
+        store(val, m_stackIntPtrFunc, runtimeDataArg, offset, idx, insertBefore);
+      }
+
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+
+  // stack.load functions
+  for (auto& kv : m_stackLoadFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      std::string name = stripSuffix(call->getName(), ".restore");
+      call->setName("");
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+
+      Instruction* insertBefore = call;
+      Value* val = load(m_stackIntPtrFunc, runtimeDataArg, offset, idx, name, call->getType(), insertBefore);
+      call->replaceAllUsesWith(val);
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+
+
+  // Scalarize accesses based on a stack.ptr func
+  for (auto& kv : m_stackPtrFuncs)
+  {
+    Function* F = kv.second;
+    if (!F->getReturnType()->getPointerElementType()->isAggregateType())
+      continue;
+    for (auto U = F->user_begin(), UE = F->user_end(); U != UE; )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+      flattenGepsOnValue(call, offset, idx);
+      call->eraseFromParent();
+    }
+  }
+
+
+  // stack.ptr functions
+  for (auto& kv : m_stackPtrFuncs)
+  {
+    Function* F = kv.second;
+    for (auto U = F->user_begin(); U != F->user_end(); )
+    {
+      CallInst* call = dyn_cast<CallInst>(*(U++));
+      assert(call);
+
+      std::string name = call->getName();
+      Value* runtimeDataArg = call->getParent()->getParent()->arg_begin();
+      Value* offset = call->getArgOperand(0);
+      Value* idx = call->getArgOperand(1);
+
+      Instruction* insertBefore = call;
+      Value* ptr = CallInst::Create(m_stackIntPtrFunc, { runtimeDataArg, offset, idx }, addSuffix(name, ".ptr"), insertBefore);
+      if (ptr->getType() != call->getType())
+        ptr = new BitCastInst(ptr, call->getType(), "", insertBefore);
+      ptr->takeName(call);
+      call->replaceAllUsesWith(ptr);
+      call->eraseFromParent();
+    }
+    F->eraseFromParent();
+  }
+}
+
+Function* StateFunctionTransform::split(Function* baseFunc, BasicBlock* substateEntryBlock, int substateIndex)
+{
+  ValueToValueMapTy VMap;
+  Function*         substateFunc = cloneBlocksReachableFrom(substateEntryBlock, VMap);
+  Module*           module = baseFunc->getParent();
+  module->getFunctionList().push_back(substateFunc);
+  substateFunc->setName(m_functionName + ".ss_" + std::to_string(substateIndex));
+
+  if (substateIndex != 0)
+  {
+    // Collect allocas from entry block
+    SmallVector<Instruction*, 16> allocasToClone;
+    for (auto& I : baseFunc->getEntryBlock().getInstList())
+    {
+      if (isa<AllocaInst>(&I))
+        allocasToClone.push_back(&I);
+    }
+
+    // Clone collected allocas
+    BasicBlock* newEntryBlock = &substateFunc->getEntryBlock();
+    for (auto I : allocasToClone)
+    {
+      // Collect users of original instruction in substateFunc
+      std::vector<Instruction*> users;
+      for (auto U : I->users())
+      {
+        Instruction* inst = dyn_cast<Instruction>(U);
+        if (inst->getParent()->getParent() == substateFunc)
+          users.push_back(inst);
+      }
+
+      if (users.empty())
+        continue;
+
+      // Clone instruction
+      Instruction* clone = I->clone();
+      if (I->hasName())
+        clone->setName(I->getName());
+      clone->insertBefore(newEntryBlock->getFirstInsertionPt()); // allocas first in entry block
+      RemapInstruction(clone, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+      // Replaces uses
+      for (auto user : users)
+        user->replaceUsesOfWith(I, clone);
+    }
+  }
+
+  //printFunction( substateFunc, substateFunc->getName().str() + "-BeforeSplittingOpt", m_dumpId++ );
+
+  makeReducible(substateFunc);
+
+  // Undo the reg2mem done in preserveLiveValuesAcrossCallSites()
+  runPasses(substateFunc, {
+    createVerifierPass(),
+    createPromoteMemoryToRegisterPass()
+  });
+
+  //printFunction( substateFunc, substateFunc->getName().str() + "-AfterSplitting", m_dumpId++ );
+
+  return substateFunc;
+}
+
+BasicBlockVector StateFunctionTransform::replaceCallSites()
+{
+  LLVMContext& context = m_function->getContext();
+
+  BasicBlockVector substateEntryPoints{ &m_function->getEntryBlock() };
+  substateEntryPoints[0]->setName(m_functionName + ".BB0");
+
+  // Add other substates by splitting blocks at call sites.
+  for (size_t i = 0; i < m_callSites.size(); ++i)
+  {
+    CallInst*   call = m_callSites[i];
+    BasicBlock* block = call->getParent();
+    StringRef calledFuncName = call->getCalledFunction()->getName();
+
+    BasicBlock* nextBlock =
+      block->splitBasicBlock(call->getNextNode(), m_functionName + ".BB" + std::to_string(i + 1) + ".from."
+        + cleanName(calledFuncName));
+    substateEntryPoints.push_back(nextBlock);
+
+    // Return state id for entry state of the function being called
+    Instruction* insertBefore = call;
+    Value* returnStateId = nullptr;
+    if (calledFuncName == CALL_INDIRECT_NAME)
+      returnStateId = call->getArgOperand(0);
+    else
+      returnStateId = getDummyStateId(m_callSiteFunctionIdx[i], 0, insertBefore);
+    ReplaceInstWithInst(call->getParent()->getTerminator(), ReturnInst::Create(context, returnStateId));
+    call->eraseFromParent();
+  }
+  return substateEntryPoints;
+}
+
+llvm::Value* StateFunctionTransform::getDummyStateId(int functionIdx, int substate, llvm::Instruction* insertBefore)
+{
+  if (!m_dummyStateIdFunc)
+  {
+    Module* M = m_function->getParent();
+    m_dummyStateIdFunc = FunctionBuilder(M, "dummyStateId").i32().i32("functionIdx").i32("substate").build();
+  }
+  LLVMContext& context = insertBefore->getContext();
+  Value* functionIdxVal = makeInt32(functionIdx, context);
+  Value* substateVal = makeInt32(substate, context);
+  return CallInst::Create(m_dummyStateIdFunc, { functionIdxVal, substateVal }, "stateId", insertBefore);
+}
+
+raw_ostream& StateFunctionTransform::getOutputStream(const std::string functionName, const std::string& suffix, unsigned int dumpId)
+{
+  if (m_dumpFilename.empty())
+    return DBGS();
+
+  const std::string filename = createDumpPath(m_dumpFilename, dumpId, suffix, functionName);
+  std::error_code  errorCode;
+  raw_ostream* out = new raw_fd_ostream(filename, errorCode, sys::fs::OpenFlags::F_None);
+  if (errorCode)
+  {
+    DBGS() << "Failed to open " << filename << " for writing sft output. " << errorCode.message() << "\n";
+    delete out;
+    return DBGS();
+  }
+
+  return *out;
+}
+
+void StateFunctionTransform::printFunction(const Function* function, const std::string& suffix, unsigned int dumpId)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream(m_functionName, suffix, dumpId);
+  out << "; ########################### " << suffix << "\n";
+  out << *function << "\n";
+  if (&out != &DBGS())
+    delete &out;
+}
+
+void StateFunctionTransform::printFunction(const std::string& suffix)
+{
+  printFunction(m_function, suffix, m_dumpId++);
+}
+
+void StateFunctionTransform::printFunctions(const std::vector<Function*>& funcs, const char* suffix)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream(m_functionName, suffix, m_dumpId++);
+  out << "; ########################### " << suffix << "\n";
+  for (Function* F : funcs)
+    out << *F << "\n";
+  if (&out != &DBGS())
+    delete &out;
+}
+
+void StateFunctionTransform::printModule(const Module* module, const std::string& suffix)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = getOutputStream("module", suffix, m_dumpId++);
+  out << "; ########################### " << suffix << "\n";
+  out << *module << "\n";
+}
+
+void StateFunctionTransform::printSet(const InstructionSetVector& vals, const char* msg)
+{
+  if (!m_verbose)
+    return;
+
+  raw_ostream& out = DBGS();
+  if (msg)
+    out << msg << " --------------------\n";
+
+  uint64_t totalBytes = 0;
+  if (vals.size() > 0)
+  {
+    Module*    module = m_function->getParent();
+    DataLayout DL(module);
+    for (InstructionSetVector::const_iterator I = vals.begin(), IE = vals.end(); I != IE; ++I)
+    {
+      const Instruction* inst = *I;
+      uint64_t           size = DL.getTypeAllocSize(inst->getType());
+      out << stringf("%3dB: ", size) << *inst << '\n';
+      totalBytes += size;
+    }
+  }
+  out << "Count:" << vals.size() << "  Bytes:" << totalBytes << "\n\n";
+}

+ 295 - 0
lib/DxrFallback/StateFunctionTransform.h

@@ -0,0 +1,295 @@
+#pragma once
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+  class AllocaInst;
+  class BasicBlock;
+  class CallInst;
+  class Function;
+  class FunctionType;
+  class Instruction;
+  class Module;
+  class raw_ostream;
+  class ReturnInst;
+  class StructType;
+  class Type;
+  class Value;
+}
+
+class LiveValues;
+
+typedef std::vector<llvm::BasicBlock*>  BasicBlockVector;
+typedef llvm::SetVector<llvm::Instruction*> InstructionSetVector;
+
+
+//==============================================================================
+// Transforms the given function into a number of state functions to be 
+// used in a state machine. 
+//
+// State functions have the following signature: 
+//    int (<RuntimeDataTy> runtimeData). 
+// They take an runtime data argument with a given type used by the runtime and 
+// return the state ID of the next state. If the function contains calls to other  
+// candidate functions that are to be transformed into state functions, the 
+// function is split into multiple substate functions at call sites and the calls 
+// are replaced with continuations. For example candidate funcA() calling candidate 
+// funcB():
+//   void funcA(int param0)
+//   {
+//      // code moved to funcA_ss0()
+//      int foo = 10;
+//      ...
+//
+//      funcB(arg0, arg1); 
+//
+//      // code moved to funcA_ss1()
+//      int bar = someFunc(foo);
+//      
+//   } 
+// will be split into two substate functions, funcA_ss0() and funcA_ss1(). 
+// funcA_ss0() pushes the stateID for funcA_ss1() onto the stack, and
+// returns the state ID for the entry substate of funcB, funcB_ss0(). 
+// A substate of funcB will eventually pop the stack and return the state ID
+// for funcA_ss1(). funcA_ss1() in turn pops the stack to get the state ID
+// placed there by its caller. 
+//
+// If candidate functions, like funcB(), have arguments they are moved to the stack.
+// Any values that are live across continuations, like foo in this example,
+// must also be saved to the stack before the continuation and restored before use. 
+// Some values, like DXIL buffer handles should not be saved and must be 
+// rematerialized after a continuation. The stack frame in a state function has
+// the following layout:
+//   
+//   |               |
+//   +---------------+  
+//   | argN          |  
+//   | ...           |   
+//   | arg0          |  
+//   | returnStateID | caller arg frame
+//   +---------------+ <-- entry stack pointer
+//   |               |
+//   | saved values  |
+//   |               |
+//   +---------------+
+//   | argN          |
+//   | ...           |
+//   | arg0          |
+//   | returnStateID | callee arg frame
+//   +---------------+ <-- stack frame pointer
+//           |
+//           V stack grows downward towards smaller addresses
+//
+// The return state ID is stored at the base of the argument frame, followed by
+// function arguments, if any. The saved values follow the argument frame. Instead
+// of adjusting the size of the stack frame for the saved values and argument
+// frames of each continuation a single allocation is made with enough space to
+// accommodate all continuations in the function.
+//
+// Several placeholder functions are used during the process of the state function
+// transform to break dependency cycles. A placeholder for the runtime data pointer
+// is used to allocate the stack frame before the function signature is changed
+// and the pointer parameter is created. The stack frame is also allocated before
+// its size has been determined, so a placeholder is used. The state IDs corresponding
+// to function entry substates may also not be known before the transform has been 
+// run on all the candidate functions. Therefore a placeholder is used for state 
+// IDs as well. These are replaced by calling StateFunctionTransform::finalizeStateIds()
+// after all the candidate functions have been transformed.
+//
+// If the intrinsic Internal_CallIndirect(int stateId) appears in the body of
+// the function then it is treated as a continuation with a transition to the
+// specified stateId.
+//
+// When an attribute size is specified, space is allocated on the stack frame for
+// committed/pending attributes, as well as the previous offsets for the committed/
+// pending attributes. The attribute size should be set if the 
+// function is TraceRay(). The payload offset needs to be set by the caller. The 
+// stack frame for TraceRay() has the following layout:
+//
+//   |                         |
+//   +-------------------------+ 
+//   |                         |
+//   | TraceRay() args         |
+//   |                         |
+//   +-------------------------+
+//   | returnStateID           | caller arg frame
+//   +-------------------------+ <-- entry stack offset
+//   | old committed attr offs |
+//   | old pending attr offset |
+//   +-------------------------+ 
+//   |                         |
+//   | committed attributes    |
+//   |                         |
+//   +-------------------------+ <-- new committed attribute offset
+//   |                         |
+//   | pending attributes      |
+//   |                         |
+//   +-------------------------+ <-- new pending attribute offset
+//   |                         |
+//   | saved values            |
+//   |                         |
+//   +-------------------------+
+//   | argN                    |
+//   | ...                     |
+//   | arg0                    |
+//   | returnStateID           | callee arg frame
+//   +-------------------------+ <-- stack frame offset
+//      
+// The arguments to some functions (e.g. closesthit, anyhit, and miss shaders)
+// come from the payload or attributes. The positions of these arguments can be 
+// specified to SFT, which will redirect the defs from the args to corresponding
+// values on the stack.
+//
+// The following runtime (LLVM) functions are used by SFT (all sizes and offsets
+// are in terms of ints):
+//   void stackFramePush(<RuntimeDataTy> runtimeData, i32 size)
+//   void stackFramePop(<RuntimeDataTy> runtimeData, i32 size)
+//
+//   i32 stackFrameOffset(<RuntimeDataTy> runtimeData)
+//   i32 payloadOffset(<RuntimeDataTy> runtimeData) 
+//   i32 committedAttrOffset(<RuntimeDataTy> runtimeData)
+//   i32 pendingAttrOffset(<RuntimeDataTy> runtimeData)
+//
+//   i32* stackIntPtr(<RuntimeDataTy> runtimeData, i32 baseOffset, i32 offset)
+//   
+// Called before/after stackFramePush()/stackFramePop():
+//   void traceFramePush(<RuntimeDataTy> runtimeData, i32 attrSize) 
+//   void traceFramePop(<RuntimeDataTy> runtimeData)               
+
+class StateFunctionTransform
+{
+public:
+  enum ParameterSemanticType
+  {
+    PST_NONE = 0,
+    PST_PAYLOAD,
+    PST_ATTRIBUTE,
+
+    PST_COUNT
+  };
+
+  // func is the function to be transformed. candidateFuncNames is a list of all 
+  // functions that which have been or will be transformed to state functions, 
+  // including func. The runtimeDataArgTy is the type to use for the first argument
+  // in state functions.
+  StateFunctionTransform(llvm::Function* func, const std::vector<std::string>& candidateFuncNames, llvm::Type* runtimeDataArgTy);
+
+  // Optional parameters to be specified before run()
+  void setAttributeSize(int sizeInBytes); // needed for TraceRay()
+  void setParameterInfo(const std::vector<ParameterSemanticType>& paramTypes, bool useCommittedAttr = true);
+  void setResourceGlobals(const std::set<llvm::Value*>& resources);
+
+  static llvm::Function* createDummyRuntimeDataArgFunc(llvm::Module* M, llvm::Type* runtimeDataArgTy);
+
+  // Generates state functions from func into the same module. The original function
+  // is left only as a declaration.
+  void run(std::vector<llvm::Function*>& stateFunctions, _Out_ unsigned int &shaderStackSize);
+
+  // candidateFuncEntryStateIds corresponding to the candidateFuncNames passed to
+  // the constructor. stateIDs are computed as candidateFuncEntryStateIds[functionIdx]
+  // + substateIdx, where functionIdx and substateIdx come from the arguments to
+  // the placeholder stateID function.
+  static void finalizeStateIds(llvm::Module* module, const std::vector<int>& candidateFuncEntryStateIds);
+
+  // Outputs detailed diagnostic information if set to true.
+  void setVerbose(bool val);
+
+  void setDumpFilename(const std::string& dumpFilename);
+
+
+private:
+  // Function to transform
+  llvm::Function* m_function = nullptr;
+
+  // Name of the function to transform
+  std::string m_functionName;
+
+  // Index of the function to transform in m_candidateFuncNames
+  int m_functionIdx = 0;
+
+  // cadidateFuncNames is a list of all functions that which have been or will 
+  // be transformed to state functions. Used to create function index used
+  // by the stateID placeholder function.
+  const std::vector<std::string>& m_candidateFuncNames;
+
+  llvm::Type* m_runtimeDataArgTy = nullptr;
+  llvm::Value* m_runtimeDataArg = nullptr;     // set in init() and changeFunctionSignature()
+  llvm::Value* m_stackFrameSizeVal = nullptr;  // set in init() and preserveLiveValuesAcrossCallsites()
+
+  int m_attributeSizeInBytes = -1;
+  std::vector<ParameterSemanticType> m_paramTypes;
+  bool m_useCommittedAttr = false;
+  const std::set<llvm::Value*>* m_resources;
+
+  std::vector<llvm::CallInst*> m_callSites;
+  std::vector<int> m_callSiteFunctionIdx;
+  std::vector<llvm::CallInst*> m_movePayloadToStackCalls;
+  std::vector<llvm::CallInst*> m_setPendingAttrCalls;
+  std::vector<llvm::ReturnInst*> m_returns;
+
+  bool m_verbose = false;
+  std::string m_dumpFilename;
+  unsigned int m_dumpId = 0;
+
+  llvm::Function* m_stackIntPtrFunc = nullptr;
+
+  llvm::CallInst* m_stackFramePush = nullptr;
+  llvm::CallInst* m_stackFrameOffset = nullptr;
+  llvm::CallInst* m_payloadOffset = nullptr;          // Offset at beginning of function
+  llvm::CallInst* m_committedAttrOffset = nullptr;    // Offset at beginning of function
+  llvm::CallInst* m_pendingAttrOffset = nullptr;      // Offset at beginning of function
+
+  // Placeholder function taking constant values functionIdx and substate. 
+  // These are later translated to a stateId by finalizeStateIds().
+  llvm::Function* m_dummyStateIdFunc = nullptr;
+
+  int m_maxCallerArgFrameSizeInBytes = 0;
+  int m_traceFrameSizeInBytes = 0;
+
+  // Functions used to abstract stack operations. These make intermediate stages
+  // in the transform a little bit cleaner. 
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackStoreFuncs;
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackLoadFuncs;
+  std::map<llvm::FunctionType*, llvm::Function*> m_stackPtrFuncs;
+
+  // Main stages of the transformation 
+  void init();
+  void findCallSitesIntrinsicsAndReturns();
+  void changeCallingConvention();
+  void preserveLiveValuesAcrossCallsites(_Out_ unsigned int &shaderStackSize);
+  void createSubstateFunctions(std::vector<llvm::Function*>& stateFunctions);
+  void lowerStackFuncs();
+
+  llvm::Value* getDummyStateId(int functionIdx, int substate, llvm::Instruction* insertBefore);
+
+  void allocateStackFrame();
+  void allocateTraceFrame();
+  void createArgFrames();
+  void changeFunctionSignature();
+
+  void createStackStore(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackLoad(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackPtr(llvm::Value* baseOffset, llvm::Value* val, int offsetInBytes, llvm::Instruction* insertBefore);
+  llvm::Instruction* createStackPtr(llvm::Value* baseOffset, llvm::Type* valTy, llvm::Value* intIndex, llvm::Instruction* insertBefore);
+  void rewriteDummyStackSize(uint64_t frameSizeInBytes);
+
+  BasicBlockVector replaceCallSites();
+  llvm::Function* split(llvm::Function* baseFunc, llvm::BasicBlock* subStateEntryBlock, int substateIndex);
+
+  void flattenGepsOnValue(llvm::Value* val, llvm::Value* baseOffset, llvm::Value* offset);
+  void scalarizeVectorStackAccess(llvm::Instruction* vecPtr, llvm::Value* baseOffset, llvm::Value* offsetVal);
+
+  // Diagnostic printing functions
+  llvm::raw_ostream& getOutputStream(const std::string functionName, const std::string& suffix, unsigned int dumpId);
+  void printFunction(const llvm::Function* function, const std::string& suffix, unsigned int dumpId);
+  void printFunction(const std::string& suffix);
+  void printFunctions(const std::vector<llvm::Function*>& funcs, const char* suffix);
+  void printModule(const llvm::Module* module, const std::string& suffix);
+  void printSet(const InstructionSetVector& vals, const char* msg = nullptr);
+};

+ 26 - 0
lib/DxrFallback/readme.md

@@ -0,0 +1,26 @@
+# DXR Fallback Compiler
+The DXR Fallback Compiler is a specialized compiler that's a part of the [D3D12 Raytracing Fallback Layer](https://github.com/Microsoft/DirectX-Graphics-Samples/tree/master/Libraries/D3D12RaytracingFallback). The purpose of the DXR Fallback Compiler is to take input DXR shader libs and link them into a single compute shader that is runnable DX12 hardware (even without DXR driver support).
+
+## Building the DXR Fallback Compiler
+In order to build the DXR Fallback Compiler in Visual Studio, simply build the dxrfallbackcompiler project in the *Clang Libraries* folder.
+
+## Using with the D3D12 Raytracing Fallback Layer
+To use the DXR Fallback Compiler with the [DirectX Graphics Samples](https://github.com/Microsoft/DirectX-Graphics-Samples/blob/master/Samples/Desktop/D3D12Raytracing/readme.md), build a dxrfallbackcompiler.dll using the Build instructions and place the output dll in Samples/Desktop/D3D12Raytracing/tools/x64. 
+
+If you're incorporating the Fallback Layer into your own personal project, you need to ensure that the dll is either alongside your executable or in the working directory.
+
+## Overview
+Note that the below overview and all proceeding documentation assumes familiarity with the DirectX Raytracing API.
+
+The DXR Fallback Compiler addresses several challenges that native DX12 compute shaders are not normally capable of handling:
+ * Combining multiple orthogonal shaders into a single large compute shader
+ * Uses of all new DXR HLSL intrinsics
+ * Invocation of another shader in the middle of shader code - *i.e. TraceRay and CallShader*
+ * Recursive invocations of shader calls
+
+These challenges are handled by abstractly viewing GPU execution of a DXR pipeline as State Machine traversal, where each shader is transformed into one or more state functions. further technical details are described in the header of [StateFunctionTransform.h](..\\DxrFallback\StateFunctionTransform.h).
+
+## Building runtime.h
+Download LLVM 3.7: http://releases.llvm.org/3.7.0/LLVM-3.7.0-win64.exe
+You may need to adjust BINPATH in script.cmd to point to your llvm binaries
+Run script.cmd and it should output a patched runtime.h

+ 1974 - 0
lib/DxrFallback/runtime.h

@@ -0,0 +1,1974 @@
+
+// This file generated by compiling the following source (runtime.c) as follows:
+//    clang -S -emit-llvm -target nvptr runtime.c
+//    opt -S -mem2reg runtime.ll -o runtime.opt.ll
+// The resulting LLVM-IR is stripped of its datalayout and replaced with one
+// compatible with DXIL.
+
+// runtime.c
+#if 0 
+#include <stddef.h>
+
+static const int STACK_SIZE_IN_BYTES = 1024;
+
+typedef float float3 __attribute__((vector_size(3*sizeof(float))));
+typedef float float4 __attribute__((vector_size(4*sizeof(float))));
+typedef float float12 __attribute__((vector_size(12*sizeof(float))));
+typedef float (M3x4)[12];
+typedef int   (StackType)[STACK_SIZE_IN_BYTES/sizeof(int)];
+typedef unsigned char byte;
+
+
+typedef struct RuntimeDataStruct
+{
+  int DispatchRaysIndex[2];
+  int DispatchRaysDimensions[2];
+
+  float RayTMin;
+  float RayTCurrent;
+  unsigned RayFlags;
+  float WorldRayOrigin[3];
+  float WorldRayDirection[3];
+  float ObjectRayOrigin[3];
+  float ObjectRayDirection[3];
+  M3x4 ObjectToWorld;
+  M3x4 WorldToObject;
+
+  unsigned PrimitiveIndex;
+  unsigned InstanceIndex;
+  unsigned InstanceID;
+  unsigned HitKind;
+  unsigned ShaderRecordOffset;
+
+
+  // Pending hit values - accessed in anyHit and intersection shaders before a hit has been committed
+  float PendingRayTCurrent;
+  unsigned PendingPrimitiveIndex;
+  unsigned PendingInstanceIndex;
+  unsigned PendingInstanceID;
+  unsigned PendingHitKind;
+  unsigned PendingShaderRecordOffset; 
+
+  int GroupIndex; 
+  int AnyHitResult;
+  int AnyHitStateId;  // Originally temporary. We needed to avoid resource usage
+                      // in ReportHit() because of linking issues so weset the value here first. 
+                      // May be worth retaining to cache the value when fetching the intersection 
+                      // stateId (fetch them both at once). 
+
+  int PayloadOffset;            
+  int CommittedAttrOffset;      
+  int PendingAttrOffset;        
+  
+  int StackOffset; // offset from the start of the stack
+  StackType* Stack;
+} RuntimeData;
+
+typedef RuntimeData* RuntimeDataType;
+
+typedef struct TraceRaySpills_ClosestHit
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+  float ObjectRayOrigin[3];      
+  float ObjectRayDirection[3];   
+
+  unsigned PrimitiveIndex;       
+  unsigned InstanceIndex;        
+  unsigned InstanceID;           
+  unsigned HitKind;              
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_ClosestHit;
+
+typedef struct TraceRaySpills_Miss
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+            
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_Miss;
+
+
+#define REF(x) (runtimeData->x)
+#define REF_FLT(x) (runtimeData->x)
+#define REF_STACK(offset) ((*runtimeData->Stack)[runtimeData->StackOffset + offset])
+#define REF_FLT_OFS(x, offset) (runtimeData->x[offset])
+
+// Return next stateID
+int rewrite_dispatch(RuntimeDataType runtimeData, int stateID);
+void* rewrite_setLaunchParams(RuntimeDataType runtimeData, unsigned dimx, unsigned dimy);
+unsigned rewrite_getStackSize(void);
+StackType* rewrite_createStack(void);
+
+void stackInit(RuntimeDataType runtimeData, StackType* theStack, unsigned stackSize)
+{
+  REF(Stack) = theStack;
+  REF(StackOffset) = stackSize/sizeof(int) - 1;
+  REF(PayloadOffset)       = 1111; // recognizable bogus values
+  REF(CommittedAttrOffset) = 2222;
+  REF(PendingAttrOffset)   = 3333;
+}
+
+void stackFramePush(RuntimeDataType runtimeData, int size)
+{
+  REF(StackOffset) -= size;
+}
+
+void stackFramePop(RuntimeDataType runtimeData, int size)
+{ 
+  REF(StackOffset) += size;
+}
+
+int stackFrameOffset(RuntimeDataType runtimeData)
+{
+  return REF(StackOffset);
+}
+
+int payloadOffset(RuntimeDataType runtimeData)
+{
+  return REF(PayloadOffset);
+}
+
+int committedAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(CommittedAttrOffset);
+}
+
+int pendingAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingAttrOffset);
+}
+
+int* stackIntPtr(RuntimeDataType runtimeData, int baseOffset, int offset)
+{
+  return &(*runtimeData->Stack)[baseOffset + offset];
+}
+
+
+void traceFramePush(RuntimeDataType runtimeData, int attrSize)
+{
+  // Save the old payload and attribute offsets
+  REF_STACK(-1) = REF(CommittedAttrOffset);
+  REF_STACK(-2) = REF(PendingAttrOffset);
+
+  // Set new offsets
+  REF(CommittedAttrOffset) = REF(StackOffset) - 2 - attrSize; 
+  REF(PendingAttrOffset)   = REF(StackOffset) - 2 - 2 * attrSize; 
+}
+
+void traceFramePop(RuntimeDataType runtimeData)
+{
+  // Restore the old attribute offsets
+  REF(CommittedAttrOffset) = REF_STACK(-1); 
+  REF(PendingAttrOffset) = REF_STACK(-2);
+}
+
+void traceRaySave_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+  spills->ObjectRayOrigin[0]    = REF_FLT(ObjectRayOrigin[0]);
+  spills->ObjectRayOrigin[1]    = REF_FLT(ObjectRayOrigin[1]);
+  spills->ObjectRayOrigin[2]    = REF_FLT(ObjectRayOrigin[2]);
+  spills->ObjectRayDirection[0] = REF_FLT(ObjectRayDirection[0]);
+  spills->ObjectRayDirection[1] = REF_FLT(ObjectRayDirection[1]);
+  spills->ObjectRayDirection[2] = REF_FLT(ObjectRayDirection[2]);
+
+  spills->PrimitiveIndex      = REF(PrimitiveIndex);       
+  spills->InstanceIndex       = REF(InstanceIndex);        
+  spills->InstanceID          = REF(InstanceID);           
+  spills->HitKind             = REF(HitKind);              
+  spills->ShaderRecordOffset  = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+  REF_FLT(ObjectRayOrigin[0])    = spills->ObjectRayOrigin[0];     
+  REF_FLT(ObjectRayOrigin[1])    = spills->ObjectRayOrigin[1];     
+  REF_FLT(ObjectRayOrigin[2])    = spills->ObjectRayOrigin[2];     
+  REF_FLT(ObjectRayDirection[0]) = spills->ObjectRayDirection[0];  
+  REF_FLT(ObjectRayDirection[1]) = spills->ObjectRayDirection[1];  
+  REF_FLT(ObjectRayDirection[2]) = spills->ObjectRayDirection[2];  
+
+  REF(PrimitiveIndex)     = spills->PrimitiveIndex;          
+  REF(InstanceIndex)      = spills->InstanceIndex;           
+  REF(InstanceID)         = spills->InstanceID;              
+  REF(HitKind)            = spills->HitKind;                 
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+void traceRaySave_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+
+  spills->ShaderRecordOffset    = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Intrinsics for the fallback layer
+//
+//////////////////////////////////////////////////////////////////////////
+
+void fb_Fallback_Scheduler(int initialStateId, unsigned dimx, unsigned dimy)
+{
+  StackType* theStack = rewrite_createStack();
+  RuntimeData theRuntimeData;
+  RuntimeDataType runtimeData = &theRuntimeData;
+
+  rewrite_setLaunchParams(runtimeData, dimx, dimy);
+  if(REF(DispatchRaysIndex[0]) >= REF(DispatchRaysDimensions[0]) ||
+     REF(DispatchRaysIndex[1]) >= REF(DispatchRaysDimensions[1]))
+  { 
+    return;
+  }
+
+
+  // Set final return stateID into reserved area at stack top
+  unsigned stackSize = rewrite_getStackSize();
+  stackInit(runtimeData, theStack, stackSize);
+  int stackFrameOffs = stackFrameOffset(runtimeData);
+  *stackIntPtr(runtimeData, stackFrameOffs, 0) = -1;
+
+  int stateId = initialStateId;
+  int count = 0;
+  while( stateId >= 0 )
+  {
+    stateId = rewrite_dispatch(runtimeData, stateId);
+  }
+}
+
+void fb_Fallback_SetLaunchParams(RuntimeDataType runtimeData, unsigned DTidx, unsigned DTidy, unsigned dimx, unsigned dimy, unsigned groupIndex)
+{ 
+  REF(DispatchRaysIndex[0]) = DTidx;
+  REF(DispatchRaysIndex[1]) = DTidy;
+  REF(DispatchRaysDimensions[0]) = dimx;
+  REF(DispatchRaysDimensions[1]) = dimy;
+
+  REF(GroupIndex) = groupIndex;
+}
+
+int fb_Fallback_TraceRayBegin(RuntimeDataType runtimeData, unsigned rayFlags, float ox, float oy, float oz, float tmin, float dx, float dy, float dz, float tmax, int newPayloadOffset)
+{ 
+  REF(RayFlags) = rayFlags;
+  REF_FLT(WorldRayOrigin[0]) = ox;
+  REF_FLT(WorldRayOrigin[1]) = oy;
+  REF_FLT(WorldRayOrigin[2]) = oz;
+  REF_FLT(WorldRayDirection[0]) = dx;
+  REF_FLT(WorldRayDirection[1]) = dy;
+  REF_FLT(WorldRayDirection[2]) = dz;
+  REF_FLT(RayTCurrent) = tmax;
+  REF_FLT(RayTMin) = tmin;
+
+  int oldOffset = REF(PayloadOffset);
+  REF(PayloadOffset) = newPayloadOffset;
+  return oldOffset;
+}
+
+void fb_Fallback_TraceRayEnd(RuntimeDataType runtimeData, int oldPayloadOffset)
+{
+  REF(PayloadOffset) = oldPayloadOffset;
+}
+
+void fb_Fallback_SetPendingTriVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID, float t, unsigned hitKind)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+  REF_FLT(PendingRayTCurrent) = t;
+  REF(PendingHitKind) = hitKind;
+}
+
+void fb_Fallback_SetPendingCustomVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+}
+
+void fb_Fallback_CommitHit(RuntimeDataType runtimeData)
+{
+  REF_FLT(RayTCurrent)    = REF_FLT(PendingRayTCurrent);
+  REF(ShaderRecordOffset) = REF(PendingShaderRecordOffset);
+  REF(PrimitiveIndex)     = REF(PendingPrimitiveIndex);
+  REF(InstanceIndex)      = REF(PendingInstanceIndex);
+  REF(InstanceID)         = REF(PendingInstanceID);
+  REF(HitKind)            = REF(PendingHitKind);  
+
+  int PendingAttrOffset = REF(PendingAttrOffset);
+  REF(PendingAttrOffset) = REF(CommittedAttrOffset);
+  REF(CommittedAttrOffset) = PendingAttrOffset;
+}
+
+
+int fb_Fallback_RuntimeDataLoadInt(RuntimeDataType runtimeData, int offset)
+{
+  return (*runtimeData->Stack)[offset];
+}
+
+void fb_Fallback_RuntimeDataStoreInt(RuntimeDataType runtimeData, int offset, int val)
+{
+  (*runtimeData->Stack)[offset] = val;
+}
+
+unsigned fb_dxop_dispatchRaysIndex(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysIndex[i]);
+}
+
+unsigned fb_dxop_dispatchRaysDimensions(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysDimensions[i]);
+}
+
+float fb_dxop_rayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+float fb_Fallback_RayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+void fb_Fallback_SetRayTMin(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTMin) = t;
+}
+
+float fb_dxop_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+float fb_Fallback_RayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+void fb_Fallback_SetRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTCurrent) = t;
+}
+
+unsigned fb_dxop_rayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+unsigned fb_Fallback_RayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+void fb_Fallback_SetRayFlags(RuntimeDataType runtimeData, unsigned flags)
+{
+  REF(RayFlags) = flags;
+}
+
+float fb_dxop_worldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+float fb_Fallback_WorldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+void fb_Fallback_SetWorldRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayOrigin[0]) = x;
+  REF_FLT(WorldRayOrigin[1]) = y;
+  REF_FLT(WorldRayOrigin[2]) = z;
+}
+
+float fb_dxop_worldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+float fb_Fallback_WorldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+void fb_Fallback_SetWorldRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayDirection[0]) = x;
+  REF_FLT(WorldRayDirection[1]) = y;
+  REF_FLT(WorldRayDirection[2]) = z;
+}
+
+float fb_dxop_objectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+float fb_Fallback_ObjectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+void fb_Fallback_SetObjectRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayOrigin[0]) = x;
+  REF_FLT(ObjectRayOrigin[1]) = y;
+  REF_FLT(ObjectRayOrigin[2]) = z;
+}
+
+float fb_dxop_objectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+float fb_Fallback_ObjectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+void fb_Fallback_SetObjectRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayDirection[0]) = x;
+  REF_FLT(ObjectRayDirection[1]) = y;
+  REF_FLT(ObjectRayDirection[2]) = z;
+}
+
+float fb_dxop_objectToWorld(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(ObjectToWorld, i);
+}
+
+void fb_Fallback_SetObjectToWorld(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(ObjectToWorld, 0)  = M[0]; 
+  REF_FLT_OFS(ObjectToWorld, 1)  = M[1]; 
+  REF_FLT_OFS(ObjectToWorld, 2)  = M[2]; 
+  REF_FLT_OFS(ObjectToWorld, 3)  = M[3]; 
+  REF_FLT_OFS(ObjectToWorld, 4)  = M[4]; 
+  REF_FLT_OFS(ObjectToWorld, 5)  = M[5]; 
+  REF_FLT_OFS(ObjectToWorld, 6)  = M[6]; 
+  REF_FLT_OFS(ObjectToWorld, 7)  = M[7]; 
+  REF_FLT_OFS(ObjectToWorld, 8)  = M[8]; 
+  REF_FLT_OFS(ObjectToWorld, 9)  = M[9]; 
+  REF_FLT_OFS(ObjectToWorld, 10) = M[10];
+  REF_FLT_OFS(ObjectToWorld, 11) = M[11];
+}
+
+float fb_dxop_worldToObject(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(WorldToObject, i);
+}
+
+void fb_Fallback_SetWorldToObject(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(WorldToObject, 0)  = M[0]; 
+  REF_FLT_OFS(WorldToObject, 1)  = M[1]; 
+  REF_FLT_OFS(WorldToObject, 2)  = M[2]; 
+  REF_FLT_OFS(WorldToObject, 3)  = M[3]; 
+  REF_FLT_OFS(WorldToObject, 4)  = M[4]; 
+  REF_FLT_OFS(WorldToObject, 5)  = M[5]; 
+  REF_FLT_OFS(WorldToObject, 6)  = M[6]; 
+  REF_FLT_OFS(WorldToObject, 7)  = M[7]; 
+  REF_FLT_OFS(WorldToObject, 8)  = M[8]; 
+  REF_FLT_OFS(WorldToObject, 9)  = M[9]; 
+  REF_FLT_OFS(WorldToObject, 10) = M[10];
+  REF_FLT_OFS(WorldToObject, 11) = M[11];
+}
+
+unsigned fb_dxop_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+unsigned fb_Fallback_PrimitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+void fb_Fallback_SetPrimitiveIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PrimitiveIndex) = i;
+}
+
+unsigned fb_Fallback_ShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(ShaderRecordOffset);
+}
+
+void fb_Fallback_SetShaderRecordOffset(RuntimeDataType runtimeData, unsigned shaderRecordOffset)
+{
+  REF(ShaderRecordOffset) = shaderRecordOffset;
+}
+
+unsigned fb_dxop_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+unsigned fb_Fallback_InstanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+void fb_Fallback_SetInstanceIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceIndex) = i;
+}
+
+unsigned fb_dxop_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+unsigned fb_Fallback_InstanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+void fb_Fallback_SetInstanceID(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceID) = i;
+}
+
+unsigned fb_dxop_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+unsigned fb_Fallback_HitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+void fb_Fallback_SetHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(HitKind) = i;
+}
+
+float fb_dxop_pending_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(PendingRayTCurrent);
+}
+
+void fb_Fallback_SetPendingRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(PendingRayTCurrent) = t;
+}
+
+unsigned fb_dxop_pending_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_pending_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingPrimitiveIndex);
+}
+
+unsigned fb_Fallback_PendingShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingShaderRecordOffset);
+}
+
+unsigned fb_dxop_pending_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceIndex);
+}
+
+unsigned fb_dxop_pending_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceID);
+}
+
+unsigned fb_dxop_pending_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(PendingHitKind);
+}
+
+void fb_Fallback_SetPendingHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PendingHitKind) = i;
+}
+
+unsigned fb_Fallback_GroupIndex(RuntimeDataType runtimeData)
+{ 
+  return REF(GroupIndex);
+}
+
+int fb_Fallback_AnyHitResult(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitResult);
+}
+
+void fb_Fallback_SetAnyHitResult(RuntimeDataType runtimeData, int result)
+{
+  REF(AnyHitResult) = result;
+}
+
+int fb_Fallback_AnyHitStateId(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitStateId);
+}
+
+void fb_Fallback_SetAnyHitStateId(RuntimeDataType runtimeData, int id)
+{
+  REF(AnyHitStateId) = id;
+}
+
+#endif
+
+static const char* runtimeString[] = { R"AAA(
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f:64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+
+
+%struct.RuntimeDataStruct = type { [2 x i32], [2 x i32], float, float, i32, [3 x float], [3 x float], [3 x float], [3 x float], [12 x float], [12 x float], i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [256 x i32]* }
+%struct.TraceRaySpills_ClosestHit = type { float, float, i32, [3 x float], [3 x float], [3 x float], [3 x float], i32, i32, i32, i32, i32 }
+%struct.TraceRaySpills_Miss = type { float, float, i32, [3 x float], [3 x float], i32 }
+
+; Function Attrs: nounwind
+define void @stackInit(%struct.RuntimeDataStruct* %runtimeData, [256 x i32]* %theStack, i32 %stackSize) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  store [256 x i32]* %theStack, [256 x i32]** %Stack, align 4
+  %div = udiv i32 %stackSize, 4
+  %sub = sub i32 %div, 1
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  store i32 %sub, i32* %StackOffset, align 4
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 1111, i32* %PayloadOffset, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 2222, i32* %CommittedAttrOffset, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 3333, i32* %PendingAttrOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @stackFramePush(%struct.RuntimeDataStruct* %runtimeData, i32 %size) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %sub = sub nsw i32 %0, %size
+  store i32 %sub, i32* %StackOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @stackFramePop(%struct.RuntimeDataStruct* %runtimeData, i32 %size) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %0, %size
+  store i32 %add, i32* %StackOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @stackFrameOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @payloadOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  %0 = load i32, i32* %PayloadOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @committedAttrOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %0 = load i32, i32* %CommittedAttrOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @pendingAttrOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %0 = load i32, i32* %PendingAttrOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32* @stackIntPtr(%struct.RuntimeDataStruct* %runtimeData, i32 %baseOffset, i32 %offset) #0 {
+entry:
+  %add = add nsw i32 %baseOffset, %offset
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %add
+  ret i32* %arrayidx
+}
+
+; Function Attrs: nounwind
+define void @traceFramePush(%struct.RuntimeDataStruct* %runtimeData, i32 %attrSize) #0 {
+entry:
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %0 = load i32, i32* %CommittedAttrOffset, align 4
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %1 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %1, -1
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %2 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %2, i32 0, i32 %add
+  store i32 %0, i32* %arrayidx, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %3 = load i32, i32* %PendingAttrOffset, align 4
+  %StackOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %4 = load i32, i32* %StackOffset1, align 4
+  %add2 = add nsw i32 %4, -2
+  %Stack3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %5 = load [256 x i32]*, [256 x i32]** %Stack3, align 4
+  %arrayidx4 = getelementptr inbounds [256 x i32], [256 x i32]* %5, i32 0, i32 %add2
+  store i32 %3, i32* %arrayidx4, align 4
+  %StackOffset5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %6 = load i32, i32* %StackOffset5, align 4
+  %sub = sub nsw i32 %6, 2
+  %sub6 = sub nsw i32 %sub, %attrSize
+  %CommittedAttrOffset7 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %sub6, i32* %CommittedAttrOffset7, align 4
+  %StackOffset8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %7 = load i32, i32* %StackOffset8, align 4
+  %sub9 = sub nsw i32 %7, 2
+  %mul = mul nsw i32 2, %attrSize
+  %sub10 = sub nsw i32 %sub9, %mul
+  %PendingAttrOffset11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %sub10, i32* %PendingAttrOffset11, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceFramePop(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %StackOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %0 = load i32, i32* %StackOffset, align 4
+  %add = add nsw i32 %0, -1
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %1 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %1, i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %2, i32* %CommittedAttrOffset, align 4
+  %StackOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 28
+  %3 = load i32, i32* %StackOffset1, align 4
+  %add2 = add nsw i32 %3, -2
+  %Stack3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %4 = load [256 x i32]*, [256 x i32]** %Stack3, align 4
+  %arrayidx4 = getelementptr inbounds [256 x i32], [256 x i32]* %4, i32 0, i32 %add2
+  %5 = load i32, i32* %arrayidx4, align 4
+  %PendingAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %5, i32* %PendingAttrOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRaySave_ClosestHit(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_ClosestHit* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 2
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 1
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 0
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+)AAA",
+R"AAA(
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx25 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  %9 = load float, float* %arrayidx25, align 4
+  %ObjectRayOrigin26 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx27 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin26, i32 0, i32 0
+  store float %9, float* %arrayidx27, align 4
+  %ObjectRayOrigin28 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx29 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin28, i32 0, i32 1
+  %10 = load float, float* %arrayidx29, align 4
+  %ObjectRayOrigin30 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx31 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin30, i32 0, i32 1
+  store float %10, float* %arrayidx31, align 4
+  %ObjectRayOrigin32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx33 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin32, i32 0, i32 2
+  %11 = load float, float* %arrayidx33, align 4
+  %ObjectRayOrigin34 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx35 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin34, i32 0, i32 2
+  store float %11, float* %arrayidx35, align 4
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx36 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  %12 = load float, float* %arrayidx36, align 4
+  %ObjectRayDirection37 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx38 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection37, i32 0, i32 0
+  store float %12, float* %arrayidx38, align 4
+  %ObjectRayDirection39 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx40 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection39, i32 0, i32 1
+  %13 = load float, float* %arrayidx40, align 4
+  %ObjectRayDirection41 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx42 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection41, i32 0, i32 1
+  store float %13, float* %arrayidx42, align 4
+  %ObjectRayDirection43 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx44 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection43, i32 0, i32 2
+  %14 = load float, float* %arrayidx44, align 4
+  %ObjectRayDirection45 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx46 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection45, i32 0, i32 2
+  store float %14, float* %arrayidx46, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %15 = load i32, i32* %PrimitiveIndex, align 4
+  %PrimitiveIndex47 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 7
+  store i32 %15, i32* %PrimitiveIndex47, align 4
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %16 = load i32, i32* %InstanceIndex, align 4
+  %InstanceIndex48 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 8
+  store i32 %16, i32* %InstanceIndex48, align 4
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %17 = load i32, i32* %InstanceID, align 4
+  %InstanceID49 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 9
+  store i32 %17, i32* %InstanceID49, align 4
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %18 = load i32, i32* %HitKind, align 4
+  %HitKind50 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 10
+  store i32 %18, i32* %HitKind50, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %19 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset51 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 11
+  store i32 %19, i32* %ShaderRecordOffset51, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRayRestore_ClosestHit(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_ClosestHit* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 2
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 1
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 0
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 3
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+)AAA",
+R"AAA(
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 4
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ObjectRayOrigin = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx25 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  %9 = load float, float* %arrayidx25, align 4
+  %ObjectRayOrigin26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx27 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin26, i32 0, i32 0
+  store float %9, float* %arrayidx27, align 4
+  %ObjectRayOrigin28 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx29 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin28, i32 0, i32 1
+  %10 = load float, float* %arrayidx29, align 4
+  %ObjectRayOrigin30 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx31 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin30, i32 0, i32 1
+  store float %10, float* %arrayidx31, align 4
+  %ObjectRayOrigin32 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 5
+  %arrayidx33 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin32, i32 0, i32 2
+  %11 = load float, float* %arrayidx33, align 4
+  %ObjectRayOrigin34 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx35 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin34, i32 0, i32 2
+  store float %11, float* %arrayidx35, align 4
+  %ObjectRayDirection = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx36 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  %12 = load float, float* %arrayidx36, align 4
+  %ObjectRayDirection37 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx38 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection37, i32 0, i32 0
+  store float %12, float* %arrayidx38, align 4
+  %ObjectRayDirection39 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx40 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection39, i32 0, i32 1
+  %13 = load float, float* %arrayidx40, align 4
+  %ObjectRayDirection41 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx42 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection41, i32 0, i32 1
+  store float %13, float* %arrayidx42, align 4
+  %ObjectRayDirection43 = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 6
+  %arrayidx44 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection43, i32 0, i32 2
+  %14 = load float, float* %arrayidx44, align 4
+  %ObjectRayDirection45 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx46 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection45, i32 0, i32 2
+  store float %14, float* %arrayidx46, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 7
+  %15 = load i32, i32* %PrimitiveIndex, align 4
+  %PrimitiveIndex47 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %15, i32* %PrimitiveIndex47, align 4
+  %InstanceIndex = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 8
+  %16 = load i32, i32* %InstanceIndex, align 4
+  %InstanceIndex48 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %16, i32* %InstanceIndex48, align 4
+  %InstanceID = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 9
+  %17 = load i32, i32* %InstanceID, align 4
+  %InstanceID49 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %17, i32* %InstanceID49, align 4
+  %HitKind = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 10
+  %18 = load i32, i32* %HitKind, align 4
+  %HitKind50 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %18, i32* %HitKind50, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.TraceRaySpills_ClosestHit, %struct.TraceRaySpills_ClosestHit* %spills, i32 0, i32 11
+  %19 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset51 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %19, i32* %ShaderRecordOffset51, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRaySave_Miss(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_Miss* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 2
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 1
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 0
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+)AAA",
+R"AAA(
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %9 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset25 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 5
+  store i32 %9, i32* %ShaderRecordOffset25, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @traceRayRestore_Miss(%struct.RuntimeDataStruct* %runtimeData, %struct.TraceRaySpills_Miss* %spills) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 2
+  %0 = load i32, i32* %RayFlags, align 4
+  %RayFlags1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %0, i32* %RayFlags1, align 4
+  %RayTCurrent = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 1
+  %1 = load float, float* %RayTCurrent, align 4
+  %RayTCurrent2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %1, float* %RayTCurrent2, align 4
+  %RayTMin = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 0
+  %2 = load float, float* %RayTMin, align 4
+  %RayTMin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %2, float* %RayTMin3, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  %3 = load float, float* %arrayidx, align 4
+  %WorldRayOrigin4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin4, i32 0, i32 0
+  store float %3, float* %arrayidx5, align 4
+  %WorldRayOrigin6 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin6, i32 0, i32 1
+  %4 = load float, float* %arrayidx7, align 4
+  %WorldRayOrigin8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin8, i32 0, i32 1
+  store float %4, float* %arrayidx9, align 4
+  %WorldRayOrigin10 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 3
+  %arrayidx11 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin10, i32 0, i32 2
+  %5 = load float, float* %arrayidx11, align 4
+  %WorldRayOrigin12 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx13 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin12, i32 0, i32 2
+  store float %5, float* %arrayidx13, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx14 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  %6 = load float, float* %arrayidx14, align 4
+  %WorldRayDirection15 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx16 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection15, i32 0, i32 0
+  store float %6, float* %arrayidx16, align 4
+  %WorldRayDirection17 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx18 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection17, i32 0, i32 1
+  %7 = load float, float* %arrayidx18, align 4
+  %WorldRayDirection19 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx20 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection19, i32 0, i32 1
+  store float %7, float* %arrayidx20, align 4
+  %WorldRayDirection21 = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 4
+  %arrayidx22 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection21, i32 0, i32 2
+  %8 = load float, float* %arrayidx22, align 4
+  %WorldRayDirection23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx24 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection23, i32 0, i32 2
+  store float %8, float* %arrayidx24, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.TraceRaySpills_Miss, %struct.TraceRaySpills_Miss* %spills, i32 0, i32 5
+  %9 = load i32, i32* %ShaderRecordOffset, align 4
+  %ShaderRecordOffset25 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %9, i32* %ShaderRecordOffset25, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_Scheduler(i32 %initialStateId, i32 %dimx, i32 %dimy) #0 {
+entry:
+  %theRuntimeData = alloca %struct.RuntimeDataStruct, align 4
+  %call = call [256 x i32]* @rewrite_createStack()
+  %call1 = call i8* @rewrite_setLaunchParams(%struct.RuntimeDataStruct* %theRuntimeData, i32 %dimx, i32 %dimy)
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 0
+  %0 = load i32, i32* %arrayidx, align 4
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 1
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 0
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp = icmp sge i32 %0, %1
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %DispatchRaysIndex3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 0
+  %arrayidx4 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex3, i32 0, i32 1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %DispatchRaysDimensions5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %theRuntimeData, i32 0, i32 1
+  %arrayidx6 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions5, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx6, align 4
+  %cmp7 = icmp sge i32 %2, %3
+  br i1 %cmp7, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  br label %while.end
+
+if.end:                                           ; preds = %lor.lhs.false
+  %call8 = call i32 @rewrite_getStackSize()
+  call void @stackInit(%struct.RuntimeDataStruct* %theRuntimeData, [256 x i32]* %call, i32 %call8)
+  %call9 = call i32 @stackFrameOffset(%struct.RuntimeDataStruct* %theRuntimeData)
+  %call10 = call i32* @stackIntPtr(%struct.RuntimeDataStruct* %theRuntimeData, i32 %call9, i32 0)
+  store i32 -1, i32* %call10, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %if.end
+  %stateId.0 = phi i32 [ %initialStateId, %if.end ], [ %call12, %while.body ]
+  %cmp11 = icmp sge i32 %stateId.0, 0
+  br i1 %cmp11, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %call12 = call i32 @rewrite_dispatch(%struct.RuntimeDataStruct* %theRuntimeData, i32 %stateId.0)
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond, %if.then
+  ret void
+}
+
+declare [256 x i32]* @rewrite_createStack() #1
+
+declare i8* @rewrite_setLaunchParams(%struct.RuntimeDataStruct*, i32, i32) #1
+
+declare i32 @rewrite_getStackSize() #1
+
+declare i32 @rewrite_dispatch(%struct.RuntimeDataStruct*, i32) #1
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetLaunchParams(%struct.RuntimeDataStruct* %runtimeData, i32 %DTidx, i32 %DTidy, i32 %dimx, i32 %dimy, i32 %groupIndex) #0 {
+entry:
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 0
+  store i32 %DTidx, i32* %arrayidx, align 4
+  %DispatchRaysIndex1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex1, i32 0, i32 1
+  store i32 %DTidy, i32* %arrayidx2, align 4
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx3 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 0
+  store i32 %dimx, i32* %arrayidx3, align 4
+  %DispatchRaysDimensions4 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions4, i32 0, i32 1
+)AAA",
+R"AAA(
+  store i32 %dimy, i32* %arrayidx5, align 4
+  %GroupIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 22
+  store i32 %groupIndex, i32* %GroupIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_TraceRayBegin(%struct.RuntimeDataStruct* %runtimeData, i32 %rayFlags, float %ox, float %oy, float %oz, float %tmin, float %dx, float %dy, float %dz, float %tmax, i32 %newPayloadOffset) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %rayFlags, i32* %RayFlags, align 4
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  store float %ox, float* %arrayidx, align 4
+  %WorldRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin1, i32 0, i32 1
+  store float %oy, float* %arrayidx2, align 4
+  %WorldRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin3, i32 0, i32 2
+  store float %oz, float* %arrayidx4, align 4
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx5 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  store float %dx, float* %arrayidx5, align 4
+  %WorldRayDirection6 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx7 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection6, i32 0, i32 1
+  store float %dy, float* %arrayidx7, align 4
+  %WorldRayDirection8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx9 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection8, i32 0, i32 2
+  store float %dz, float* %arrayidx9, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %tmax, float* %RayTCurrent, align 4
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %tmin, float* %RayTMin, align 4
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  %0 = load i32, i32* %PayloadOffset, align 4
+  %PayloadOffset10 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 %newPayloadOffset, i32* %PayloadOffset10, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_TraceRayEnd(%struct.RuntimeDataStruct* %runtimeData, i32 %oldPayloadOffset) #0 {
+entry:
+  %PayloadOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 25
+  store i32 %oldPayloadOffset, i32* %PayloadOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingTriVals(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset, i32 %primitiveIndex, i32 %instanceIndex, i32 %instanceID, float %t, i32 %hitKind) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  store i32 %shaderRecordOffset, i32* %PendingShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  store i32 %primitiveIndex, i32* %PendingPrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  store i32 %instanceIndex, i32* %PendingInstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  store i32 %instanceID, i32* %PendingInstanceID, align 4
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  store float %t, float* %PendingRayTCurrent, align 4
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  store i32 %hitKind, i32* %PendingHitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingCustomVals(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset, i32 %primitiveIndex, i32 %instanceIndex, i32 %instanceID) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  store i32 %shaderRecordOffset, i32* %PendingShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  store i32 %primitiveIndex, i32* %PendingPrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  store i32 %instanceIndex, i32* %PendingInstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  store i32 %instanceID, i32* %PendingInstanceID, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_CommitHit(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  %0 = load float, float* %PendingRayTCurrent, align 4
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %0, float* %RayTCurrent, align 4
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  %1 = load i32, i32* %PendingShaderRecordOffset, align 4
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %1, i32* %ShaderRecordOffset, align 4
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  %2 = load i32, i32* %PendingPrimitiveIndex, align 4
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %2, i32* %PrimitiveIndex, align 4
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  %3 = load i32, i32* %PendingInstanceIndex, align 4
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %3, i32* %InstanceIndex, align 4
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  %4 = load i32, i32* %PendingInstanceID, align 4
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %4, i32* %InstanceID, align 4
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  %5 = load i32, i32* %PendingHitKind, align 4
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %5, i32* %HitKind, align 4
+  %PendingAttrOffset1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  %6 = load i32, i32* %PendingAttrOffset1, align 4
+  %CommittedAttrOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  %7 = load i32, i32* %CommittedAttrOffset, align 4
+  %PendingAttrOffset2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 27
+  store i32 %7, i32* %PendingAttrOffset2, align 4
+  %CommittedAttrOffset3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 26
+  store i32 %6, i32* %CommittedAttrOffset3, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_RuntimeDataLoadInt(%struct.RuntimeDataStruct* %runtimeData, i32 %offset) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %offset
+  %1 = load i32, i32* %arrayidx, align 4
+  ret i32 %1
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_RuntimeDataStoreInt(%struct.RuntimeDataStruct* %runtimeData, i32 %offset, i32 %val) #0 {
+entry:
+  %Stack = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 29
+  %0 = load [256 x i32]*, [256 x i32]** %Stack, align 4
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %0, i32 0, i32 %offset
+  store i32 %val, i32* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_dispatchRaysIndex(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %DispatchRaysIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysIndex, i32 0, i32 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_dispatchRaysDimensions(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+)AAA",
+R"AAA(
+entry:
+  %idxprom = zext i8 %i to i32
+  %DispatchRaysDimensions = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 1
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %DispatchRaysDimensions, i32 0, i32 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_rayTMin(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %0 = load float, float* %RayTMin, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_RayTMin(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  %0 = load float, float* %RayTMin, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayTMin(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %RayTMin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 2
+  store float %t, float* %RayTMin, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_rayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %0 = load float, float* %RayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_RayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  %0 = load float, float* %RayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayTCurrent(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %RayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 3
+  store float %t, float* %RayTCurrent, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_rayFlags(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_RayFlags(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  %0 = load i32, i32* %RayFlags, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetRayFlags(%struct.RuntimeDataStruct* %runtimeData, i32 %flags) #0 {
+entry:
+  %RayFlags = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 4
+  store i32 %flags, i32* %RayFlags, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_WorldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldRayOrigin(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %WorldRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %WorldRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %WorldRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 5
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayOrigin3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_WorldRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldRayDirection(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %WorldRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %WorldRayDirection1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %WorldRayDirection3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 6
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %WorldRayDirection3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_ObjectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectRayOrigin(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %ObjectRayOrigin = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %ObjectRayOrigin1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin1, i32 0, i32 1
+  store float %y, float* %arrayidx2, align 4
+  %ObjectRayOrigin3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 7
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayOrigin3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define float @fb_Fallback_ObjectRayDirection(%struct.RuntimeDataStruct* %runtimeData, i8 zeroext %i) #0 {
+entry:
+  %idxprom = zext i8 %i to i32
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectRayDirection(%struct.RuntimeDataStruct* %runtimeData, float %x, float %y, float %z) #0 {
+entry:
+  %ObjectRayDirection = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection, i32 0, i32 0
+  store float %x, float* %arrayidx, align 4
+  %ObjectRayDirection1 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx2 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection1, i32 0, i32 1
+)AAA",
+R"AAA(
+  store float %y, float* %arrayidx2, align 4
+  %ObjectRayDirection3 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 8
+  %arrayidx4 = getelementptr inbounds [3 x float], [3 x float]* %ObjectRayDirection3, i32 0, i32 2
+  store float %z, float* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_objectToWorld(%struct.RuntimeDataStruct* %runtimeData, i32 %r, i8 zeroext %c) #0 {
+entry:
+  %mul = mul nsw i32 %r, 4
+  %conv = zext i8 %c to i32
+  %add = add nsw i32 %mul, %conv
+  %ObjectToWorld = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld, i32 0, i32 %add
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetObjectToWorld(%struct.RuntimeDataStruct* %runtimeData, <12 x float> %M) #0 {
+entry:
+  %vecext = extractelement <12 x float> %M, i32 0
+  %ObjectToWorld = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld, i32 0, i32 0
+  store float %vecext, float* %arrayidx, align 4
+  %vecext1 = extractelement <12 x float> %M, i32 1
+  %ObjectToWorld2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx3 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld2, i32 0, i32 1
+  store float %vecext1, float* %arrayidx3, align 4
+  %vecext4 = extractelement <12 x float> %M, i32 2
+  %ObjectToWorld5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx6 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld5, i32 0, i32 2
+  store float %vecext4, float* %arrayidx6, align 4
+  %vecext7 = extractelement <12 x float> %M, i32 3
+  %ObjectToWorld8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx9 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld8, i32 0, i32 3
+  store float %vecext7, float* %arrayidx9, align 4
+  %vecext10 = extractelement <12 x float> %M, i32 4
+  %ObjectToWorld11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx12 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld11, i32 0, i32 4
+  store float %vecext10, float* %arrayidx12, align 4
+  %vecext13 = extractelement <12 x float> %M, i32 5
+  %ObjectToWorld14 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx15 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld14, i32 0, i32 5
+  store float %vecext13, float* %arrayidx15, align 4
+  %vecext16 = extractelement <12 x float> %M, i32 6
+  %ObjectToWorld17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx18 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld17, i32 0, i32 6
+  store float %vecext16, float* %arrayidx18, align 4
+  %vecext19 = extractelement <12 x float> %M, i32 7
+  %ObjectToWorld20 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx21 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld20, i32 0, i32 7
+  store float %vecext19, float* %arrayidx21, align 4
+  %vecext22 = extractelement <12 x float> %M, i32 8
+  %ObjectToWorld23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx24 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld23, i32 0, i32 8
+  store float %vecext22, float* %arrayidx24, align 4
+  %vecext25 = extractelement <12 x float> %M, i32 9
+  %ObjectToWorld26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx27 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld26, i32 0, i32 9
+  store float %vecext25, float* %arrayidx27, align 4
+  %vecext28 = extractelement <12 x float> %M, i32 10
+  %ObjectToWorld29 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx30 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld29, i32 0, i32 10
+  store float %vecext28, float* %arrayidx30, align 4
+  %vecext31 = extractelement <12 x float> %M, i32 11
+  %ObjectToWorld32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 9
+  %arrayidx33 = getelementptr inbounds [12 x float], [12 x float]* %ObjectToWorld32, i32 0, i32 11
+  store float %vecext31, float* %arrayidx33, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_worldToObject(%struct.RuntimeDataStruct* %runtimeData, i32 %r, i8 zeroext %c) #0 {
+entry:
+  %mul = mul nsw i32 %r, 4
+  %conv = zext i8 %c to i32
+  %add = add nsw i32 %mul, %conv
+  %WorldToObject = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject, i32 0, i32 %add
+  %0 = load float, float* %arrayidx, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetWorldToObject(%struct.RuntimeDataStruct* %runtimeData, <12 x float> %M) #0 {
+entry:
+  %vecext = extractelement <12 x float> %M, i32 0
+  %WorldToObject = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject, i32 0, i32 0
+  store float %vecext, float* %arrayidx, align 4
+  %vecext1 = extractelement <12 x float> %M, i32 1
+  %WorldToObject2 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx3 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject2, i32 0, i32 1
+  store float %vecext1, float* %arrayidx3, align 4
+  %vecext4 = extractelement <12 x float> %M, i32 2
+  %WorldToObject5 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx6 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject5, i32 0, i32 2
+  store float %vecext4, float* %arrayidx6, align 4
+  %vecext7 = extractelement <12 x float> %M, i32 3
+  %WorldToObject8 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx9 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject8, i32 0, i32 3
+  store float %vecext7, float* %arrayidx9, align 4
+  %vecext10 = extractelement <12 x float> %M, i32 4
+  %WorldToObject11 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx12 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject11, i32 0, i32 4
+  store float %vecext10, float* %arrayidx12, align 4
+  %vecext13 = extractelement <12 x float> %M, i32 5
+  %WorldToObject14 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx15 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject14, i32 0, i32 5
+  store float %vecext13, float* %arrayidx15, align 4
+  %vecext16 = extractelement <12 x float> %M, i32 6
+  %WorldToObject17 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx18 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject17, i32 0, i32 6
+  store float %vecext16, float* %arrayidx18, align 4
+  %vecext19 = extractelement <12 x float> %M, i32 7
+  %WorldToObject20 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx21 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject20, i32 0, i32 7
+  store float %vecext19, float* %arrayidx21, align 4
+  %vecext22 = extractelement <12 x float> %M, i32 8
+  %WorldToObject23 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx24 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject23, i32 0, i32 8
+  store float %vecext22, float* %arrayidx24, align 4
+  %vecext25 = extractelement <12 x float> %M, i32 9
+  %WorldToObject26 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx27 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject26, i32 0, i32 9
+  store float %vecext25, float* %arrayidx27, align 4
+  %vecext28 = extractelement <12 x float> %M, i32 10
+  %WorldToObject29 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx30 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject29, i32 0, i32 10
+  store float %vecext28, float* %arrayidx30, align 4
+  %vecext31 = extractelement <12 x float> %M, i32 11
+  %WorldToObject32 = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 10
+  %arrayidx33 = getelementptr inbounds [12 x float], [12 x float]* %WorldToObject32, i32 0, i32 11
+  store float %vecext31, float* %arrayidx33, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_primitiveIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %0 = load i32, i32* %PrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_PrimitiveIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  %0 = load i32, i32* %PrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPrimitiveIndex(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+)AAA",
+R"AAA(
+entry:
+  %PrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 11
+  store i32 %i, i32* %PrimitiveIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_ShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  %0 = load i32, i32* %ShaderRecordOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData, i32 %shaderRecordOffset) #0 {
+entry:
+  %ShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 15
+  store i32 %shaderRecordOffset, i32* %ShaderRecordOffset, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_instanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %0 = load i32, i32* %InstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_InstanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  %0 = load i32, i32* %InstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetInstanceIndex(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %InstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 12
+  store i32 %i, i32* %InstanceIndex, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_instanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %0 = load i32, i32* %InstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_InstanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  %0 = load i32, i32* %InstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetInstanceID(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %InstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 13
+  store i32 %i, i32* %InstanceID, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_hitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %0 = load i32, i32* %HitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_HitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  %0 = load i32, i32* %HitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetHitKind(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %HitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 14
+  store i32 %i, i32* %HitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define float @fb_dxop_pending_rayTCurrent(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  %0 = load float, float* %PendingRayTCurrent, align 4
+  ret float %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingRayTCurrent(%struct.RuntimeDataStruct* %runtimeData, float %t) #0 {
+entry:
+  %PendingRayTCurrent = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 16
+  store float %t, float* %PendingRayTCurrent, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_primitiveID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingPrimitiveIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 17
+  %0 = load i32, i32* %PendingPrimitiveIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_PendingShaderRecordOffset(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingShaderRecordOffset = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 21
+  %0 = load i32, i32* %PendingShaderRecordOffset, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_instanceIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingInstanceIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 18
+  %0 = load i32, i32* %PendingInstanceIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_instanceID(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingInstanceID = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 19
+  %0 = load i32, i32* %PendingInstanceID, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_dxop_pending_hitKind(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  %0 = load i32, i32* %PendingHitKind, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetPendingHitKind(%struct.RuntimeDataStruct* %runtimeData, i32 %i) #0 {
+entry:
+  %PendingHitKind = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 20
+  store i32 %i, i32* %PendingHitKind, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_GroupIndex(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %GroupIndex = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 22
+  %0 = load i32, i32* %GroupIndex, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_AnyHitResult(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %AnyHitResult = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 23
+  %0 = load i32, i32* %AnyHitResult, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetAnyHitResult(%struct.RuntimeDataStruct* %runtimeData, i32 %result) #0 {
+entry:
+  %AnyHitResult = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 23
+  store i32 %result, i32* %AnyHitResult, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @fb_Fallback_AnyHitStateId(%struct.RuntimeDataStruct* %runtimeData) #0 {
+entry:
+  %AnyHitStateId = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 24
+  %0 = load i32, i32* %AnyHitStateId, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+define void @fb_Fallback_SetAnyHitStateId(%struct.RuntimeDataStruct* %runtimeData, i32 %id) #0 {
+entry:
+  %AnyHitStateId = getelementptr inbounds %struct.RuntimeDataStruct, %struct.RuntimeDataStruct* %runtimeData, i32 0, i32 24
+  store i32 %id, i32* %AnyHitStateId, align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind }
+)AAA"
+};
+
+#include <sstream>
+static std::string getRuntimeString()
+{
+  std::ostringstream out;
+  for( size_t i=0; i < _countof(runtimeString); ++i)
+    out << runtimeString[i];
+
+  return out.str();
+}

+ 62 - 0
lib/DxrFallback/runtime/rewriteRuntime.py

@@ -0,0 +1,62 @@
+import re
+
+inputFilename = 'runtime.opt.ll'
+sourceFilename = r'C:/Users/chwallis/Desktop/DXILShaderPatch/runtime.c'
+outputFilename = 'C:/Users/chwallis/Desktop/DXILShaderPatch/runtime.h'
+
+source = open(sourceFilename).read()
+
+input = open(inputFilename).read()
+m = re.search(r'"nvptx"(.*?)attributes #', input, re.DOTALL)
+dxil = m.group(1)
+
+# split the string up to avoid  error C2026: string too big, trailing characters truncated
+lines = dxil.splitlines()
+dxil = []
+count = 0
+for line in lines:
+    count += len(line)
+    dxil.append(line)
+    if count > 10000:
+        dxil.append(')AAA",')
+        dxil.append('R"AAA(')
+        count = 0
+dxil = '\n'.join(dxil)
+
+template = """
+// This file generated by compiling the following source (runtime.c) as follows:
+//    clang -S -emit-llvm -target nvptr runtime.c
+//    opt -S -mem2reg runtime.ll -o runtime.opt.ll
+// The resulting LLVM-IR is stripped of its datalayout and replaced with one
+// compatible with DXIL.
+
+// runtime.c
+#if 0 
+%SOURCE%
+#endif
+
+static const char* runtimeString[] = { R"AAA(
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f:64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%DXIL%
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind }
+)AAA"
+};
+
+#include <sstream>
+static std::string getRuntimeString()
+{
+  std::ostringstream out;
+  for( size_t i=0; i < _countof(runtimeString); ++i)
+    out << runtimeString[i];
+
+  return out.str();
+}
+"""
+
+output = re.sub(r'%SOURCE%', source, template)
+output = re.sub(r'%DXIL%', dxil, output)
+open(outputFilename, 'w').write(output)

+ 658 - 0
lib/DxrFallback/runtime/runtime.c

@@ -0,0 +1,658 @@
+#include <stddef.h>
+
+static const int STACK_SIZE_IN_BYTES = 1024;
+
+typedef float float3 __attribute__((vector_size(3*sizeof(float))));
+typedef float float4 __attribute__((vector_size(4*sizeof(float))));
+typedef float float12 __attribute__((vector_size(12*sizeof(float))));
+typedef float (M3x4)[12];
+typedef int   (StackType)[STACK_SIZE_IN_BYTES/sizeof(int)];
+typedef unsigned char byte;
+
+
+typedef struct RuntimeDataStruct
+{
+  int DispatchRaysIndex[2];
+  int DispatchRaysDimensions[2];
+
+  float RayTMin;
+  float RayTCurrent;
+  unsigned RayFlags;
+  float WorldRayOrigin[3];
+  float WorldRayDirection[3];
+  float ObjectRayOrigin[3];
+  float ObjectRayDirection[3];
+  M3x4 ObjectToWorld;
+  M3x4 WorldToObject;
+
+  unsigned PrimitiveIndex;
+  unsigned InstanceIndex;
+  unsigned InstanceID;
+  unsigned HitKind;
+  unsigned ShaderRecordOffset;
+
+
+  // Pending hit values - accessed in anyHit and intersection shaders before a hit has been committed
+  float PendingRayTCurrent;
+  unsigned PendingPrimitiveIndex;
+  unsigned PendingInstanceIndex;
+  unsigned PendingInstanceID;
+  unsigned PendingHitKind;
+  unsigned PendingShaderRecordOffset; 
+
+  int GroupIndex; 
+  int AnyHitResult;
+  int AnyHitStateId;  // Originally temporary. We needed to avoid resource usage
+                      // in ReportHit() because of linking issues so weset the value here first. 
+                      // May be worth retaining to cache the value when fetching the intersection 
+                      // stateId (fetch them both at once). 
+
+  int PayloadOffset;            
+  int CommittedAttrOffset;      
+  int PendingAttrOffset;        
+  
+  int StackOffset; // offset from the start of the stack
+  StackType* Stack;
+} RuntimeData;
+
+typedef RuntimeData* RuntimeDataType;
+
+typedef struct TraceRaySpills_ClosestHit
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+  float ObjectRayOrigin[3];      
+  float ObjectRayDirection[3];   
+
+  unsigned PrimitiveIndex;       
+  unsigned InstanceIndex;        
+  unsigned InstanceID;           
+  unsigned HitKind;              
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_ClosestHit;
+
+typedef struct TraceRaySpills_Miss
+{
+  float RayTMin;                 
+  float RayTCurrent;             
+  unsigned RayFlags;             
+  float WorldRayOrigin[3];       
+  float WorldRayDirection[3];    
+            
+  unsigned ShaderRecordOffset;
+} TraceRaySpills_Miss;
+
+
+#define REF(x) (runtimeData->x)
+#define REF_FLT(x) (runtimeData->x)
+#define REF_STACK(offset) ((*runtimeData->Stack)[runtimeData->StackOffset + offset])
+#define REF_FLT_OFS(x, offset) (runtimeData->x[offset])
+
+// Return next stateID
+int rewrite_dispatch(RuntimeDataType runtimeData, int stateID);
+void* rewrite_setLaunchParams(RuntimeDataType runtimeData, unsigned dimx, unsigned dimy);
+unsigned rewrite_getStackSize(void);
+StackType* rewrite_createStack(void);
+
+void stackInit(RuntimeDataType runtimeData, StackType* theStack, unsigned stackSize)
+{
+  REF(Stack) = theStack;
+  REF(StackOffset) = stackSize/sizeof(int) - 1;
+  REF(PayloadOffset)       = 1111; // recognizable bogus values
+  REF(CommittedAttrOffset) = 2222;
+  REF(PendingAttrOffset)   = 3333;
+}
+
+void stackFramePush(RuntimeDataType runtimeData, int size)
+{
+  REF(StackOffset) -= size;
+}
+
+void stackFramePop(RuntimeDataType runtimeData, int size)
+{ 
+  REF(StackOffset) += size;
+}
+
+int stackFrameOffset(RuntimeDataType runtimeData)
+{
+  return REF(StackOffset);
+}
+
+int payloadOffset(RuntimeDataType runtimeData)
+{
+  return REF(PayloadOffset);
+}
+
+int committedAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(CommittedAttrOffset);
+}
+
+int pendingAttrOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingAttrOffset);
+}
+
+int* stackIntPtr(RuntimeDataType runtimeData, int baseOffset, int offset)
+{
+  return &(*runtimeData->Stack)[baseOffset + offset];
+}
+
+
+void traceFramePush(RuntimeDataType runtimeData, int attrSize)
+{
+  // Save the old payload and attribute offsets
+  REF_STACK(-1) = REF(CommittedAttrOffset);
+  REF_STACK(-2) = REF(PendingAttrOffset);
+
+  // Set new offsets
+  REF(CommittedAttrOffset) = REF(StackOffset) - 2 - attrSize; 
+  REF(PendingAttrOffset)   = REF(StackOffset) - 2 - 2 * attrSize; 
+}
+
+void traceFramePop(RuntimeDataType runtimeData)
+{
+  // Restore the old attribute offsets
+  REF(CommittedAttrOffset) = REF_STACK(-1); 
+  REF(PendingAttrOffset) = REF_STACK(-2);
+}
+
+void traceRaySave_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+  spills->ObjectRayOrigin[0]    = REF_FLT(ObjectRayOrigin[0]);
+  spills->ObjectRayOrigin[1]    = REF_FLT(ObjectRayOrigin[1]);
+  spills->ObjectRayOrigin[2]    = REF_FLT(ObjectRayOrigin[2]);
+  spills->ObjectRayDirection[0] = REF_FLT(ObjectRayDirection[0]);
+  spills->ObjectRayDirection[1] = REF_FLT(ObjectRayDirection[1]);
+  spills->ObjectRayDirection[2] = REF_FLT(ObjectRayDirection[2]);
+
+  spills->PrimitiveIndex      = REF(PrimitiveIndex);       
+  spills->InstanceIndex       = REF(InstanceIndex);        
+  spills->InstanceID          = REF(InstanceID);           
+  spills->HitKind             = REF(HitKind);              
+  spills->ShaderRecordOffset  = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_ClosestHit(RuntimeDataType runtimeData, TraceRaySpills_ClosestHit* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+  REF_FLT(ObjectRayOrigin[0])    = spills->ObjectRayOrigin[0];     
+  REF_FLT(ObjectRayOrigin[1])    = spills->ObjectRayOrigin[1];     
+  REF_FLT(ObjectRayOrigin[2])    = spills->ObjectRayOrigin[2];     
+  REF_FLT(ObjectRayDirection[0]) = spills->ObjectRayDirection[0];  
+  REF_FLT(ObjectRayDirection[1]) = spills->ObjectRayDirection[1];  
+  REF_FLT(ObjectRayDirection[2]) = spills->ObjectRayDirection[2];  
+
+  REF(PrimitiveIndex)     = spills->PrimitiveIndex;          
+  REF(InstanceIndex)      = spills->InstanceIndex;           
+  REF(InstanceID)         = spills->InstanceID;              
+  REF(HitKind)            = spills->HitKind;                 
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+void traceRaySave_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  spills->RayFlags              = REF(RayFlags);
+  spills->RayTCurrent           = REF_FLT(RayTCurrent);
+  spills->RayTMin               = REF_FLT(RayTMin);
+  spills->WorldRayOrigin[0]     = REF_FLT(WorldRayOrigin[0]);
+  spills->WorldRayOrigin[1]     = REF_FLT(WorldRayOrigin[1]);
+  spills->WorldRayOrigin[2]     = REF_FLT(WorldRayOrigin[2]);
+  spills->WorldRayDirection[0]  = REF_FLT(WorldRayDirection[0]);
+  spills->WorldRayDirection[1]  = REF_FLT(WorldRayDirection[1]);
+  spills->WorldRayDirection[2]  = REF_FLT(WorldRayDirection[2]);
+
+  spills->ShaderRecordOffset    = REF(ShaderRecordOffset);  
+}
+
+void traceRayRestore_Miss(RuntimeDataType runtimeData, TraceRaySpills_Miss* spills)
+{
+  REF(RayFlags)                  = spills->RayFlags;               
+  REF_FLT(RayTCurrent)           = spills->RayTCurrent;            
+  REF_FLT(RayTMin)               = spills->RayTMin;                
+  REF_FLT(WorldRayOrigin[0])     = spills->WorldRayOrigin[0];      
+  REF_FLT(WorldRayOrigin[1])     = spills->WorldRayOrigin[1];      
+  REF_FLT(WorldRayOrigin[2])     = spills->WorldRayOrigin[2];      
+  REF_FLT(WorldRayDirection[0])  = spills->WorldRayDirection[0];   
+  REF_FLT(WorldRayDirection[1])  = spills->WorldRayDirection[1];   
+  REF_FLT(WorldRayDirection[2])  = spills->WorldRayDirection[2];   
+
+  REF(ShaderRecordOffset) = spills->ShaderRecordOffset;    
+}
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Intrinsics for the fallback layer
+//
+//////////////////////////////////////////////////////////////////////////
+
+void fb_Fallback_Scheduler(int initialStateId, unsigned dimx, unsigned dimy)
+{
+  StackType* theStack = rewrite_createStack();
+  RuntimeData theRuntimeData;
+  RuntimeDataType runtimeData = &theRuntimeData;
+
+  rewrite_setLaunchParams(runtimeData, dimx, dimy);
+  if(REF(DispatchRaysIndex[0]) >= REF(DispatchRaysDimensions[0]) ||
+     REF(DispatchRaysIndex[1]) >= REF(DispatchRaysDimensions[1]))
+  { 
+    return;
+  }
+
+
+  // Set final return stateID into reserved area at stack top
+  unsigned stackSize = rewrite_getStackSize();
+  stackInit(runtimeData, theStack, stackSize);
+  int stackFrameOffs = stackFrameOffset(runtimeData);
+  *stackIntPtr(runtimeData, stackFrameOffs, 0) = -1;
+
+  int stateId = initialStateId;
+  int count = 0;
+  while( stateId >= 0 )
+  {
+    stateId = rewrite_dispatch(runtimeData, stateId);
+  }
+}
+
+void fb_Fallback_SetLaunchParams(RuntimeDataType runtimeData, unsigned DTidx, unsigned DTidy, unsigned dimx, unsigned dimy, unsigned groupIndex)
+{ 
+  REF(DispatchRaysIndex[0]) = DTidx;
+  REF(DispatchRaysIndex[1]) = DTidy;
+  REF(DispatchRaysDimensions[0]) = dimx;
+  REF(DispatchRaysDimensions[1]) = dimy;
+
+  REF(GroupIndex) = groupIndex;
+}
+
+int fb_Fallback_TraceRayBegin(RuntimeDataType runtimeData, unsigned rayFlags, float ox, float oy, float oz, float tmin, float dx, float dy, float dz, float tmax, int newPayloadOffset)
+{ 
+  REF(RayFlags) = rayFlags;
+  REF_FLT(WorldRayOrigin[0]) = ox;
+  REF_FLT(WorldRayOrigin[1]) = oy;
+  REF_FLT(WorldRayOrigin[2]) = oz;
+  REF_FLT(WorldRayDirection[0]) = dx;
+  REF_FLT(WorldRayDirection[1]) = dy;
+  REF_FLT(WorldRayDirection[2]) = dz;
+  REF_FLT(RayTCurrent) = tmax;
+  REF_FLT(RayTMin) = tmin;
+
+  int oldOffset = REF(PayloadOffset);
+  REF(PayloadOffset) = newPayloadOffset;
+  return oldOffset;
+}
+
+void fb_Fallback_TraceRayEnd(RuntimeDataType runtimeData, int oldPayloadOffset)
+{
+  REF(PayloadOffset) = oldPayloadOffset;
+}
+
+void fb_Fallback_SetPendingTriVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID, float t, unsigned hitKind)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+  REF_FLT(PendingRayTCurrent) = t;
+  REF(PendingHitKind) = hitKind;
+}
+
+void fb_Fallback_SetPendingCustomVals(RuntimeDataType runtimeData, unsigned shaderRecordOffset, unsigned primitiveIndex, unsigned instanceIndex, unsigned instanceID)
+{
+  REF(PendingShaderRecordOffset) = shaderRecordOffset;
+  REF(PendingPrimitiveIndex) = primitiveIndex;
+  REF(PendingInstanceIndex) = instanceIndex;
+  REF(PendingInstanceID) = instanceID;
+}
+
+void fb_Fallback_CommitHit(RuntimeDataType runtimeData)
+{
+  REF_FLT(RayTCurrent)    = REF_FLT(PendingRayTCurrent);
+  REF(ShaderRecordOffset) = REF(PendingShaderRecordOffset);
+  REF(PrimitiveIndex)     = REF(PendingPrimitiveIndex);
+  REF(InstanceIndex)      = REF(PendingInstanceIndex);
+  REF(InstanceID)         = REF(PendingInstanceID);
+  REF(HitKind)            = REF(PendingHitKind);  
+
+  int PendingAttrOffset = REF(PendingAttrOffset);
+  REF(PendingAttrOffset) = REF(CommittedAttrOffset);
+  REF(CommittedAttrOffset) = PendingAttrOffset;
+}
+
+
+int fb_Fallback_RuntimeDataLoadInt(RuntimeDataType runtimeData, int offset)
+{
+  return (*runtimeData->Stack)[offset];
+}
+
+void fb_Fallback_RuntimeDataStoreInt(RuntimeDataType runtimeData, int offset, int val)
+{
+  (*runtimeData->Stack)[offset] = val;
+}
+
+unsigned fb_dxop_dispatchRaysIndex(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysIndex[i]);
+}
+
+unsigned fb_dxop_dispatchRaysDimensions(RuntimeDataType runtimeData, byte i)
+{  
+  return REF(DispatchRaysDimensions[i]);
+}
+
+float fb_dxop_rayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+float fb_Fallback_RayTMin(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTMin);
+}
+
+void fb_Fallback_SetRayTMin(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTMin) = t;
+}
+
+float fb_dxop_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+float fb_Fallback_RayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(RayTCurrent);
+}
+
+void fb_Fallback_SetRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(RayTCurrent) = t;
+}
+
+unsigned fb_dxop_rayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+unsigned fb_Fallback_RayFlags(RuntimeDataType runtimeData)
+{
+  return REF(RayFlags);
+}
+
+void fb_Fallback_SetRayFlags(RuntimeDataType runtimeData, unsigned flags)
+{
+  REF(RayFlags) = flags;
+}
+
+float fb_dxop_worldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+float fb_Fallback_WorldRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(WorldRayOrigin[i]);
+}
+
+void fb_Fallback_SetWorldRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayOrigin[0]) = x;
+  REF_FLT(WorldRayOrigin[1]) = y;
+  REF_FLT(WorldRayOrigin[2]) = z;
+}
+
+float fb_dxop_worldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+float fb_Fallback_WorldRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(WorldRayDirection[i]);
+}
+
+void fb_Fallback_SetWorldRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(WorldRayDirection[0]) = x;
+  REF_FLT(WorldRayDirection[1]) = y;
+  REF_FLT(WorldRayDirection[2]) = z;
+}
+
+float fb_dxop_objectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+float fb_Fallback_ObjectRayOrigin(RuntimeDataType runtimeData, byte i)
+{ 
+  return REF_FLT(ObjectRayOrigin[i]);
+}
+
+void fb_Fallback_SetObjectRayOrigin(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayOrigin[0]) = x;
+  REF_FLT(ObjectRayOrigin[1]) = y;
+  REF_FLT(ObjectRayOrigin[2]) = z;
+}
+
+float fb_dxop_objectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+float fb_Fallback_ObjectRayDirection(RuntimeDataType runtimeData, byte i)
+{  
+  return REF_FLT(ObjectRayDirection[i]);
+}
+
+void fb_Fallback_SetObjectRayDirection(RuntimeDataType runtimeData, float x, float y, float z)
+{ 
+  REF_FLT(ObjectRayDirection[0]) = x;
+  REF_FLT(ObjectRayDirection[1]) = y;
+  REF_FLT(ObjectRayDirection[2]) = z;
+}
+
+float fb_dxop_objectToWorld(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(ObjectToWorld, i);
+}
+
+void fb_Fallback_SetObjectToWorld(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(ObjectToWorld, 0)  = M[0]; 
+  REF_FLT_OFS(ObjectToWorld, 1)  = M[1]; 
+  REF_FLT_OFS(ObjectToWorld, 2)  = M[2]; 
+  REF_FLT_OFS(ObjectToWorld, 3)  = M[3]; 
+  REF_FLT_OFS(ObjectToWorld, 4)  = M[4]; 
+  REF_FLT_OFS(ObjectToWorld, 5)  = M[5]; 
+  REF_FLT_OFS(ObjectToWorld, 6)  = M[6]; 
+  REF_FLT_OFS(ObjectToWorld, 7)  = M[7]; 
+  REF_FLT_OFS(ObjectToWorld, 8)  = M[8]; 
+  REF_FLT_OFS(ObjectToWorld, 9)  = M[9]; 
+  REF_FLT_OFS(ObjectToWorld, 10) = M[10];
+  REF_FLT_OFS(ObjectToWorld, 11) = M[11];
+}
+
+float fb_dxop_worldToObject(RuntimeDataType runtimeData, int r, byte c)
+{
+  int i = r * 4 + c;
+  return REF_FLT_OFS(WorldToObject, i);
+}
+
+void fb_Fallback_SetWorldToObject(RuntimeDataType runtimeData, float12 M)
+{
+  REF_FLT_OFS(WorldToObject, 0)  = M[0]; 
+  REF_FLT_OFS(WorldToObject, 1)  = M[1]; 
+  REF_FLT_OFS(WorldToObject, 2)  = M[2]; 
+  REF_FLT_OFS(WorldToObject, 3)  = M[3]; 
+  REF_FLT_OFS(WorldToObject, 4)  = M[4]; 
+  REF_FLT_OFS(WorldToObject, 5)  = M[5]; 
+  REF_FLT_OFS(WorldToObject, 6)  = M[6]; 
+  REF_FLT_OFS(WorldToObject, 7)  = M[7]; 
+  REF_FLT_OFS(WorldToObject, 8)  = M[8]; 
+  REF_FLT_OFS(WorldToObject, 9)  = M[9]; 
+  REF_FLT_OFS(WorldToObject, 10) = M[10];
+  REF_FLT_OFS(WorldToObject, 11) = M[11];
+}
+
+unsigned fb_dxop_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+unsigned fb_Fallback_PrimitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PrimitiveIndex);
+}
+
+void fb_Fallback_SetPrimitiveIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PrimitiveIndex) = i;
+}
+
+unsigned fb_Fallback_ShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(ShaderRecordOffset);
+}
+
+void fb_Fallback_SetShaderRecordOffset(RuntimeDataType runtimeData, unsigned shaderRecordOffset)
+{
+  REF(ShaderRecordOffset) = shaderRecordOffset;
+}
+
+unsigned fb_dxop_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+unsigned fb_Fallback_InstanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(InstanceIndex);
+}
+
+void fb_Fallback_SetInstanceIndex(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceIndex) = i;
+}
+
+unsigned fb_dxop_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+unsigned fb_Fallback_InstanceID(RuntimeDataType runtimeData)
+{
+  return REF(InstanceID);
+}
+
+void fb_Fallback_SetInstanceID(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(InstanceID) = i;
+}
+
+unsigned fb_dxop_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+unsigned fb_Fallback_HitKind(RuntimeDataType runtimeData)
+{
+  return REF(HitKind);
+}
+
+void fb_Fallback_SetHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(HitKind) = i;
+}
+
+float fb_dxop_pending_rayTCurrent(RuntimeDataType runtimeData)
+{
+  return REF_FLT(PendingRayTCurrent);
+}
+
+void fb_Fallback_SetPendingRayTCurrent(RuntimeDataType runtimeData, float t)
+{
+  REF_FLT(PendingRayTCurrent) = t;
+}
+
+unsigned fb_dxop_pending_primitiveID(RuntimeDataType runtimeData)
+//unsigned fb_dxop_pending_primitiveIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingPrimitiveIndex);
+}
+
+unsigned fb_Fallback_PendingShaderRecordOffset(RuntimeDataType runtimeData)
+{
+  return REF(PendingShaderRecordOffset);
+}
+
+unsigned fb_dxop_pending_instanceIndex(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceIndex);
+}
+
+unsigned fb_dxop_pending_instanceID(RuntimeDataType runtimeData)
+{
+  return REF(PendingInstanceID);
+}
+
+unsigned fb_dxop_pending_hitKind(RuntimeDataType runtimeData)
+{
+  return REF(PendingHitKind);
+}
+
+void fb_Fallback_SetPendingHitKind(RuntimeDataType runtimeData, unsigned i)
+{
+  REF(PendingHitKind) = i;
+}
+
+unsigned fb_Fallback_GroupIndex(RuntimeDataType runtimeData)
+{ 
+  return REF(GroupIndex);
+}
+
+int fb_Fallback_AnyHitResult(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitResult);
+}
+
+void fb_Fallback_SetAnyHitResult(RuntimeDataType runtimeData, int result)
+{
+  REF(AnyHitResult) = result;
+}
+
+int fb_Fallback_AnyHitStateId(RuntimeDataType runtimeData)
+{
+  return REF(AnyHitStateId);
+}
+
+void fb_Fallback_SetAnyHitStateId(RuntimeDataType runtimeData, int id)
+{
+  REF(AnyHitStateId) = id;
+}

+ 9 - 0
lib/DxrFallback/runtime/script.cmd

@@ -0,0 +1,9 @@
+@setlocal
+@set BINPATH=C:\Program Files\LLVM3.7\bin
+@set CLANG="%BINPATH%\clang"
+@set OPT="%BINPATH%\opt"
+
+
+%CLANG% -S -emit-llvm -target nvptx runtime.c 
+%OPT% -S -mem2reg  runtime.ll -o runtime.opt.ll
+python rewriteRuntime.py

+ 3 - 0
lib/HLSL/CMakeLists.txt

@@ -26,12 +26,14 @@ add_llvm_library(LLVMHLSL
   DxilPreparePasses.cpp
   DxilRemoveDiscards.cpp
   DxilReduceMSAAToSingleSample.cpp
+  DxilPatchShaderRecordBindings.cpp
   DxilPreserveAllOutputs.cpp
   DxilResource.cpp
   DxilResourceBase.cpp
   DxilRootSignature.cpp
   DxilSampler.cpp
   DxilSemantic.cpp
+  DxilShaderFlags.cpp
   DxilShaderAccessTracking.cpp
   DxilShaderModel.cpp
   DxilSignature.cpp
@@ -40,6 +42,7 @@ add_llvm_library(LLVMHLSL
   DxilTargetTransformInfo.cpp
   DxilTypeSystem.cpp
   DxilUtil.cpp
+  DxilExportMap.cpp
   DxilValidation.cpp
   DxcOptimizer.cpp
   HLMatrixLowerPass.cpp

+ 7 - 2
lib/HLSL/DxcOptimizer.cpp

@@ -86,6 +86,7 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeDSEPass(Registry);
     initializeDeadInstEliminationPass(Registry);
     initializeDxilAddPixelHitInstrumentationPass(Registry);
+    initializeDxilAllocateResourcesForLibPass(Registry);
     initializeDxilCondenseResourcesPass(Registry);
     initializeDxilConvergentClearPass(Registry);
     initializeDxilConvergentMarkPass(Registry);
@@ -98,13 +99,15 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeDxilForceEarlyZPass(Registry);
     initializeDxilGenerationPassPass(Registry);
     initializeDxilLegalizeEvalOperationsPass(Registry);
-    initializeDxilLegalizeResourceUsePassPass(Registry);
+    initializeDxilLegalizeResourcesPass(Registry);
     initializeDxilLegalizeSampleOffsetPassPass(Registry);
-    initializeDxilLegalizeStaticResourceUsePassPass(Registry);
     initializeDxilLoadMetadataPass(Registry);
+    initializeDxilLowerCreateHandleForLibPass(Registry);
     initializeDxilOutputColorBecomesConstantPass(Registry);
     initializeDxilPrecisePropagatePassPass(Registry);
     initializeDxilPreserveAllOutputsPass(Registry);
+    initializeDxilPromoteLocalResourcesPass(Registry);
+    initializeDxilPromoteStaticResourcesPass(Registry);
     initializeDxilReduceMSAAToSingleSamplePass(Registry);
     initializeDxilRemoveDiscardsPass(Registry);
     initializeDxilShaderAccessTrackingPass(Registry);
@@ -112,6 +115,7 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeDynamicIndexingVectorToArrayPass(Registry);
     initializeEarlyCSELegacyPassPass(Registry);
     initializeEliminateAvailableExternallyPass(Registry);
+    initializeFailUndefResourcePass(Registry);
     initializeFloat2IntPass(Registry);
     initializeFunctionAttrsPass(Registry);
     initializeGVNPass(Registry);
@@ -140,6 +144,7 @@ HRESULT SetupRegistryPassForHLSL() {
     initializeLowerBitSetsPass(Registry);
     initializeLowerExpectIntrinsicPass(Registry);
     initializeLowerStaticGlobalIntoAllocaPass(Registry);
+    initializeMatrixBitcastLowerPassPass(Registry);
     initializeMergeFunctionsPass(Registry);
     initializeMergedLoadStoreMotionPass(Registry);
     initializeMultiDimArrayToOneDimArrayPass(Registry);

+ 2 - 1
lib/HLSL/DxilAddPixelHitInstrumentation.cpp

@@ -15,6 +15,7 @@
 #include "dxc/HLSL/DxilInstructions.h"
 #include "dxc/HLSL/DxilModule.h"
 #include "dxc/HLSL/DxilPIXPasses.h"
+#include "dxc/HLSL/DxilUtil.h"
 
 #include "llvm/IR/PassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -93,7 +94,7 @@ bool DxilAddPixelHitInstrumentation::runOnModule(Module &M)
 
   CallInst *HandleForUAV;
   {
-    IRBuilder<> Builder(DM.GetEntryFunction()->getEntryBlock().getFirstInsertionPt());
+    IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(DM.GetEntryFunction()));
     
     unsigned int UAVResourceHandle = static_cast<unsigned int>(DM.GetUAVs().size());
 

+ 1851 - 271
lib/HLSL/DxilCondenseResources.cpp

@@ -17,13 +17,18 @@
 #include "dxc/HLSL/DxilTypeSystem.h"
 #include "dxc/HLSL/DxilInstructions.h"
 #include "dxc/HLSL/DxilSpanAllocator.h"
+#include "dxc/HLSL/HLMatrixLowerHelper.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "dxc/HLSL/HLModule.h"
 
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <memory>
@@ -32,14 +37,19 @@
 using namespace llvm;
 using namespace hlsl;
 
+// Resource rangeID remap.
+namespace {
 struct ResourceID {
-  DXIL::ResourceClass Class;  // Resource class.
-  unsigned ID;                // Resource ID, as specified on entry.
-
-  bool operator<(const ResourceID& other) const {
-    if (Class < other.Class) return true;
-    if (Class > other.Class) return false;
-    if (ID < other.ID) return true;
+  DXIL::ResourceClass Class; // Resource class.
+  unsigned ID;               // Resource ID, as specified on entry.
+
+  bool operator<(const ResourceID &other) const {
+    if (Class < other.Class)
+      return true;
+    if (Class > other.Class)
+      return false;
+    if (ID < other.ID)
+      return true;
     return false;
   }
 };
@@ -47,11 +57,143 @@ struct ResourceID {
 struct RemapEntry {
   ResourceID ResID;           // Resource identity, as specified on entry.
   DxilResourceBase *Resource; // In-memory resource representation.
-  unsigned Index;             // Index in resource vector - new ID for the resource.
+  unsigned Index; // Index in resource vector - new ID for the resource.
 };
 
 typedef std::map<ResourceID, RemapEntry> RemapEntryCollection;
 
+template <typename TResource>
+void BuildRewrites(const std::vector<std::unique_ptr<TResource>> &Rs,
+                   RemapEntryCollection &C) {
+  const unsigned s = (unsigned)Rs.size();
+  for (unsigned i = 0; i < s; ++i) {
+    const std::unique_ptr<TResource> &R = Rs[i];
+    if (R->GetID() != i) {
+      ResourceID RId = {R->GetClass(), R->GetID()};
+      RemapEntry RE = {RId, R.get(), i};
+      C[RId] = RE;
+    }
+  }
+}
+
+// Build m_rewrites, returns 'true' if any rewrites are needed.
+bool BuildRewriteMap(RemapEntryCollection &rewrites, DxilModule &DM) {
+  BuildRewrites(DM.GetCBuffers(), rewrites);
+  BuildRewrites(DM.GetSRVs(), rewrites);
+  BuildRewrites(DM.GetUAVs(), rewrites);
+  BuildRewrites(DM.GetSamplers(), rewrites);
+
+  return !rewrites.empty();
+}
+
+void ApplyRewriteMapOnResTable(RemapEntryCollection &rewrites, DxilModule &DM) {
+  for (auto &entry : rewrites) {
+    entry.second.Resource->SetID(entry.second.Index);
+  }
+}
+
+} // namespace
+
+// Resource lowerBound allocation.
+namespace {
+
+template <typename T>
+static bool
+AllocateDxilResource(const std::vector<std::unique_ptr<T>> &resourceList,
+                     LLVMContext &Ctx, unsigned AutoBindingSpace=0) {
+  bool bChanged = false;
+  SpacesAllocator<unsigned, T> SAlloc;
+
+  for (auto &res : resourceList) {
+    const unsigned space = res->GetSpaceID();
+    typename SpacesAllocator<unsigned, T>::Allocator &alloc = SAlloc.Get(space);
+
+    if (res->IsAllocated()) {
+      const unsigned reg = res->GetLowerBound();
+      const T *conflict = nullptr;
+      if (res->IsUnbounded()) {
+        const T *unbounded = alloc.GetUnbounded();
+        if (unbounded) {
+          Ctx.emitError(Twine("more than one unbounded resource (") +
+                        unbounded->GetGlobalName() + (" and ") +
+                        res->GetGlobalName() + (") in space ") + Twine(space));
+        } else {
+          conflict = alloc.Insert(res.get(), reg, res->GetUpperBound());
+          if (!conflict)
+            alloc.SetUnbounded(res.get());
+        }
+      } else {
+        conflict = alloc.Insert(res.get(), reg, res->GetUpperBound());
+      }
+      if (conflict) {
+        Ctx.emitError(((res->IsUnbounded()) ? Twine("unbounded ") : Twine("")) +
+                      Twine("resource ") + res->GetGlobalName() +
+                      Twine(" at register ") + Twine(reg) +
+                      Twine(" overlaps with resource ") +
+                      conflict->GetGlobalName() + Twine(" at register ") +
+                      Twine(conflict->GetLowerBound()) + Twine(", space ") +
+                      Twine(space));
+      }
+    }
+  }
+
+  // Allocate.
+  const unsigned space = AutoBindingSpace;
+  typename SpacesAllocator<unsigned, T>::Allocator &alloc0 = SAlloc.Get(space);
+  for (auto &res : resourceList) {
+    if (!res->IsAllocated()) {
+      DXASSERT(res->GetSpaceID() == 0,
+               "otherwise non-zero space has no user register assignment");
+      unsigned reg = 0;
+      bool success = false;
+      if (res->IsUnbounded()) {
+        const T *unbounded = alloc0.GetUnbounded();
+        if (unbounded) {
+          Ctx.emitError(Twine("more than one unbounded resource (") +
+                        unbounded->GetGlobalName() + Twine(" and ") +
+                        res->GetGlobalName() + Twine(") in space ") +
+                        Twine(space));
+        } else {
+          success = alloc0.AllocateUnbounded(res.get(), reg);
+          if (success)
+            alloc0.SetUnbounded(res.get());
+        }
+      } else {
+        success = alloc0.Allocate(res.get(), res->GetRangeSize(), reg);
+      }
+      if (success) {
+        res->SetLowerBound(reg);
+        res->SetSpaceID(space);
+        bChanged = true;
+      } else {
+        Ctx.emitError(((res->IsUnbounded()) ? Twine("unbounded ") : Twine("")) +
+                      Twine("resource ") + res->GetGlobalName() +
+                      Twine(" could not be allocated"));
+      }
+    }
+  }
+
+  return bChanged;
+}
+
+bool AllocateDxilResources(DxilModule &DM) {
+  uint32_t AutoBindingSpace = DM.GetAutoBindingSpace();
+  if (AutoBindingSpace == UINT_MAX) {
+    // For libraries, we don't allocate unless AutoBindingSpace is set.
+    if (DM.GetShaderModel()->IsLib())
+      return false;
+    // For shaders, we allocate in space 0 by default.
+    AutoBindingSpace = 0;
+  }
+  bool bChanged = false;
+  bChanged |= AllocateDxilResource(DM.GetCBuffers(), DM.GetCtx(), AutoBindingSpace);
+  bChanged |= AllocateDxilResource(DM.GetSamplers(), DM.GetCtx(), AutoBindingSpace);
+  bChanged |= AllocateDxilResource(DM.GetUAVs(), DM.GetCtx(), AutoBindingSpace);
+  bChanged |= AllocateDxilResource(DM.GetSRVs(), DM.GetCtx(), AutoBindingSpace);
+  return bChanged;
+}
+} // namespace
+
 class DxilCondenseResources : public ModulePass {
 private:
   RemapEntryCollection m_rewrites;
@@ -64,16 +206,15 @@ public:
 
   bool runOnModule(Module &M) override {
     DxilModule &DM = M.GetOrCreateDxilModule();
-
-    // Switch tbuffers to SRVs, as they have been treated as cbuffers up to this point.
-    if (DM.GetCBuffers().size())
-      PatchTBuffers(DM);
+    // Skip lib.
+    if (DM.GetShaderModel()->IsLib())
+      return false;
 
     // Remove unused resource.
     DM.RemoveUnusedResources();
 
     // Make sure all resource types are dense; build a map of rewrites.
-    if (BuildRewriteMap(DM)) {
+    if (BuildRewriteMap(m_rewrites, DM)) {
       // Rewrite all instructions that refer to resources in the map.
       ApplyRewriteMap(DM);
     }
@@ -85,16 +226,11 @@ public:
       if (!DM.GetShaderModel()->IsLib()) {
         AllocateDxilResources(DM);
         PatchCreateHandle(DM);
-      } else {
-        PatchCreateHandleForLib(DM);
       }
     }
     return true;
   }
 
-  // Build m_rewrites, returns 'true' if any rewrites are needed.
-  bool BuildRewriteMap(DxilModule &DM);
-
   DxilResourceBase &GetFirstRewrite() const {
     DXASSERT_NOMSG(!m_rewrites.empty());
     return *m_rewrites.begin()->second.Resource;
@@ -102,13 +238,8 @@ public:
 
 private:
   void ApplyRewriteMap(DxilModule &DM);
-  void AllocateDxilResources(DxilModule &DM);
   // Add lowbound to create handle range index.
   void PatchCreateHandle(DxilModule &DM);
-  // Add lowbound to create handle range index for library.
-  void PatchCreateHandleForLib(DxilModule &DM);
-  // Switch CBuffer for SRV for TBuffers.
-  void PatchTBuffers(DxilModule &DM);
 };
 
 void DxilCondenseResources::ApplyRewriteMap(DxilModule &DM) {
@@ -139,111 +270,11 @@ void DxilCondenseResources::ApplyRewriteMap(DxilModule &DM) {
     }
   }
 
-  for (auto &entry : m_rewrites) {
-    entry.second.Resource->SetID(entry.second.Index);
-  }
-}
-
-template <typename TResource>
-static void BuildRewrites(const std::vector<std::unique_ptr<TResource>> &Rs,
-                          RemapEntryCollection &C) {
-  const unsigned s = (unsigned)Rs.size();
-  for (unsigned i = 0; i < s; ++i) {
-    const std::unique_ptr<TResource> &R = Rs[i];
-    if (R->GetID() != i) {
-      ResourceID RId = {R->GetClass(), R->GetID()};
-      RemapEntry RE = {RId, R.get(), i};
-      C[RId] = RE;
-    }
-  }
-}
-
-bool DxilCondenseResources::BuildRewriteMap(DxilModule &DM) {
-  BuildRewrites(DM.GetCBuffers(), m_rewrites);
-  BuildRewrites(DM.GetSRVs(), m_rewrites);
-  BuildRewrites(DM.GetUAVs(), m_rewrites);
-  BuildRewrites(DM.GetSamplers(), m_rewrites);
-
-  return !m_rewrites.empty();
+  ApplyRewriteMapOnResTable(m_rewrites, DM);
 }
 
 namespace {
 
-template<typename T>
-static void AllocateDxilResource(const std::vector<std::unique_ptr<T> > &resourceList, LLVMContext &Ctx) {
-  SpacesAllocator<unsigned, T> SAlloc;
-
-  for (auto &res : resourceList) {
-    const unsigned space = res->GetSpaceID();
-    typename SpacesAllocator<unsigned, T>::Allocator &alloc = SAlloc.Get(space);
-
-    if (res->IsAllocated()) {
-      const unsigned reg = res->GetLowerBound();
-      const T *conflict = nullptr;
-      if (res->IsUnbounded()) {
-        const T *unbounded = alloc.GetUnbounded();
-        if (unbounded) {
-          Ctx.emitError(
-            Twine("more than one unbounded resource (") +
-            unbounded->GetGlobalName() +
-            (" and ") + res->GetGlobalName() +
-            (") in space ") + Twine(space));
-        } else {
-          conflict = alloc.Insert(res.get(), reg, res->GetUpperBound());
-          if (!conflict)
-            alloc.SetUnbounded(res.get());
-        }
-      } else {
-        conflict = alloc.Insert(res.get(), reg, res->GetUpperBound());
-      }
-      if (conflict) {
-        Ctx.emitError(
-          ((res->IsUnbounded()) ? Twine("unbounded ") : Twine("")) +
-          Twine("resource ") + res->GetGlobalName() +
-          Twine(" at register ") + Twine(reg) +
-          Twine(" overlaps with resource ") + conflict->GetGlobalName() +
-          Twine(" at register ") + Twine(conflict->GetLowerBound()) +
-          Twine(", space ") + Twine(space));
-      }
-    }
-  }
-
-  // Allocate.
-  const unsigned space = 0;
-  typename SpacesAllocator<unsigned, T>::Allocator &alloc0 = SAlloc.Get(space);
-  for (auto &res : resourceList) {
-    if (!res->IsAllocated()) {
-      DXASSERT(res->GetSpaceID() == 0, "otherwise non-zero space has no user register assignment");
-      unsigned reg = 0;
-      bool success = false;
-      if (res->IsUnbounded()) {
-        const T *unbounded = alloc0.GetUnbounded();
-        if (unbounded) {
-          Ctx.emitError(
-            Twine("more than one unbounded resource (") +
-            unbounded->GetGlobalName() +
-            Twine(" and ") + res->GetGlobalName() +
-            Twine(") in space ") + Twine(space));
-        } else {
-          success = alloc0.AllocateUnbounded(res.get(), reg);
-          if (success)
-            alloc0.SetUnbounded(res.get());
-        }
-      } else {
-        success = alloc0.Allocate(res.get(), res->GetRangeSize(), reg);
-      }
-      if (success) {
-        res->SetLowerBound(reg);
-      } else {
-        Ctx.emitError(
-          ((res->IsUnbounded()) ? Twine("unbounded ") : Twine("")) +
-          Twine("resource ") + res->GetGlobalName() +
-          Twine(" could not be allocated"));
-      }
-    }
-  }
-}
-
 void PatchLowerBoundOfCreateHandle(CallInst *handle, DxilModule &DM) {
   DxilInst_CreateHandle createHandle(handle);
   DXASSERT_NOMSG(createHandle);
@@ -397,56 +428,6 @@ static void PatchTBufferCreateHandle(CallInst *handle, DxilModule &DM, std::unor
 
 }
 
-
-void DxilCondenseResources::AllocateDxilResources(DxilModule &DM) {
-  AllocateDxilResource(DM.GetCBuffers(), DM.GetCtx());
-  AllocateDxilResource(DM.GetSamplers(), DM.GetCtx());
-  AllocateDxilResource(DM.GetUAVs(), DM.GetCtx());
-  AllocateDxilResource(DM.GetSRVs(), DM.GetCtx());
-}
-
-void InitTBuffer(const DxilCBuffer *pSource, DxilResource *pDest) {
-  pDest->SetKind(pSource->GetKind());
-  pDest->SetCompType(DXIL::ComponentType::U32);
-  pDest->SetSampleCount(0);
-  pDest->SetElementStride(0);
-  pDest->SetGloballyCoherent(false);
-  pDest->SetHasCounter(false);
-  pDest->SetRW(false);
-  pDest->SetROV(false);
-  pDest->SetID(pSource->GetID());
-  pDest->SetSpaceID(pSource->GetSpaceID());
-  pDest->SetLowerBound(pSource->GetLowerBound());
-  pDest->SetRangeSize(pSource->GetRangeSize());
-  pDest->SetGlobalSymbol(pSource->GetGlobalSymbol());
-  pDest->SetGlobalName(pSource->GetGlobalName());
-  pDest->SetHandle(pSource->GetHandle());
-}
-
-void DxilCondenseResources::PatchTBuffers(DxilModule &DM) {
-  Function *createHandle = DM.GetOP()->GetOpFunc(DXIL::OpCode::CreateHandle,
-                                                 Type::getVoidTy(DM.GetCtx()));
-
-  std::unordered_set<unsigned> tbufferIDs;
-  for (User *U : createHandle->users()) {
-    PatchTBufferCreateHandle(cast<CallInst>(U), DM, tbufferIDs);
-  }
-
-  // move tbuffer resources to SRVs
-  unsigned offset = DM.GetSRVs().size();
-  for (auto it = DM.GetCBuffers().begin(); it != DM.GetCBuffers().end(); it++) {
-    DxilCBuffer *CB = it->get();
-    unsigned resID = CB->GetID();
-    if (tbufferIDs.find(resID) != tbufferIDs.end()) {
-      auto srv = make_unique<DxilResource>();
-      InitTBuffer(CB, srv.get());
-      srv->SetID(resID + offset);
-      DM.AddSRV(std::move(srv));
-      // cbuffer should get cleaned up since it's now unused.
-    }
-  }
-}
-
 void DxilCondenseResources::PatchCreateHandle(DxilModule &DM) {
   Function *createHandle = DM.GetOP()->GetOpFunc(DXIL::OpCode::CreateHandle,
                                                  Type::getVoidTy(DM.GetCtx()));
@@ -456,107 +437,13 @@ void DxilCondenseResources::PatchCreateHandle(DxilModule &DM) {
   }
 }
 
-static Value *PatchRangeIDForLib(DxilModule &DM, IRBuilder<> &Builder,
-                                 Value *rangeIdVal,
-                                 std::unordered_map<PHINode *, Value *> &phiMap,
-                                 DXIL::ResourceClass ResClass) {
-  Value *linkRangeID = nullptr;
-  if (isa<ConstantInt>(rangeIdVal)) {
-    unsigned rangeId = cast<ConstantInt>(rangeIdVal)->getLimitedValue();
-
-    const DxilModule::ResourceLinkInfo &linkInfo =
-        DM.GetResourceLinkInfo(ResClass, rangeId);
-    linkRangeID = Builder.CreateLoad(linkInfo.ResRangeID);
-  } else {
-    if (PHINode *phi = dyn_cast<PHINode>(rangeIdVal)) {
-      auto it = phiMap.find(phi);
-      if (it == phiMap.end()) {
-        unsigned numOperands = phi->getNumOperands();
-
-        PHINode *phiRangeID = Builder.CreatePHI(phi->getType(), numOperands);
-        phiMap[phi] = phiRangeID;
-
-        std::vector<Value *> rangeIDs(numOperands);
-        for (unsigned i = 0; i < numOperands; i++) {
-          Value *V = phi->getOperand(i);
-          BasicBlock *BB = phi->getIncomingBlock(i);
-          IRBuilder<> Builder(BB->getTerminator());
-          rangeIDs[i] = PatchRangeIDForLib(DM, Builder, V, phiMap, ResClass);
-        }
-
-        for (unsigned i = 0; i < numOperands; i++) {
-          Value *V = rangeIDs[i];
-          BasicBlock *BB = phi->getIncomingBlock(i);
-          phiRangeID->addIncoming(V, BB);
-        }
-        linkRangeID = phiRangeID;
-      } else {
-        linkRangeID = it->second;
-      }
-    } else if (SelectInst *si = dyn_cast<SelectInst>(rangeIdVal)) {
-      IRBuilder<> Builder(si);
-      Value *trueVal =
-          PatchRangeIDForLib(DM, Builder, si->getTrueValue(), phiMap, ResClass);
-      Value *falseVal = PatchRangeIDForLib(DM, Builder, si->getFalseValue(),
-                                           phiMap, ResClass);
-      linkRangeID = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
-    } else if (CastInst *cast = dyn_cast<CastInst>(rangeIdVal)) {
-      if (cast->getOpcode() == CastInst::CastOps::ZExt &&
-          cast->getOperand(0)->getType() == Type::getInt1Ty(DM.GetCtx())) {
-        // select cond, 1, 0.
-        IRBuilder<> Builder(cast);
-        Value *trueVal = PatchRangeIDForLib(
-            DM, Builder, ConstantInt::get(cast->getType(), 1), phiMap,
-            ResClass);
-        Value *falseVal = PatchRangeIDForLib(
-            DM, Builder, ConstantInt::get(cast->getType(), 0), phiMap,
-            ResClass);
-        linkRangeID =
-            Builder.CreateSelect(cast->getOperand(0), trueVal, falseVal);
-      }
-    }
-  }
-  return linkRangeID;
-}
-
-void DxilCondenseResources::PatchCreateHandleForLib(DxilModule &DM) {
-  Function *createHandle = DM.GetOP()->GetOpFunc(DXIL::OpCode::CreateHandle,
-                                                 Type::getVoidTy(DM.GetCtx()));
-  DM.CreateResourceLinkInfo();
-  for (User *U : createHandle->users()) {
-    CallInst *handle = cast<CallInst>(U);
-    DxilInst_CreateHandle createHandle(handle);
-    DXASSERT_NOMSG(createHandle);
-
-    DXIL::ResourceClass ResClass =
-        static_cast<DXIL::ResourceClass>(createHandle.get_resourceClass_val());
-
-    std::unordered_map<PHINode *, Value*> phiMap;
-    Value *rangeID = createHandle.get_rangeId();
-    IRBuilder<> Builder(handle);
-    Value *linkRangeID = PatchRangeIDForLib(
-        DM, Builder, rangeID, phiMap, ResClass);
-
-    // Dynamic rangeId is not supported - skip and let validation report the
-    // error.
-    if (!linkRangeID)
-      continue;
-    // Update rangeID to linkinfo rangeID.
-    handle->setArgOperand(DXIL::OperandIndex::kCreateHandleResIDOpIdx,
-                          linkRangeID);
-    if (rangeID->user_empty() && isa<Instruction>(rangeID)) {
-      cast<Instruction>(rangeID)->eraseFromParent();
-    }
-  }
-}
-
 char DxilCondenseResources::ID = 0;
 
 bool llvm::AreDxilResourcesDense(llvm::Module *M, hlsl::DxilResourceBase **ppNonDense) {
   DxilModule &DM = M->GetOrCreateDxilModule();
-  DxilCondenseResources Pass;
-  if (Pass.BuildRewriteMap(DM)) {
-    *ppNonDense = &Pass.GetFirstRewrite();
+  RemapEntryCollection rewrites;
+  if (BuildRewriteMap(rewrites, DM)) {
+    *ppNonDense = rewrites.begin()->second.Resource;
     return false;
   }
   else {
@@ -570,3 +457,1696 @@ ModulePass *llvm::createDxilCondenseResourcesPass() {
 }
 
 INITIALIZE_PASS(DxilCondenseResources, "hlsl-dxil-condense", "DXIL Condense Resources", false, false)
+
+namespace {
+class DxilLowerCreateHandleForLib : public ModulePass {
+private:
+  RemapEntryCollection m_rewrites;
+  DxilModule *m_DM;
+  bool m_HasDbgInfo;
+  bool m_bIsLib;
+  bool m_bLegalizationFailed;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilLowerCreateHandleForLib() : ModulePass(ID) {}
+
+  const char *getPassName() const override {
+    return "DXIL Lower createHandleForLib";
+  }
+
+  bool runOnModule(Module &M) override {
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    m_DM = &DM;
+    // Clear llvm used to remove unused resource.
+    m_DM->ClearLLVMUsed();
+    m_bIsLib = DM.GetShaderModel()->IsLib();
+    m_bLegalizationFailed = false;
+
+    bool bChanged = false;
+    unsigned numResources = DM.GetCBuffers().size() + DM.GetUAVs().size() +
+                            DM.GetSRVs().size() + DM.GetSamplers().size();
+
+    if (!numResources)
+      return false;
+
+    // Switch tbuffers to SRVs, as they have been treated as cbuffers up to this
+    // point.
+    if (DM.GetCBuffers().size())
+      bChanged = PatchTBuffers(DM) || bChanged;
+
+    // Remove unused resource.
+    DM.RemoveUnusedResourceSymbols();
+
+    unsigned newResources = DM.GetCBuffers().size() + DM.GetUAVs().size() +
+                            DM.GetSRVs().size() + DM.GetSamplers().size();
+    bChanged = bChanged || (numResources != newResources);
+
+    if (0 == newResources)
+      return bChanged;
+
+    bChanged |= AllocateDxilResources(DM);
+
+    if (m_bIsLib && DM.GetShaderModel()->GetMinor() == ShaderModel::kOfflineMinor)
+      return bChanged;
+
+    // Make sure no select on resource.
+    bChanged |= RemovePhiOnResource();
+
+    if (m_bIsLib || m_bLegalizationFailed)
+      return bChanged;
+
+    bChanged = true;
+
+    // Load up debug information, to cross-reference values and the instructions
+    // used to load them.
+    m_HasDbgInfo = getDebugMetadataVersionFromModule(M) != 0;
+
+    GenerateDxilResourceHandles();
+
+    if (DM.GetOP()->UseMinPrecision())
+      UpdateStructTypeForLegacyLayout();
+    // Change resource symbol into undef.
+    UpdateResourceSymbols();
+
+    // Remove unused createHandleForLib functions.
+    dxilutil::RemoveUnusedFunctions(M, DM.GetEntryFunction(),
+                                    DM.GetPatchConstantFunction(), m_bIsLib);
+
+    return bChanged;
+  }
+
+private:
+  bool RemovePhiOnResource();
+  void UpdateResourceSymbols();
+  void TranslateDxilResourceUses(DxilResourceBase &res);
+  void GenerateDxilResourceHandles();
+  void UpdateStructTypeForLegacyLayout();
+  // Switch CBuffer for SRV for TBuffers.
+  bool PatchTBuffers(DxilModule &DM);
+  void PatchTBufferUse(Value *V, DxilModule &DM);
+};
+
+} // namespace
+
+// Phi on resource.
+namespace {
+
+typedef std::unordered_map<Value*, Value*> ValueToValueMap;
+typedef llvm::SetVector<Value*> ValueSetVector;
+typedef llvm::SmallVector<Value*, 4> IndexVector;
+typedef std::unordered_map<Value*, IndexVector> ValueToIdxMap;
+
+//#define SUPPORT_SELECT_ON_ALLOCA
+
+// Errors:
+class ResourceUseErrors
+{
+  bool m_bErrorsReported;
+public:
+  ResourceUseErrors() : m_bErrorsReported(false) {}
+
+  enum ErrorCode {
+    // Collision between use of one resource GV and another.
+    // All uses must be guaranteed to resolve to only one GV.
+    // Additionally, when writing resource to alloca, all uses
+    // of that alloca are considered resolving to a single GV.
+    GVConflicts,
+
+    // static global resources are disallowed for libraries at this time.
+    // for non-library targets, they should have been eliminated already.
+    StaticGVUsed,
+
+    // user function calls with resource params or return type are
+    // are currently disallowed for libraries.
+    UserCallsWithResources,
+
+    // When searching up from store pointer looking for alloca,
+    // we encountered an unexpted value type
+    UnexpectedValuesFromStorePointer,
+
+    // When remapping values to be replaced, we add them to RemappedValues
+    // so we don't use dead values stored in other sets/maps.  Circular
+    // remaps that should not happen are aadded to RemappingCyclesDetected.
+    RemappingCyclesDetected,
+
+    // Without SUPPORT_SELECT_ON_ALLOCA, phi/select on alloca based
+    // pointer is disallowed, since this scenario is still untested.
+    // This error also covers any other unknown alloca pointer uses.
+    // Supported:
+    // alloca (-> gep)? -> load -> ...
+    // alloca (-> gep)? -> store.
+    // Unsupported without SUPPORT_SELECT_ON_ALLOCA:
+    // alloca (-> gep)? -> phi/select -> ...
+    AllocaUserDisallowed,
+
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+    // Conflict in select/phi between GV pointer and alloca pointer.  This
+    // algorithm can't handle this case.
+    AllocaSelectConflict,
+#endif
+
+    ErrorCodeCount
+  };
+
+  const StringRef ErrorText[ErrorCodeCount] = {
+    "local resource not guaranteed to map to unique global resource.",
+    "static global resource use is disallowed for library functions.",
+    "exported library functions cannot have resource parameters or return value.",
+    "internal error: unexpected instruction type when looking for alloca from store.",
+    "internal error: cycles detected in value remapping.",
+    "phi/select disallowed on pointers to local resources."
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+    ,"unable to resolve merge of global and local resource pointers."
+#endif
+  };
+
+  ValueSetVector ErrorSets[ErrorCodeCount];
+
+  // Ulitimately, the goal of ErrorUsers is to mark all create handles
+  // so we don't try to report errors on them again later.
+  std::unordered_set<Value*> ErrorUsers;  // users of error values
+  bool AddErrorUsers(Value* V) {
+    auto it = ErrorUsers.insert(V);
+    if (!it.second)
+      return false;   // already there
+    if (isa<GEPOperator>(V) ||
+        isa<LoadInst>(V) ||
+        isa<PHINode>(V) ||
+        isa<SelectInst>(V) ||
+        isa<AllocaInst>(V)) {
+      for (auto U : V->users()) {
+        AddErrorUsers(U);
+      }
+    } else if(isa<StoreInst>(V)) {
+      AddErrorUsers(cast<StoreInst>(V)->getPointerOperand());
+    }
+    // create handle will be marked, but users not followed
+    return true;
+  }
+  void ReportError(ErrorCode ec, Value* V) {
+    DXASSERT_NOMSG(ec < ErrorCodeCount);
+    if (!ErrorSets[ec].insert(V))
+      return;   // Error already reported
+    AddErrorUsers(V);
+    m_bErrorsReported = true;
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      dxilutil::EmitErrorOnInstruction(I, ErrorText[ec]);
+    } else {
+      StringRef Name = V->getName();
+      std::string escName;
+      if (isa<Function>(V)) {
+        llvm::raw_string_ostream os(escName);
+        dxilutil::PrintEscapedString(Name, os);
+        os.flush();
+        Name = escName;
+      }
+      Twine msg = Twine(ErrorText[ec]) + " Value: " + Name;
+      V->getContext().emitError(msg);
+    }
+  }
+
+  bool ErrorsReported() {
+    return m_bErrorsReported;
+  }
+};
+
+unsigned CountArrayDimensions(Type* Ty,
+    // Optionally collect dimensions
+    SmallVector<unsigned, 4> *dims = nullptr) {
+  if (Ty->isPointerTy())
+    Ty = Ty->getPointerElementType();
+  unsigned dim = 0;
+  if (dims)
+    dims->clear();
+  while (Ty->isArrayTy()) {
+    if (dims)
+      dims->push_back(Ty->getArrayNumElements());
+    dim++;
+    Ty = Ty->getArrayElementType();
+  }
+  return dim;
+}
+
+// Helper class for legalizing resource use
+// Convert select/phi on resources to select/phi on index to GEP on GV.
+// Convert resource alloca to index alloca.
+// Assumes createHandleForLib has no select/phi
+class LegalizeResourceUseHelper {
+  // Change:
+  //  gep1 = GEP gRes, i1
+  //  res1 = load gep1
+  //  gep2 = GEP gRes, i2
+  //  gep3 = GEP gRes, i3
+  //  gep4 = phi gep2, gep3           <-- handle select/phi on GEP
+  //  res4 = load gep4
+  //  res5 = phi res1, res4
+  //  res6 = load GEP gRes, 23        <-- handle constant GepExpression
+  //  res = select cnd2, res5, res6
+  //  handle = createHandleForLib(res)
+  // To:
+  //  i4 = phi i2, i3
+  //  i5 = phi i1, i4
+  //  i6 = select cnd, i5, 23
+  //  gep = GEP gRes, i6
+  //  res = load gep
+  //  handle = createHandleForLib(res)
+
+  // Also handles alloca
+  //  resArray = alloca [2 x Resource]
+  //  gep1 = GEP gRes, i1
+  //  res1 = load gep1
+  //  gep2 = GEP gRes, i2
+  //  gep3 = GEP gRes, i3
+  //  phi4 = phi gep2, gep3
+  //  res4 = load phi4
+  //  gep5 = GEP resArray, 0
+  //  gep6 = GEP resArray, 1
+  //  store gep5, res1
+  //  store gep6, res4
+  //  gep7 = GEP resArray, i7   <-- dynamically index array
+  //  res = load gep7
+  //  handle = createHandleForLib(res)
+  // Desired result:
+  //  idxArray = alloca [2 x i32]
+  //  phi4 = phi i2, i3
+  //  gep5 = GEP idxArray, 0
+  //  gep6 = GEP idxArray, 1
+  //  store gep5, i1
+  //  store gep6, phi4
+  //  gep7 = GEP idxArray, i7
+  //  gep8 = GEP gRes, gep7
+  //  res = load gep8
+  //  handle = createHandleForLib(res)
+
+  // Also handles multi-dim resource index and multi-dim resource array allocas
+
+  // Basic algorithm:
+  // - recursively mark each GV user with GV (ValueToResourceGV)
+  //  - verify only one GV used for any given value
+  // - handle allocas by searching up from store for alloca
+  //  - then recursively mark alloca users
+  // - ResToIdxReplacement keeps track of vector of indices that
+  //   will be used to replace a given resource value or pointer
+  // - Next, create selects/phis for indices corresponding to
+  //   selects/phis on resource pointers or values.
+  //  - leave incoming index values undef for now
+  // - Create index allocas to replace resource allocas
+  // - Create GEPs on index allocas to replace GEPs on resource allocas
+  // - Create index loads on index allocas to replace loads on resource alloca GEP
+  // - Fill in replacements for GEPs on resource GVs
+  //  - copy replacement index vectors to corresponding loads
+  // - Create index stores to replace resource stores to alloca/GEPs
+  // - Update selects/phis incoming index values
+  // - SimplifyMerges: replace index phis/selects on same value with that value
+  //  - RemappedValues[phi/select] set to replacement value
+  //  - use LookupValue from now on when reading from ResToIdxReplacement
+  // - Update handles by replacing load/GEP chains that go through select/phi
+  //   with direct GV GEP + load, with select/phi on GEP indices instead.
+
+public:
+  ResourceUseErrors m_Errors;
+
+  ValueToValueMap ValueToResourceGV;
+  ValueToIdxMap ResToIdxReplacement;
+  // Value sets we can use to iterate
+  ValueSetVector Selects, GEPs, Stores, Handles;
+  ValueSetVector Allocas, AllocaGEPs, AllocaLoads;
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+  ValueSetVector AllocaSelects;
+#endif
+
+  std::unordered_set<Value *> NonUniformSet;
+
+  // New index selects created by pass, so we can try simplifying later
+  ValueSetVector NewSelects;
+
+  // Values that have been replaced with other values need remapping
+  ValueToValueMap RemappedValues;
+
+  // Things to clean up if no users:
+  std::unordered_set<Instruction*> CleanupInsts;
+
+  GlobalVariable *LookupResourceGV(Value *V) {
+    auto itGV = ValueToResourceGV.find(V);
+    if (itGV == ValueToResourceGV.end())
+      return nullptr;
+    return cast<GlobalVariable>(itGV->second);
+  }
+
+  // Follow RemappedValues, return input if not remapped
+  Value *LookupValue(Value *V) {
+    auto it = RemappedValues.find(V);
+    SmallPtrSet<Value*, 4> visited;
+    while (it != RemappedValues.end()) {
+      // Cycles should not happen, but are bad if they do.
+      if (visited.count(it->second)) {
+        DXASSERT(false, "otherwise, circular remapping");
+        m_Errors.ReportError(ResourceUseErrors::RemappingCyclesDetected, V);
+        break;
+      }
+      V = it->second;
+      it = RemappedValues.find(V);
+      if (it != RemappedValues.end())
+        visited.insert(V);
+    }
+    return V;
+  }
+
+  bool AreLoadUsersTrivial(LoadInst *LI) {
+    for (auto U : LI->users()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        Function *F = CI->getCalledFunction();
+        DxilModule &DM = F->getParent()->GetDxilModule();
+        hlsl::OP *hlslOP = DM.GetOP();
+        if (hlslOP->IsDxilOpFunc(F)) {
+          hlsl::OP::OpCodeClass opClass;
+          if (hlslOP->GetOpCodeClass(F, opClass) &&
+            opClass == DXIL::OpCodeClass::CreateHandleForLib) {
+            continue;
+          }
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+
+  // This is used to quickly skip the common case where no work is needed
+  bool AreGEPUsersTrivial(GEPOperator *GEP) {
+    if (GlobalVariable *GV = LookupResourceGV(GEP)) {
+      if (GEP->getPointerOperand() != LookupResourceGV(GEP))
+        return false;
+    }
+    for (auto U : GEP->users()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+        if (AreLoadUsersTrivial(LI))
+          continue;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  // AssignResourceGVFromStore is used on pointer being stored to.
+  // Follow GEP/Phi/Select up to Alloca, then CollectResourceGVUsers on Alloca
+  void AssignResourceGVFromStore(GlobalVariable *GV, Value *V,
+                                 SmallPtrSet<Value*, 4> &visited,
+                                 bool bNonUniform) {
+    // Prevent cycles as we search up
+    if (visited.count(V) != 0)
+      return;
+    // Verify and skip if already processed
+    auto it = ValueToResourceGV.find(V);
+    if (it != ValueToResourceGV.end()) {
+      if (it->second != GV) {
+        m_Errors.ReportError(ResourceUseErrors::GVConflicts, V);
+      }
+      return;
+    }
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+      CollectResourceGVUsers(GV, AI, /*bAlloca*/true, bNonUniform);
+      return;
+    } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      // follow the pointer up
+      AssignResourceGVFromStore(GV, GEP->getPointerOperand(), visited, bNonUniform);
+      return;
+    } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+      // follow all incoming values
+      for (auto it : Phi->operand_values())
+        AssignResourceGVFromStore(GV, it, visited, bNonUniform);
+#else
+      m_Errors.ReportError(ResourceUseErrors::AllocaUserDisallowed, V);
+#endif
+      return;
+    } else if (SelectInst *Sel = dyn_cast<SelectInst>(V)) {
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+      // follow all incoming values
+      AssignResourceGVFromStore(GV, Sel->getTrueValue(), visited, bNonUniform);
+      AssignResourceGVFromStore(GV, Sel->getFalseValue(), visited, bNonUniform);
+#else
+      m_Errors.ReportError(ResourceUseErrors::AllocaUserDisallowed, V);
+#endif
+      return;
+    } else if (isa<GlobalVariable>(V) &&
+               cast<GlobalVariable>(V)->getLinkage() ==
+                    GlobalVariable::LinkageTypes::InternalLinkage) {
+      // this is writing to global static, which is disallowed at this point.
+      m_Errors.ReportError(ResourceUseErrors::StaticGVUsed, V);
+      return;
+    } else {
+      // Most likely storing to output parameter
+      m_Errors.ReportError(ResourceUseErrors::UserCallsWithResources, V);
+      return;
+    }
+    return;
+  }
+
+  // Recursively mark values with GV, following users.
+  // Starting value V should be GV itself.
+  // Returns true if value/uses reference no other GV in map.
+  void CollectResourceGVUsers(GlobalVariable *GV, Value *V, bool bAlloca = false, bool bNonUniform = false) {
+    // Recursively tag value V and its users as using GV.
+    auto it = ValueToResourceGV.find(V);
+    if (it != ValueToResourceGV.end()) {
+      if (it->second != GV) {
+        m_Errors.ReportError(ResourceUseErrors::GVConflicts, V);
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+      } else {
+        // if select/phi, make sure bAlloca is consistent
+        if (isa<PHINode>(V) || isa<SelectInst>(V))
+          if ((bAlloca && AllocaSelects.count(V) == 0) ||
+              (!bAlloca && Selects.count(V) == 0))
+            m_Errors.ReportError(ResourceUseErrors::AllocaSelectConflict, V);
+#endif
+      }
+      return;
+    }
+    ValueToResourceGV[V] = GV;
+    if (GV == V) {
+      // Just add and recurse users
+      // make sure bAlloca is clear for users
+      bAlloca = false;
+    } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (bAlloca)
+        AllocaGEPs.insert(GEP);
+      else if (!AreGEPUsersTrivial(GEP))
+        GEPs.insert(GEP);
+      else
+        return; // Optimization: skip trivial GV->GEP->load->createHandle
+      if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP)) {
+        if (DxilMDHelper::IsMarkedNonUniform(GEPInst))
+          bNonUniform = true;
+      }
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+      if (bAlloca)
+        AllocaLoads.insert(LI);
+      // clear bAlloca for users
+      bAlloca = false;
+      if (bNonUniform)
+        NonUniformSet.insert(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
+      Stores.insert(SI);
+      if (!bAlloca) {
+        // Find and mark allocas this store could be storing to
+        SmallPtrSet<Value*, 4> visited;
+        AssignResourceGVFromStore(GV, SI->getPointerOperand(), visited, bNonUniform);
+      }
+      return;
+    } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+      if (bAlloca) {
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+        AllocaSelects.insert(Phi);
+#else
+        m_Errors.ReportError(ResourceUseErrors::AllocaUserDisallowed, V);
+#endif
+      } else {
+        Selects.insert(Phi);
+      }
+    } else if (SelectInst *Sel = dyn_cast<SelectInst>(V)) {
+      if (bAlloca) {
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+        AllocaSelects.insert(Sel);
+#else
+        m_Errors.ReportError(ResourceUseErrors::AllocaUserDisallowed, V);
+#endif
+      } else {
+        Selects.insert(Sel);
+      }
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+      Allocas.insert(AI);
+      // set bAlloca for users
+      bAlloca = true;
+    } else if (Constant *C = dyn_cast<Constant>(V)) {
+      // skip @llvm.used entry
+      return;
+    } else if (bAlloca) {
+      m_Errors.ReportError(ResourceUseErrors::AllocaUserDisallowed, V);
+    } else {
+      // Must be createHandleForLib or user function call.
+      CallInst *CI = cast<CallInst>(V);
+      Function *F = CI->getCalledFunction();
+      DxilModule &DM = GV->getParent()->GetDxilModule();
+      hlsl::OP *hlslOP = DM.GetOP();
+      if (hlslOP->IsDxilOpFunc(F)) {
+        hlsl::OP::OpCodeClass opClass;
+        if (hlslOP->GetOpCodeClass(F, opClass) &&
+            opClass == DXIL::OpCodeClass::CreateHandleForLib) {
+          Handles.insert(CI);
+          if (bNonUniform)
+            NonUniformSet.insert(CI);
+          return;
+        }
+      }
+      // This could be user call with resource param, which is disallowed for lib_6_3
+      m_Errors.ReportError(ResourceUseErrors::UserCallsWithResources, V);
+      return;
+    }
+
+    // Recurse users
+    for (auto U : V->users())
+      CollectResourceGVUsers(GV, U, bAlloca, bNonUniform);
+    return;
+  }
+
+  // Remove conflicting values from sets before
+  // transforming the remainder.
+  void RemoveConflictingValue(Value* V) {
+    bool bRemoved = false;
+    if (isa<GEPOperator>(V)) {
+      bRemoved = GEPs.remove(V) || AllocaGEPs.remove(V);
+    } else if (isa<LoadInst>(V)) {
+      bRemoved = AllocaLoads.remove(V);
+    } else if (isa<StoreInst>(V)) {
+      bRemoved = Stores.remove(V);
+    } else if (isa<PHINode>(V) || isa<SelectInst>(V)) {
+      bRemoved = Selects.remove(V);
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+      bRemoved |= AllocaSelects.remove(V);
+#endif
+    } else if (isa<AllocaInst>(V)) {
+      bRemoved = Allocas.remove(V);
+    } else if (isa<CallInst>(V)) {
+      bRemoved = Handles.remove(V);
+      return; // don't recurse
+    }
+    if (bRemoved) {
+      // Recurse users
+      for (auto U : V->users())
+        RemoveConflictingValue(U);
+    }
+  }
+  void RemoveConflicts() {
+    for (auto V : m_Errors.ErrorSets[ResourceUseErrors::GVConflicts]) {
+      RemoveConflictingValue(V);
+      ValueToResourceGV.erase(V);
+    }
+  }
+
+  void CreateSelects() {
+    if (Selects.empty()
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+        && AllocaSelects.empty()
+#endif
+        )
+      return;
+    LLVMContext &Ctx =
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+      Selects.empty() ? AllocaSelects[0]->getContext() :
+#endif
+      Selects[0]->getContext();
+    Type *i32Ty = IntegerType::getInt32Ty(Ctx);
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+    for (auto &SelectSet : {Selects, AllocaSelects}) {
+      bool bAlloca = !(&SelectSet == &Selects);
+#else
+    for (auto &SelectSet : { Selects }) {
+#endif
+      for (auto pValue : SelectSet) {
+        Type *SelectTy = i32Ty;
+#ifdef SUPPORT_SELECT_ON_ALLOCA
+        // For alloca case, type needs to match dimensionality of incoming value
+        if (bAlloca) {
+          // TODO: Not sure if this case will actually work
+          //      (or whether it can even be generated from HLSL)
+          Type *Ty = pValue->getType();
+          SmallVector<unsigned, 4> dims;
+          unsigned dim = CountArrayDimensions(Ty, &dims);
+          for (unsigned i = 0; i < dim; i++)
+            SelectTy = ArrayType::get(SelectTy, (uint64_t)dims[dim - i - 1]);
+          if (Ty->isPointerTy())
+            SelectTy = PointerType::get(SelectTy, 0);
+        }
+#endif
+        Value *UndefValue = UndefValue::get(SelectTy);
+        if (PHINode *Phi = dyn_cast<PHINode>(pValue)) {
+          GlobalVariable *GV = LookupResourceGV(Phi);
+          if (!GV)
+            continue; // skip value removed due to conflict
+          IRBuilder<> PhiBuilder(Phi);
+          unsigned gvDim = CountArrayDimensions(GV->getType());
+          IndexVector &idxVector = ResToIdxReplacement[Phi];
+          idxVector.resize(gvDim, nullptr);
+          unsigned numIncoming = Phi->getNumIncomingValues();
+          for (unsigned i = 0; i < gvDim; i++) {
+            PHINode *newPhi = PhiBuilder.CreatePHI(SelectTy, numIncoming);
+            NewSelects.insert(newPhi);
+            idxVector[i] = newPhi;
+            for (unsigned j = 0; j < numIncoming; j++) {
+              // Set incoming values to undef until next pass
+              newPhi->addIncoming(UndefValue, Phi->getIncomingBlock(j));
+            }
+          }
+        } else if (SelectInst *Sel = dyn_cast<SelectInst>(pValue)) {
+          GlobalVariable *GV = LookupResourceGV(Sel);
+          if (!GV)
+            continue; // skip value removed due to conflict
+          IRBuilder<> Builder(Sel);
+          unsigned gvDim = CountArrayDimensions(GV->getType());
+          IndexVector &idxVector = ResToIdxReplacement[Sel];
+          idxVector.resize(gvDim, nullptr);
+          for (unsigned i = 0; i < gvDim; i++) {
+            Value *newSel = Builder.CreateSelect(Sel->getCondition(), UndefValue, UndefValue);
+            NewSelects.insert(newSel);
+            idxVector[i] = newSel;
+          }
+        } else {
+          DXASSERT(false, "otherwise, non-select/phi in Selects set");
+        }
+      }
+    }
+  }
+
+  // Create index allocas to replace resource allocas
+  void CreateIndexAllocas() {
+    if (Allocas.empty())
+      return;
+    Type *i32Ty = IntegerType::getInt32Ty(Allocas[0]->getContext());
+    for (auto pValue : Allocas) {
+      AllocaInst *pAlloca = cast<AllocaInst>(pValue);
+      GlobalVariable *GV = LookupResourceGV(pAlloca);
+      if (!GV)
+        continue; // skip value removed due to conflict
+      IRBuilder<> AllocaBuilder(pAlloca);
+      unsigned gvDim = CountArrayDimensions(GV->getType());
+      SmallVector<unsigned, 4> dimVector;
+      unsigned allocaTyDim = CountArrayDimensions(pAlloca->getType(), &dimVector);
+      Type *pIndexType = i32Ty;
+      for (unsigned i = 0; i < allocaTyDim; i++) {
+        pIndexType = ArrayType::get(pIndexType, dimVector[allocaTyDim - i - 1]);
+      }
+      Value *arraySize = pAlloca->getArraySize();
+      IndexVector &idxVector = ResToIdxReplacement[pAlloca];
+      idxVector.resize(gvDim, nullptr);
+      for (unsigned i = 0; i < gvDim; i++) {
+        AllocaInst *pAlloca = AllocaBuilder.CreateAlloca(pIndexType, arraySize);
+        pAlloca->setAlignment(4);
+        idxVector[i] = pAlloca;
+      }
+    }
+  }
+
+  // Add corresponding GEPs for index allocas
+  IndexVector &ReplaceAllocaGEP(GetElementPtrInst *GEP) {
+    IndexVector &idxVector = ResToIdxReplacement[GEP];
+    if (!idxVector.empty())
+      return idxVector;
+
+    Value *Ptr = GEP->getPointerOperand();
+
+    // Recurse for partial GEPs
+    IndexVector &ptrIndices = isa<GetElementPtrInst>(Ptr) ?
+      ReplaceAllocaGEP(cast<GetElementPtrInst>(Ptr)) : ResToIdxReplacement[Ptr];
+
+    IRBuilder<> Builder(GEP);
+    SmallVector<Value*, 4> gepIndices;
+    for (auto it = GEP->idx_begin(), idxEnd = GEP->idx_end(); it != idxEnd; it++)
+      gepIndices.push_back(*it);
+    idxVector.resize(ptrIndices.size(), nullptr);
+    for (unsigned i = 0; i < ptrIndices.size(); i++) {
+      idxVector[i] = Builder.CreateInBoundsGEP(ptrIndices[i], gepIndices);
+    }
+    return idxVector;
+  }
+
+  void ReplaceAllocaGEPs() {
+    for (auto V : AllocaGEPs) {
+      ReplaceAllocaGEP(cast<GetElementPtrInst>(V));
+    }
+  }
+
+  void ReplaceAllocaLoads() {
+    for (auto V : AllocaLoads) {
+      LoadInst *LI = cast<LoadInst>(V);
+      Value *Ptr = LI->getPointerOperand();
+      IRBuilder<> Builder(LI);
+      IndexVector &idxVector = ResToIdxReplacement[V];
+      IndexVector &ptrIndices = ResToIdxReplacement[Ptr];
+      idxVector.resize(ptrIndices.size(), nullptr);
+      for (unsigned i = 0; i < ptrIndices.size(); i++) {
+        idxVector[i] = Builder.CreateLoad(ptrIndices[i]);
+      }
+    }
+  }
+
+  // Add GEP to ResToIdxReplacement with indices from incoming + GEP
+  IndexVector &ReplaceGVGEPs(GEPOperator *GEP) {
+    IndexVector &idxVector = ResToIdxReplacement[GEP];
+    // Skip if already done
+    // (we recurse into partial GEP and iterate all GEPs)
+    if (!idxVector.empty())
+      return idxVector;
+
+    Type *i32Ty = IntegerType::getInt32Ty(GEP->getContext());
+    Constant *Zero = Constant::getIntegerValue(i32Ty, APInt(32, 0));
+
+    Value *Ptr = GEP->getPointerOperand();
+
+    unsigned idx = 0;
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      unsigned gvDim = CountArrayDimensions(GV->getType());
+      idxVector.resize(gvDim, Zero);
+    } else if (isa<GEPOperator>(Ptr) || isa<PHINode>(Ptr) || isa<SelectInst>(Ptr)) {
+      // Recurse for partial GEPs
+      IndexVector &ptrIndices = isa<GEPOperator>(Ptr) ?
+        ReplaceGVGEPs(cast<GEPOperator>(Ptr)) : ResToIdxReplacement[Ptr];
+      unsigned ptrDim = CountArrayDimensions(Ptr->getType());
+      unsigned gvDim = ptrIndices.size();
+      DXASSERT(ptrDim <= gvDim, "otherwise incoming pointer has more dimensions than associated GV");
+      unsigned gepStart = gvDim - ptrDim;
+      // Copy indices and add ours
+      idxVector.resize(ptrIndices.size(), Zero);
+      for (; idx < gepStart; idx++)
+        idxVector[idx] = ptrIndices[idx];
+    }
+    if (GEP->hasIndices()) {
+      auto itIdx = GEP->idx_begin();
+      ++itIdx;  // Always skip leading zero (we don't support GV+n pointer arith)
+      while (itIdx != GEP->idx_end())
+        idxVector[idx++] = *itIdx++;
+    }
+    return idxVector;
+  }
+
+  // Add GEPs to ResToIdxReplacement and update loads
+  void ReplaceGVGEPs() {
+    if (GEPs.empty())
+      return;
+    for (auto V : GEPs) {
+      GEPOperator *GEP = cast<GEPOperator>(V);
+      IndexVector &gepVector = ReplaceGVGEPs(GEP);
+      for (auto U : GEP->users()) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+          // Just copy incoming indices
+          ResToIdxReplacement[LI] = gepVector;
+        }
+      }
+    }
+  }
+
+  // Create new index stores for incoming indices
+  void ReplaceStores() {
+    // generate stores of incoming indices to corresponding index pointers
+    if (Stores.empty())
+      return;
+    for (auto V : Stores) {
+      StoreInst *SI = cast<StoreInst>(V);
+      IRBuilder<> Builder(SI);
+      IndexVector &idxVector = ResToIdxReplacement[SI];
+      Value *Ptr = SI->getPointerOperand();
+      Value *Val = SI->getValueOperand();
+      IndexVector &ptrIndices = ResToIdxReplacement[Ptr];
+      IndexVector &valIndices = ResToIdxReplacement[Val];
+      DXASSERT_NOMSG(ptrIndices.size() == valIndices.size());
+      idxVector.resize(ptrIndices.size(), nullptr);
+      for (unsigned i = 0; i < idxVector.size(); i++) {
+        idxVector[i] = Builder.CreateStore(valIndices[i], ptrIndices[i]);
+      }
+    }
+  }
+
+  // For each Phi/Select: update matching incoming values for new phis
+  void UpdateSelects() {
+    for (auto V : Selects) {
+      // update incoming index values corresponding to incoming resource values
+      IndexVector &idxVector = ResToIdxReplacement[V];
+      Instruction *I = cast<Instruction>(V);
+      unsigned numOperands = I->getNumOperands();
+      unsigned startOp = isa<PHINode>(V) ? 0 : 1;
+      for (unsigned iOp = startOp; iOp < numOperands; iOp++) {
+        IndexVector &incomingIndices = ResToIdxReplacement[I->getOperand(iOp)];
+        DXASSERT_NOMSG(idxVector.size() == incomingIndices.size());
+        for (unsigned i = 0; i < idxVector.size(); i++) {
+          // must be instruction (phi/select)
+          Instruction *indexI = cast<Instruction>(idxVector[i]);
+          indexI->setOperand(iOp, incomingIndices[i]);
+        }
+
+        // Now clear incoming operand (adding to cleanup) to break cycles
+        if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(iOp)))
+          CleanupInsts.insert(OpI);
+        I->setOperand(iOp, UndefValue::get(I->getType()));
+      }
+    }
+  }
+
+  // ReplaceHandles
+  //  - iterate handles
+  //    - insert GEP using new indices associated with resource value
+  //    - load resource from new GEP
+  //    - replace resource use in createHandleForLib with new load
+  // Assumes: no users of handle are phi/select or store
+  void ReplaceHandles() {
+    if (Handles.empty())
+      return;
+    Type *i32Ty = IntegerType::getInt32Ty(Handles[0]->getContext());
+    Constant *Zero = Constant::getIntegerValue(i32Ty, APInt(32, 0));
+    for (auto V : Handles) {
+      CallInst *CI = cast<CallInst>(V);
+      DxilInst_CreateHandleForLib createHandle(CI);
+      Value *res = createHandle.get_Resource();
+      // Skip extra work if nothing between load and create handle
+      if (LoadInst *LI = dyn_cast<LoadInst>(res)) {
+        Value *Ptr = LI->getPointerOperand();
+        if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr))
+          Ptr = GEP->getPointerOperand();
+        if (isa<GlobalVariable>(Ptr))
+          continue;
+      }
+      GlobalVariable *GV = LookupResourceGV(res);
+      if (!GV)
+        continue; // skip value removed due to conflict
+      IRBuilder<> Builder(CI);
+      IndexVector &idxVector = ResToIdxReplacement[res];
+      DXASSERT(idxVector.size() == CountArrayDimensions(GV->getType()), "replacements empty or invalid");
+      SmallVector<Value*, 4> gepIndices;
+      gepIndices.push_back(Zero);
+      for (auto idxVal : idxVector)
+        gepIndices.push_back(LookupValue(idxVal));
+      Value *GEP = Builder.CreateInBoundsGEP(GV, gepIndices);
+      // Mark new GEP instruction non-uniform if necessary
+      if (NonUniformSet.count(res) != 0 || NonUniformSet.count(CI) != 0)
+        if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP))
+          DxilMDHelper::MarkNonUniform(GEPInst);
+      LoadInst *LI = Builder.CreateLoad(GEP);
+      createHandle.set_Resource(LI);
+      if (Instruction *resI = dyn_cast<Instruction>(res))
+        CleanupInsts.insert(resI);
+    }
+  }
+
+  // Delete unused CleanupInsts, restarting when changed
+  // Return true if something was deleted
+  bool CleanupUnusedValues() {
+    //  - delete unused CleanupInsts, restarting when changed
+    bool bAnyChanges = false;
+    bool bChanged = false;
+    do {
+      bChanged = false;
+      for (auto it = CleanupInsts.begin(); it != CleanupInsts.end();) {
+        Instruction *I = *(it++);
+        if (I->user_empty()) {
+          // Add instructions operands CleanupInsts
+          for (unsigned iOp = 0; iOp < I->getNumOperands(); iOp++) {
+            if (Instruction *opI = dyn_cast<Instruction>(I->getOperand(iOp)))
+              CleanupInsts.insert(opI);
+          }
+          I->eraseFromParent();
+          CleanupInsts.erase(I);
+          bChanged = true;
+        }
+      }
+      if (bChanged)
+        bAnyChanges = true;
+    } while (bChanged);
+    return bAnyChanges;
+  }
+
+  void SimplifyMerges() {
+    // Loop if changed
+    bool bChanged = false;
+    do {
+      bChanged = false;
+      for (auto V : NewSelects) {
+        if (LookupValue(V) != V)
+          continue;
+        Instruction *I = cast<Instruction>(V);
+        unsigned startOp = isa<PHINode>(I) ? 0 : 1;
+        Value *newV = dxilutil::MergeSelectOnSameValue(
+          cast<Instruction>(V), startOp, I->getNumOperands());
+        if (newV) {
+          RemappedValues[V] = newV;
+          bChanged = true;
+        }
+      }
+    } while (bChanged);
+  }
+
+  void CleanupDeadInsts() {
+    // Assuming everything was successful:
+    // delete stores to allocas to remove cycles
+    for (auto V : Stores) {
+      StoreInst *SI = cast<StoreInst>(V);
+      if (Instruction *I = dyn_cast<Instruction>(SI->getValueOperand()))
+        CleanupInsts.insert(I);
+      if (Instruction *I = dyn_cast<Instruction>(SI->getPointerOperand()))
+        CleanupInsts.insert(I);
+      SI->eraseFromParent();
+    }
+    CleanupUnusedValues();
+  }
+
+  void VerifyComplete(DxilModule &DM) {
+    // Check that all handles now resolve to a global variable, otherwise,
+    // they are likely loading from resource function parameter, which
+    // is disallowed.
+    hlsl::OP *hlslOP = DM.GetOP();
+    for (Function &F : DM.GetModule()->functions()) {
+      if (hlslOP->IsDxilOpFunc(&F)) {
+        hlsl::OP::OpCodeClass opClass;
+        if (hlslOP->GetOpCodeClass(&F, opClass) &&
+          opClass == DXIL::OpCodeClass::CreateHandleForLib) {
+          for (auto U : F.users()) {
+            CallInst *CI = cast<CallInst>(U);
+            if (m_Errors.ErrorUsers.count(CI))
+              continue;   // Error already reported
+            DxilInst_CreateHandleForLib createHandle(CI);
+            Value *res = createHandle.get_Resource();
+            LoadInst *LI = dyn_cast<LoadInst>(res);
+            if (LI) {
+              Value *Ptr = LI->getPointerOperand();
+              if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr))
+                Ptr = GEP->getPointerOperand();
+              if (isa<GlobalVariable>(Ptr))
+                continue;
+            }
+            // handle wasn't processed
+            // Right now, the most likely cause is user call with resources, but
+            // this should be updated if there are other reasons for this to happen.
+            m_Errors.ReportError(ResourceUseErrors::UserCallsWithResources, U);
+          }
+        }
+      }
+    }
+  }
+
+  // Fix resource global variable properties to external constant
+  bool SetExternalConstant(GlobalVariable *GV) {
+    if (GV->hasInitializer() || !GV->isConstant() ||
+        GV->getLinkage() != GlobalVariable::LinkageTypes::ExternalLinkage) {
+      GV->setInitializer(nullptr);
+      GV->setConstant(true);
+      GV->setLinkage(GlobalVariable::LinkageTypes::ExternalLinkage);
+      return true;
+    }
+    return false;
+  }
+
+  bool CollectResources(DxilModule &DM) {
+    bool bChanged = false;
+    for (const auto &res : DM.GetCBuffers()) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(res->GetGlobalSymbol())) {
+        bChanged |= SetExternalConstant(GV);
+        CollectResourceGVUsers(GV, GV);
+      }
+    }
+    for (const auto &res : DM.GetSRVs()) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(res->GetGlobalSymbol())) {
+        bChanged |= SetExternalConstant(GV);
+        CollectResourceGVUsers(GV, GV);
+      }
+    }
+    for (const auto &res : DM.GetUAVs()) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(res->GetGlobalSymbol())) {
+        bChanged |= SetExternalConstant(GV);
+        CollectResourceGVUsers(GV, GV);
+      }
+    }
+    for (const auto &res : DM.GetSamplers()) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(res->GetGlobalSymbol())) {
+        bChanged |= SetExternalConstant(GV);
+        CollectResourceGVUsers(GV, GV);
+      }
+    }
+    return bChanged;
+  }
+
+  void DoTransform() {
+    RemoveConflicts();
+    CreateSelects();
+    CreateIndexAllocas();
+    ReplaceAllocaGEPs();
+    ReplaceAllocaLoads();
+    ReplaceGVGEPs();
+    ReplaceStores();
+    UpdateSelects();
+    SimplifyMerges();
+    ReplaceHandles();
+    if (!m_Errors.ErrorsReported())
+      CleanupDeadInsts();
+  }
+
+  bool ErrorsReported() {
+    return m_Errors.ErrorsReported();
+  }
+
+  bool runOnModule(llvm::Module &M) {
+    DxilModule &DM = M.GetOrCreateDxilModule();
+
+    bool bChanged = CollectResources(DM);
+
+    // If no selects or allocas are involved, there isn't anything to do
+    if (Selects.empty() && Allocas.empty())
+      return bChanged;
+
+    DoTransform();
+    VerifyComplete(DM);
+
+    return true;
+  }
+};
+
+class DxilLegalizeResources : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilLegalizeResources()
+    : ModulePass(ID) {}
+
+  const char *getPassName() const override {
+    return "DXIL Legalize Resource Use";
+  }
+
+  bool runOnModule(Module &M) override {
+    LegalizeResourceUseHelper helper;
+    return helper.runOnModule(M);
+  }
+
+private:
+};
+
+} // namespace
+
+char DxilLegalizeResources::ID = 0;
+
+ModulePass *llvm::createDxilLegalizeResources() {
+  return new DxilLegalizeResources();
+}
+
+INITIALIZE_PASS(DxilLegalizeResources,
+  "hlsl-dxil-legalize-resources",
+  "DXIL legalize resource use", false, false)
+
+
+bool DxilLowerCreateHandleForLib::RemovePhiOnResource() {
+  LegalizeResourceUseHelper helper;
+  bool bChanged = helper.runOnModule(*m_DM->GetModule());
+  if (helper.ErrorsReported())
+    m_bLegalizationFailed = true;
+  return bChanged;
+}
+
+
+// LegacyLayout.
+namespace {
+
+StructType *UpdateStructTypeForLegacyLayout(StructType *ST, bool IsCBuf,
+                                            DxilTypeSystem &TypeSys, Module &M);
+
+Type *UpdateFieldTypeForLegacyLayout(Type *Ty, bool IsCBuf,
+                                     DxilFieldAnnotation &annotation,
+                                     DxilTypeSystem &TypeSys, Module &M) {
+  DXASSERT(!Ty->isPointerTy(), "struct field should not be a pointer");
+
+  if (Ty->isArrayTy()) {
+    Type *EltTy = Ty->getArrayElementType();
+    Type *UpdatedTy =
+        UpdateFieldTypeForLegacyLayout(EltTy, IsCBuf, annotation, TypeSys, M);
+    if (EltTy == UpdatedTy)
+      return Ty;
+    else
+      return ArrayType::get(UpdatedTy, Ty->getArrayNumElements());
+  } else if (HLMatrixLower::IsMatrixType(Ty)) {
+    DXASSERT(annotation.HasMatrixAnnotation(), "must a matrix");
+    unsigned rows, cols;
+    Type *EltTy = HLMatrixLower::GetMatrixInfo(Ty, cols, rows);
+
+    // Get cols and rows from annotation.
+    const DxilMatrixAnnotation &matrix = annotation.GetMatrixAnnotation();
+    if (matrix.Orientation == MatrixOrientation::RowMajor) {
+      rows = matrix.Rows;
+      cols = matrix.Cols;
+    } else {
+      DXASSERT(matrix.Orientation == MatrixOrientation::ColumnMajor, "");
+      cols = matrix.Rows;
+      rows = matrix.Cols;
+    }
+    // CBuffer matrix must 4 * 4 bytes align.
+    if (IsCBuf)
+      cols = 4;
+
+    EltTy =
+        UpdateFieldTypeForLegacyLayout(EltTy, IsCBuf, annotation, TypeSys, M);
+    Type *rowTy = VectorType::get(EltTy, cols);
+    return ArrayType::get(rowTy, rows);
+  } else if (StructType *ST = dyn_cast<StructType>(Ty)) {
+    return UpdateStructTypeForLegacyLayout(ST, IsCBuf, TypeSys, M);
+  } else if (Ty->isVectorTy()) {
+    Type *EltTy = Ty->getVectorElementType();
+    Type *UpdatedTy =
+        UpdateFieldTypeForLegacyLayout(EltTy, IsCBuf, annotation, TypeSys, M);
+    if (EltTy == UpdatedTy)
+      return Ty;
+    else
+      return VectorType::get(UpdatedTy, Ty->getVectorNumElements());
+  } else {
+    Type *i32Ty = Type::getInt32Ty(Ty->getContext());
+    // Basic types.
+    if (Ty->isHalfTy()) {
+      return Type::getFloatTy(Ty->getContext());
+    } else if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+      if (ITy->getBitWidth() < 32)
+        return i32Ty;
+      else
+        return Ty;
+    } else
+      return Ty;
+  }
+}
+
+StructType *UpdateStructTypeForLegacyLayout(StructType *ST, bool IsCBuf,
+                                            DxilTypeSystem &TypeSys,
+                                            Module &M) {
+  bool bUpdated = false;
+  unsigned fieldsCount = ST->getNumElements();
+  std::vector<Type *> fieldTypes(fieldsCount);
+  DxilStructAnnotation *SA = TypeSys.GetStructAnnotation(ST);
+  DXASSERT(SA, "must have annotation for struct type");
+
+  for (unsigned i = 0; i < fieldsCount; i++) {
+    Type *EltTy = ST->getElementType(i);
+    Type *UpdatedTy = UpdateFieldTypeForLegacyLayout(
+        EltTy, IsCBuf, SA->GetFieldAnnotation(i), TypeSys, M);
+    fieldTypes[i] = UpdatedTy;
+    if (EltTy != UpdatedTy)
+      bUpdated = true;
+  }
+
+  if (!bUpdated) {
+    return ST;
+  } else {
+    std::string legacyName = "dx.alignment.legacy." + ST->getName().str();
+    if (StructType *legacyST = M.getTypeByName(legacyName))
+      return legacyST;
+
+    StructType *NewST =
+        StructType::create(ST->getContext(), fieldTypes, legacyName);
+    DxilStructAnnotation *NewSA = TypeSys.AddStructAnnotation(NewST);
+    // Clone annotation.
+    *NewSA = *SA;
+    return NewST;
+  }
+}
+
+void UpdateStructTypeForLegacyLayout(DxilResourceBase &Res,
+                                     DxilTypeSystem &TypeSys, Module &M) {
+  GlobalVariable *GV = cast<GlobalVariable>(Res.GetGlobalSymbol());
+  Type *Ty = GV->getType()->getPointerElementType();
+  bool IsResourceArray = Res.GetRangeSize() != 1;
+  if (IsResourceArray) {
+    // Support Array of struct buffer.
+    if (Ty->isArrayTy())
+      Ty = Ty->getArrayElementType();
+  }
+  StructType *ST = cast<StructType>(Ty);
+  if (ST->isOpaque()) {
+    DXASSERT(Res.GetClass() == DxilResourceBase::Class::CBuffer,
+             "Only cbuffer can have opaque struct.");
+    return;
+  }
+
+  Type *UpdatedST =
+      UpdateStructTypeForLegacyLayout(ST, IsResourceArray, TypeSys, M);
+  if (ST != UpdatedST) {
+    Type *Ty = GV->getType()->getPointerElementType();
+    if (IsResourceArray) {
+      // Support Array of struct buffer.
+      if (Ty->isArrayTy()) {
+        UpdatedST = ArrayType::get(UpdatedST, Ty->getArrayNumElements());
+      }
+    }
+    GlobalVariable *NewGV = cast<GlobalVariable>(
+        M.getOrInsertGlobal(GV->getName().str() + "_legacy", UpdatedST));
+    Res.SetGlobalSymbol(NewGV);
+    // Delete old GV.
+    for (auto UserIt = GV->user_begin(); UserIt != GV->user_end();) {
+      Value *User = *(UserIt++);
+      if (Instruction *I = dyn_cast<Instruction>(User)) {
+        if (!User->user_empty())
+          I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+        I->eraseFromParent();
+      } else {
+        ConstantExpr *CE = cast<ConstantExpr>(User);
+        if (!CE->user_empty())
+          CE->replaceAllUsesWith(UndefValue::get(CE->getType()));
+      }
+    }
+    GV->removeDeadConstantUsers();
+    GV->eraseFromParent();
+  }
+}
+
+void UpdateStructTypeForLegacyLayoutOnDM(DxilModule &DM) {
+  DxilTypeSystem &TypeSys = DM.GetTypeSystem();
+  Module &M = *DM.GetModule();
+  for (auto &CBuf : DM.GetCBuffers()) {
+    UpdateStructTypeForLegacyLayout(*CBuf.get(), TypeSys, M);
+  }
+
+  for (auto &UAV : DM.GetUAVs()) {
+    if (UAV->GetKind() == DxilResourceBase::Kind::StructuredBuffer)
+      UpdateStructTypeForLegacyLayout(*UAV.get(), TypeSys, M);
+  }
+
+  for (auto &SRV : DM.GetSRVs()) {
+    if (SRV->GetKind() == DxilResourceBase::Kind::StructuredBuffer)
+      UpdateStructTypeForLegacyLayout(*SRV.get(), TypeSys, M);
+  }
+}
+
+} // namespace
+
+void DxilLowerCreateHandleForLib::UpdateStructTypeForLegacyLayout() {
+  UpdateStructTypeForLegacyLayoutOnDM(*m_DM);
+}
+
+// Change ResourceSymbol to undef if don't need.
+void DxilLowerCreateHandleForLib::UpdateResourceSymbols() {
+  std::vector<GlobalVariable *> &LLVMUsed = m_DM->GetLLVMUsed();
+
+  auto UpdateResourceSymbol = [&LLVMUsed, this](DxilResourceBase *res) {
+    GlobalVariable *GV = cast<GlobalVariable>(res->GetGlobalSymbol());
+    GV->removeDeadConstantUsers();
+    DXASSERT(GV->user_empty(), "else resource not lowered");
+    Type *Ty = GV->getType();
+    res->SetGlobalSymbol(UndefValue::get(Ty));
+    if (m_HasDbgInfo)
+      LLVMUsed.emplace_back(GV);
+
+    res->SetGlobalSymbol(UndefValue::get(Ty));
+  };
+
+  for (auto &&C : m_DM->GetCBuffers()) {
+    UpdateResourceSymbol(C.get());
+  }
+  for (auto &&Srv : m_DM->GetSRVs()) {
+    UpdateResourceSymbol(Srv.get());
+  }
+  for (auto &&Uav : m_DM->GetUAVs()) {
+    UpdateResourceSymbol(Uav.get());
+  }
+  for (auto &&S : m_DM->GetSamplers()) {
+    UpdateResourceSymbol(S.get());
+  }
+}
+
+// Lower createHandleForLib
+namespace {
+
+void ReplaceResourceUserWithHandle(
+    LoadInst *Res, Value *handle) {
+  for (auto resUser = Res->user_begin(); resUser != Res->user_end();) {
+    Value *V = *(resUser++);
+    CallInst *CI = dyn_cast<CallInst>(V);
+    DxilInst_CreateHandleForLib createHandle(CI);
+    DXASSERT(createHandle, "must be createHandle");
+    CI->replaceAllUsesWith(handle);
+    CI->eraseFromParent();
+  }
+  Res->eraseFromParent();
+}
+
+DIGlobalVariable *FindGlobalVariableDebugInfo(GlobalVariable *GV,
+                                              DebugInfoFinder &DbgInfoFinder) {
+  struct GlobalFinder {
+    GlobalVariable *GV;
+    bool operator()(llvm::DIGlobalVariable *const arg) const {
+      return arg->getVariable() == GV;
+    }
+  };
+  GlobalFinder F = {GV};
+  DebugInfoFinder::global_variable_iterator Found =
+      std::find_if(DbgInfoFinder.global_variables().begin(),
+                   DbgInfoFinder.global_variables().end(), F);
+  if (Found != DbgInfoFinder.global_variables().end()) {
+    return *Found;
+  }
+  return nullptr;
+}
+} // namespace
+void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
+    DxilResourceBase &res) {
+  OP *hlslOP = m_DM->GetOP();
+  Function *createHandle = hlslOP->GetOpFunc(
+      OP::OpCode::CreateHandle, llvm::Type::getVoidTy(m_DM->GetCtx()));
+  Value *opArg = hlslOP->GetU32Const((unsigned)OP::OpCode::CreateHandle);
+  bool isViewResource = res.GetClass() == DXIL::ResourceClass::SRV ||
+                        res.GetClass() == DXIL::ResourceClass::UAV;
+  bool isROV = isViewResource && static_cast<DxilResource &>(res).IsROV();
+  std::string handleName =
+      (res.GetGlobalName() + Twine("_") + Twine(res.GetResClassName())).str();
+  if (isViewResource)
+    handleName += (Twine("_") + Twine(res.GetResDimName())).str();
+  if (isROV)
+    handleName += "_ROV";
+
+  Value *resClassArg = hlslOP->GetU8Const(
+      static_cast<std::underlying_type<DxilResourceBase::Class>::type>(
+          res.GetClass()));
+  Value *resIDArg = hlslOP->GetU32Const(res.GetID());
+  // resLowerBound will be added after allocation in DxilCondenseResources.
+  Value *resLowerBound = hlslOP->GetU32Const(res.GetLowerBound());
+
+  Value *isUniformRes = hlslOP->GetI1Const(0);
+
+  Value *GV = res.GetGlobalSymbol();
+  Module *pM = m_DM->GetModule();
+  // TODO: add debug info to create handle.
+  DIVariable *DIV = nullptr;
+  DILocation *DL = nullptr;
+  if (m_HasDbgInfo) {
+    DebugInfoFinder &Finder = m_DM->GetOrCreateDebugInfoFinder();
+    DIV = FindGlobalVariableDebugInfo(cast<GlobalVariable>(GV), Finder);
+    if (DIV)
+      // TODO: how to get col?
+      DL =
+          DILocation::get(pM->getContext(), DIV->getLine(), 1, DIV->getScope());
+  }
+
+  bool isResArray = res.GetRangeSize() > 1;
+  std::unordered_map<Function *, Instruction *> handleMapOnFunction;
+
+  Value *createHandleArgs[] = {opArg, resClassArg, resIDArg, resLowerBound,
+                               isUniformRes};
+
+  for (iplist<Function>::iterator F : pM->getFunctionList()) {
+    if (!F->isDeclaration()) {
+      if (!isResArray) {
+        IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(F));
+        if (m_HasDbgInfo) {
+          // TODO: set debug info.
+          // Builder.SetCurrentDebugLocation(DL);
+        }
+        handleMapOnFunction[F] =
+            Builder.CreateCall(createHandle, createHandleArgs, handleName);
+      }
+    }
+  }
+
+  for (auto U = GV->user_begin(), E = GV->user_end(); U != E;) {
+    User *user = *(U++);
+    // Skip unused user.
+    if (user->user_empty())
+      continue;
+
+    if (LoadInst *ldInst = dyn_cast<LoadInst>(user)) {
+      Function *userF = ldInst->getParent()->getParent();
+      DXASSERT(handleMapOnFunction.count(userF), "must exist");
+      Value *handle = handleMapOnFunction[userF];
+      ReplaceResourceUserWithHandle(ldInst, handle);
+    } else {
+      DXASSERT(dyn_cast<GEPOperator>(user) != nullptr,
+               "else AddOpcodeParamForIntrinsic in CodeGen did not patch uses "
+               "to only have ld/st refer to temp object");
+      GEPOperator *GEP = cast<GEPOperator>(user);
+      Value *idx = nullptr;
+      if (GEP->getNumIndices() == 2) {
+        // one dim array of resource
+        idx = (GEP->idx_begin() + 1)->get();
+      } else {
+        gep_type_iterator GEPIt = gep_type_begin(GEP), E = gep_type_end(GEP);
+        // Must be instruction for multi dim array.
+        std::unique_ptr<IRBuilder<> > Builder;
+        if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP)) {
+          Builder = llvm::make_unique<IRBuilder<> >(GEPInst);
+        } else {
+          Builder = llvm::make_unique<IRBuilder<> >(GV->getContext());
+        }
+        for (; GEPIt != E; ++GEPIt) {
+          if (GEPIt->isArrayTy()) {
+            unsigned arraySize = GEPIt->getArrayNumElements();
+            Value * tmpIdx = GEPIt.getOperand();
+            if (idx == nullptr)
+              idx = tmpIdx;
+            else {
+              idx = Builder->CreateMul(idx, Builder->getInt32(arraySize));
+              idx = Builder->CreateAdd(idx, tmpIdx);
+            }
+          }
+        }
+      }
+
+      createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] = idx;
+
+      createHandleArgs[DXIL::OperandIndex::kCreateHandleIsUniformOpIdx] =
+          isUniformRes;
+
+      Value *handle = nullptr;
+      if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP)) {
+        IRBuilder<> Builder = IRBuilder<>(GEPInst);
+        if (DxilMDHelper::IsMarkedNonUniform(GEPInst)) {
+          // Mark nonUniform.
+          createHandleArgs[DXIL::OperandIndex::kCreateHandleIsUniformOpIdx] =
+              hlslOP->GetI1Const(1);
+          // Clear nonUniform on GEP.
+          GEPInst->setMetadata(DxilMDHelper::kDxilNonUniformAttributeMDName, nullptr);
+        }
+        createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] =
+            Builder.CreateAdd(idx, resLowerBound);
+        handle = Builder.CreateCall(createHandle, createHandleArgs, handleName);
+      }
+
+      for (auto GEPU = GEP->user_begin(), GEPE = GEP->user_end();
+           GEPU != GEPE;) {
+        // Must be load inst.
+        LoadInst *ldInst = cast<LoadInst>(*(GEPU++));
+        if (handle) {
+          ReplaceResourceUserWithHandle(ldInst, handle);
+        } else {
+          IRBuilder<> Builder = IRBuilder<>(ldInst);
+          createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] =
+              Builder.CreateAdd(idx, resLowerBound);
+          Value *localHandle =
+              Builder.CreateCall(createHandle, createHandleArgs, handleName);
+          ReplaceResourceUserWithHandle(ldInst, localHandle);
+        }
+      }
+
+      if (Instruction *I = dyn_cast<Instruction>(GEP)) {
+        I->eraseFromParent();
+      }
+    }
+  }
+  // Erase unused handle.
+  for (auto It : handleMapOnFunction) {
+    Instruction *I = It.second;
+    if (I->user_empty())
+      I->eraseFromParent();
+  }
+}
+
+void DxilLowerCreateHandleForLib::GenerateDxilResourceHandles() {
+  for (size_t i = 0; i < m_DM->GetCBuffers().size(); i++) {
+    DxilCBuffer &C = m_DM->GetCBuffer(i);
+    TranslateDxilResourceUses(C);
+  }
+  // Create sampler handle first, may be used by SRV operations.
+  for (size_t i = 0; i < m_DM->GetSamplers().size(); i++) {
+    DxilSampler &S = m_DM->GetSampler(i);
+    TranslateDxilResourceUses(S);
+  }
+
+  for (size_t i = 0; i < m_DM->GetSRVs().size(); i++) {
+    DxilResource &SRV = m_DM->GetSRV(i);
+    TranslateDxilResourceUses(SRV);
+  }
+
+  for (size_t i = 0; i < m_DM->GetUAVs().size(); i++) {
+    DxilResource &UAV = m_DM->GetUAV(i);
+    TranslateDxilResourceUses(UAV);
+  }
+}
+
+// TBuffer.
+namespace {
+void InitTBuffer(const DxilCBuffer *pSource, DxilResource *pDest) {
+  pDest->SetKind(pSource->GetKind());
+  pDest->SetCompType(DXIL::ComponentType::U32);
+  pDest->SetSampleCount(0);
+  pDest->SetElementStride(0);
+  pDest->SetGloballyCoherent(false);
+  pDest->SetHasCounter(false);
+  pDest->SetRW(false);
+  pDest->SetROV(false);
+  pDest->SetID(pSource->GetID());
+  pDest->SetSpaceID(pSource->GetSpaceID());
+  pDest->SetLowerBound(pSource->GetLowerBound());
+  pDest->SetRangeSize(pSource->GetRangeSize());
+  pDest->SetGlobalSymbol(pSource->GetGlobalSymbol());
+  pDest->SetGlobalName(pSource->GetGlobalName());
+  pDest->SetHandle(pSource->GetHandle());
+}
+
+void PatchTBufferLoad(CallInst *handle, DxilModule &DM) {
+  hlsl::OP *hlslOP = DM.GetOP();
+  llvm::LLVMContext &Ctx = DM.GetCtx();
+  Type *doubleTy = Type::getDoubleTy(Ctx);
+  Type *i64Ty = Type::getInt64Ty(Ctx);
+
+  // Replace corresponding cbuffer loads with typed buffer loads
+  for (auto U = handle->user_begin(); U != handle->user_end();) {
+    CallInst *I = cast<CallInst>(*(U++));
+    DXASSERT(I && OP::IsDxilOpFuncCallInst(I),
+             "otherwise unexpected user of CreateHandle value");
+    DXIL::OpCode opcode = OP::GetDxilOpFuncCallInst(I);
+    if (opcode == DXIL::OpCode::CBufferLoadLegacy) {
+      DxilInst_CBufferLoadLegacy cbLoad(I);
+
+      // Replace with appropriate buffer load instruction
+      IRBuilder<> Builder(I);
+      opcode = OP::OpCode::BufferLoad;
+      Type *Ty = Type::getInt32Ty(Ctx);
+      Function *BufLoad = hlslOP->GetOpFunc(opcode, Ty);
+      Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
+      Value *undefI = UndefValue::get(Type::getInt32Ty(Ctx));
+      Value *offset = cbLoad.get_regIndex();
+      CallInst *load =
+          Builder.CreateCall(BufLoad, {opArg, handle, offset, undefI});
+
+      // Find extractelement uses of cbuffer load and replace + generate bitcast
+      // as necessary
+      for (auto LU = I->user_begin(); LU != I->user_end();) {
+        ExtractValueInst *evInst = dyn_cast<ExtractValueInst>(*(LU++));
+        DXASSERT(evInst && evInst->getNumIndices() == 1,
+                 "user of cbuffer load result should be extractvalue");
+        uint64_t idx = evInst->getIndices()[0];
+        Type *EltTy = evInst->getType();
+        IRBuilder<> EEBuilder(evInst);
+        Value *result = nullptr;
+        if (EltTy != Ty) {
+          // extract two values and DXIL::OpCode::MakeDouble or construct i64
+          if ((EltTy == doubleTy) || (EltTy == i64Ty)) {
+            DXASSERT(idx < 2, "64-bit component index out of range");
+
+            // This assumes big endian order in tbuffer elements (is this
+            // correct?)
+            Value *low = EEBuilder.CreateExtractValue(load, idx * 2);
+            Value *high = EEBuilder.CreateExtractValue(load, idx * 2 + 1);
+            if (EltTy == doubleTy) {
+              opcode = OP::OpCode::MakeDouble;
+              Function *MakeDouble = hlslOP->GetOpFunc(opcode, doubleTy);
+              Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
+              result = EEBuilder.CreateCall(MakeDouble, {opArg, low, high});
+            } else {
+              high = EEBuilder.CreateZExt(high, i64Ty);
+              low = EEBuilder.CreateZExt(low, i64Ty);
+              high = EEBuilder.CreateShl(high, hlslOP->GetU64Const(32));
+              result = EEBuilder.CreateOr(high, low);
+            }
+          } else {
+            result = EEBuilder.CreateExtractValue(load, idx);
+            result = EEBuilder.CreateBitCast(result, EltTy);
+          }
+        } else {
+          result = EEBuilder.CreateExtractValue(load, idx);
+        }
+
+        evInst->replaceAllUsesWith(result);
+        evInst->eraseFromParent();
+      }
+    } else if (opcode == DXIL::OpCode::CBufferLoad) {
+      // TODO: Handle this, or prevent this for tbuffer
+      DXASSERT(false, "otherwise CBufferLoad used for tbuffer rather than "
+                      "CBufferLoadLegacy");
+    } else {
+      DXASSERT(false, "otherwise unexpected user of CreateHandle value");
+    }
+    I->eraseFromParent();
+  }
+}
+
+} // namespace
+
+void DxilLowerCreateHandleForLib::PatchTBufferUse(Value *V, DxilModule &DM) {
+  for (User *U : V->users()) {
+    if (CallInst *CI = dyn_cast<CallInst>(U)) {
+      // Patch dxil call.
+      if (hlsl::OP::IsDxilOpFuncCallInst(CI))
+        PatchTBufferLoad(CI, DM);
+    } else {
+      PatchTBufferUse(U, DM);
+    }
+  }
+}
+
+bool DxilLowerCreateHandleForLib::PatchTBuffers(DxilModule &DM) {
+  bool bChanged = false;
+  // move tbuffer resources to SRVs
+  unsigned offset = DM.GetSRVs().size();
+  Module &M = *DM.GetModule();
+  for (auto it = DM.GetCBuffers().begin(); it != DM.GetCBuffers().end(); it++) {
+    DxilCBuffer *CB = it->get();
+    if (CB->GetKind() == DXIL::ResourceKind::TBuffer) {
+      auto srv = make_unique<DxilResource>();
+      InitTBuffer(CB, srv.get());
+      srv->SetID(offset++);
+      DM.AddSRV(std::move(srv));
+      GlobalVariable *GV = cast<GlobalVariable>(CB->GetGlobalSymbol());
+      PatchTBufferUse(GV, DM);
+      // Set global symbol for cbuffer to an unused value so it can be removed
+      // in RemoveUnusedResourceSymbols.
+      Type *Ty = GV->getType()->getElementType();
+      GlobalVariable *NewGV = new GlobalVariable(
+          M, Ty, GV->isConstant(), GV->getLinkage(), /*Initializer*/ nullptr,
+          GV->getName(),
+          /*InsertBefore*/ nullptr, GV->getThreadLocalMode(),
+          GV->getType()->getAddressSpace(), GV->isExternallyInitialized());
+      CB->SetGlobalSymbol(NewGV);
+      bChanged = true;
+    }
+  }
+  return bChanged;
+}
+
+
+
+char DxilLowerCreateHandleForLib::ID = 0;
+
+ModulePass *llvm::createDxilLowerCreateHandleForLibPass() {
+  return new DxilLowerCreateHandleForLib();
+}
+
+INITIALIZE_PASS(DxilLowerCreateHandleForLib, "hlsl-dxil-lower-handle-for-lib", "DXIL Lower createHandleForLib", false, false)
+
+
+class DxilAllocateResourcesForLib : public ModulePass {
+private:
+  RemapEntryCollection m_rewrites;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilAllocateResourcesForLib() : ModulePass(ID), m_AutoBindingSpace(UINT_MAX) {}
+
+  void applyOptions(PassOptions O) override {
+    GetPassOptionUInt32(O, "auto-binding-space", &m_AutoBindingSpace, UINT_MAX);
+  }
+  const char *getPassName() const override { return "DXIL Condense Resources"; }
+
+  bool runOnModule(Module &M) override {
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    // Must specify a default space, and must apply to library.
+    // Use DxilCondenseResources instead for shaders.
+    if ((m_AutoBindingSpace == UINT_MAX) || !DM.GetShaderModel()->IsLib())
+      return false;
+
+    bool hasResource = DM.GetCBuffers().size() ||
+      DM.GetUAVs().size() || DM.GetSRVs().size() || DM.GetSamplers().size();
+
+    if (hasResource) {
+      DM.SetAutoBindingSpace(m_AutoBindingSpace);
+      AllocateDxilResources(DM);
+    }
+    return true;
+  }
+
+private:
+  uint32_t m_AutoBindingSpace;
+};
+
+char DxilAllocateResourcesForLib::ID = 0;
+
+ModulePass *llvm::createDxilAllocateResourcesForLibPass() {
+  return new DxilAllocateResourcesForLib();
+}
+
+INITIALIZE_PASS(DxilAllocateResourcesForLib, "hlsl-dxil-allocate-resources-for-lib", "DXIL Allocate Resources For Library", false, false)

+ 583 - 35
lib/HLSL/DxilContainerAssembler.cpp

@@ -10,6 +10,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Bitcode/ReaderWriter.h"
@@ -18,17 +19,22 @@
 #include "dxc/HLSL/DxilModule.h"
 #include "dxc/HLSL/DxilShaderModel.h"
 #include "dxc/HLSL/DxilRootSignature.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilOperations.h"
 #include "dxc/Support/Global.h"
 #include "dxc/Support/Unicode.h"
 #include "dxc/Support/WinIncludes.h"
 #include "dxc/Support/FileIOHelper.h"
 #include "dxc/Support/dxcapi.impl.h"
 #include "dxc/HLSL/DxilPipelineStateValidation.h"
+#include "dxc/HLSL/DxilRuntimeReflection.h"
 #include <algorithm>
 #include <functional>
 
 using namespace llvm;
 using namespace hlsl;
+using namespace hlsl::RDAT;
 
 static DxilProgramSigSemantic KindToSystemValue(Semantic::Kind kind, DXIL::TessellatorDomain domain) {
   switch (kind) {
@@ -290,20 +296,23 @@ public:
 };
 
 DxilPartWriter *hlsl::NewProgramSignatureWriter(const DxilModule &M, DXIL::SignatureKind Kind) {
+  DXIL::TessellatorDomain domain = DXIL::TessellatorDomain::Undefined;
+  if (M.GetShaderModel()->IsHS() || M.GetShaderModel()->IsDS())
+    domain = M.GetTessellatorDomain();
   switch (Kind) {
   case DXIL::SignatureKind::Input:
     return new DxilProgramSignatureWriter(
-        M.GetInputSignature(), M.GetTessellatorDomain(), true,
-        !M.m_ShaderFlags.GetUseNativeLowPrecision());
+        M.GetInputSignature(), domain, true,
+        M.GetUseMinPrecision());
   case DXIL::SignatureKind::Output:
     return new DxilProgramSignatureWriter(
-        M.GetOutputSignature(), M.GetTessellatorDomain(), false,
-        !M.m_ShaderFlags.GetUseNativeLowPrecision());
+        M.GetOutputSignature(), domain, false,
+        M.GetUseMinPrecision());
   case DXIL::SignatureKind::PatchConstant:
     return new DxilProgramSignatureWriter(
-        M.GetPatchConstantSignature(), M.GetTessellatorDomain(),
+        M.GetPatchConstantSignature(), domain,
         /*IsInput*/ M.GetShaderModel()->IsDS(),
-        /*UseMinPrecision*/!M.m_ShaderFlags.GetUseNativeLowPrecision());
+        /*UseMinPrecision*/M.GetUseMinPrecision());
   case DXIL::SignatureKind::Invalid:
     return nullptr;
   }
@@ -446,6 +455,7 @@ public:
     UINT uSRVs = m_Module.GetSRVs().size();
     UINT uUAVs = m_Module.GetUAVs().size();
     m_PSVInitInfo.ResourceCount = uCBuffers + uSamplers + uSRVs + uUAVs;
+    // TODO: for >= 6.2 version, create more efficient structure
     if (m_PSVInitInfo.PSVVersion > 0) {
       m_PSVInitInfo.ShaderStage = (PSVShaderKind)SM->GetKind();
       // Copy Dxil Signatures
@@ -710,10 +720,528 @@ public:
   }
 };
 
+// Size-checked writer
+//  on overrun: throw buffer_overrun{};
+//  on overlap: throw buffer_overlap{};
+class CheckedWriter {
+  char *Ptr;
+  size_t Size;
+  size_t Offset;
+
+public:
+  class exception : public std::exception {};
+  class buffer_overrun : public exception {
+  public:
+    buffer_overrun() noexcept {}
+    virtual const char * what() const noexcept override {
+      return ("buffer_overrun");
+    }
+  };
+  class buffer_overlap : public exception {
+  public:
+    buffer_overlap() noexcept {}
+    virtual const char * what() const noexcept override {
+      return ("buffer_overlap");
+    }
+  };
+
+  CheckedWriter(void *ptr, size_t size) :
+    Ptr(reinterpret_cast<char*>(ptr)), Size(size), Offset(0) {}
+
+  size_t GetOffset() const { return Offset; }
+  void Reset(size_t offset = 0) {
+    if (offset >= Size) throw buffer_overrun{};
+    Offset = offset;
+  }
+  // offset is absolute, ensure offset is >= current offset
+  void Advance(size_t offset = 0) {
+    if (offset < Offset) throw buffer_overlap{};
+    if (offset >= Size) throw buffer_overrun{};
+    Offset = offset;
+  }
+  void CheckBounds(size_t size) const {
+    assert(Offset <= Size && "otherwise, offset larger than size");
+    if (size > Size - Offset)
+      throw buffer_overrun{};
+  }
+  template <typename T>
+  T *Cast(size_t size = 0) {
+    if (0 == size) size = sizeof(T);
+    CheckBounds(size);
+    return reinterpret_cast<T*>(Ptr + Offset);
+  }
+
+  // Map and Write advance Offset:
+  template <typename T>
+  T &Map() {
+    const size_t size = sizeof(T);
+    T * p = Cast<T>(size);
+    Offset += size;
+    return *p;
+  }
+  template <typename T>
+  T *MapArray(size_t count = 1) {
+    const size_t size = sizeof(T) * count;
+    T *p = Cast<T>(size);
+    Offset += size;
+    return p;
+  }
+  template <typename T>
+  void Write(const T &obj) {
+    const size_t size = sizeof(T);
+    *Cast<T>(size) = obj;
+    Offset += size;
+  }
+  template <typename T>
+  void WriteArray(const T *pArray, size_t count = 1) {
+    const size_t size = sizeof(T) * count;
+    memcpy(Cast<T>(size), pArray, size);
+    Offset += size;
+  }
+};
+
+// Like DXIL container, RDAT itself is a mini container that contains multiple RDAT parts
+class RDATPart {
+public:
+  virtual uint32_t GetPartSize() const { return 0; }
+  virtual void Write(void *ptr) {}
+  virtual RuntimeDataPartType GetType() const { return RuntimeDataPartType::Invalid; }
+  virtual ~RDATPart() {}
+};
+
+// Most RDAT parts are tables each containing a list of structures of same type.
+// Exceptions are string table and index table because each string or list of
+// indicies can be of different sizes.
+template <class T>
+class RDATTable : public RDATPart {
+protected:
+  std::vector<T> m_rows;
+public:
+  virtual void Insert(T *data) {}
+  virtual ~RDATTable() {}
+
+  void Insert(const T &data) {
+    m_rows.push_back(data);
+  }
+
+  void Write(void *ptr) {
+    char *pCur = (char*)ptr;
+    RuntimeDataTableHeader &header = *reinterpret_cast<RuntimeDataTableHeader*>(pCur);
+    header.RecordCount = m_rows.size();
+    header.RecordStride = sizeof(T);
+    pCur += sizeof(RuntimeDataTableHeader);
+    memcpy(pCur, m_rows.data(), header.RecordCount * header.RecordStride);
+  };
+
+  uint32_t GetPartSize() const {
+    if (m_rows.empty())
+      return 0;
+    return sizeof(RuntimeDataTableHeader) + m_rows.size() * sizeof(T);
+  }
+};
+
+// Resource table will contain a list of RuntimeDataResourceInfo in order of
+// CBuffer, Sampler, SRV, and UAV resource classes.
+class ResourceTable : public RDATTable<RuntimeDataResourceInfo> {
+public:
+  RuntimeDataPartType GetType() const { return RuntimeDataPartType::ResourceTable; }
+};
+
+class FunctionTable : public RDATTable<RuntimeDataFunctionInfo> {
+public:
+  RuntimeDataPartType GetType() const { return RuntimeDataPartType::FunctionTable; }
+};
+
+class StringBufferPart : public RDATPart {
+private:
+  StringMap<uint32_t> m_StringMap;
+  SmallVector<char, 256> m_StringBuffer;
+  uint32_t curIndex;
+public:
+  StringBufferPart() : m_StringMap(), m_StringBuffer(), curIndex(0) {
+    // Always start string table with null so empty/null strings have offset of zero
+    m_StringBuffer.push_back('\0');
+  }
+  // returns the offset of the name inserted
+  uint32_t Insert(StringRef name) {
+    if (name.empty())
+      return 0;
+
+    // Don't add duplicate strings
+    auto found = m_StringMap.find(name);
+    if (found != m_StringMap.end())
+      return found->second;
+
+    uint32_t prevIndex = (uint32_t)m_StringBuffer.size();
+    m_StringMap[name] = prevIndex;
+    m_StringBuffer.reserve(m_StringBuffer.size() + name.size() + 1);
+    m_StringBuffer.append(name.begin(), name.end());
+    m_StringBuffer.push_back('\0');
+    return prevIndex;
+  }
+  RuntimeDataPartType GetType() const { return RuntimeDataPartType::StringBuffer; }
+  uint32_t GetPartSize() const { return m_StringBuffer.size(); }
+  void Write(void *ptr) { memcpy(ptr, m_StringBuffer.data(), m_StringBuffer.size()); }
+};
+
+struct IndexArraysPart : public RDATPart {
+private:
+  std::vector<uint32_t> m_IndexBuffer;
+
+  // Use m_IndexSet with CmpIndices to avoid duplicate index arrays
+  struct CmpIndices {
+    const IndexArraysPart &Table;
+    CmpIndices(const IndexArraysPart &table) : Table(table) {}
+    bool operator()(uint32_t left, uint32_t right) const {
+      const uint32_t *pLeft = Table.m_IndexBuffer.data() + left;
+      const uint32_t *pRight = Table.m_IndexBuffer.data() + right;
+      if (*pLeft != *pRight)
+        return (*pLeft < *pRight);
+      uint32_t count = *pLeft;
+      for (unsigned i = 0; i < count; i++) {
+        ++pLeft; ++pRight;
+        if (*pLeft != *pRight)
+          return (*pLeft < *pRight);
+      }
+      return false;
+    }
+  };
+  std::set<uint32_t, CmpIndices> m_IndexSet;
+
+public:
+  IndexArraysPart() : m_IndexBuffer(), m_IndexSet(*this) {}
+  template <class iterator>
+  uint32_t AddIndex(iterator begin, iterator end) {
+    uint32_t newOffset = m_IndexBuffer.size();
+    m_IndexBuffer.push_back(0); // Size: update after insertion
+    m_IndexBuffer.insert(m_IndexBuffer.end(), begin, end);
+    m_IndexBuffer[newOffset] = (m_IndexBuffer.size() - newOffset) - 1;
+    // Check for duplicate, return new offset if not duplicate
+    auto insertResult = m_IndexSet.insert(newOffset);
+    if (insertResult.second)
+      return newOffset;
+    // Otherwise it was a duplicate, so chop off the size and return the original
+    m_IndexBuffer.resize(newOffset);
+    return *insertResult.first;
+  }
+
+  RuntimeDataPartType GetType() const { return RuntimeDataPartType::IndexArrays; }
+  uint32_t GetPartSize() const {
+    return sizeof(uint32_t) * m_IndexBuffer.size();
+  }
+
+  void Write(void *ptr) {
+    memcpy(ptr, m_IndexBuffer.data(), m_IndexBuffer.size() * sizeof(uint32_t));
+  }
+};
+
+using namespace DXIL;
+
+class DxilRDATWriter : public DxilPartWriter {
+private:
+  const DxilModule &m_Module;
+  SmallVector<char, 1024> m_RDATBuffer;
+
+  std::vector<std::unique_ptr<RDATPart>> m_Parts;
+  typedef llvm::SmallSetVector<uint32_t, 8> Indices;
+  typedef std::unordered_map<const llvm::Function *, Indices> FunctionIndexMap;
+  FunctionIndexMap m_FuncToResNameOffset; // list of resources used
+  FunctionIndexMap m_FuncToDependencies;  // list of unresolved functions used
+
+  struct ShaderCompatInfo {
+    ShaderCompatInfo()
+      : minMajor(6), minMinor(0),
+        mask(((unsigned)1 << (unsigned)DXIL::ShaderKind::Invalid) - 1)
+      {}
+    unsigned minMajor, minMinor, mask;
+  };
+  typedef std::unordered_map<const llvm::Function*, ShaderCompatInfo> FunctionShaderCompatMap;
+  FunctionShaderCompatMap m_FuncToShaderCompat;
+
+  void UpdateFunctionToShaderCompat(const llvm::Function* dxilFunc) {
+    for (const auto &user : dxilFunc->users()) {
+      if (const llvm::Instruction *I = dyn_cast<const llvm::Instruction>(user)) {
+        // Find calling function
+        const llvm::Function *F = cast<const llvm::Function>(I->getParent()->getParent());
+        // Insert or lookup info
+        ShaderCompatInfo &info = m_FuncToShaderCompat[F];
+        OpCode opcode = OP::GetDxilOpFuncCallInst(I);
+        unsigned major, minor, mask;
+        // bWithTranslation = true for library modules
+        OP::GetMinShaderModelAndMask(opcode, /*bWithTranslation*/true, major, minor, mask);
+        if (major > info.minMajor) {
+          info.minMajor = major;
+          info.minMinor = minor;
+        } else if (minor > info.minMinor) {
+          info.minMinor = minor;
+        }
+        info.mask &= mask;
+      }
+    }
+  }
+
+  const llvm::Function *FindUsingFunction(const llvm::Value *User) {
+    if (const llvm::Instruction *I = dyn_cast<const llvm::Instruction>(User)) {
+      // Instruction should be inside a basic block, which is in a function
+      return cast<const llvm::Function>(I->getParent()->getParent());
+    }
+    // User can be either instruction, constant, or operator. But User is an
+    // operator only if constant is a scalar value, not resource pointer.
+    const llvm::Constant *CU = cast<const llvm::Constant>(User);
+    if (!CU->user_empty())
+      return FindUsingFunction(*CU->user_begin());
+    else
+      return nullptr;
+  }
+
+  void UpdateFunctionToResourceInfo(const DxilResourceBase *resource,
+                                    uint32_t offset) {
+    Constant *var = resource->GetGlobalSymbol();
+    if (var) {
+      for (auto user : var->users()) {
+        // Find the function.
+        const llvm::Function *F = FindUsingFunction(user);
+        if (!F)
+          continue;
+        if (m_FuncToResNameOffset.find(F) == m_FuncToResNameOffset.end()) {
+          m_FuncToResNameOffset[F] = Indices();
+        }
+        m_FuncToResNameOffset[F].insert(offset);
+      }
+    }
+  }
+
+  void InsertToResourceTable(DxilResourceBase &resource,
+                             ResourceClass resourceClass,
+                             ResourceTable &resourceTable,
+                             StringBufferPart &stringBufferPart,
+                             uint32_t &resourceIndex) {
+    uint32_t stringIndex = stringBufferPart.Insert(resource.GetGlobalName());
+    UpdateFunctionToResourceInfo(&resource, resourceIndex++);
+    RuntimeDataResourceInfo info = {};
+    info.ID = resource.GetID();
+    info.Class = static_cast<uint32_t>(resourceClass);
+    info.Kind = static_cast<uint32_t>(resource.GetKind());
+    info.Space = resource.GetSpaceID();
+    info.LowerBound = resource.GetLowerBound();
+    info.UpperBound = resource.GetUpperBound();
+    info.Name = stringIndex;
+    info.Flags = 0;
+    if (ResourceClass::UAV == resourceClass) {
+      DxilResource *pRes = static_cast<DxilResource*>(&resource);
+      if (pRes->HasCounter())
+        info.Flags |= static_cast<uint32_t>(DxilResourceFlag::UAVCounter);
+      if (pRes->IsGloballyCoherent())
+        info.Flags |= static_cast<uint32_t>(DxilResourceFlag::UAVGloballyCoherent);
+      if (pRes->IsROV())
+        info.Flags |= static_cast<uint32_t>(DxilResourceFlag::UAVRasterizerOrderedView);
+      // TODO: add dynamic index flag
+    }
+    resourceTable.Insert(info);
+  }
+
+  void UpdateResourceInfo(StringBufferPart &stringBufferPart) {
+    // Try to allocate string table for resources. String table is a sequence
+    // of strings delimited by \0
+    m_Parts.emplace_back(llvm::make_unique<ResourceTable>());
+    ResourceTable &resourceTable = *reinterpret_cast<ResourceTable*>(m_Parts.back().get());
+    uint32_t resourceIndex = 0;
+    for (auto &resource : m_Module.GetCBuffers()) {
+      InsertToResourceTable(*resource.get(), ResourceClass::CBuffer, resourceTable, stringBufferPart,
+                            resourceIndex);
+
+    }
+    for (auto &resource : m_Module.GetSamplers()) {
+      InsertToResourceTable(*resource.get(), ResourceClass::Sampler, resourceTable, stringBufferPart,
+                            resourceIndex);
+    }
+    for (auto &resource : m_Module.GetSRVs()) {
+      InsertToResourceTable(*resource.get(), ResourceClass::SRV, resourceTable, stringBufferPart,
+                            resourceIndex);
+    }
+    for (auto &resource : m_Module.GetUAVs()) {
+      InsertToResourceTable(*resource.get(), ResourceClass::UAV, resourceTable, stringBufferPart,
+                            resourceIndex);
+    }
+  }
+
+  void UpdateFunctionDependency(llvm::Function *F, StringBufferPart &stringBufferPart) {
+    for (const auto &user : F->users()) {
+      const llvm::Function *userFunction = FindUsingFunction(user);
+      uint32_t index = stringBufferPart.Insert(F->getName());
+      if (m_FuncToDependencies.find(userFunction) ==
+          m_FuncToDependencies.end()) {
+        m_FuncToDependencies[userFunction] =
+            Indices();
+      }
+      m_FuncToDependencies[userFunction].insert(index);
+    }
+  }
+
+  void UpdateFunctionInfo(StringBufferPart &stringBufferPart) {
+    m_Parts.emplace_back(llvm::make_unique<FunctionTable>());
+    FunctionTable &functionTable = *reinterpret_cast<FunctionTable*>(m_Parts.back().get());
+    m_Parts.emplace_back(llvm::make_unique<IndexArraysPart>());
+    IndexArraysPart &indexArraysPart = *reinterpret_cast<IndexArraysPart*>(m_Parts.back().get());
+    for (auto &function : m_Module.GetModule()->getFunctionList()) {
+      if (function.isDeclaration() && !function.isIntrinsic()) {
+        if (OP::IsDxilOpFunc(&function)) {
+          // update min shader model and shader stage mask per function
+          UpdateFunctionToShaderCompat(&function);
+        } else {
+          // collect unresolved dependencies per function
+          UpdateFunctionDependency(&function, stringBufferPart);
+        }
+      }
+    }
+    for (auto &function : m_Module.GetModule()->getFunctionList()) {
+      if (!function.isDeclaration()) {
+        StringRef mangled = function.getName();
+        StringRef unmangled = hlsl::dxilutil::DemangleFunctionName(function.getName());
+        uint32_t mangledIndex = stringBufferPart.Insert(mangled);
+        uint32_t unmangledIndex = stringBufferPart.Insert(unmangled);
+        // Update resource Index
+        uint32_t resourceIndex = UINT_MAX;
+        uint32_t functionDependencies = UINT_MAX;
+        uint32_t payloadSizeInBytes = 0;
+        uint32_t attrSizeInBytes = 0;
+        uint32_t shaderKind = static_cast<uint32_t>(DXIL::ShaderKind::Library);
+
+        if (m_FuncToResNameOffset.find(&function) != m_FuncToResNameOffset.end())
+          resourceIndex =
+              indexArraysPart.AddIndex(m_FuncToResNameOffset[&function].begin(),
+                                  m_FuncToResNameOffset[&function].end());
+        if (m_FuncToDependencies.find(&function) != m_FuncToDependencies.end())
+          functionDependencies =
+              indexArraysPart.AddIndex(m_FuncToDependencies[&function].begin(),
+                                  m_FuncToDependencies[&function].end());
+        if (m_Module.HasDxilFunctionProps(&function)) {
+          auto props = m_Module.GetDxilFunctionProps(&function);
+          if (props.IsClosestHit() || props.IsAnyHit()) {
+            payloadSizeInBytes = props.ShaderProps.Ray.payloadSizeInBytes;
+            attrSizeInBytes = props.ShaderProps.Ray.attributeSizeInBytes;
+          }
+          else if (props.IsMiss()) {
+            payloadSizeInBytes = props.ShaderProps.Ray.payloadSizeInBytes;
+          }
+          else if (props.IsCallable()) {
+            payloadSizeInBytes = props.ShaderProps.Ray.paramSizeInBytes;
+          }
+          shaderKind = (uint32_t)props.shaderKind;
+        }
+        ShaderFlags flags = ShaderFlags::CollectShaderFlags(&function, &m_Module);
+        RuntimeDataFunctionInfo info = {};
+        info.Name = mangledIndex;
+        info.UnmangledName = unmangledIndex;
+        info.ShaderKind = shaderKind;
+        info.Resources = resourceIndex;
+        info.FunctionDependencies = functionDependencies;
+        info.PayloadSizeInBytes = payloadSizeInBytes;
+        info.AttributeSizeInBytes = attrSizeInBytes;
+        uint64_t featureFlags = flags.GetFeatureInfo();
+        info.FeatureInfo1 = featureFlags & 0xffffffff;
+        info.FeatureInfo2 = (featureFlags >> 32) & 0xffffffff;
+        // Init min target 6.0
+        unsigned minMajor = 6, minMinor = 0;
+        // Increase min target based on feature flags:
+        if (flags.GetUseNativeLowPrecision() && flags.GetLowPrecisionPresent()) {
+          minMinor = 2;
+        } else if (flags.GetBarycentrics() || flags.GetViewID()) {
+          minMinor = 1;
+        }
+        if ((DXIL::ShaderKind)shaderKind == DXIL::ShaderKind::Library) {
+          // Init mask to all kinds for library functions
+          info.ShaderStageFlag = ((unsigned)1 << (unsigned)DXIL::ShaderKind::Invalid) - 1;
+        } else {
+          // Init mask to current kind for shader functions
+          info.ShaderStageFlag = (unsigned)1 << shaderKind;
+        }
+        auto it = m_FuncToShaderCompat.find(&function);
+        if (it != m_FuncToShaderCompat.end()) {
+          auto &compatInfo = it->second;
+          if (compatInfo.minMajor > minMajor) {
+            minMajor = compatInfo.minMajor;
+            minMinor = compatInfo.minMinor;
+          } else if (compatInfo.minMinor > minMinor) {
+            minMinor = compatInfo.minMinor;
+          }
+          info.ShaderStageFlag &= compatInfo.mask;
+        }
+        info.MinShaderTarget = EncodeVersion((DXIL::ShaderKind)shaderKind, minMajor, minMinor);
+        functionTable.Insert(info);
+      }
+    }
+  }
+
+public:
+  DxilRDATWriter(const DxilModule &module, uint32_t InfoVersion = 0)
+      : m_Module(module), m_RDATBuffer(), m_Parts(), m_FuncToResNameOffset() {
+    // It's important to keep the order of this update
+    m_Parts.emplace_back(llvm::make_unique<StringBufferPart>());
+    StringBufferPart &stringBufferPart = *reinterpret_cast<StringBufferPart*>(m_Parts.back().get());
+    UpdateResourceInfo(stringBufferPart);
+    UpdateFunctionInfo(stringBufferPart);
+
+    // Delete any empty parts:
+    std::vector<std::unique_ptr<RDATPart>>::iterator it = m_Parts.begin();
+    while (it != m_Parts.end()) {
+      if (it->get()->GetPartSize() == 0) {
+        it = m_Parts.erase(it);
+      }
+      else
+        it++;
+    }
+  }
+
+  uint32_t size() const override {
+    // header + offset array
+    uint32_t total = sizeof(RuntimeDataHeader) + m_Parts.size() * sizeof(uint32_t);
+    // For each part: part header + part size
+    for (auto &part : m_Parts)
+      total += sizeof(RuntimeDataPartHeader) + PSVALIGN4(part->GetPartSize());
+    return total;
+  }
+
+  void write(AbstractMemoryStream *pStream) override {
+    try {
+      m_RDATBuffer.resize(size(), 0);
+      CheckedWriter W(m_RDATBuffer.data(), m_RDATBuffer.size());
+      // write RDAT header
+      RuntimeDataHeader &header = W.Map<RuntimeDataHeader>();
+      header.Version = RDAT_Version_0;
+      header.PartCount = m_Parts.size();
+      // map offsets
+      uint32_t *offsets = W.MapArray<uint32_t>(header.PartCount);
+      // write parts
+      unsigned i = 0;
+      for (auto &part : m_Parts) {
+        offsets[i++] = W.GetOffset();
+        RuntimeDataPartHeader &partHeader = W.Map<RuntimeDataPartHeader>();
+        partHeader.Type = part->GetType();
+        partHeader.Size = PSVALIGN4(part->GetPartSize());
+        DXASSERT(partHeader.Size, "otherwise, failed to remove empty part");
+        char *bytes = W.MapArray<char>(partHeader.Size);
+        part->Write(bytes);
+      }
+    }
+    catch (CheckedWriter::exception e) {
+      throw hlsl::Exception(DXC_E_GENERAL_INTERNAL_ERROR, e.what());
+    }
+
+    ULONG cbWritten;
+    IFT(pStream->Write(m_RDATBuffer.data(), m_RDATBuffer.size(), &cbWritten));
+    DXASSERT_NOMSG(cbWritten == m_RDATBuffer.size());
+  }
+};
+
 DxilPartWriter *hlsl::NewPSVWriter(const DxilModule &M, uint32_t PSVVersion) {
   return new DxilPSVWriter(M, PSVVersion);
 }
 
+DxilPartWriter *hlsl::NewRDATWriter(const DxilModule &M, uint32_t InfoVersion) {
+  return new DxilRDATWriter(M, InfoVersion);
+}
+
 class DxilContainerWriter_impl : public DxilContainerWriter  {
 private:
   class DxilPart {
@@ -828,15 +1356,6 @@ void hlsl::SerializeDxilContainerForModule(DxilModule *pModule,
   if (ValMajor == 1 && ValMinor == 0)
     Flags &= ~SerializeDxilFlags::IncludeDebugNamePart;
 
-  DxilProgramSignatureWriter inputSigWriter(
-      pModule->GetInputSignature(), pModule->GetTessellatorDomain(),
-      /*IsInput*/ true,
-      /*UseMinPrecision*/ !pModule->m_ShaderFlags.GetUseNativeLowPrecision());
-  DxilProgramSignatureWriter outputSigWriter(
-      pModule->GetOutputSignature(), pModule->GetTessellatorDomain(),
-      /*IsInput*/ false,
-      /*UseMinPrecision*/ !pModule->m_ShaderFlags.GetUseNativeLowPrecision());
-  DxilPSVWriter PSVWriter(*pModule);
   DxilContainerWriter_impl writer;
 
   // Write the feature part.
@@ -845,30 +1364,59 @@ void hlsl::SerializeDxilContainerForModule(DxilModule *pModule,
     featureInfoWriter.write(pStream);
   });
 
-  // Write the input and output signature parts.
-  writer.AddPart(DFCC_InputSignature, inputSigWriter.size(), [&](AbstractMemoryStream *pStream) {
-    inputSigWriter.write(pStream);
-  });
-  writer.AddPart(DFCC_OutputSignature, outputSigWriter.size(), [&](AbstractMemoryStream *pStream) {
-    outputSigWriter.write(pStream);
-  });
-
-  DxilProgramSignatureWriter patchConstantSigWriter(
-      pModule->GetPatchConstantSignature(), pModule->GetTessellatorDomain(),
-      /*IsInput*/ pModule->GetShaderModel()->IsDS(),
-      /*UseMinPrecision*/ !pModule->m_ShaderFlags.GetUseNativeLowPrecision());
-  if (pModule->GetPatchConstantSignature().GetElements().size()) {
-    writer.AddPart(DFCC_PatchConstantSignature, patchConstantSigWriter.size(),
+  std::unique_ptr<DxilProgramSignatureWriter> pInputSigWriter = nullptr;
+  std::unique_ptr<DxilProgramSignatureWriter> pOutputSigWriter = nullptr;
+  std::unique_ptr<DxilProgramSignatureWriter> pPatchConstantSigWriter = nullptr;
+  if (!pModule->GetShaderModel()->IsLib()) {
+    DXIL::TessellatorDomain domain = DXIL::TessellatorDomain::Undefined;
+    if (pModule->GetShaderModel()->IsHS() || pModule->GetShaderModel()->IsDS())
+      domain = pModule->GetTessellatorDomain();
+    pInputSigWriter = llvm::make_unique<DxilProgramSignatureWriter>(
+        pModule->GetInputSignature(), domain,
+        /*IsInput*/ true,
+        /*UseMinPrecision*/ pModule->GetUseMinPrecision());
+    pOutputSigWriter = llvm::make_unique<DxilProgramSignatureWriter>(
+        pModule->GetOutputSignature(), domain,
+        /*IsInput*/ false,
+        /*UseMinPrecision*/ pModule->GetUseMinPrecision());
+    // Write the input and output signature parts.
+    writer.AddPart(DFCC_InputSignature, pInputSigWriter->size(),
                    [&](AbstractMemoryStream *pStream) {
-                     patchConstantSigWriter.write(pStream);
+                     pInputSigWriter->write(pStream);
+                   });
+    writer.AddPart(DFCC_OutputSignature, pOutputSigWriter->size(),
+                   [&](AbstractMemoryStream *pStream) {
+                     pOutputSigWriter->write(pStream);
                    });
-  }
 
+    pPatchConstantSigWriter = llvm::make_unique<DxilProgramSignatureWriter>(
+        pModule->GetPatchConstantSignature(), domain,
+        /*IsInput*/ pModule->GetShaderModel()->IsDS(),
+        /*UseMinPrecision*/ pModule->GetUseMinPrecision());
+    if (pModule->GetPatchConstantSignature().GetElements().size()) {
+      writer.AddPart(DFCC_PatchConstantSignature,
+                     pPatchConstantSigWriter->size(),
+                     [&](AbstractMemoryStream *pStream) {
+                       pPatchConstantSigWriter->write(pStream);
+                     });
+    }
+  }
   // Write the DxilPipelineStateValidation (PSV0) part.
-  writer.AddPart(DFCC_PipelineStateValidation, PSVWriter.size(), [&](AbstractMemoryStream *pStream) {
-    PSVWriter.write(pStream);
-  });
-
+  std::unique_ptr<DxilRDATWriter> pRDATWriter = nullptr;
+  std::unique_ptr<DxilPSVWriter> pPSVWriter = nullptr;
+  unsigned int major, minor;
+  pModule->GetDxilVersion(major, minor);
+  if (pModule->GetShaderModel()->IsLib()) {
+    pRDATWriter = llvm::make_unique<DxilRDATWriter>(*pModule);
+    writer.AddPart(
+        DFCC_RuntimeData, pRDATWriter->size(),
+        [&](AbstractMemoryStream *pStream) { pRDATWriter->write(pStream); });
+  } else if (!pModule->GetShaderModel()->IsLib()) {
+    pPSVWriter = llvm::make_unique<DxilPSVWriter>(*pModule);
+    writer.AddPart(
+        DFCC_PipelineStateValidation, pPSVWriter->size(),
+        [&](AbstractMemoryStream *pStream) { pPSVWriter->write(pStream); });
+  }
   // Write the root signature (RTS0) part.
   DxilProgramRootSignatureWriter rootSigWriter(pModule->GetRootSignature());
   CComPtr<AbstractMemoryStream> pInputProgramStream = pModuleBitcode;

+ 448 - 58
lib/HLSL/DxilContainerReflection.cpp

@@ -23,8 +23,10 @@
 #include "dxc/Support/microcom.h"
 #include "dxc/Support/FileIOHelper.h"
 #include "dxc/Support/dxcapi.impl.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
 
 #include <unordered_set>
+#include "llvm/ADT/SetVector.h"
 
 #include "dxc/dxcapi.h"
 
@@ -70,39 +72,60 @@ public:
 
 class CShaderReflectionConstantBuffer;
 class CShaderReflectionType;
-class DxilShaderReflection : public ID3D12ShaderReflection {
-private:
-  DXC_MICROCOM_TM_REF_FIELDS()
+
+enum class PublicAPI { D3D12 = 0, D3D11_47 = 1, D3D11_43 = 2 };
+
+class DxilModuleReflection {
+public:
   CComPtr<IDxcBlob> m_pContainer;
   LLVMContext Context;
   std::unique_ptr<Module> m_pModule; // Must come after LLVMContext, otherwise unique_ptr will over-delete.
   DxilModule *m_pDxilModule = nullptr;
-  std::vector<CShaderReflectionConstantBuffer>    m_CBs;
+  std::vector<std::unique_ptr<CShaderReflectionConstantBuffer>>    m_CBs;
   std::vector<D3D12_SHADER_INPUT_BIND_DESC>       m_Resources;
+  std::vector<std::unique_ptr<CShaderReflectionType>> m_Types;
+  void CreateReflectionObjects();
+  void CreateReflectionObjectForResource(DxilResourceBase *R);
+
+  HRESULT LoadModule(IDxcBlob *pBlob, const DxilPartHeader *pPart);
+
+  // Common code
+  ID3D12ShaderReflectionConstantBuffer* _GetConstantBufferByIndex(UINT Index);
+  ID3D12ShaderReflectionConstantBuffer* _GetConstantBufferByName(LPCSTR Name);
+
+  HRESULT _GetResourceBindingDesc(UINT ResourceIndex,
+                                  _Out_ D3D12_SHADER_INPUT_BIND_DESC *pDesc,
+                                  PublicAPI api = PublicAPI::D3D12);
+
+  ID3D12ShaderReflectionVariable* _GetVariableByName(LPCSTR Name);
+
+  HRESULT _GetResourceBindingDescByName(LPCSTR Name,
+                                        D3D12_SHADER_INPUT_BIND_DESC *pDesc,
+                                        PublicAPI api = PublicAPI::D3D12);
+};
+
+class DxilShaderReflection : public DxilModuleReflection, public ID3D12ShaderReflection {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
   std::vector<D3D12_SIGNATURE_PARAMETER_DESC>     m_InputSignature;
   std::vector<D3D12_SIGNATURE_PARAMETER_DESC>     m_OutputSignature;
   std::vector<D3D12_SIGNATURE_PARAMETER_DESC>     m_PatchConstantSignature;
   std::vector<std::unique_ptr<char[]>>            m_UpperCaseNames;
-  std::vector<std::unique_ptr<CShaderReflectionType>> m_Types;
-  void CreateReflectionObjects();
   void SetCBufferUsage();
-  void CreateReflectionObjectForResource(DxilResourceBase *R);
   void CreateReflectionObjectsForSignature(
       const DxilSignature &Sig,
       std::vector<D3D12_SIGNATURE_PARAMETER_DESC> &Descs);
   LPCSTR CreateUpperCase(LPCSTR pValue);
   void MarkUsedSignatureElements();
 public:
-  enum class PublicAPI { D3D12 = 0, D3D11_47 = 1, D3D11_43 = 2 };
   PublicAPI m_PublicAPI;
   void SetPublicAPI(PublicAPI value) { m_PublicAPI = value; }
   static PublicAPI IIDToAPI(REFIID iid) {
-    DxilShaderReflection::PublicAPI api =
-        DxilShaderReflection::PublicAPI::D3D12;
+    PublicAPI api = PublicAPI::D3D12;
     if (IsEqualIID(IID_ID3D11ShaderReflection_43, iid))
-      api = DxilShaderReflection::PublicAPI::D3D11_43;
+      api = PublicAPI::D3D11_43;
     else if (IsEqualIID(IID_ID3D11ShaderReflection_47, iid))
-      api = DxilShaderReflection::PublicAPI::D3D11_47;
+      api = PublicAPI::D3D11_47;
     return api;
   }
   DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
@@ -164,6 +187,37 @@ public:
   STDMETHODIMP_(UINT64) GetRequiresFlags(THIS);
 };
 
+class CFunctionReflection;
+class DxilLibraryReflection : public DxilModuleReflection, public ID3D12LibraryReflection {
+private:
+  DXC_MICROCOM_TM_REF_FIELDS()
+
+  // Storage, and function by name:
+  typedef DenseMap<StringRef, std::unique_ptr<CFunctionReflection> > FunctionMap;
+  typedef DenseMap<const Function*, CFunctionReflection*> FunctionsByPtr;
+  FunctionMap m_FunctionMap;
+  FunctionsByPtr m_FunctionsByPtr;
+  // Enable indexing into functions in deterministic order:
+  std::vector<CFunctionReflection*> m_FunctionVector;
+
+  void AddResourceUseToFunctions(DxilResourceBase &resource, unsigned resIndex);
+  void AddResourceDependencies();
+
+public:
+  DXC_MICROCOM_TM_ADDREF_RELEASE_IMPL()
+  DXC_MICROCOM_TM_CTOR(DxilLibraryReflection)
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void **ppvObject) {
+    return DoBasicQueryInterface<ID3D12LibraryReflection>(this, iid, ppvObject);
+  }
+
+  HRESULT Load(IDxcBlob *pBlob, const DxilPartHeader *pPart);
+
+  // ID3D12LibraryReflection
+  STDMETHOD(GetDesc)(THIS_ _Out_ D3D12_LIBRARY_DESC * pDesc);
+
+  STDMETHOD_(ID3D12FunctionReflection *, GetFunctionByIndex)(THIS_ _In_ INT FunctionIndex);
+};
+
 _Use_decl_annotations_
 HRESULT DxilContainerReflection::Load(IDxcBlob *pContainer) {
   if (pContainer == nullptr) {
@@ -246,13 +300,28 @@ HRESULT DxilContainerReflection::GetPartReflection(UINT32 idx, REFIID iid, void
   
   DxcThreadMalloc TM(m_pMalloc);
   HRESULT hr = S_OK;
-  CComPtr<DxilShaderReflection> pReflection = DxilShaderReflection::Alloc(m_pMalloc);
-  IFCOOM(pReflection.p);
-  DxilShaderReflection::PublicAPI api = DxilShaderReflection::IIDToAPI(iid);
-  pReflection->SetPublicAPI(api);
+  const DxilProgramHeader *pProgramHeader =
+    reinterpret_cast<const DxilProgramHeader*>(GetDxilPartData(pPart));
+  if (!IsValidDxilProgramHeader(pProgramHeader, pPart->PartSize)) {
+    return E_INVALIDARG;
+  }
+
+  DXIL::ShaderKind SK = GetVersionShaderType(pProgramHeader->ProgramVersion);
+  if (SK == DXIL::ShaderKind::Library) {
+    CComPtr<DxilLibraryReflection> pReflection = DxilLibraryReflection::Alloc(m_pMalloc);
+    IFCOOM(pReflection.p);
+    IFC(pReflection->Load(m_container, pPart));
+    IFC(pReflection.p->QueryInterface(iid, ppvObject));
+  } else {
+    CComPtr<DxilShaderReflection> pReflection = DxilShaderReflection::Alloc(m_pMalloc);
+    IFCOOM(pReflection.p);
+    PublicAPI api = DxilShaderReflection::IIDToAPI(iid);
+    pReflection->SetPublicAPI(api);
+
+    IFC(pReflection->Load(m_container, pPart));
+    IFC(pReflection.p->QueryInterface(iid, ppvObject));
+  }
 
-  IFC(pReflection->Load(m_container, pPart));
-  IFC(pReflection.p->QueryInterface(iid, ppvObject));
 Cleanup:
   return hr;
 }
@@ -412,6 +481,30 @@ class CInvalidSRConstantBuffer : public ID3D12ShaderReflectionConstantBuffer {
 };
 static CInvalidSRConstantBuffer g_InvalidSRConstantBuffer;
 
+class CInvalidFunctionParameter : public ID3D12FunctionParameterReflection {
+  STDMETHOD(GetDesc)(THIS_ _Out_ D3D12_PARAMETER_DESC * pDesc) { return E_FAIL; }
+};
+CInvalidFunctionParameter g_InvalidFunctionParameter;
+
+class CInvalidFunction : public ID3D12FunctionReflection {
+  STDMETHOD(GetDesc)(THIS_ _Out_ D3D12_FUNCTION_DESC * pDesc) { return E_FAIL; }
+
+  STDMETHOD_(ID3D12ShaderReflectionConstantBuffer *, GetConstantBufferByIndex)(THIS_ _In_ UINT BufferIndex) { return &g_InvalidSRConstantBuffer; }
+  STDMETHOD_(ID3D12ShaderReflectionConstantBuffer *, GetConstantBufferByName)(THIS_ _In_ LPCSTR Name) { return &g_InvalidSRConstantBuffer; }
+
+  STDMETHOD(GetResourceBindingDesc)(THIS_ _In_ UINT ResourceIndex,
+    _Out_ D3D12_SHADER_INPUT_BIND_DESC * pDesc) { return E_FAIL; }
+
+  STDMETHOD_(ID3D12ShaderReflectionVariable *, GetVariableByName)(THIS_ _In_ LPCSTR Name) { return nullptr; }
+
+  STDMETHOD(GetResourceBindingDescByName)(THIS_ _In_ LPCSTR Name,
+    _Out_ D3D12_SHADER_INPUT_BIND_DESC * pDesc) { return E_FAIL; }
+
+  // Use D3D_RETURN_PARAMETER_INDEX to get description of the return value.
+  STDMETHOD_(ID3D12FunctionParameterReflection *, GetFunctionParameter)(THIS_ _In_ INT ParameterIndex) { return &g_InvalidFunctionParameter; }
+};
+CInvalidFunction g_InvalidFunction;
+
 void CShaderReflectionVariable::Initialize(
     CShaderReflectionConstantBuffer *pBuffer, D3D12_SHADER_VARIABLE_DESC *pDesc,
     CShaderReflectionType *pType, BYTE *pDefaultValue) {
@@ -833,13 +926,15 @@ HRESULT CShaderReflectionType::Initialize(
       name = name.ltrim("struct.");
       m_Name = name;
 
-      unsigned int fieldCount = type->getStructNumElements();
-
       // Fields may have annotations, and we need to look at these
       // in order to decode their types properly.
       DxilTypeSystem &typeSys = M.GetTypeSystem();
       DxilStructAnnotation *structAnnotation = typeSys.GetStructAnnotation(structType);
-      DXASSERT(structAnnotation, "else type system is missing annotations for user-defined struct");
+
+      // There is no annotation for empty structs
+      unsigned int fieldCount = 0;
+      if (structAnnotation)
+        fieldCount = type->getStructNumElements();
 
       // The DXBC reflection info computes `Columns` for a
       // `struct` type from the fields (see below)
@@ -1133,8 +1228,8 @@ static D3D_SHADER_INPUT_TYPE ResourceToShaderInputType(DxilResourceBase *RB) {
     if (R->HasCounter()) return D3D_SIT_UAV_RWSTRUCTURED_WITH_COUNTER;
     return D3D_SIT_UAV_RWSTRUCTURED;
   }
+  case DxilResource::Kind::TBuffer:
   case DxilResource::Kind::TypedBuffer:
-    return isUAV ? D3D_SIT_UAV_RWTYPED : D3D_SIT_STRUCTURED;
   case DxilResource::Kind::Texture1D:
   case DxilResource::Kind::Texture1DArray:
   case DxilResource::Kind::Texture2D:
@@ -1144,9 +1239,11 @@ static D3D_SHADER_INPUT_TYPE ResourceToShaderInputType(DxilResourceBase *RB) {
   case DxilResource::Kind::Texture3D:
   case DxilResource::Kind::TextureCube:
   case DxilResource::Kind::TextureCubeArray:
-    return R->IsRW() ? D3D_SIT_UAV_RWTYPED : D3D_SIT_TEXTURE;
+    return isUAV ? D3D_SIT_UAV_RWTYPED : D3D_SIT_TEXTURE;
+  case DxilResource::Kind::RTAccelerationStructure:
+    return (D3D_SHADER_INPUT_TYPE)D3D_SIT_RTACCELERATIONSTRUCTURE;
   default:
-    return (D3D_SHADER_INPUT_TYPE)0;
+    return (D3D_SHADER_INPUT_TYPE)-1;
   }
 }
 
@@ -1174,6 +1271,7 @@ static D3D_SRV_DIMENSION ResourceToDimension(DxilResourceBase *RB) {
   switch (RB->GetKind()) {
   case DxilResource::Kind::StructuredBuffer:
   case DxilResource::Kind::TypedBuffer:
+  case DxilResource::Kind::TBuffer:
     return D3D_SRV_DIMENSION_BUFFER;
   case DxilResource::Kind::Texture1D:
     return D3D_SRV_DIMENSION_TEXTURE1D;
@@ -1230,7 +1328,7 @@ static UINT ResourceToFlags(DxilResourceBase *RB) {
   return result;
 }
 
-void DxilShaderReflection::CreateReflectionObjectForResource(DxilResourceBase *RB) {
+void DxilModuleReflection::CreateReflectionObjectForResource(DxilResourceBase *RB) {
   DxilResourceBase::Class C = RB->GetClass();
   DxilResource *R =
       (C == DXIL::ResourceClass::UAV || C == DXIL::ResourceClass::SRV)
@@ -1387,7 +1485,10 @@ static void SetCBufVarUsage(CShaderReflectionConstantBuffer &cb,
 void DxilShaderReflection::SetCBufferUsage() {
   hlsl::OP *hlslOP = m_pDxilModule->GetOP();
   LLVMContext &Ctx = m_pDxilModule->GetCtx();
-  unsigned cbSize = m_CBs.size();
+
+  // Indexes >= cbuffer size from DxilModule are SRV or UAV structured buffers.
+  // We only collect usage for actual cbuffers, so don't go clearing usage on other buffers.
+  unsigned cbSize = std::min(m_CBs.size(), m_pDxilModule->GetCBuffers().size());
   std::vector< std::vector<unsigned> > cbufUsage(cbSize);
 
   Function *createHandle = hlslOP->GetOpFunc(DXIL::OpCode::CreateHandle, Type::getVoidTy(Ctx));
@@ -1409,38 +1510,36 @@ void DxilShaderReflection::SetCBufferUsage() {
   }
 
   for (unsigned i=0;i<cbSize;i++) {
-    SetCBufVarUsage(m_CBs[i], cbufUsage[i]);
+    SetCBufVarUsage(*m_CBs[i], cbufUsage[i]);
   }
 }
 
-void DxilShaderReflection::CreateReflectionObjects() {
+void DxilModuleReflection::CreateReflectionObjects() {
   DXASSERT_NOMSG(m_pDxilModule != nullptr);
 
   // Create constant buffers, resources and signatures.
   for (auto && cb : m_pDxilModule->GetCBuffers()) {
-    CShaderReflectionConstantBuffer rcb;
-    rcb.Initialize(*m_pDxilModule, *(cb.get()), m_Types);
-    m_CBs.push_back(std::move(rcb));
+    std::unique_ptr<CShaderReflectionConstantBuffer> rcb(new CShaderReflectionConstantBuffer());
+    rcb->Initialize(*m_pDxilModule, *(cb.get()), m_Types);
+    m_CBs.emplace_back(std::move(rcb));
   }
-  // Set cbuf usage.
-  SetCBufferUsage();
 
   // TODO: add tbuffers into m_CBs
   for (auto && uav : m_pDxilModule->GetUAVs()) {
     if (uav->GetKind() != DxilResource::Kind::StructuredBuffer) {
       continue;
     }
-    CShaderReflectionConstantBuffer rcb;
-    rcb.InitializeStructuredBuffer(*m_pDxilModule, *(uav.get()), m_Types);
-    m_CBs.push_back(std::move(rcb));
+    std::unique_ptr<CShaderReflectionConstantBuffer> rcb(new CShaderReflectionConstantBuffer());
+    rcb->InitializeStructuredBuffer(*m_pDxilModule, *(uav.get()), m_Types);
+    m_CBs.emplace_back(std::move(rcb));
   }
   for (auto && srv : m_pDxilModule->GetSRVs()) {
     if (srv->GetKind() != DxilResource::Kind::StructuredBuffer) {
       continue;
     }
-    CShaderReflectionConstantBuffer rcb;
-    rcb.InitializeStructuredBuffer(*m_pDxilModule, *(srv.get()), m_Types);
-    m_CBs.push_back(std::move(rcb));
+    std::unique_ptr<CShaderReflectionConstantBuffer> rcb(new CShaderReflectionConstantBuffer());
+    rcb->InitializeStructuredBuffer(*m_pDxilModule, *(srv.get()), m_Types);
+    m_CBs.emplace_back(std::move(rcb));
   }
 
   // Populate all resources.
@@ -1456,12 +1555,6 @@ void DxilShaderReflection::CreateReflectionObjects() {
   for (auto && uavRes : m_pDxilModule->GetUAVs()) {
     CreateReflectionObjectForResource(uavRes.get());
   }
-
-  // Populate input/output/patch constant signatures.
-  CreateReflectionObjectsForSignature(m_pDxilModule->GetInputSignature(), m_InputSignature);
-  CreateReflectionObjectsForSignature(m_pDxilModule->GetOutputSignature(), m_OutputSignature);
-  CreateReflectionObjectsForSignature(m_pDxilModule->GetPatchConstantSignature(), m_PatchConstantSignature);
-  MarkUsedSignatureElements();
 }
 
 static D3D_REGISTER_COMPONENT_TYPE CompTypeToRegisterComponentType(CompType CT) {
@@ -1636,8 +1729,8 @@ LPCSTR DxilShaderReflection::CreateUpperCase(LPCSTR pValue) {
   return m_UpperCaseNames.back().get();
 }
 
-HRESULT DxilShaderReflection::Load(IDxcBlob *pBlob,
-                                   const DxilPartHeader *pPart) {
+HRESULT DxilModuleReflection::LoadModule(IDxcBlob *pBlob,
+                                         const DxilPartHeader *pPart) {
   DXASSERT_NOMSG(pBlob != nullptr);
   DXASSERT_NOMSG(pPart != nullptr);
   m_pContainer = pBlob;
@@ -1666,6 +1759,24 @@ HRESULT DxilShaderReflection::Load(IDxcBlob *pBlob,
   CATCH_CPP_RETURN_HRESULT();
 };
 
+HRESULT DxilShaderReflection::Load(IDxcBlob *pBlob,
+                                   const DxilPartHeader *pPart) {
+  IFR(LoadModule(pBlob, pPart));
+
+  try {
+    // Set cbuf usage.
+    SetCBufferUsage();
+
+    // Populate input/output/patch constant signatures.
+    CreateReflectionObjectsForSignature(m_pDxilModule->GetInputSignature(), m_InputSignature);
+    CreateReflectionObjectsForSignature(m_pDxilModule->GetOutputSignature(), m_OutputSignature);
+    CreateReflectionObjectsForSignature(m_pDxilModule->GetPatchConstantSignature(), m_PatchConstantSignature);
+    MarkUsedSignatureElements();
+    return S_OK;
+  }
+  CATCH_CPP_RETURN_HRESULT();
+}
+
 _Use_decl_annotations_
 HRESULT DxilShaderReflection::GetDesc(D3D12_SHADER_DESC *pDesc) {
   IFR(ZeroMemoryToOut(pDesc));
@@ -1790,20 +1901,26 @@ void DxilShaderReflection::MarkUsedSignatureElements() {
 
 _Use_decl_annotations_
 ID3D12ShaderReflectionConstantBuffer* DxilShaderReflection::GetConstantBufferByIndex(UINT Index) {
+  return DxilModuleReflection::_GetConstantBufferByIndex(Index);
+}
+ID3D12ShaderReflectionConstantBuffer* DxilModuleReflection::_GetConstantBufferByIndex(UINT Index) {
   if (Index >= m_CBs.size()) {
     return &g_InvalidSRConstantBuffer;
   }
-  return &m_CBs[Index];
+  return m_CBs[Index].get();
 }
 
 _Use_decl_annotations_
 ID3D12ShaderReflectionConstantBuffer* DxilShaderReflection::GetConstantBufferByName(LPCSTR Name) {
+  return DxilModuleReflection::_GetConstantBufferByName(Name);
+}
+ID3D12ShaderReflectionConstantBuffer* DxilModuleReflection::_GetConstantBufferByName(LPCSTR Name) {
   if (!Name) {
     return &g_InvalidSRConstantBuffer;
   }
   for (UINT index = 0; index < m_CBs.size(); ++index) {
-    if (0 == strcmp(m_CBs[index].GetName(), Name)) {
-      return &m_CBs[index];
+    if (0 == strcmp(m_CBs[index]->GetName(), Name)) {
+      return m_CBs[index].get();
     }
   }
   return &g_InvalidSRConstantBuffer;
@@ -1812,9 +1929,13 @@ ID3D12ShaderReflectionConstantBuffer* DxilShaderReflection::GetConstantBufferByN
 _Use_decl_annotations_
 HRESULT DxilShaderReflection::GetResourceBindingDesc(UINT ResourceIndex,
   _Out_ D3D12_SHADER_INPUT_BIND_DESC *pDesc) {
+  return DxilModuleReflection::_GetResourceBindingDesc(ResourceIndex, pDesc, m_PublicAPI);
+}
+HRESULT DxilModuleReflection::_GetResourceBindingDesc(UINT ResourceIndex,
+  _Out_ D3D12_SHADER_INPUT_BIND_DESC *pDesc, PublicAPI api) {
   IFRBOOL(pDesc != nullptr, E_INVALIDARG);
   IFRBOOL(ResourceIndex < m_Resources.size(), E_INVALIDARG);
-  if (m_PublicAPI != PublicAPI::D3D12) {
+  if (api != PublicAPI::D3D12) {
     memcpy(pDesc, &m_Resources[ResourceIndex], sizeof(D3D11_SHADER_INPUT_BIND_DESC));
   }
   else {
@@ -1870,10 +1991,13 @@ HRESULT DxilShaderReflection::GetPatchConstantParameterDesc(UINT ParameterIndex,
 
 _Use_decl_annotations_
 ID3D12ShaderReflectionVariable* DxilShaderReflection::GetVariableByName(LPCSTR Name) {
+  return DxilModuleReflection::_GetVariableByName(Name);
+}
+ID3D12ShaderReflectionVariable* DxilModuleReflection::_GetVariableByName(LPCSTR Name) {
   if (Name != nullptr) {
     // Iterate through all cbuffers to find the variable.
     for (UINT i = 0; i < m_CBs.size(); i++) {
-      ID3D12ShaderReflectionVariable *pVar = m_CBs[i].GetVariableByName(Name);
+      ID3D12ShaderReflectionVariable *pVar = m_CBs[i]->GetVariableByName(Name);
       if (pVar != &g_InvalidSRVariable) {
         return pVar;
       }
@@ -1886,11 +2010,15 @@ ID3D12ShaderReflectionVariable* DxilShaderReflection::GetVariableByName(LPCSTR N
 _Use_decl_annotations_
 HRESULT DxilShaderReflection::GetResourceBindingDescByName(LPCSTR Name,
   D3D12_SHADER_INPUT_BIND_DESC *pDesc) {
+  return DxilModuleReflection::_GetResourceBindingDescByName(Name, pDesc, m_PublicAPI);
+}
+HRESULT DxilModuleReflection::_GetResourceBindingDescByName(LPCSTR Name,
+  D3D12_SHADER_INPUT_BIND_DESC *pDesc, PublicAPI api) {
   IFRBOOL(Name != nullptr, E_INVALIDARG);
 
   for (UINT i = 0; i < m_Resources.size(); i++) {
     if (strcmp(m_Resources[i].Name, Name) == 0) {
-      if (m_PublicAPI != PublicAPI::D3D12) {
+      if (api != PublicAPI::D3D12) {
         memcpy(pDesc, &m_Resources[i], sizeof(D3D11_SHADER_INPUT_BIND_DESC));
       }
       else {
@@ -1909,6 +2037,8 @@ UINT DxilShaderReflection::GetConversionInstructionCount() { return 0; }
 UINT DxilShaderReflection::GetBitwiseInstructionCount() { return 0; }
 
 D3D_PRIMITIVE DxilShaderReflection::GetGSInputPrimitive() {
+  if (!m_pDxilModule->GetShaderModel()->IsGS())
+    return D3D_PRIMITIVE::D3D10_PRIMITIVE_UNDEFINED;
   return (D3D_PRIMITIVE)m_pDxilModule->GetInputPrimitive();
 }
 
@@ -1927,11 +2057,19 @@ HRESULT DxilShaderReflection::GetMinFeatureLevel(enum D3D_FEATURE_LEVEL* pLevel)
 
 _Use_decl_annotations_
 UINT DxilShaderReflection::GetThreadGroupSize(UINT *pSizeX, UINT *pSizeY, UINT *pSizeZ) {
-  UINT *pNumThreads = m_pDxilModule->m_NumThreads;
-  AssignToOutOpt(pNumThreads[0], pSizeX);
-  AssignToOutOpt(pNumThreads[1], pSizeY);
-  AssignToOutOpt(pNumThreads[2], pSizeZ);
-  return pNumThreads[0] * pNumThreads[1] * pNumThreads[2];
+  if (!m_pDxilModule->GetShaderModel()->IsCS()) {
+    AssignToOutOpt((UINT)0, pSizeX);
+    AssignToOutOpt((UINT)0, pSizeY);
+    AssignToOutOpt((UINT)0, pSizeZ);
+    return 0;
+  }
+  unsigned x = m_pDxilModule->GetNumThreads(0);
+  unsigned y = m_pDxilModule->GetNumThreads(1);
+  unsigned z = m_pDxilModule->GetNumThreads(2);
+  AssignToOutOpt(x, pSizeX);
+  AssignToOutOpt(y, pSizeY);
+  AssignToOutOpt(z, pSizeZ);
+  return x * y * z;
 }
 
 UINT64 DxilShaderReflection::GetRequiresFlags() {
@@ -1953,6 +2091,258 @@ UINT64 DxilShaderReflection::GetRequiresFlags() {
   return result;
 }
 
+
+// ID3D12FunctionReflection
+
+class CFunctionReflection : public ID3D12FunctionReflection {
+protected:
+  DxilLibraryReflection * m_pLibraryReflection = nullptr;
+  const Function *m_pFunction;
+  const DxilFunctionProps *m_pProps;  // nullptr if non-shader library function or patch constant function
+  std::string m_Name;
+  typedef SmallSetVector<UINT32, 8> ResourceUseSet;
+  ResourceUseSet m_UsedResources;
+  ResourceUseSet m_UsedCBs;
+
+public:
+  void Initialize(DxilLibraryReflection* pLibraryReflection, Function *pFunction) {
+    DXASSERT_NOMSG(pLibraryReflection);
+    DXASSERT_NOMSG(pFunction);
+    m_pLibraryReflection = pLibraryReflection;
+    m_pFunction = pFunction;
+
+    const DxilModule &M = *m_pLibraryReflection->m_pDxilModule;
+    m_Name = m_pFunction->getName().str();
+    m_pProps = nullptr;
+    if (M.HasDxilFunctionProps(m_pFunction)) {
+      m_pProps = &M.GetDxilFunctionProps(m_pFunction);
+    }
+  }
+  void AddResourceReference(UINT resIndex) {
+    m_UsedResources.insert(resIndex);
+  }
+  void AddCBReference(UINT cbIndex) {
+    m_UsedCBs.insert(cbIndex);
+  }
+
+  // ID3D12FunctionReflection
+  STDMETHOD(GetDesc)(THIS_ _Out_ D3D12_FUNCTION_DESC * pDesc);
+
+  // BufferIndex relative to used constant buffers here
+  STDMETHOD_(ID3D12ShaderReflectionConstantBuffer *, GetConstantBufferByIndex)(THIS_ _In_ UINT BufferIndex);
+  STDMETHOD_(ID3D12ShaderReflectionConstantBuffer *, GetConstantBufferByName)(THIS_ _In_ LPCSTR Name);
+
+  STDMETHOD(GetResourceBindingDesc)(THIS_ _In_ UINT ResourceIndex,
+    _Out_ D3D12_SHADER_INPUT_BIND_DESC * pDesc);
+
+  STDMETHOD_(ID3D12ShaderReflectionVariable *, GetVariableByName)(THIS_ _In_ LPCSTR Name);
+
+  STDMETHOD(GetResourceBindingDescByName)(THIS_ _In_ LPCSTR Name,
+    _Out_ D3D12_SHADER_INPUT_BIND_DESC * pDesc);
+
+  // Use D3D_RETURN_PARAMETER_INDEX to get description of the return value.
+  STDMETHOD_(ID3D12FunctionParameterReflection *, GetFunctionParameter)(THIS_ _In_ INT ParameterIndex) {
+    return &g_InvalidFunctionParameter;
+  }
+};
+
+_Use_decl_annotations_
+HRESULT CFunctionReflection::GetDesc(D3D12_FUNCTION_DESC *pDesc) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  IFR(ZeroMemoryToOut(pDesc));
+
+  const ShaderModel* pSM = m_pLibraryReflection->m_pDxilModule->GetShaderModel();
+  DXIL::ShaderKind kind = DXIL::ShaderKind::Library;
+  if (m_pProps) {
+    kind = m_pProps->shaderKind;
+  }
+  pDesc->Version = EncodeVersion(kind, pSM->GetMajor(), pSM->GetMinor());
+
+  //Unset:  LPCSTR                  Creator;                     // Creator string
+  //Unset:  UINT                    Flags;                       // Shader compilation/parse flags
+
+  pDesc->ConstantBuffers = (UINT)m_UsedCBs.size();
+  pDesc->BoundResources = (UINT)m_UsedResources.size();
+
+  //Unset:  UINT                    InstructionCount;            // Number of emitted instructions
+  //Unset:  UINT                    TempRegisterCount;           // Number of temporary registers used 
+  //Unset:  UINT                    TempArrayCount;              // Number of temporary arrays used
+  //Unset:  UINT                    DefCount;                    // Number of constant defines 
+  //Unset:  UINT                    DclCount;                    // Number of declarations (input + output)
+  //Unset:  UINT                    TextureNormalInstructions;   // Number of non-categorized texture instructions
+  //Unset:  UINT                    TextureLoadInstructions;     // Number of texture load instructions
+  //Unset:  UINT                    TextureCompInstructions;     // Number of texture comparison instructions
+  //Unset:  UINT                    TextureBiasInstructions;     // Number of texture bias instructions
+  //Unset:  UINT                    TextureGradientInstructions; // Number of texture gradient instructions
+  //Unset:  UINT                    FloatInstructionCount;       // Number of floating point arithmetic instructions used
+  //Unset:  UINT                    IntInstructionCount;         // Number of signed integer arithmetic instructions used
+  //Unset:  UINT                    UintInstructionCount;        // Number of unsigned integer arithmetic instructions used
+  //Unset:  UINT                    StaticFlowControlCount;      // Number of static flow control instructions used
+  //Unset:  UINT                    DynamicFlowControlCount;     // Number of dynamic flow control instructions used
+  //Unset:  UINT                    MacroInstructionCount;       // Number of macro instructions used
+  //Unset:  UINT                    ArrayInstructionCount;       // Number of array instructions used
+  //Unset:  UINT                    MovInstructionCount;         // Number of mov instructions used
+  //Unset:  UINT                    MovcInstructionCount;        // Number of movc instructions used
+  //Unset:  UINT                    ConversionInstructionCount;  // Number of type conversion instructions used
+  //Unset:  UINT                    BitwiseInstructionCount;     // Number of bitwise arithmetic instructions used
+  //Unset:  D3D_FEATURE_LEVEL       MinFeatureLevel;             // Min target of the function byte code
+  //Unset:  UINT64                  RequiredFeatureFlags;        // Required feature flags
+
+  pDesc->Name = m_Name.c_str();
+
+  //Unset:  INT                     FunctionParameterCount;      // Number of logical parameters in the function signature (not including return)
+  //Unset:  BOOL                    HasReturn;                   // TRUE, if function returns a value, false - it is a subroutine
+  //Unset:  BOOL                    Has10Level9VertexShader;     // TRUE, if there is a 10L9 VS blob
+  //Unset:  BOOL                    Has10Level9PixelShader;      // TRUE, if there is a 10L9 PS blob
+  return S_OK;
+}
+
+// BufferIndex is relative to used constant buffers here
+ID3D12ShaderReflectionConstantBuffer *CFunctionReflection::GetConstantBufferByIndex(UINT BufferIndex) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  if (BufferIndex >= m_UsedCBs.size())
+    return &g_InvalidSRConstantBuffer;
+  return m_pLibraryReflection->_GetConstantBufferByIndex(m_UsedCBs[BufferIndex]);
+}
+
+ID3D12ShaderReflectionConstantBuffer *CFunctionReflection::GetConstantBufferByName(LPCSTR Name) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  return m_pLibraryReflection->_GetConstantBufferByName(Name);
+}
+
+HRESULT CFunctionReflection::GetResourceBindingDesc(UINT ResourceIndex,
+  D3D12_SHADER_INPUT_BIND_DESC * pDesc) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  if (ResourceIndex >= m_UsedResources.size())
+    return E_INVALIDARG;
+  return m_pLibraryReflection->_GetResourceBindingDesc(m_UsedResources[ResourceIndex], pDesc);
+}
+
+ID3D12ShaderReflectionVariable * CFunctionReflection::GetVariableByName(LPCSTR Name) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  return m_pLibraryReflection->_GetVariableByName(Name);
+}
+
+HRESULT CFunctionReflection::GetResourceBindingDescByName(LPCSTR Name,
+  D3D12_SHADER_INPUT_BIND_DESC * pDesc) {
+  DXASSERT_NOMSG(m_pLibraryReflection);
+  return m_pLibraryReflection->_GetResourceBindingDescByName(Name, pDesc);
+}
+
+
+// DxilLibraryReflection
+
+// From DxilContainerAssembler:
+static llvm::Function *FindUsingFunction(llvm::Value *User) {
+  if (llvm::Instruction *I = dyn_cast<llvm::Instruction>(User)) {
+    // Instruction should be inside a basic block, which is in a function
+    return cast<llvm::Function>(I->getParent()->getParent());
+  }
+  // User can be either instruction, constant, or operator. But User is an
+  // operator only if constant is a scalar value, not resource pointer.
+  llvm::Constant *CU = cast<llvm::Constant>(User);
+  if (!CU->user_empty())
+    return FindUsingFunction(*CU->user_begin());
+  else
+    return nullptr;
+}
+
+void DxilLibraryReflection::AddResourceUseToFunctions(DxilResourceBase &resource, unsigned resIndex) {
+  Constant *var = resource.GetGlobalSymbol();
+  if (var) {
+    for (auto user : var->users()) {
+      // Find the function.
+      if (llvm::Function *F = FindUsingFunction(user)) {
+        auto funcReflector = m_FunctionsByPtr[F];
+        funcReflector->AddResourceReference(resIndex);
+        if (resource.GetClass() == DXIL::ResourceClass::CBuffer) {
+          funcReflector->AddCBReference(resource.GetID());
+        }
+      }
+    }
+  }
+}
+
+void DxilLibraryReflection::AddResourceDependencies() {
+  std::map<StringRef, CFunctionReflection*> orderedMap;
+  for (auto &F : m_pModule->functions()) {
+    if (F.isDeclaration())
+      continue;
+    auto &func = m_FunctionMap[F.getName()];
+    DXASSERT(!func.get(), "otherwise duplicate named functions");
+    func.reset(new CFunctionReflection());
+    func->Initialize(this, &F);
+    m_FunctionsByPtr[&F] = func.get();
+    orderedMap[F.getName()] = func.get();
+  }
+  // Fill in function vector sorted by name
+  m_FunctionVector.clear();
+  m_FunctionVector.reserve(orderedMap.size());
+  for (auto &it : orderedMap) {
+    m_FunctionVector.push_back(it.second);
+  }
+  UINT resIndex = 0;
+  for (auto &resource : m_Resources) {
+    switch ((UINT32)resource.Type) {
+    case D3D_SIT_CBUFFER:
+      AddResourceUseToFunctions(m_pDxilModule->GetCBuffer(resource.uID), resIndex);
+      break;
+    case D3D_SIT_TBUFFER:   // TODO: Handle when TBuffers are added to CB list
+    case D3D_SIT_TEXTURE:
+    case D3D_SIT_STRUCTURED:
+    case D3D_SIT_BYTEADDRESS:
+    case D3D_SIT_RTACCELERATIONSTRUCTURE:
+      AddResourceUseToFunctions(m_pDxilModule->GetSRV(resource.uID), resIndex);
+      break;
+    case D3D_SIT_UAV_RWTYPED:
+    case D3D_SIT_UAV_RWSTRUCTURED:
+    case D3D_SIT_UAV_RWBYTEADDRESS:
+    case D3D_SIT_UAV_APPEND_STRUCTURED:
+    case D3D_SIT_UAV_CONSUME_STRUCTURED:
+    case D3D_SIT_UAV_RWSTRUCTURED_WITH_COUNTER:
+      AddResourceUseToFunctions(m_pDxilModule->GetUAV(resource.uID), resIndex);
+      break;
+    case D3D_SIT_SAMPLER:
+      AddResourceUseToFunctions(m_pDxilModule->GetSampler(resource.uID), resIndex);
+      break;
+    }
+    resIndex++;
+  }
+}
+
+// ID3D12LibraryReflection
+
+HRESULT DxilLibraryReflection::Load(IDxcBlob *pBlob,
+  const DxilPartHeader *pPart) {
+  IFR(LoadModule(pBlob, pPart));
+
+  try {
+    AddResourceDependencies();
+    return S_OK;
+  }
+  CATCH_CPP_RETURN_HRESULT();
+}
+
+_Use_decl_annotations_
+HRESULT DxilLibraryReflection::GetDesc(D3D12_LIBRARY_DESC * pDesc) {
+  IFR(ZeroMemoryToOut(pDesc));
+  //Unset:  LPCSTR    Creator;           // The name of the originator of the library.
+  //Unset:  UINT      Flags;             // Compilation flags.
+  //UINT      FunctionCount;     // Number of functions exported from the library.
+  pDesc->FunctionCount = (UINT)m_FunctionVector.size();
+  return S_OK;
+}
+
+_Use_decl_annotations_
+ID3D12FunctionReflection *DxilLibraryReflection::GetFunctionByIndex(INT FunctionIndex) {
+  if ((UINT)FunctionIndex >= m_FunctionVector.size())
+    return &g_InvalidFunction;
+  return m_FunctionVector[FunctionIndex];
+}
+
+// DxilRuntimeReflection implementation
+#include "dxc/HLSL/DxilRuntimeReflection.inl"
+
 #else
 void hlsl::CreateDxcContainerReflection(IDxcContainerReflection **ppResult) {
   *ppResult = nullptr;

+ 2 - 1
lib/HLSL/DxilDebugInstrumentation.cpp

@@ -13,6 +13,7 @@
 #include "dxc/HLSL/DxilModule.h"
 #include "dxc/HLSL/DxilOperations.h"
 #include "dxc/HLSL/DxilPIXPasses.h"
+#include "dxc/HLSL/DxilUtil.h"
 
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Constants.h"
@@ -728,7 +729,7 @@ bool DxilDebugInstrumentation::runOnModule(Module &M) {
   //  value at (UAVSize) - (Small Amount) * 2 (which is actually a conservative definition of overflow).
   //
 
-  Instruction* firstInsertionPt = DM.GetEntryFunction()->getEntryBlock().getFirstInsertionPt();
+  Instruction* firstInsertionPt = dxilutil::FirstNonAllocaInsertionPt(DM.GetEntryFunction());
   IRBuilder<> Builder(firstInsertionPt);
 
   BuilderContext BC{ M, DM, Ctx, HlslOP, Builder };

+ 8 - 6
lib/HLSL/DxilEliminateOutputDynamicIndexing.cpp

@@ -13,6 +13,7 @@
 #include "dxc/HLSL/DxilOperations.h"
 #include "dxc/HLSL/DxilSignatureElement.h"
 #include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilUtil.h"
 #include "dxc/Support/Global.h"
 #include "dxc/HLSL/DxilInstructions.h"
 
@@ -100,11 +101,12 @@ public:
 bool DxilEliminateOutputDynamicIndexing::EliminateDynamicOutput(
     hlsl::OP *hlslOP, DXIL::OpCode opcode, DxilSignature &outputSig,
     Function *Entry) {
-  ArrayRef<llvm::Function *> storeOutputs =
+  auto &storeOutputs =
       hlslOP->GetOpFuncList(opcode);
 
   MapVector<Value *, Type *> dynamicSigSet;
-  for (Function *F : storeOutputs) {
+  for (auto it : storeOutputs) {
+    Function *F = it.second;
     // Skip overload not used.
     if (!F)
       continue;
@@ -122,10 +124,10 @@ bool DxilEliminateOutputDynamicIndexing::EliminateDynamicOutput(
   if (dynamicSigSet.empty())
     return false;
 
-  IRBuilder<> Builder(Entry->getEntryBlock().getFirstInsertionPt());
+  IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(Entry));
 
-  Value *opcodeV = Builder.getInt32(static_cast<unsigned>(opcode));
-  Value *zero = Builder.getInt32(0);
+  Value *opcodeV = AllocaBuilder.getInt32(static_cast<unsigned>(opcode));
+  Value *zero = AllocaBuilder.getInt32(0);
 
   for (auto sig : dynamicSigSet) {
     Value *sigID = sig.first;
@@ -138,7 +140,7 @@ bool DxilEliminateOutputDynamicIndexing::EliminateDynamicOutput(
 
     std::vector<Value *> tmpSigElts(col);
     for (unsigned c = 0; c < col; c++) {
-      Value *newCol = Builder.CreateAlloca(AT);
+      Value *newCol = AllocaBuilder.CreateAlloca(AT);
       tmpSigElts[c] = newCol;
     }
 

+ 27 - 0
lib/HLSL/DxilEntryProps.h

@@ -0,0 +1,27 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilEntryProps.h                                                          //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Put entry signature and function props together.                          //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+#include "dxc/HLSL/DxilSignature.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+
+namespace hlsl {
+
+class DxilEntryProps {
+public:
+  DxilEntrySignature sig;
+  DxilFunctionProps props;
+  DxilEntryProps(DxilFunctionProps &p, bool bUseMinPrecision)
+      : sig(p.shaderKind, bUseMinPrecision), props(p) {}
+  DxilEntryProps(DxilEntryProps &p)
+      : sig(p.sig), props(p.props) {}
+};
+}

+ 221 - 0
lib/HLSL/DxilExportMap.cpp

@@ -0,0 +1,221 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilExportMap.cpp                                                         //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// dxilutil::ExportMap for handling -exports option.                         //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/Support/Global.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "dxc/HLSL/DxilExportMap.h"
+#include "dxc/HLSL/DxilTypeSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include <string>
+#include <vector>
+#include <set>
+
+using namespace llvm;
+using namespace hlsl;
+
+namespace hlsl {
+namespace dxilutil {
+
+void ExportMap::clear() {
+  m_ExportMap.clear();
+}
+bool ExportMap::empty() const {
+  return m_ExportMap.empty();
+}
+
+bool ExportMap::ParseExports(const std::vector<std::string> &exportOpts, llvm::raw_ostream &errors) {
+  for (auto &str : exportOpts) {
+    llvm::StringRef exports = StoreString(str);
+    size_t start = 0;
+    size_t end = llvm::StringRef::npos;
+    // def1;def2;...
+    while (true) {
+      end = exports.find_first_of(';', start);
+      llvm::StringRef exportDef = exports.slice(start, end);
+
+      // def: export1[[,export2,...]=internal]
+      llvm::StringRef internalName = exportDef;
+      size_t equals = exportDef.find_first_of('=');
+      if (equals != llvm::StringRef::npos) {
+        internalName = exportDef.substr(equals + 1);
+        size_t exportStart = 0;
+        while (true) {
+          size_t comma = exportDef.find_first_of(',', exportStart);
+          if (comma == llvm::StringRef::npos || comma > equals)
+            break;
+          if (exportStart < comma)
+            Add(exportDef.slice(exportStart, comma), internalName);
+          exportStart = comma + 1;
+        }
+        if (exportStart < equals)
+          Add(exportDef.slice(exportStart, equals), internalName);
+      } else {
+        Add(internalName);
+      }
+
+      if (equals == 0 || internalName.empty()) {
+        errors << "Invalid syntax for -exports: '" << exportDef
+          << "'.  Syntax is: export1[[,export2,...]=internal][;...]";
+        return false;
+      }
+      if (end == llvm::StringRef::npos)
+        break;
+      start = end + 1;
+    }
+  }
+  return true;
+}
+
+void ExportMap::Add(llvm::StringRef exportName, llvm::StringRef internalName) {
+  // Incoming strings may be escaped (because they originally come from arguments)
+  // Unescape them here, if necessary
+  if (exportName.startswith("\\")) {
+    std::string str;
+    llvm::raw_string_ostream os(str);
+    PrintUnescapedString(exportName, os);
+    exportName = StoreString(os.str());
+  }
+  if (internalName.startswith("\\")) {
+    std::string str;
+    llvm::raw_string_ostream os(str);
+    PrintUnescapedString(internalName, os);
+    internalName = StoreString(os.str());
+  }
+
+  if (internalName.empty())
+    internalName = exportName;
+  exportName = DemangleFunctionName(exportName);
+  m_ExportMap[internalName].insert(exportName);
+}
+
+ExportMap::const_iterator ExportMap::GetExportsByName(llvm::StringRef Name) const {
+  ExportMap::const_iterator it = m_ExportMap.find(Name);
+  StringRef unmangled = DemangleFunctionName(Name);
+  if (it == end()) {
+    if (Name.startswith(ManglingPrefix)) {
+      it = m_ExportMap.find(unmangled);
+    }
+    else if (Name.startswith(EntryPrefix)) {
+      it = m_ExportMap.find(Name.substr(strlen(EntryPrefix)));
+    }
+  }
+  return it;
+}
+
+bool ExportMap::IsExported(llvm::StringRef original) const {
+  if (m_ExportMap.empty())
+    return true;
+  return GetExportsByName(original) != end();
+}
+
+void ExportMap::BeginProcessing() {
+  m_ExportNames.clear();
+  m_NameCollisions.clear();
+  m_UnusedExports.clear();
+  for (auto &it : m_ExportMap) {
+    m_UnusedExports.emplace(it.getKey());
+  }
+}
+
+bool ExportMap::ProcessFunction(llvm::Function *F, bool collisionAvoidanceRenaming) {
+  // Skip if already added.  This can happen due to patch constant functions.
+  if (m_RenameMap.find(F) != m_RenameMap.end())
+    return true;
+
+  StringRef originalName = F->getName();
+  StringRef unmangled = DemangleFunctionName(originalName);
+  auto it = GetExportsByName(F->getName());
+
+  // Early out if not exported, and do optional collision avoidance
+  if (it == end()) {
+    F->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
+    if (collisionAvoidanceRenaming) {
+      std::string internalName = (Twine("internal.") + unmangled).str();
+      internalName = dxilutil::ReplaceFunctionName(originalName, internalName);
+      F->setName(internalName);
+    }
+    return false;
+  }
+
+  F->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
+
+  // Add entry to m_RenameMap:
+  auto &renames = m_RenameMap[F];
+  const llvm::StringSet<> &exportRenames = it->getValue();
+  llvm::StringRef internalName = it->getKey();
+
+  // mark export used
+  UseExport(internalName);
+
+  // Add identity first
+  auto itIdentity = exportRenames.find(unmangled);
+  if (exportRenames.empty() || itIdentity != exportRenames.end()) {
+    if (exportRenames.size() > 1)
+      renames.insert(originalName);
+    ExportName(originalName);
+  } else if (collisionAvoidanceRenaming) {
+    // do optional collision avoidance for exports being renamed
+    std::string tempName = (Twine("temp.") + unmangled).str();
+    tempName = dxilutil::ReplaceFunctionName(originalName, tempName);
+    F->setName(tempName);
+  }
+
+  for (auto itName = exportRenames.begin(); itName != exportRenames.end(); itName++) {
+    // Now add actual renames
+    if (itName != itIdentity) {
+      StringRef newName = StoreString(dxilutil::ReplaceFunctionName(F->getName(), itName->getKey()));
+      renames.insert(newName);
+      ExportName(newName);
+    }
+  }
+
+  return true;
+}
+
+void ExportMap::RegisterExportedFunction(llvm::Function *F) {
+  // Skip if already added
+  if (m_RenameMap.find(F) != m_RenameMap.end())
+    return;
+  F->setLinkage(GlobalValue::LinkageTypes::ExternalLinkage);
+  NameSet &renames = m_RenameMap[F];
+  (void)(renames);  // Don't actually add anything
+  ExportName(F->getName());
+}
+
+void ExportMap::UseExport(llvm::StringRef internalName) {
+  auto it = m_UnusedExports.find(internalName);
+  if (it != m_UnusedExports.end())
+    m_UnusedExports.erase(it);
+}
+void ExportMap::ExportName(llvm::StringRef exportName) {
+  auto result = m_ExportNames.insert(exportName);
+  if (!result.second) {
+    // Already present, report collision
+    m_NameCollisions.insert(exportName);
+  }
+}
+
+bool ExportMap::EndProcessing() const {
+  return m_UnusedExports.empty() && m_NameCollisions.empty();
+}
+
+llvm::StringRef ExportMap::StoreString(llvm::StringRef str) {
+  return *m_StringStorage.insert(str).first;
+}
+
+} // dxilutil
+} // hlsl

File diff suppressed because it is too large
+ 169 - 616
lib/HLSL/DxilGenerationPass.cpp


+ 3 - 2
lib/HLSL/DxilLegalizeSampleOffsetPass.cpp

@@ -184,8 +184,9 @@ void DxilLegalizeSampleOffsetPass::CollectIllegalOffsets(
 void DxilLegalizeSampleOffsetPass::CollectIllegalOffsets(
     std::vector<Instruction *> &illegalOffsets, Function &CurF,
     DXIL::OpCode opcode, hlsl::OP *hlslOP) {
-  ArrayRef<Function *> intrFuncList = hlslOP->GetOpFuncList(opcode);
-  for (Function *intrFunc : intrFuncList) {
+  auto &intrFuncList = hlslOP->GetOpFuncList(opcode);
+  for (auto it : intrFuncList) {
+    Function *intrFunc = it.second;
     if (!intrFunc)
       continue;
     for (User *U : intrFunc->users()) {

+ 560 - 157
lib/HLSL/DxilLinker.cpp

@@ -10,10 +10,12 @@
 #include "dxc/HLSL/DxilLinker.h"
 #include "dxc/HLSL/DxilCBuffer.h"
 #include "dxc/HLSL/DxilFunctionProps.h"
+#include "DxilEntryProps.h"
 #include "dxc/HLSL/DxilModule.h"
 #include "dxc/HLSL/DxilOperations.h"
 #include "dxc/HLSL/DxilResource.h"
 #include "dxc/HLSL/DxilSampler.h"
+#include "dxc/HLSL/DxilUtil.h"
 #include "dxc/Support/Global.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/DenseSet.h"
@@ -23,18 +25,22 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SetVector.h"
 #include <memory>
 #include <vector>
 
 #include "dxc/HLSL/DxilContainer.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfo.h"
 
 #include "dxc/HLSL/DxilGenerationPass.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 
+#include "dxc/HLSL/DxilExportMap.h"
+
 using namespace llvm;
 using namespace hlsl;
 
@@ -58,13 +64,13 @@ void AddResourceMap(
     std::unordered_map<const llvm::Constant *, DxilResourceBase *> &resMap,
     DxilModule &DM) {
   for (auto &Res : resTab) {
-    const DxilModule::ResourceLinkInfo &linkInfo =
-        DM.GetResourceLinkInfo(resClass, Res->GetID());
-    resMap[linkInfo.ResRangeID] = Res.get();
+    resMap[Res->GetGlobalSymbol()] = Res.get();
   }
 }
 
-void CloneFunction(Function *F, Function *NewF, ValueToValueMapTy &vmap) {
+void CloneFunction(Function *F, Function *NewF, ValueToValueMapTy &vmap,
+                   hlsl::DxilTypeSystem *TypeSys = nullptr,
+                   hlsl::DxilTypeSystem *SrcTypeSys = nullptr) {
   SmallVector<ReturnInst *, 2> Returns;
   // Map params.
   auto paramIt = NewF->arg_begin();
@@ -73,6 +79,11 @@ void CloneFunction(Function *F, Function *NewF, ValueToValueMapTy &vmap) {
   }
 
   llvm::CloneFunctionInto(NewF, F, vmap, /*ModuleLevelChanges*/ true, Returns);
+  if (TypeSys) {
+    if (SrcTypeSys == nullptr)
+      SrcTypeSys = TypeSys;
+    TypeSys->CopyFunctionAnnotation(NewF, F, *SrcTypeSys);
+  }
 
   // Remove params from vmap.
   for (Argument &param : F->args()) {
@@ -136,15 +147,16 @@ public:
   bool DetachLib(StringRef name) override;
   void DetachAll() override;
 
-  std::unique_ptr<llvm::Module> Link(StringRef entry,
-                                     StringRef profile) override;
+  std::unique_ptr<llvm::Module>
+  Link(StringRef entry, StringRef profile, dxilutil::ExportMap &exportMap) override;
 
 private:
   bool AttachLib(DxilLib *lib);
   bool DetachLib(DxilLib *lib);
   bool AddFunctions(SmallVector<StringRef, 4> &workList,
                     DenseSet<DxilLib *> &libSet, StringSet<> &addedFunctionSet,
-                    DxilLinkJob &linkJob, bool bLazyLoadDone);
+                    DxilLinkJob &linkJob, bool bLazyLoadDone,
+                    bool bAllowFuncionDecls);
   // Attached libs to link.
   std::unordered_set<DxilLib *> m_attachedLibs;
   // Owner of all DxilLib.
@@ -315,19 +327,30 @@ DxilResourceBase *DxilLib::GetResource(const llvm::Constant *GV) {
 namespace {
 // Create module from link defines.
 struct DxilLinkJob {
-  DxilLinkJob(LLVMContext &Ctx, unsigned valMajor, unsigned valMinor) : m_ctx(Ctx), m_valMajor(valMajor), m_valMinor(valMinor) {}
+  DxilLinkJob(LLVMContext &Ctx, dxilutil::ExportMap &exportMap,
+              unsigned valMajor, unsigned valMinor)
+      : m_ctx(Ctx), m_exportMap(exportMap), m_valMajor(valMajor),
+        m_valMinor(valMinor) {}
   std::unique_ptr<llvm::Module>
   Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
-       StringRef profile);
+       const ShaderModel *pSM);
+  std::unique_ptr<llvm::Module> LinkToLib(const ShaderModel *pSM);
+  void StripDeadDebugInfo(llvm::Module &M);
   void RunPreparePass(llvm::Module &M);
   void AddFunction(std::pair<DxilFunctionLinkInfo *, DxilLib *> &linkPair);
   void AddFunction(llvm::Function *F);
 
 private:
+  void LinkNamedMDNodes(Module *pM, ValueToValueMapTy &vmap);
+  void AddFunctionDecls(Module *pM);
+  bool AddGlobals(DxilModule &DM, ValueToValueMapTy &vmap);
+  void CloneFunctions(ValueToValueMapTy &vmap);
+  void AddFunctions(DxilModule &DM, ValueToValueMapTy &vmap,
+                    std::unordered_set<Function *> &initFuncSet);
   bool AddResource(DxilResourceBase *res, llvm::GlobalVariable *GV);
   void AddResourceToDM(DxilModule &DM);
   std::unordered_map<DxilFunctionLinkInfo *, DxilLib *> m_functionDefs;
-  llvm::StringMap<llvm::Function *> m_dxilFunctions;
+  llvm::StringMap<llvm::Function *> m_functionDecls;
   // New created functions.
   llvm::StringMap<llvm::Function *> m_newFunctions;
   // New created globals.
@@ -336,6 +359,7 @@ private:
   llvm::StringMap<std::pair<DxilResourceBase *, llvm::GlobalVariable *>>
       m_resourceMap;
   LLVMContext &m_ctx;
+  dxilutil::ExportMap &m_exportMap;
   unsigned m_valMajor, m_valMinor;
 };
 } // namespace
@@ -345,6 +369,7 @@ const char kUndefFunction[] = "Cannot find definition of function ";
 const char kRedefineFunction[] = "Definition already exists for function ";
 const char kRedefineGlobal[] = "Definition already exists for global variable ";
 const char kInvalidProfile[] = " is invalid profile to link";
+const char kExportOnlyForLib[] = "export map is only for library";
 const char kShaderKindMismatch[] =
     "Profile mismatch between entry function and target profile:";
 const char kNoEntryProps[] =
@@ -352,6 +377,9 @@ const char kNoEntryProps[] =
 const char kRedefineResource[] =
     "Resource already exists as ";
 const char kInvalidValidatorVersion[] = "Validator version does not support target profile ";
+const char kExportNameCollision[] = "Export name collides with another export: ";
+const char kExportFunctionMissing[] = "Could not find target for export: ";
+const char kNoFunctionsToExport[] = "Library has no functions to export";
 } // namespace
 //------------------------------------------------------------------------------
 //
@@ -488,138 +516,77 @@ void DxilLinkJob::AddResourceToDM(DxilModule &DM) {
     }
     // Update ID.
     basePtr->SetID(ID);
-    Constant *rangeID = ConstantInt::get(GV->getType()->getElementType(), ID);
-    for (User *U : GV->users()) {
-      LoadInst *LI = cast<LoadInst>(U);
-      LI->replaceAllUsesWith(rangeID);
-    }
+
+    basePtr->SetGlobalSymbol(GV);
+    DM.GetLLVMUsed().push_back(GV);
   }
+  // Prevent global vars used for resources from being deleted through optimizations
+  // while we still have hidden uses (pointers in resource vectors).
+  DM.EmitLLVMUsed();
 }
 
-std::unique_ptr<Module>
-DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
-                  StringRef profile) {
-
-  Function *entryFunc = entryLinkPair.first->func;
-  DxilModule &entryDM = entryLinkPair.second->GetDxilModule();
-  if (!entryDM.HasDxilFunctionProps(entryFunc)) {
-    // Cannot get function props.
-    m_ctx.emitError(Twine(kNoEntryProps) + entryFunc->getName());
-    return nullptr;
+void DxilLinkJob::LinkNamedMDNodes(Module *pM, ValueToValueMapTy &vmap) {
+  SetVector<Module *> moduleSet;
+  for (auto &it : m_functionDefs) {
+    DxilLib *pLib = it.second;
+    moduleSet.insert(pLib->GetDxilModule().GetModule());
   }
-
-  DxilFunctionProps props = entryDM.GetDxilFunctionProps(entryFunc);
-  if (props.shaderKind == DXIL::ShaderKind::Library ||
-      props.shaderKind == DXIL::ShaderKind::Invalid) {
-    m_ctx.emitError(profile + Twine(kInvalidProfile));
-    // Invalid profile.
-    return nullptr;
+  // Link normal NamedMDNode.
+  // TODO: skip duplicate operands.
+  for (Module *pSrcM : moduleSet) {
+    const NamedMDNode *pSrcModFlags = pSrcM->getModuleFlagsMetadata();
+    for (const NamedMDNode &NMD : pSrcM->named_metadata()) {
+      // Don't link module flags here. Do them separately.
+      if (&NMD == pSrcModFlags)
+        continue;
+      // Skip dxil metadata which will be regenerated.
+      if (DxilMDHelper::IsKnownNamedMetaData(NMD))
+        continue;
+      NamedMDNode *DestNMD = pM->getOrInsertNamedMetadata(NMD.getName());
+      // Add Src elements into Dest node.
+      for (const MDNode *op : NMD.operands())
+        DestNMD->addOperand(MapMetadata(op, vmap, RF_None, /*TypeMap*/ nullptr,
+                                        /*ValMaterializer*/ nullptr));
+    }
   }
-
-  const ShaderModel *pSM = ShaderModel::GetByName(profile.data());
-  if (pSM->GetKind() != props.shaderKind) {
-    // Shader kind mismatch.
-    m_ctx.emitError(Twine(kShaderKindMismatch) + profile + " and " +
-                    ShaderModel::GetKindName(props.shaderKind));
-    return nullptr;
+  // Link mod flags.
+  SetVector<MDNode *> flagSet;
+  for (Module *pSrcM : moduleSet) {
+    NamedMDNode *pSrcModFlags = pSrcM->getModuleFlagsMetadata();
+    if (pSrcModFlags) {
+      for (MDNode *flag : pSrcModFlags->operands()) {
+        flagSet.insert(flag);
+      }
+    }
+  }
+  // TODO: check conflict in flags.
+  if (!flagSet.empty()) {
+    NamedMDNode *ModFlags = pM->getOrInsertModuleFlagsMetadata();
+    for (MDNode *flag : flagSet) {
+      ModFlags->addOperand(flag);
+    }
   }
+}
 
-  // Create new module.
-  std::unique_ptr<Module> pM =
-      llvm::make_unique<Module>(entryFunc->getName(), entryDM.GetCtx());
-  // Set target.
-  pM->setTargetTriple(entryDM.GetModule()->getTargetTriple());
-  // Add dxil operation functions before create DxilModule.
-  for (auto &it : m_dxilFunctions) {
+void DxilLinkJob::AddFunctionDecls(Module *pM) {
+  for (auto &it : m_functionDecls) {
     Function *F = it.second;
     Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
-                                      F->getName(), pM.get());
+                                      F->getName(), pM);
     NewF->setAttributes(F->getAttributes());
     m_newFunctions[NewF->getName()] = NewF;
   }
+}
 
-  // Create DxilModule.
-  const bool bSkipInit = true;
-  DxilModule &DM = pM->GetOrCreateDxilModule(bSkipInit);
-  DM.SetShaderModel(pSM);
-
-  // Set Validator version, verifying that it supports the requested profile
-  unsigned minValMajor, minValMinor;
-  DM.GetMinValidatorVersion(minValMajor, minValMinor);
-  if (minValMajor > m_valMajor || (minValMajor == m_valMajor && minValMinor > m_valMinor)) {
-    m_ctx.emitError(Twine(kInvalidValidatorVersion) + profile);
-    return nullptr;
-  }
-  DM.SetValidatorVersion(m_valMajor, m_valMinor);
-
-  // Add type sys
+bool DxilLinkJob::AddGlobals(DxilModule &DM, ValueToValueMapTy &vmap) {
   DxilTypeSystem &typeSys = DM.GetTypeSystem();
-
-  ValueToValueMapTy vmap;
-
-  std::unordered_set<Function *> initFuncSet;
-  // Add function
+  Module *pM = DM.GetModule();
+  bool bSuccess = true;
   for (auto &it : m_functionDefs) {
     DxilFunctionLinkInfo *linkInfo = it.first;
     DxilLib *pLib = it.second;
     DxilModule &tmpDM = pLib->GetDxilModule();
     DxilTypeSystem &tmpTypeSys = tmpDM.GetTypeSystem();
-
-    Function *F = linkInfo->func;
-    Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
-                                      F->getName(), pM.get());
-    NewF->setAttributes(F->getAttributes());
-
-    if (!NewF->hasFnAttribute(llvm::Attribute::NoInline))
-      NewF->addFnAttr(llvm::Attribute::AlwaysInline);
-
-    if (tmpTypeSys.GetFunctionAnnotation(F)) {
-      // Clone funcAnnotation to typeSys.
-      typeSys.CopyFunctionAnnotation(NewF, F, tmpTypeSys);
-    }
-
-    // Add to function map.
-    m_newFunctions[NewF->getName()] = NewF;
-    if (pLib->IsInitFunc(F))
-      initFuncSet.insert(NewF);
-
-    vmap[F] = NewF;
-  }
-
-  // Set Entry
-  Function *NewEntryFunc = m_newFunctions[entryFunc->getName()];
-  DM.SetEntryFunction(NewEntryFunc);
-  DM.SetEntryFunctionName(entryFunc->getName());
-  if (entryDM.HasDxilEntrySignature(entryFunc)) {
-    // Add signature.
-    DxilEntrySignature &entrySig = entryDM.GetDxilEntrySignature(entryFunc);
-    std::unique_ptr<DxilEntrySignature> newSig =
-        llvm::make_unique<DxilEntrySignature>(entrySig);
-    DM.ResetEntrySignature(newSig.release());
-  }
-
-  if (NewEntryFunc->hasFnAttribute(llvm::Attribute::AlwaysInline))
-    NewEntryFunc->removeFnAttr(llvm::Attribute::AlwaysInline);
-  if (props.IsHS()) {
-    Function *patchConstantFunc = props.ShaderProps.HS.patchConstantFunc;
-    Function *newPatchConstantFunc =
-        m_newFunctions[patchConstantFunc->getName()];
-    props.ShaderProps.HS.patchConstantFunc = newPatchConstantFunc;
-
-    if (newPatchConstantFunc->hasFnAttribute(llvm::Attribute::AlwaysInline))
-      newPatchConstantFunc->removeFnAttr(llvm::Attribute::AlwaysInline);
-  }
-  // Set EntryProps
-  DM.SetShaderProperties(&props);
-
-  // Debug info.
-
-  // Add global
-  bool bSuccess = true;
-  for (auto &it : m_functionDefs) {
-    DxilFunctionLinkInfo *linkInfo = it.first;
-    DxilLib *pLib = it.second;
-
     for (GlobalVariable *GV : linkInfo->usedGVs) {
       // Skip added globals.
       if (m_newGlobals.count(GV->getName())) {
@@ -646,9 +613,10 @@ DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
       if (GV->hasInitializer())
         Initializer = GV->getInitializer();
 
+      Type *Ty = GV->getType()->getElementType();
       GlobalVariable *NewGV = new GlobalVariable(
-          *pM, GV->getType()->getElementType(), GV->isConstant(),
-          GV->getLinkage(), Initializer, GV->getName(),
+          *pM, Ty, GV->isConstant(), GV->getLinkage(), Initializer,
+          GV->getName(),
           /*InsertBefore*/ nullptr, GV->getThreadLocalMode(),
           GV->getType()->getAddressSpace(), GV->isExternallyInitialized());
 
@@ -656,16 +624,17 @@ DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
 
       vmap[GV] = NewGV;
 
+      typeSys.CopyTypeAnnotation(Ty, tmpTypeSys);
+
       if (DxilResourceBase *res = pLib->GetResource(GV)) {
         bSuccess &= AddResource(res, NewGV);
       }
     }
   }
+  return bSuccess;
+}
 
-  if (!bSuccess)
-    return nullptr;
-
-  // Clone functions.
+void DxilLinkJob::CloneFunctions(ValueToValueMapTy &vmap) {
   for (auto &it : m_functionDefs) {
     DxilFunctionLinkInfo *linkInfo = it.first;
 
@@ -684,10 +653,120 @@ DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
 
     CloneFunction(F, NewF, vmap);
   }
+}
+
+void DxilLinkJob::AddFunctions(DxilModule &DM, ValueToValueMapTy &vmap,
+                               std::unordered_set<Function *> &initFuncSet) {
+  DxilTypeSystem &typeSys = DM.GetTypeSystem();
+  Module *pM = DM.GetModule();
+  for (auto &it : m_functionDefs) {
+    DxilFunctionLinkInfo *linkInfo = it.first;
+    DxilLib *pLib = it.second;
+    DxilModule &tmpDM = pLib->GetDxilModule();
+    DxilTypeSystem &tmpTypeSys = tmpDM.GetTypeSystem();
+
+    Function *F = linkInfo->func;
+    Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                      F->getName(), pM);
+    NewF->setAttributes(F->getAttributes());
+
+    if (!NewF->hasFnAttribute(llvm::Attribute::NoInline))
+      NewF->addFnAttr(llvm::Attribute::AlwaysInline);
+
+    if (DxilFunctionAnnotation *funcAnnotation =
+            tmpTypeSys.GetFunctionAnnotation(F)) {
+      // Clone funcAnnotation to typeSys.
+      typeSys.CopyFunctionAnnotation(NewF, F, tmpTypeSys);
+    }
+
+    // Add to function map.
+    m_newFunctions[NewF->getName()] = NewF;
+    if (pLib->IsInitFunc(F))
+      initFuncSet.insert(NewF);
+
+    vmap[F] = NewF;
+  }
+}
+
+std::unique_ptr<Module>
+DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
+                  const ShaderModel *pSM) {
+  Function *entryFunc = entryLinkPair.first->func;
+  DxilModule &entryDM = entryLinkPair.second->GetDxilModule();
+  if (!entryDM.HasDxilFunctionProps(entryFunc)) {
+    // Cannot get function props.
+    m_ctx.emitError(Twine(kNoEntryProps) + entryFunc->getName());
+    return nullptr;
+  }
+
+  DxilFunctionProps props = entryDM.GetDxilFunctionProps(entryFunc);
+
+  if (pSM->GetKind() != props.shaderKind) {
+    // Shader kind mismatch.
+    m_ctx.emitError(Twine(kShaderKindMismatch) +
+                    ShaderModel::GetKindName(pSM->GetKind()) + " and " +
+                    ShaderModel::GetKindName(props.shaderKind));
+    return nullptr;
+  }
+
+  // Create new module.
+  std::unique_ptr<Module> pM =
+      llvm::make_unique<Module>(entryFunc->getName(), entryDM.GetCtx());
+  // Set target.
+  pM->setTargetTriple(entryDM.GetModule()->getTargetTriple());
+  // Add dxil operation functions before create DxilModule.
+  AddFunctionDecls(pM.get());
+
+  // Create DxilModule.
+  const bool bSkipInit = true;
+  DxilModule &DM = pM->GetOrCreateDxilModule(bSkipInit);
+  DM.SetShaderModel(pSM, entryDM.GetUseMinPrecision());
+
+  // Set Validator version.
+  DM.SetValidatorVersion(m_valMajor, m_valMinor);
+
+  ValueToValueMapTy vmap;
+
+  std::unordered_set<Function *> initFuncSet;
+  // Add function
+  AddFunctions(DM, vmap, initFuncSet);
+
+  // Set Entry
+  Function *NewEntryFunc = m_newFunctions[entryFunc->getName()];
+  DM.SetEntryFunction(NewEntryFunc);
+  DM.SetEntryFunctionName(entryFunc->getName());
+
+  DxilEntryPropsMap EntryPropMap;
+  std::unique_ptr<DxilEntryProps> pProps =
+      llvm::make_unique<DxilEntryProps>(entryDM.GetDxilEntryProps(entryFunc));
+  EntryPropMap[NewEntryFunc] = std::move(pProps);
+  DM.ResetEntryPropsMap(std::move(EntryPropMap));
+
+
+  if (NewEntryFunc->hasFnAttribute(llvm::Attribute::AlwaysInline))
+    NewEntryFunc->removeFnAttr(llvm::Attribute::AlwaysInline);
+  if (props.IsHS()) {
+    Function *patchConstantFunc = props.ShaderProps.HS.patchConstantFunc;
+    Function *newPatchConstantFunc =
+        m_newFunctions[patchConstantFunc->getName()];
+    props.ShaderProps.HS.patchConstantFunc = newPatchConstantFunc;
+
+    if (newPatchConstantFunc->hasFnAttribute(llvm::Attribute::AlwaysInline))
+      newPatchConstantFunc->removeFnAttr(llvm::Attribute::AlwaysInline);
+  }
+  // Set EntryProps
+  DM.SetShaderProperties(&props);
+
+  // Add global
+  bool bSuccess = AddGlobals(DM, vmap);
+  if (!bSuccess)
+    return nullptr;
+
+  // Clone functions.
+  CloneFunctions(vmap);
 
   // Call global constrctor.
-  IRBuilder<> Builder(
-      DM.GetEntryFunction()->getEntryBlock().getFirstInsertionPt());
+  IRBuilder<> Builder(dxilutil::FirstNonAllocaInsertionPt(DM.GetEntryFunction()));
   for (auto &it : m_functionDefs) {
     DxilFunctionLinkInfo *linkInfo = it.first;
     DxilLib *pLib = it.second;
@@ -706,35 +785,268 @@ DxilLinkJob::Link(std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair,
   // This should be after functions cloned.
   AddResourceToDM(DM);
 
+  // Link metadata like debug info.
+  LinkNamedMDNodes(pM.get(), vmap);
+
   RunPreparePass(*pM);
 
   return pM;
 }
 
+std::unique_ptr<Module>
+DxilLinkJob::LinkToLib(const ShaderModel *pSM) {
+  if (m_functionDefs.empty()) {
+    m_ctx.emitError(Twine(kNoFunctionsToExport));
+    return nullptr;
+  }
+  DxilLib *pLib = m_functionDefs.begin()->second;
+  DxilModule &tmpDM = pLib->GetDxilModule();
+  // Create new module.
+  std::unique_ptr<Module> pM =
+      llvm::make_unique<Module>("merged_lib", tmpDM.GetCtx());
+  // Set target.
+  pM->setTargetTriple(tmpDM.GetModule()->getTargetTriple());
+  // Add dxil operation functions and external decls before create DxilModule.
+  AddFunctionDecls(pM.get());
+
+  // Create DxilModule.
+  const bool bSkipInit = true;
+  DxilModule &DM = pM->GetOrCreateDxilModule(bSkipInit);
+  DM.SetShaderModel(pSM, tmpDM.GetUseMinPrecision());
+
+  // Set Validator version.
+  DM.SetValidatorVersion(m_valMajor, m_valMinor);
+
+  ValueToValueMapTy vmap;
+
+  std::unordered_set<Function *> initFuncSet;
+  // Add function
+  AddFunctions(DM, vmap, initFuncSet);
+
+  // Set DxilFunctionProps.
+  DxilEntryPropsMap EntryPropMap;
+  for (auto &it : m_functionDefs) {
+    DxilFunctionLinkInfo *linkInfo = it.first;
+    DxilLib *pLib = it.second;
+    DxilModule &tmpDM = pLib->GetDxilModule();
+
+    Function *F = linkInfo->func;
+    if (tmpDM.HasDxilEntryProps(F)) {
+      Function *NewF = m_newFunctions[F->getName()];
+      DxilEntryProps &props = tmpDM.GetDxilEntryProps(F);
+      std::unique_ptr<DxilEntryProps> pProps =
+          llvm::make_unique<DxilEntryProps>(props);
+      EntryPropMap[NewF] = std::move(pProps);
+    }
+  }
+  DM.ResetEntryPropsMap(std::move(EntryPropMap));
+
+  // Add global
+  bool bSuccess = AddGlobals(DM, vmap);
+  if (!bSuccess)
+    return nullptr;
+
+  // Clone functions.
+  CloneFunctions(vmap);
+
+  // Refresh intrinsic cache.
+  DM.GetOP()->RefreshCache();
+
+  // Add resource to DM.
+  // This should be after functions cloned.
+  AddResourceToDM(DM);
+
+  // Link metadata like debug info.
+  LinkNamedMDNodes(pM.get(), vmap);
+
+  RunPreparePass(*pM);
+
+  if (!m_exportMap.empty()) {
+    m_exportMap.BeginProcessing();
+
+    DM.ClearDxilMetadata(*pM);
+    for (auto it = pM->begin(); it != pM->end();) {
+      Function *F = it++;
+      if (F->isDeclaration())
+        continue;
+      if (!m_exportMap.ProcessFunction(F, true)) {
+        // Remove Function not in exportMap.
+        DM.RemoveFunction(F);
+        F->eraseFromParent();
+      }
+    }
+
+    if(!m_exportMap.EndProcessing()) {
+      for (auto &name : m_exportMap.GetNameCollisions()) {
+        std::string escaped;
+        llvm::raw_string_ostream os(escaped);
+        dxilutil::PrintEscapedString(name, os);
+        m_ctx.emitError(Twine(kExportNameCollision) + os.str());
+      }
+      for (auto &name : m_exportMap.GetUnusedExports()) {
+        std::string escaped;
+        llvm::raw_string_ostream os(escaped);
+        dxilutil::PrintEscapedString(name, os);
+        m_ctx.emitError(Twine(kExportFunctionMissing) + os.str());
+      }
+      return nullptr;
+    }
+
+    // Rename the original, if necessary, then clone the rest
+    for (auto &it : m_exportMap.GetFunctionRenames()) {
+      Function *F = it.first;
+      auto &renames = it.second;
+
+      if (renames.empty())
+        continue;
+
+      auto itName = renames.begin();
+
+      // Rename the original, if necessary, then clone the rest
+      if (renames.find(F->getName()) == renames.end())
+        F->setName(*(itName++));
+
+      while (itName != renames.end()) {
+        if (F->getName() != *itName) {
+          Function *NewF = Function::Create(F->getFunctionType(),
+            GlobalValue::LinkageTypes::ExternalLinkage,
+            *itName, DM.GetModule());
+          ValueToValueMapTy vmap;
+          CloneFunction(F, NewF, vmap, &DM.GetTypeSystem());
+          // add DxilFunctionProps if entry
+          if (DM.HasDxilFunctionProps(F)) {
+            DM.CloneDxilEntryProps(F, NewF);
+          }
+        }
+        itName++;
+      }
+    }
+
+    DM.EmitDxilMetadata();
+  }
+
+  return pM;
+}
+
 void DxilLinkJob::AddFunction(
     std::pair<DxilFunctionLinkInfo *, DxilLib *> &linkPair) {
   m_functionDefs[linkPair.first] = linkPair.second;
 }
 
 void DxilLinkJob::AddFunction(llvm::Function *F) {
-  m_dxilFunctions[F->getName()] = F;
+  m_functionDecls[F->getName()] = F;
+}
+
+// Clone of StripDeadDebugInfo::runOnModule.
+// Also remove function which not not in current Module.
+void DxilLinkJob::StripDeadDebugInfo(Module &M) {
+  LLVMContext &C = M.getContext();
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Metadata *, 64> LiveGlobalVariables;
+  SmallVector<Metadata *, 64> LiveSubprograms;
+  DenseSet<const MDNode *> VisitedSet;
+
+  for (DICompileUnit *DIC : F.compile_units()) {
+    // Create our live subprogram list.
+    bool SubprogramChange = false;
+    for (DISubprogram *DISP : DIC->getSubprograms()) {
+      // Make sure we visit each subprogram only once.
+      if (!VisitedSet.insert(DISP).second)
+        continue;
+
+      // If the function referenced by DISP is not null, the function is live.
+      if (Function *Func = DISP->getFunction()) {
+        if (Func->getParent() == &M)
+          LiveSubprograms.push_back(DISP);
+        else
+          SubprogramChange = true;
+      } else {
+        SubprogramChange = true;
+      }
+    }
+
+    // Create our live global variable list.
+    bool GlobalVariableChange = false;
+    for (DIGlobalVariable *DIG : DIC->getGlobalVariables()) {
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If the global variable referenced by DIG is not null, the global
+      // variable is live.
+      if (Constant *CV = DIG->getVariable()) {
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CV)) {
+          if (GV->getParent() == &M) {
+            LiveGlobalVariables.push_back(DIG);
+          } else {
+            GlobalVariableChange = true;
+          }
+        } else {
+          LiveGlobalVariables.push_back(DIG);
+        }
+      } else {
+        GlobalVariableChange = true;
+      }
+    }
+
+    // If we found dead subprograms or global variables, replace the current
+    // subprogram list/global variable list with our new live subprogram/global
+    // variable list.
+    if (SubprogramChange) {
+      DIC->replaceSubprograms(MDTuple::get(C, LiveSubprograms));
+    }
+
+    if (GlobalVariableChange) {
+      DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
+    }
+
+    // Reset lists for the next iteration.
+    LiveSubprograms.clear();
+    LiveGlobalVariables.clear();
+  }
 }
 
 void DxilLinkJob::RunPreparePass(Module &M) {
+  StripDeadDebugInfo(M);
   legacy::PassManager PM;
-
   PM.add(createAlwaysInlinerPass(/*InsertLifeTime*/ false));
+
+  // Remove unused functions.
   PM.add(createDxilDeadFunctionEliminationPass());
+
+  // SROA
+  PM.add(createSROAPass(/*RequiresDomTree*/false));
+
+  // Remove MultiDimArray from function call arg.
+  PM.add(createMultiDimArrayToOneDimArrayPass());
+
+  // Lower matrix bitcast.
+  PM.add(createMatrixBitcastLowerPass());
+
   // mem2reg.
   PM.add(createPromoteMemoryToRegisterPass());
-  // Remove unused functions.
-  PM.add(createDeadCodeEliminationPass());
-  PM.add(createGlobalDCEPass());
+
+  // Clean up vectors, and run mem2reg again
+  PM.add(createScalarizerPass());
+  PM.add(createPromoteMemoryToRegisterPass());
 
   PM.add(createSimplifyInstPass());
   PM.add(createCFGSimplificationPass());
 
-  PM.add(createDxilCondenseResourcesPass());
+  PM.add(createDeadCodeEliminationPass());
+  PM.add(createGlobalDCEPass());
+
+  PM.add(createDxilLowerCreateHandleForLibPass());
+  PM.add(createDxilTranslateRawBuffer());
   PM.add(createDxilFinalizeModulePass());
   PM.add(createComputeViewIdStatePass());
   PM.add(createDxilDeadFunctionEliminationPass());
@@ -859,7 +1171,8 @@ bool DxilLinkerImpl::DetachLib(DxilLib *lib) {
 bool DxilLinkerImpl::AddFunctions(SmallVector<StringRef, 4> &workList,
                                   DenseSet<DxilLib *> &libSet,
                                   StringSet<> &addedFunctionSet,
-                                  DxilLinkJob &linkJob, bool bLazyLoadDone) {
+                                  DxilLinkJob &linkJob, bool bLazyLoadDone,
+                                  bool bAllowFuncionDecls) {
   while (!workList.empty()) {
     StringRef name = workList.pop_back_val();
     // Ignore added function.
@@ -882,12 +1195,17 @@ bool DxilLinkerImpl::AddFunctions(SmallVector<StringRef, 4> &workList,
       pLib->LazyLoadFunction(F);
     }
     for (Function *F : linkPair.first->usedFunctions) {
-      if (hlsl::OP::IsDxilOpFunc(F)) {
+      if (hlsl::OP::IsDxilOpFunc(F) || F->isIntrinsic()) {
         // Add dxil operations directly.
         linkJob.AddFunction(F);
-      } else {
-        // Push function name to work list.
-        workList.emplace_back(F->getName());
+      } else if (addedFunctionSet.count(F->getName()) == 0) {
+        if (bAllowFuncionDecls && F->isDeclaration() && !m_functionNameMap.count(F->getName())) {
+          // When linking to lib, use of undefined function is allowed; add directly.
+          linkJob.AddFunction(F);
+        } else {
+          // Push function name to work list.
+          workList.emplace_back(F->getName());
+        }
       }
     }
 
@@ -896,39 +1214,124 @@ bool DxilLinkerImpl::AddFunctions(SmallVector<StringRef, 4> &workList,
   return true;
 }
 
-std::unique_ptr<llvm::Module> DxilLinkerImpl::Link(StringRef entry,
-                                               StringRef profile) {
-  StringSet<> addedFunctionSet;
-  SmallVector<StringRef, 4> workList;
-  workList.emplace_back(entry);
+std::unique_ptr<llvm::Module>
+DxilLinkerImpl::Link(StringRef entry, StringRef profile, dxilutil::ExportMap &exportMap) {
+  const ShaderModel *pSM = ShaderModel::GetByName(profile.data());
+  DXIL::ShaderKind kind = pSM->GetKind();
+  if (kind == DXIL::ShaderKind::Invalid ||
+      (kind >= DXIL::ShaderKind::RayGeneration &&
+       kind <= DXIL::ShaderKind::Callable)) {
+    m_ctx.emitError(profile + Twine(kInvalidProfile));
+    // Invalid profile.
+    return nullptr;
+  }
 
-  DxilLinkJob linkJob(m_ctx, m_valMajor, m_valMinor);
+  if (!exportMap.empty() && kind != DXIL::ShaderKind::Library) {
+    m_ctx.emitError(Twine(kExportOnlyForLib));
+    return nullptr;
+  }
 
-  DenseSet<DxilLib *> libSet;
-  if (!AddFunctions(workList, libSet, addedFunctionSet, linkJob,
-                    /*bLazyLoadDone*/ false))
+  // Verifying validator version supports the requested profile
+  unsigned minValMajor, minValMinor;
+  pSM->GetMinValidatorVersion(minValMajor, minValMinor);
+  if (minValMajor > m_valMajor ||
+      (minValMajor == m_valMajor && minValMinor > m_valMinor)) {
+    m_ctx.emitError(Twine(kInvalidValidatorVersion) + profile);
     return nullptr;
+  }
+
+  DxilLinkJob linkJob(m_ctx, exportMap, m_valMajor, m_valMinor);
+
+  DenseSet<DxilLib *> libSet;
+  StringSet<> addedFunctionSet;
+
+  bool bIsLib = pSM->IsLib();
+  if (!bIsLib) {
+    SmallVector<StringRef, 4> workList;
+    workList.emplace_back(entry);
+
+    if (!AddFunctions(workList, libSet, addedFunctionSet, linkJob,
+                      /*bLazyLoadDone*/ false,
+                      /*bAllowFuncionDecls*/ false))
+      return nullptr;
+
+  } else {
+    if (exportMap.empty()) {
+      // Add every function for lib profile.
+      for (auto &it : m_functionNameMap) {
+        StringRef name = it.getKey();
+        std::pair<DxilFunctionLinkInfo *, DxilLib *> &linkPair = it.second;
+        DxilFunctionLinkInfo *linkInfo = linkPair.first;
+        DxilLib *pLib = linkPair.second;
+
+        Function *F = linkInfo->func;
+        pLib->LazyLoadFunction(F);
+
+        linkJob.AddFunction(linkPair);
+
+        libSet.insert(pLib);
+
+        addedFunctionSet.insert(name);
+      }
+      // Add every dxil function and llvm intrinsic.
+      for (auto *pLib : libSet) {
+        auto &DM = pLib->GetDxilModule();
+        DM.GetOP();
+        auto *pM = DM.GetModule();
+        for (Function &F : pM->functions()) {
+          if (hlsl::OP::IsDxilOpFunc(&F) || F.isIntrinsic() ||
+            (F.isDeclaration() && m_functionNameMap.count(F.getName()) == 0)) {
+            // Add intrinsics and function decls still not defined in any lib
+            linkJob.AddFunction(&F);
+          }
+        }
+      }
+    } else {
+      SmallVector<StringRef, 4> workList;
+
+      // Only add exported functions.
+      for (auto &it : m_functionNameMap) {
+        StringRef name = it.getKey();
+        // Only add names exist in exportMap.
+        if (exportMap.IsExported(name))
+          workList.emplace_back(name);
+      }
+
+      if (!AddFunctions(workList, libSet, addedFunctionSet, linkJob,
+                        /*bLazyLoadDone*/ false,
+                        /*bAllowFuncionDecls*/ true))
+        return nullptr;
+    }
+  }
 
   // Save global users.
   for (auto &pLib : libSet) {
     pLib->BuildGlobalUsage();
   }
 
+  SmallVector<StringRef, 4> workList;
   // Save global ctor users.
   for (auto &pLib : libSet) {
     pLib->CollectUsedInitFunctions(addedFunctionSet, workList);
   }
   // Add init functions if used.
-  // All init function already loaded in BuildGlobalUsage, so set bLazyLoad
-  // false here.
+  // All init function already loaded in BuildGlobalUsage,
+  // so set bLazyLoadDone to true here.
+  // Decls should have been added to addedFunctionSet if lib,
+  // so set bAllowFuncionDecls is false here.
   if (!AddFunctions(workList, libSet, addedFunctionSet, linkJob,
-                    /*bLazyLoadDone*/ true))
+                    /*bLazyLoadDone*/ true,
+                    /*bAllowFuncionDecls*/ false))
     return nullptr;
 
-  std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair =
-      m_functionNameMap[entry];
+  if (!bIsLib) {
+    std::pair<DxilFunctionLinkInfo *, DxilLib *> &entryLinkPair =
+        m_functionNameMap[entry];
 
-  return linkJob.Link(entryLinkPair, profile);
+    return linkJob.Link(entryLinkPair, pSM);
+  } else {
+    return linkJob.LinkToLib(pSM);
+  }
 }
 
 namespace hlsl {

+ 283 - 57
lib/HLSL/DxilMetadataHelper.cpp

@@ -19,6 +19,7 @@
 #include "dxc/HLSL/DxilRootSignature.h"
 #include "dxc/HLSL/ComputeViewIdState.h"
 #include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilShaderFlags.h"
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -44,19 +45,17 @@ const char DxilMDHelper::kDxilVersionMDName[]                         = "dx.vers
 const char DxilMDHelper::kDxilShaderModelMDName[]                     = "dx.shaderModel";
 const char DxilMDHelper::kDxilEntryPointsMDName[]                     = "dx.entryPoints";
 const char DxilMDHelper::kDxilResourcesMDName[]                       = "dx.resources";
-const char DxilMDHelper::kDxilResourcesLinkInfoMDName[]               = "dx.resources.link.info";
 const char DxilMDHelper::kDxilTypeSystemMDName[]                      = "dx.typeAnnotations";
 const char DxilMDHelper::kDxilTypeSystemHelperVariablePrefix[]        = "dx.typevar.";
 const char DxilMDHelper::kDxilControlFlowHintMDName[]                 = "dx.controlflow.hints";
 const char DxilMDHelper::kDxilPreciseAttributeMDName[]                = "dx.precise";
+const char DxilMDHelper::kDxilNonUniformAttributeMDName[]             = "dx.nonuniform";
 const char DxilMDHelper::kHLDxilResourceAttributeMDName[]             = "dx.hl.resource.attribute";
 const char DxilMDHelper::kDxilValidatorVersionMDName[]                = "dx.valver";
 
 // This named metadata is not valid in final module (should be moved to DxilContainer)
 const char DxilMDHelper::kDxilRootSignatureMDName[]                   = "dx.rootSignature";
 const char DxilMDHelper::kDxilViewIdStateMDName[]                     = "dx.viewIdState";
-const char DxilMDHelper::kDxilFunctionPropertiesMDName[]              = "dx.func.props";
-const char DxilMDHelper::kDxilEntrySignaturesMDName[]                 = "dx.func.signatures";
 
 const char DxilMDHelper::kDxilSourceContentsMDName[]                  = "dx.source.contents";
 const char DxilMDHelper::kDxilSourceDefinesMDName[]                   = "dx.source.defines";
@@ -165,7 +164,7 @@ void DxilMDHelper::EmitDxilShaderModel(const ShaderModel *pSM) {
   pShaderModelNamedMD = m_pModule->getOrInsertNamedMetadata(kDxilShaderModelMDName);
 
   Metadata *MDVals[kDxilShaderModelNumFields];
-  MDVals[kDxilShaderModelTypeIdx ] = MDString::get(m_Ctx, pSM->GetKindName().c_str());
+  MDVals[kDxilShaderModelTypeIdx ] = MDString::get(m_Ctx, pSM->GetKindName());
   MDVals[kDxilShaderModelMajorIdx] = Uint32ToConstMD(pSM->GetMajor());
   MDVals[kDxilShaderModelMinorIdx] = Uint32ToConstMD(pSM->GetMinor());
 
@@ -185,7 +184,8 @@ void DxilMDHelper::LoadDxilShaderModel(const ShaderModel *&pSM) {
   unsigned Major = ConstMDToUint32(pShaderModelMD->getOperand(kDxilShaderModelMajorIdx));
   unsigned Minor = ConstMDToUint32(pShaderModelMD->getOperand(kDxilShaderModelMinorIdx));
   string ShaderModelName = pShaderTypeMD->getString();
-  ShaderModelName += "_" + std::to_string(Major) + "_" + std::to_string(Minor);
+  ShaderModelName += "_" + std::to_string(Major) + "_" +
+    (Minor == ShaderModel::kOfflineMinor ? "x" : std::to_string(Minor));
   pSM = ShaderModel::GetByName(ShaderModelName.c_str());
   if (!pSM->IsValidForDxil()) {
     char ErrorMsgTxt[40];
@@ -200,7 +200,8 @@ void DxilMDHelper::LoadDxilShaderModel(const ShaderModel *&pSM) {
 // Entry points.
 //
 void DxilMDHelper::EmitDxilEntryPoints(vector<MDNode *> &MDEntries) {
-  DXASSERT(MDEntries.size() == 1, "only one entry point is supported for now");
+  DXASSERT(MDEntries.size() == 1 || GetShaderModel()->IsLib(),
+           "only one entry point is supported for now");
   NamedMDNode *pEntryPointsNamedMD = m_pModule->getNamedMetadata(kDxilEntryPointsMDName);
   IFTBOOL(pEntryPointsNamedMD == nullptr, DXC_E_INCORRECT_DXIL_METADATA);
   pEntryPointsNamedMD = m_pModule->getOrInsertNamedMetadata(kDxilEntryPointsMDName);
@@ -490,52 +491,6 @@ void DxilMDHelper::UpdateDxilResources(llvm::MDTuple *pDxilResourceTuple) {
   }
 }
 
-void DxilMDHelper::EmitDxilResourceLinkInfoTuple(MDTuple *pSRVs, MDTuple *pUAVs,
-                                             MDTuple *pCBuffers,
-                                             MDTuple *pSamplers) {
-  DXASSERT(pSRVs != nullptr || pUAVs != nullptr || pCBuffers != nullptr ||
-               pSamplers != nullptr,
-           "resource tuple should not be emitted if there are no resources");
-  Metadata *MDVals[kDxilNumResourceFields];
-  MDVals[kDxilResourceSRVs] = pSRVs;
-  MDVals[kDxilResourceUAVs] = pUAVs;
-  MDVals[kDxilResourceCBuffers] = pCBuffers;
-  MDVals[kDxilResourceSamplers] = pSamplers;
-  MDTuple *pTupleMD = MDNode::get(m_Ctx, MDVals);
-
-  NamedMDNode *pResourcesNamedMD =
-      m_pModule->getNamedMetadata(kDxilResourcesLinkInfoMDName);
-  IFTBOOL(pResourcesNamedMD == nullptr, DXC_E_INCORRECT_DXIL_METADATA);
-  pResourcesNamedMD =
-      m_pModule->getOrInsertNamedMetadata(kDxilResourcesLinkInfoMDName);
-  pResourcesNamedMD->addOperand(pTupleMD);
-}
-
-void DxilMDHelper::LoadDxilResourceLinkInfoTuple(const llvm::MDTuple *&pSRVs,
-                                             const llvm::MDTuple *&pUAVs,
-                                             const llvm::MDTuple *&pCBuffers,
-                                             const llvm::MDTuple *&pSamplers) {
-  NamedMDNode *pResourcesNamedMD =
-      m_pModule->getNamedMetadata(kDxilResourcesLinkInfoMDName);
-  if (!pResourcesNamedMD) {
-    pSRVs = pUAVs = pCBuffers = pSamplers = nullptr;
-    return;
-  }
-
-  IFTBOOL(pResourcesNamedMD->getNumOperands() == 1,
-          DXC_E_INCORRECT_DXIL_METADATA);
-
-  const MDTuple *pTupleMD = dyn_cast<MDTuple>(pResourcesNamedMD->getOperand(0));
-  IFTBOOL(pTupleMD != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
-  IFTBOOL(pTupleMD->getNumOperands() == kDxilNumResourceFields,
-          DXC_E_INCORRECT_DXIL_METADATA);
-
-  pSRVs = CastToTupleOrNull(pTupleMD->getOperand(kDxilResourceSRVs));
-  pUAVs = CastToTupleOrNull(pTupleMD->getOperand(kDxilResourceUAVs));
-  pCBuffers = CastToTupleOrNull(pTupleMD->getOperand(kDxilResourceCBuffers));
-  pSamplers = CastToTupleOrNull(pTupleMD->getOperand(kDxilResourceSamplers));
-}
-
 void DxilMDHelper::GetDxilResources(const MDOperand &MDO, const MDTuple *&pSRVs,
                                     const MDTuple *&pUAVs, const MDTuple *&pCBuffers,
                                     const MDTuple *&pSamplers) {
@@ -958,14 +913,15 @@ void DxilMDHelper::LoadDxilFieldAnnotation(const MDOperand &MDO, DxilFieldAnnota
   }
 }
 
-Function *DxilMDHelper::LoadDxilFunctionProps(MDTuple *pProps,
+const Function *DxilMDHelper::LoadDxilFunctionProps(const MDTuple *pProps,
                                               hlsl::DxilFunctionProps *props) {
   unsigned idx = 0;
-  Function *F = dyn_cast<Function>(
+  const Function *F = dyn_cast<Function>(
       dyn_cast<ValueAsMetadata>(pProps->getOperand(idx++))->getValue());
   DXIL::ShaderKind shaderKind =
       static_cast<DXIL::ShaderKind>(ConstMDToUint32(pProps->getOperand(idx++)));
 
+  bool bRayAttributes = false;
   props->shaderKind = shaderKind;
   switch (shaderKind) {
   case DXIL::ShaderKind::Compute:
@@ -1016,19 +972,253 @@ Function *DxilMDHelper::LoadDxilFunctionProps(MDTuple *pProps,
     props->ShaderProps.PS.EarlyDepthStencil =
         ConstMDToUint32(pProps->getOperand(idx++));
     break;
+  case DXIL::ShaderKind::AnyHit:
+  case DXIL::ShaderKind::ClosestHit:
+    bRayAttributes = true;
+  case DXIL::ShaderKind::Miss:
+  case DXIL::ShaderKind::Callable:
+    // payload/params unioned and first:
+    props->ShaderProps.Ray.payloadSizeInBytes =
+      ConstMDToUint32(pProps->getOperand(idx++));
+    if (bRayAttributes)
+      props->ShaderProps.Ray.attributeSizeInBytes =
+        ConstMDToUint32(pProps->getOperand(idx++));
+    break;
   default:
     break;
   }
   return F;
 }
 
+MDTuple *DxilMDHelper::EmitDxilEntryProperties(uint64_t rawShaderFlag,
+                                                const DxilFunctionProps &props,
+                                                unsigned autoBindingSpace) {
+  vector<Metadata *> MDVals;
+
+  // DXIL shader flags.
+  if (props.IsPS()) {
+    if (props.ShaderProps.PS.EarlyDepthStencil) {
+      ShaderFlags flags;
+      flags.SetShaderFlagsRaw(rawShaderFlag);
+      flags.SetForceEarlyDepthStencil(true);
+      rawShaderFlag = flags.GetShaderFlagsRaw();
+    }
+  }
+  if (rawShaderFlag != 0) {
+    MDVals.emplace_back(Uint32ToConstMD(kDxilShaderFlagsTag));
+    MDVals.emplace_back(Uint64ToConstMD(rawShaderFlag));
+  }
+
+  // Add shader kind for lib entrys.
+  if (m_pSM->IsLib() && props.shaderKind != DXIL::ShaderKind::Library) {
+    MDVals.emplace_back(Uint32ToConstMD(kDxilShaderKindTag));
+    MDVals.emplace_back(
+        Uint32ToConstMD(static_cast<unsigned>(props.shaderKind)));
+  }
+
+  switch (props.shaderKind) {
+  // Compute shader.
+  case DXIL::ShaderKind::Compute: {
+    auto &CS = props.ShaderProps.CS;
+    MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilNumThreadsTag));
+    vector<Metadata *> NumThreadVals;
+    NumThreadVals.emplace_back(Uint32ToConstMD(CS.numThreads[0]));
+    NumThreadVals.emplace_back(Uint32ToConstMD(CS.numThreads[1]));
+    NumThreadVals.emplace_back(Uint32ToConstMD(CS.numThreads[2]));
+    MDVals.emplace_back(MDNode::get(m_Ctx, NumThreadVals));
+  } break;
+  // Geometry shader.
+  case DXIL::ShaderKind::Geometry: {
+    MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilGSStateTag));
+    DXIL::PrimitiveTopology topo = DXIL::PrimitiveTopology::Undefined;
+    unsigned activeStreamMask = 0;
+    for (size_t i = 0;
+         i < _countof(props.ShaderProps.GS.streamPrimitiveTopologies); ++i) {
+      if (props.ShaderProps.GS.streamPrimitiveTopologies[i] !=
+          DXIL::PrimitiveTopology::Undefined) {
+        activeStreamMask |= 1 << i;
+        DXASSERT_NOMSG(topo == DXIL::PrimitiveTopology::Undefined ||
+                       topo ==
+                           props.ShaderProps.GS.streamPrimitiveTopologies[i]);
+        topo = props.ShaderProps.GS.streamPrimitiveTopologies[i];
+      }
+    }
+    MDTuple *pMDTuple =
+        EmitDxilGSState(props.ShaderProps.GS.inputPrimitive,
+                        props.ShaderProps.GS.maxVertexCount, activeStreamMask,
+                        topo, props.ShaderProps.GS.instanceCount);
+    MDVals.emplace_back(pMDTuple);
+  } break;
+  // Domain shader.
+  case DXIL::ShaderKind::Domain: {
+    auto &DS = props.ShaderProps.DS;
+    MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilDSStateTag));
+    MDTuple *pMDTuple = EmitDxilDSState(DS.domain, DS.inputControlPoints);
+    MDVals.emplace_back(pMDTuple);
+  } break;
+  // Hull shader.
+  case DXIL::ShaderKind::Hull: {
+    auto &HS = props.ShaderProps.HS;
+    MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilHSStateTag));
+    MDTuple *pMDTuple = EmitDxilHSState(
+        HS.patchConstantFunc, HS.inputControlPoints, HS.outputControlPoints,
+        HS.domain, HS.partition, HS.outputPrimitive, HS.maxTessFactor);
+    MDVals.emplace_back(pMDTuple);
+  } break;
+  // Raytracing.
+  case DXIL::ShaderKind::AnyHit:
+  case DXIL::ShaderKind::ClosestHit: {
+    MDVals.emplace_back(Uint32ToConstMD(kDxilRayPayloadSizeTag));
+    MDVals.emplace_back(
+        Uint32ToConstMD(props.ShaderProps.Ray.payloadSizeInBytes));
+
+    MDVals.emplace_back(Uint32ToConstMD(kDxilRayAttribSizeTag));
+    MDVals.emplace_back(
+        Uint32ToConstMD(props.ShaderProps.Ray.attributeSizeInBytes));
+  } break;
+  case DXIL::ShaderKind::Miss:
+  case DXIL::ShaderKind::Callable: {
+    MDVals.emplace_back(Uint32ToConstMD(kDxilRayPayloadSizeTag));
+
+    MDVals.emplace_back(
+        Uint32ToConstMD(props.ShaderProps.Ray.payloadSizeInBytes));
+  } break;
+  default:
+    break;
+  }
+
+  if (autoBindingSpace != UINT_MAX && m_pSM->IsSMAtLeast(6, 3)) {
+    MDVals.emplace_back(Uint32ToConstMD(kDxilAutoBindingSpaceTag));
+    MDVals.emplace_back(
+        MDNode::get(m_Ctx, {Uint32ToConstMD(autoBindingSpace)}));
+  }
+
+  if (!MDVals.empty())
+    return MDNode::get(m_Ctx, MDVals);
+  else
+    return nullptr;
+}
+
+void DxilMDHelper::LoadDxilEntryProperties(const MDOperand &MDO,
+                                            uint64_t &rawShaderFlag,
+                                            DxilFunctionProps &props,
+                                            uint32_t &autoBindingSpace) {
+  if (MDO.get() == nullptr)
+    return;
+
+  const MDTuple *pTupleMD = dyn_cast<MDTuple>(MDO.get());
+  IFTBOOL(pTupleMD != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
+  IFTBOOL((pTupleMD->getNumOperands() & 0x1) == 0,
+          DXC_E_INCORRECT_DXIL_METADATA);
+  bool bEarlyDepth = false;
+
+  if (!m_pSM->IsLib()) {
+    props.shaderKind = m_pSM->GetKind();
+  } else {
+    props.shaderKind = DXIL::ShaderKind::Library;
+  }
+
+  for (unsigned iNode = 0; iNode < pTupleMD->getNumOperands(); iNode += 2) {
+    unsigned Tag = DxilMDHelper::ConstMDToUint32(pTupleMD->getOperand(iNode));
+    const MDOperand &MDO = pTupleMD->getOperand(iNode + 1);
+    IFTBOOL(MDO.get() != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
+
+    switch (Tag) {
+    case DxilMDHelper::kDxilShaderFlagsTag: {
+      rawShaderFlag = ConstMDToUint64(MDO);
+      ShaderFlags flags;
+      flags.SetShaderFlagsRaw(rawShaderFlag);
+      bEarlyDepth = flags.GetForceEarlyDepthStencil();
+    } break;
+
+    case DxilMDHelper::kDxilNumThreadsTag: {
+      DXASSERT(props.IsCS(), "else invalid shader kind");
+      auto &CS = props.ShaderProps.CS;
+      MDNode *pNode = cast<MDNode>(MDO.get());
+      CS.numThreads[0] = ConstMDToUint32(pNode->getOperand(0));
+      CS.numThreads[1] = ConstMDToUint32(pNode->getOperand(1));
+      CS.numThreads[2] = ConstMDToUint32(pNode->getOperand(2));
+    } break;
+
+    case DxilMDHelper::kDxilGSStateTag: {
+      DXASSERT(props.IsGS(), "else invalid shader kind");
+      auto &GS = props.ShaderProps.GS;
+      DXIL::PrimitiveTopology topo = DXIL::PrimitiveTopology::Undefined;
+      unsigned activeStreamMask;
+      LoadDxilGSState(MDO, GS.inputPrimitive, GS.maxVertexCount,
+                      activeStreamMask, topo, GS.instanceCount);
+      if (topo != DXIL::PrimitiveTopology::Undefined) {
+        for (size_t i = 0; i < _countof(GS.streamPrimitiveTopologies); ++i) {
+          unsigned mask = 1 << i;
+          if (activeStreamMask & mask) {
+            GS.streamPrimitiveTopologies[i] = topo;
+          } else {
+            GS.streamPrimitiveTopologies[i] =
+                DXIL::PrimitiveTopology::Undefined;
+          }
+        }
+      }
+    } break;
+
+    case DxilMDHelper::kDxilDSStateTag: {
+      DXASSERT(props.IsDS(), "else invalid shader kind");
+      auto &DS = props.ShaderProps.DS;
+      LoadDxilDSState(MDO, DS.domain, DS.inputControlPoints);
+    } break;
+
+    case DxilMDHelper::kDxilHSStateTag: {
+      DXASSERT(props.IsHS(), "else invalid shader kind");
+      auto &HS = props.ShaderProps.HS;
+      LoadDxilHSState(MDO, HS.patchConstantFunc, HS.inputControlPoints,
+                      HS.outputControlPoints, HS.domain, HS.partition,
+                      HS.outputPrimitive, HS.maxTessFactor);
+    } break;
+
+    case DxilMDHelper::kDxilAutoBindingSpaceTag: {
+      MDNode *pNode = cast<MDNode>(MDO.get());
+      autoBindingSpace = ConstMDToUint32(pNode->getOperand(0));
+      break;
+    }
+    case DxilMDHelper::kDxilRayPayloadSizeTag: {
+      DXASSERT(props.IsAnyHit() || props.IsClosestHit() || props.IsMiss() ||
+                   props.IsCallable(),
+               "else invalid shader kind");
+      props.ShaderProps.Ray.payloadSizeInBytes =
+          ConstMDToUint32(MDO);
+    } break;
+    case DxilMDHelper::kDxilRayAttribSizeTag: {
+      DXASSERT(props.IsAnyHit() || props.IsClosestHit(),
+               "else invalid shader kind");
+      props.ShaderProps.Ray.attributeSizeInBytes =
+          ConstMDToUint32(MDO);
+    } break;
+    case DxilMDHelper::kDxilShaderKindTag: {
+      DXIL::ShaderKind kind =
+          static_cast<DXIL::ShaderKind>(ConstMDToUint32(MDO));
+      DXASSERT(props.shaderKind == DXIL::ShaderKind::Library,
+               "else invalid shader kind");
+      props.shaderKind = kind;
+    } break;
+    default:
+      DXASSERT(false, "Unknown extended shader properties tag");
+      break;
+    }
+  }
+
+  if (bEarlyDepth) {
+    DXASSERT(props.IsPS(), "else invalid shader kind");
+    props.ShaderProps.PS.EarlyDepthStencil = true;
+  }
+}
+
 MDTuple *
 DxilMDHelper::EmitDxilFunctionProps(const hlsl::DxilFunctionProps *props,
-                                    Function *F) {
+                                   const Function *F) {
+  bool bRayAttributes = false;
   Metadata *MDVals[30];
   std::fill(MDVals, MDVals + _countof(MDVals), nullptr);
   unsigned valIdx = 0;
-  MDVals[valIdx++] = ValueAsMetadata::get(F);
+  MDVals[valIdx++] = ValueAsMetadata::get(const_cast<Function*>(F));
   MDVals[valIdx++] = Uint32ToConstMD(static_cast<unsigned>(props->shaderKind));
   switch (props->shaderKind) {
   case DXIL::ShaderKind::Compute:
@@ -1067,6 +1257,16 @@ DxilMDHelper::EmitDxilFunctionProps(const hlsl::DxilFunctionProps *props,
   case DXIL::ShaderKind::Pixel:
     MDVals[valIdx++] = BoolToConstMD(props->ShaderProps.PS.EarlyDepthStencil);
     break;
+  case DXIL::ShaderKind::AnyHit:
+  case DXIL::ShaderKind::ClosestHit:
+    bRayAttributes = true;
+  case DXIL::ShaderKind::Miss:
+  case DXIL::ShaderKind::Callable:
+    // payload/params unioned and first:
+    MDVals[valIdx++] = Uint32ToConstMD(props->ShaderProps.Ray.payloadSizeInBytes);
+    if (bRayAttributes)
+      MDVals[valIdx++] = Uint32ToConstMD(props->ShaderProps.Ray.attributeSizeInBytes);
+    break;
   default:
     break;
   }
@@ -1519,7 +1719,7 @@ void DxilExtraPropertyHelper::LoadSignatureElementProperties(const MDOperand &MD
 //
 // Utilities.
 //
-bool DxilMDHelper::IsKnownNamedMetaData(llvm::NamedMDNode &Node) {
+bool DxilMDHelper::IsKnownNamedMetaData(const llvm::NamedMDNode &Node) {
   StringRef name = Node.getName();
   for (unsigned i = 0; i < DxilMDNames.size(); i++) {
     if (name == DxilMDNames[i]) {
@@ -1529,6 +1729,14 @@ bool DxilMDHelper::IsKnownNamedMetaData(llvm::NamedMDNode &Node) {
   return false;
 }
 
+void DxilMDHelper::combineDxilMetadata(llvm::Instruction *K,
+                                       const llvm::Instruction *J) {
+  if (IsMarkedNonUniform(J))
+    MarkNonUniform(K);
+  if (IsMarkedPrecise(J))
+    MarkPrecise(K);
+}
+
 ConstantAsMetadata *DxilMDHelper::Int32ToConstMD(int32_t v, LLVMContext &Ctx) {
   return ConstantAsMetadata::get(Constant::getIntegerValue(IntegerType::get(Ctx, 32), APInt(32, v)));
 }
@@ -1663,4 +1871,22 @@ void DxilMDHelper::MarkPrecise(Instruction *I) {
   I->setMetadata(DxilMDHelper::kDxilPreciseAttributeMDName, preciseNode);
 }
 
+bool DxilMDHelper::IsMarkedNonUniform(const Instruction *inst) {
+  int32_t val = 0;
+  if (MDNode *precise = inst->getMetadata(kDxilNonUniformAttributeMDName)) {
+    assert(precise->getNumOperands() == 1);
+    val = ConstMDToInt32(precise->getOperand(0));
+  }
+  return val;
+}
+
+void DxilMDHelper::MarkNonUniform(Instruction *I) {
+  LLVMContext &Ctx = I->getContext();
+  MDNode *preciseNode = MDNode::get(
+    Ctx,
+    { ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1)) });
+
+  I->setMetadata(DxilMDHelper::kDxilNonUniformAttributeMDName, preciseNode);
+}
+
 } // namespace hlsl

File diff suppressed because it is too large
+ 389 - 460
lib/HLSL/DxilModule.cpp


+ 525 - 242
lib/HLSL/DxilOperations.cpp

@@ -41,234 +41,281 @@ import hctdb_instrhelp
 /* <py::lines('OPCODE-OLOADS')>hctdb_instrhelp.get_oloads_props()</py>*/
 // OPCODE-OLOADS:BEGIN
 const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
-//   OpCode                       OpCode name,                OpCodeClass                    OpCodeClass name,              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  // Temporary, indexable, input, output registers                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::TempRegLoad,             "TempRegLoad",              OCC::TempRegLoad,              "tempRegLoad",               {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-  {  OC::TempRegStore,            "TempRegStore",             OCC::TempRegStore,             "tempRegStore",              {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
-  {  OC::MinPrecXRegLoad,         "MinPrecXRegLoad",          OCC::MinPrecXRegLoad,          "minPrecXRegLoad",           {false,  true, false, false, false, false,  true, false, false}, Attribute::ReadOnly, },
-  {  OC::MinPrecXRegStore,        "MinPrecXRegStore",         OCC::MinPrecXRegStore,         "minPrecXRegStore",          {false,  true, false, false, false, false,  true, false, false}, Attribute::None,     },
-  {  OC::LoadInput,               "LoadInput",                OCC::LoadInput,                "loadInput",                 {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadNone, },
-  {  OC::StoreOutput,             "StoreOutput",              OCC::StoreOutput,              "storeOutput",               {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
-
-  // Unary float                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::FAbs,                    "FAbs",                     OCC::Unary,                    "unary",                     {false,  true,  true,  true, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Saturate,                "Saturate",                 OCC::Unary,                    "unary",                     {false,  true,  true,  true, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::IsNaN,                   "IsNaN",                    OCC::IsSpecialFloat,           "isSpecialFloat",            {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::IsInf,                   "IsInf",                    OCC::IsSpecialFloat,           "isSpecialFloat",            {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::IsFinite,                "IsFinite",                 OCC::IsSpecialFloat,           "isSpecialFloat",            {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::IsNormal,                "IsNormal",                 OCC::IsSpecialFloat,           "isSpecialFloat",            {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Cos,                     "Cos",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Sin,                     "Sin",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Tan,                     "Tan",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Acos,                    "Acos",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Asin,                    "Asin",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Atan,                    "Atan",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Hcos,                    "Hcos",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Hsin,                    "Hsin",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Htan,                    "Htan",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Exp,                     "Exp",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Frc,                     "Frc",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Log,                     "Log",                      OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Sqrt,                    "Sqrt",                     OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Rsqrt,                   "Rsqrt",                    OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Unary float - rounding                                                                                                 void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Round_ne,                "Round_ne",                 OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Round_ni,                "Round_ni",                 OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Round_pi,                "Round_pi",                 OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Round_z,                 "Round_z",                  OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Unary int                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Bfrev,                   "Bfrev",                    OCC::Unary,                    "unary",                     {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-  {  OC::Countbits,               "Countbits",                OCC::UnaryBits,                "unaryBits",                 {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-  {  OC::FirstbitLo,              "FirstbitLo",               OCC::UnaryBits,                "unaryBits",                 {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Unary uint                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::FirstbitHi,              "FirstbitHi",               OCC::UnaryBits,                "unaryBits",                 {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Unary int                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::FirstbitSHi,             "FirstbitSHi",              OCC::UnaryBits,                "unaryBits",                 {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Binary float                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::FMax,                    "FMax",                     OCC::Binary,                   "binary",                    {false,  true,  true,  true, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::FMin,                    "FMin",                     OCC::Binary,                   "binary",                    {false,  true,  true,  true, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Binary int                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::IMax,                    "IMax",                     OCC::Binary,                   "binary",                    {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-  {  OC::IMin,                    "IMin",                     OCC::Binary,                   "binary",                    {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Binary uint                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::UMax,                    "UMax",                     OCC::Binary,                   "binary",                    {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-  {  OC::UMin,                    "UMin",                     OCC::Binary,                   "binary",                    {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Binary int with two outputs                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::IMul,                    "IMul",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Binary uint with two outputs                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::UMul,                    "UMul",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::UDiv,                    "UDiv",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Binary uint with carry or borrow                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::UAddc,                   "UAddc",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",   {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::USubb,                   "USubb",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",   {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Tertiary float                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::FMad,                    "FMad",                     OCC::Tertiary,                 "tertiary",                  {false,  true,  true,  true, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Fma,                     "Fma",                      OCC::Tertiary,                 "tertiary",                  {false, false, false,  true, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Tertiary int                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::IMad,                    "IMad",                     OCC::Tertiary,                 "tertiary",                  {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Tertiary uint                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::UMad,                    "UMad",                     OCC::Tertiary,                 "tertiary",                  {false, false, false, false, false, false,  true,  true,  true}, Attribute::ReadNone, },
-
-  // Tertiary int                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Msad,                    "Msad",                     OCC::Tertiary,                 "tertiary",                  {false, false, false, false, false, false, false,  true,  true}, Attribute::ReadNone, },
-  {  OC::Ibfe,                    "Ibfe",                     OCC::Tertiary,                 "tertiary",                  {false, false, false, false, false, false, false,  true,  true}, Attribute::ReadNone, },
-
-  // Tertiary uint                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Ubfe,                    "Ubfe",                     OCC::Tertiary,                 "tertiary",                  {false, false, false, false, false, false, false,  true,  true}, Attribute::ReadNone, },
-
-  // Quaternary                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Bfi,                     "Bfi",                      OCC::Quaternary,               "quaternary",                {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Dot                                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Dot2,                    "Dot2",                     OCC::Dot2,                     "dot2",                      {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Dot3,                    "Dot3",                     OCC::Dot3,                     "dot3",                      {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::Dot4,                    "Dot4",                     OCC::Dot4,                     "dot4",                      {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::CreateHandle,            "CreateHandle",             OCC::CreateHandle,             "createHandle",              { true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::CBufferLoad,             "CBufferLoad",              OCC::CBufferLoad,              "cbufferLoad",               {false,  true,  true,  true, false,  true,  true,  true,  true}, Attribute::ReadOnly, },
-  {  OC::CBufferLoadLegacy,       "CBufferLoadLegacy",        OCC::CBufferLoadLegacy,        "cbufferLoadLegacy",         {false,  true,  true,  true, false, false,  true,  true,  true}, Attribute::ReadOnly, },
-
-  // Resources - sample                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Sample,                  "Sample",                   OCC::Sample,                   "sample",                    {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::SampleBias,              "SampleBias",               OCC::SampleBias,               "sampleBias",                {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::SampleLevel,             "SampleLevel",              OCC::SampleLevel,              "sampleLevel",               {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::SampleGrad,              "SampleGrad",               OCC::SampleGrad,               "sampleGrad",                {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::SampleCmp,               "SampleCmp",                OCC::SampleCmp,                "sampleCmp",                 {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::SampleCmpLevelZero,      "SampleCmpLevelZero",       OCC::SampleCmpLevelZero,       "sampleCmpLevelZero",        {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-
-  // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::TextureLoad,             "TextureLoad",              OCC::TextureLoad,              "textureLoad",               {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-  {  OC::TextureStore,            "TextureStore",             OCC::TextureStore,             "textureStore",              {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
-  {  OC::BufferLoad,              "BufferLoad",               OCC::BufferLoad,               "bufferLoad",                {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-  {  OC::BufferStore,             "BufferStore",              OCC::BufferStore,              "bufferStore",               {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
-  {  OC::BufferUpdateCounter,     "BufferUpdateCounter",      OCC::BufferUpdateCounter,      "bufferUpdateCounter",       { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::CheckAccessFullyMapped,  "CheckAccessFullyMapped",   OCC::CheckAccessFullyMapped,   "checkAccessFullyMapped",    {false, false, false, false, false, false, false,  true, false}, Attribute::ReadOnly, },
-  {  OC::GetDimensions,           "GetDimensions",            OCC::GetDimensions,            "getDimensions",             { true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
-
-  // Resources - gather                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::TextureGather,           "TextureGather",            OCC::TextureGather,            "textureGather",             {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-  {  OC::TextureGatherCmp,        "TextureGatherCmp",         OCC::TextureGatherCmp,         "textureGatherCmp",          {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-
-  // Resources - sample                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Texture2DMSGetSamplePosition, "Texture2DMSGetSamplePosition", OCC::Texture2DMSGetSamplePosition, "texture2DMSGetSamplePosition",   {true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::RenderTargetGetSamplePosition, "RenderTargetGetSamplePosition", OCC::RenderTargetGetSamplePosition, "renderTargetGetSamplePosition",   {true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::RenderTargetGetSampleCount, "RenderTargetGetSampleCount", OCC::RenderTargetGetSampleCount, "renderTargetGetSampleCount",   {true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
-
-  // Synchronization                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::AtomicBinOp,             "AtomicBinOp",              OCC::AtomicBinOp,              "atomicBinOp",               {false, false, false, false, false, false, false,  true, false}, Attribute::None,     },
-  {  OC::AtomicCompareExchange,   "AtomicCompareExchange",    OCC::AtomicCompareExchange,    "atomicCompareExchange",     {false, false, false, false, false, false, false,  true, false}, Attribute::None,     },
-  {  OC::Barrier,                 "Barrier",                  OCC::Barrier,                  "barrier",                   { true, false, false, false, false, false, false, false, false}, Attribute::NoDuplicate, },
-
-  // Pixel shader                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::CalculateLOD,            "CalculateLOD",             OCC::CalculateLOD,             "calculateLOD",              {false, false,  true, false, false, false, false, false, false}, Attribute::ReadOnly, },
-  {  OC::Discard,                 "Discard",                  OCC::Discard,                  "discard",                   { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::DerivCoarseX,            "DerivCoarseX",             OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::DerivCoarseY,            "DerivCoarseY",             OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::DerivFineX,              "DerivFineX",               OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::DerivFineY,              "DerivFineY",               OCC::Unary,                    "unary",                     {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::EvalSnapped,             "EvalSnapped",              OCC::EvalSnapped,              "evalSnapped",               {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::EvalSampleIndex,         "EvalSampleIndex",          OCC::EvalSampleIndex,          "evalSampleIndex",           {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::EvalCentroid,            "EvalCentroid",             OCC::EvalCentroid,             "evalCentroid",              {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::SampleIndex,             "SampleIndex",              OCC::SampleIndex,              "sampleIndex",               {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::Coverage,                "Coverage",                 OCC::Coverage,                 "coverage",                  {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::InnerCoverage,           "InnerCoverage",            OCC::InnerCoverage,            "innerCoverage",             {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Compute shader                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::ThreadId,                "ThreadId",                 OCC::ThreadId,                 "threadId",                  {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::GroupId,                 "GroupId",                  OCC::GroupId,                  "groupId",                   {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::ThreadIdInGroup,         "ThreadIdInGroup",          OCC::ThreadIdInGroup,          "threadIdInGroup",           {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::FlattenedThreadIdInGroup, "FlattenedThreadIdInGroup", OCC::FlattenedThreadIdInGroup, "flattenedThreadIdInGroup", {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Geometry shader                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::EmitStream,              "EmitStream",               OCC::EmitStream,               "emitStream",                { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::CutStream,               "CutStream",                OCC::CutStream,                "cutStream",                 { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::EmitThenCutStream,       "EmitThenCutStream",        OCC::EmitThenCutStream,        "emitThenCutStream",         { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::GSInstanceID,            "GSInstanceID",             OCC::GSInstanceID,             "gsInstanceID",              {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Double precision                                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::MakeDouble,              "MakeDouble",               OCC::MakeDouble,               "makeDouble",                {false, false, false,  true, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::SplitDouble,             "SplitDouble",              OCC::SplitDouble,              "splitDouble",               {false, false, false,  true, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Domain and hull shader                                                                                                 void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::LoadOutputControlPoint,  "LoadOutputControlPoint",   OCC::LoadOutputControlPoint,   "loadOutputControlPoint",    {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadNone, },
-  {  OC::LoadPatchConstant,       "LoadPatchConstant",        OCC::LoadPatchConstant,        "loadPatchConstant",         {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadNone, },
-
-  // Domain shader                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::DomainLocation,          "DomainLocation",           OCC::DomainLocation,           "domainLocation",            {false, false,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Hull shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::StorePatchConstant,      "StorePatchConstant",       OCC::StorePatchConstant,       "storePatchConstant",        {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
-  {  OC::OutputControlPointID,    "OutputControlPointID",     OCC::OutputControlPointID,     "outputControlPointID",      {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-  {  OC::PrimitiveID,             "PrimitiveID",              OCC::PrimitiveID,              "primitiveID",               {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Other                                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::CycleCounterLegacy,      "CycleCounterLegacy",       OCC::CycleCounterLegacy,       "cycleCounterLegacy",        { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-
-  // Wave                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::WaveIsFirstLane,         "WaveIsFirstLane",          OCC::WaveIsFirstLane,          "waveIsFirstLane",           { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::WaveGetLaneIndex,        "WaveGetLaneIndex",         OCC::WaveGetLaneIndex,         "waveGetLaneIndex",          { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::WaveGetLaneCount,        "WaveGetLaneCount",         OCC::WaveGetLaneCount,         "waveGetLaneCount",          { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::WaveAnyTrue,             "WaveAnyTrue",              OCC::WaveAnyTrue,              "waveAnyTrue",               { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::WaveAllTrue,             "WaveAllTrue",              OCC::WaveAllTrue,              "waveAllTrue",               { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::WaveActiveAllEqual,      "WaveActiveAllEqual",       OCC::WaveActiveAllEqual,       "waveActiveAllEqual",        {false,  true,  true,  true,  true,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::WaveActiveBallot,        "WaveActiveBallot",         OCC::WaveActiveBallot,         "waveActiveBallot",          { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::WaveReadLaneAt,          "WaveReadLaneAt",           OCC::WaveReadLaneAt,           "waveReadLaneAt",            {false,  true,  true,  true,  true,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::WaveReadLaneFirst,       "WaveReadLaneFirst",        OCC::WaveReadLaneFirst,        "waveReadLaneFirst",         {false,  true,  true, false,  true,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::WaveActiveOp,            "WaveActiveOp",             OCC::WaveActiveOp,             "waveActiveOp",              {false,  true,  true,  true,  true,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::WaveActiveBit,           "WaveActiveBit",            OCC::WaveActiveBit,            "waveActiveBit",             {false, false, false, false, false,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::WavePrefixOp,            "WavePrefixOp",             OCC::WavePrefixOp,             "wavePrefixOp",              {false,  true,  true,  true, false,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::QuadReadLaneAt,          "QuadReadLaneAt",           OCC::QuadReadLaneAt,           "quadReadLaneAt",            {false,  true,  true,  true,  true,  true,  true,  true,  true}, Attribute::None,     },
-  {  OC::QuadOp,                  "QuadOp",                   OCC::QuadOp,                   "quadOp",                    {false,  true,  true,  true, false,  true,  true,  true,  true}, Attribute::None,     },
-
-  // Bitcasts with different sizes                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::BitcastI16toF16,         "BitcastI16toF16",          OCC::BitcastI16toF16,          "bitcastI16toF16",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::BitcastF16toI16,         "BitcastF16toI16",          OCC::BitcastF16toI16,          "bitcastF16toI16",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::BitcastI32toF32,         "BitcastI32toF32",          OCC::BitcastI32toF32,          "bitcastI32toF32",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::BitcastF32toI32,         "BitcastF32toI32",          OCC::BitcastF32toI32,          "bitcastF32toI32",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::BitcastI64toF64,         "BitcastI64toF64",          OCC::BitcastI64toF64,          "bitcastI64toF64",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::BitcastF64toI64,         "BitcastF64toI64",          OCC::BitcastF64toI64,          "bitcastF64toI64",           { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Legacy floating-point                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::LegacyF32ToF16,          "LegacyF32ToF16",           OCC::LegacyF32ToF16,           "legacyF32ToF16",            { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::LegacyF16ToF32,          "LegacyF16ToF32",           OCC::LegacyF16ToF32,           "legacyF16ToF32",            { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Double precision                                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::LegacyDoubleToFloat,     "LegacyDoubleToFloat",      OCC::LegacyDoubleToFloat,      "legacyDoubleToFloat",       { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::LegacyDoubleToSInt32,    "LegacyDoubleToSInt32",     OCC::LegacyDoubleToSInt32,     "legacyDoubleToSInt32",      { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-  {  OC::LegacyDoubleToUInt32,    "LegacyDoubleToUInt32",     OCC::LegacyDoubleToUInt32,     "legacyDoubleToUInt32",      { true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Wave                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::WaveAllBitCount,         "WaveAllBitCount",          OCC::WaveAllOp,                "waveAllOp",                 { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-  {  OC::WavePrefixBitCount,      "WavePrefixBitCount",       OCC::WavePrefixOp,             "wavePrefixOp",              { true, false, false, false, false, false, false, false, false}, Attribute::None,     },
-
-  // Pixel shader                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::AttributeAtVertex,       "AttributeAtVertex",        OCC::AttributeAtVertex,        "attributeAtVertex",         {false,  true,  true, false, false, false, false, false, false}, Attribute::ReadNone, },
-
-  // Graphics shader                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::ViewID,                  "ViewID",                   OCC::ViewID,                   "viewID",                    {false, false, false, false, false, false, false,  true, false}, Attribute::ReadNone, },
-
-  // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",             {false,  true,  true, false, false, false,  true,  true, false}, Attribute::ReadOnly, },
-  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",            {false,  true,  true, false, false, false,  true,  true, false}, Attribute::None,     },
+//   OpCode                       OpCode name,                OpCodeClass                    OpCodeClass name,              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj,  function attribute
+  // Temporary, indexable, input, output registers                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::TempRegLoad,             "TempRegLoad",              OCC::TempRegLoad,              "tempRegLoad",               { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadOnly, },
+  {  OC::TempRegStore,            "TempRegStore",             OCC::TempRegStore,             "tempRegStore",              { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::None,     },
+  {  OC::MinPrecXRegLoad,         "MinPrecXRegLoad",          OCC::MinPrecXRegLoad,          "minPrecXRegLoad",           { false,  true, false, false, false, false,  true, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::MinPrecXRegStore,        "MinPrecXRegStore",         OCC::MinPrecXRegStore,         "minPrecXRegStore",          { false,  true, false, false, false, false,  true, false, false, false, false}, Attribute::None,     },
+  {  OC::LoadInput,               "LoadInput",                OCC::LoadInput,                "loadInput",                 { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::StoreOutput,             "StoreOutput",              OCC::StoreOutput,              "storeOutput",               { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::None,     },
+
+  // Unary float                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::FAbs,                    "FAbs",                     OCC::Unary,                    "unary",                     { false,  true,  true,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Saturate,                "Saturate",                 OCC::Unary,                    "unary",                     { false,  true,  true,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::IsNaN,                   "IsNaN",                    OCC::IsSpecialFloat,           "isSpecialFloat",            { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::IsInf,                   "IsInf",                    OCC::IsSpecialFloat,           "isSpecialFloat",            { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::IsFinite,                "IsFinite",                 OCC::IsSpecialFloat,           "isSpecialFloat",            { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::IsNormal,                "IsNormal",                 OCC::IsSpecialFloat,           "isSpecialFloat",            { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Cos,                     "Cos",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Sin,                     "Sin",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Tan,                     "Tan",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Acos,                    "Acos",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Asin,                    "Asin",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Atan,                    "Atan",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Hcos,                    "Hcos",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Hsin,                    "Hsin",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Htan,                    "Htan",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Exp,                     "Exp",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Frc,                     "Frc",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Log,                     "Log",                      OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Sqrt,                    "Sqrt",                     OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Rsqrt,                   "Rsqrt",                    OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Unary float - rounding                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Round_ne,                "Round_ne",                 OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Round_ni,                "Round_ni",                 OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Round_pi,                "Round_pi",                 OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Round_z,                 "Round_z",                  OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Unary int                                                                                                               void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Bfrev,                   "Bfrev",                    OCC::Unary,                    "unary",                     { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+  {  OC::Countbits,               "Countbits",                OCC::UnaryBits,                "unaryBits",                 { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+  {  OC::FirstbitLo,              "FirstbitLo",               OCC::UnaryBits,                "unaryBits",                 { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Unary uint                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::FirstbitHi,              "FirstbitHi",               OCC::UnaryBits,                "unaryBits",                 { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Unary int                                                                                                               void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::FirstbitSHi,             "FirstbitSHi",              OCC::UnaryBits,                "unaryBits",                 { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Binary float                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::FMax,                    "FMax",                     OCC::Binary,                   "binary",                    { false,  true,  true,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::FMin,                    "FMin",                     OCC::Binary,                   "binary",                    { false,  true,  true,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Binary int                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::IMax,                    "IMax",                     OCC::Binary,                   "binary",                    { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+  {  OC::IMin,                    "IMin",                     OCC::Binary,                   "binary",                    { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Binary uint                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::UMax,                    "UMax",                     OCC::Binary,                   "binary",                    { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+  {  OC::UMin,                    "UMin",                     OCC::Binary,                   "binary",                    { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Binary int with two outputs                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::IMul,                    "IMul",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Binary uint with two outputs                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::UMul,                    "UMul",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::UDiv,                    "UDiv",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",         { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Binary uint with carry or borrow                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::UAddc,                   "UAddc",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",   { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::USubb,                   "USubb",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",   { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Tertiary float                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::FMad,                    "FMad",                     OCC::Tertiary,                 "tertiary",                  { false,  true,  true,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Fma,                     "Fma",                      OCC::Tertiary,                 "tertiary",                  { false, false, false,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Tertiary int                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::IMad,                    "IMad",                     OCC::Tertiary,                 "tertiary",                  { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Tertiary uint                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::UMad,                    "UMad",                     OCC::Tertiary,                 "tertiary",                  { false, false, false, false, false, false,  true,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Tertiary int                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Msad,                    "Msad",                     OCC::Tertiary,                 "tertiary",                  { false, false, false, false, false, false, false,  true,  true, false, false}, Attribute::ReadNone, },
+  {  OC::Ibfe,                    "Ibfe",                     OCC::Tertiary,                 "tertiary",                  { false, false, false, false, false, false, false,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Tertiary uint                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Ubfe,                    "Ubfe",                     OCC::Tertiary,                 "tertiary",                  { false, false, false, false, false, false, false,  true,  true, false, false}, Attribute::ReadNone, },
+
+  // Quaternary                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Bfi,                     "Bfi",                      OCC::Quaternary,               "quaternary",                { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Dot                                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Dot2,                    "Dot2",                     OCC::Dot2,                     "dot2",                      { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Dot3,                    "Dot3",                     OCC::Dot3,                     "dot3",                      { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::Dot4,                    "Dot4",                     OCC::Dot4,                     "dot4",                      { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Resources                                                                                                               void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::CreateHandle,            "CreateHandle",             OCC::CreateHandle,             "createHandle",              {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::CBufferLoad,             "CBufferLoad",              OCC::CBufferLoad,              "cbufferLoad",               { false,  true,  true,  true, false,  true,  true,  true,  true, false, false}, Attribute::ReadOnly, },
+  {  OC::CBufferLoadLegacy,       "CBufferLoadLegacy",        OCC::CBufferLoadLegacy,        "cbufferLoadLegacy",         { false,  true,  true,  true, false, false,  true,  true,  true, false, false}, Attribute::ReadOnly, },
+
+  // Resources - sample                                                                                                      void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Sample,                  "Sample",                   OCC::Sample,                   "sample",                    { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::SampleBias,              "SampleBias",               OCC::SampleBias,               "sampleBias",                { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::SampleLevel,             "SampleLevel",              OCC::SampleLevel,              "sampleLevel",               { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::SampleGrad,              "SampleGrad",               OCC::SampleGrad,               "sampleGrad",                { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::SampleCmp,               "SampleCmp",                OCC::SampleCmp,                "sampleCmp",                 { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::SampleCmpLevelZero,      "SampleCmpLevelZero",       OCC::SampleCmpLevelZero,       "sampleCmpLevelZero",        { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+
+  // Resources                                                                                                               void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::TextureLoad,             "TextureLoad",              OCC::TextureLoad,              "textureLoad",               { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadOnly, },
+  {  OC::TextureStore,            "TextureStore",             OCC::TextureStore,             "textureStore",              { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::None,     },
+  {  OC::BufferLoad,              "BufferLoad",               OCC::BufferLoad,               "bufferLoad",                { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadOnly, },
+  {  OC::BufferStore,             "BufferStore",              OCC::BufferStore,              "bufferStore",               { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::None,     },
+  {  OC::BufferUpdateCounter,     "BufferUpdateCounter",      OCC::BufferUpdateCounter,      "bufferUpdateCounter",       {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::CheckAccessFullyMapped,  "CheckAccessFullyMapped",   OCC::CheckAccessFullyMapped,   "checkAccessFullyMapped",    { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadOnly, },
+  {  OC::GetDimensions,           "GetDimensions",            OCC::GetDimensions,            "getDimensions",             {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+
+  // Resources - gather                                                                                                      void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::TextureGather,           "TextureGather",            OCC::TextureGather,            "textureGather",             { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadOnly, },
+  {  OC::TextureGatherCmp,        "TextureGatherCmp",         OCC::TextureGatherCmp,         "textureGatherCmp",          { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadOnly, },
+
+  // Resources - sample                                                                                                      void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Texture2DMSGetSamplePosition, "Texture2DMSGetSamplePosition", OCC::Texture2DMSGetSamplePosition, "texture2DMSGetSamplePosition", {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::RenderTargetGetSamplePosition, "RenderTargetGetSamplePosition", OCC::RenderTargetGetSamplePosition, "renderTargetGetSamplePosition", {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::RenderTargetGetSampleCount, "RenderTargetGetSampleCount", OCC::RenderTargetGetSampleCount, "renderTargetGetSampleCount", {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+
+  // Synchronization                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::AtomicBinOp,             "AtomicBinOp",              OCC::AtomicBinOp,              "atomicBinOp",               { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::None,     },
+  {  OC::AtomicCompareExchange,   "AtomicCompareExchange",    OCC::AtomicCompareExchange,    "atomicCompareExchange",     { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::None,     },
+  {  OC::Barrier,                 "Barrier",                  OCC::Barrier,                  "barrier",                   {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::NoDuplicate, },
+
+  // Pixel shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::CalculateLOD,            "CalculateLOD",             OCC::CalculateLOD,             "calculateLOD",              { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+  {  OC::Discard,                 "Discard",                  OCC::Discard,                  "discard",                   {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::DerivCoarseX,            "DerivCoarseX",             OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::DerivCoarseY,            "DerivCoarseY",             OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::DerivFineX,              "DerivFineX",               OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::DerivFineY,              "DerivFineY",               OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::EvalSnapped,             "EvalSnapped",              OCC::EvalSnapped,              "evalSnapped",               { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::EvalSampleIndex,         "EvalSampleIndex",          OCC::EvalSampleIndex,          "evalSampleIndex",           { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::EvalCentroid,            "EvalCentroid",             OCC::EvalCentroid,             "evalCentroid",              { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::SampleIndex,             "SampleIndex",              OCC::SampleIndex,              "sampleIndex",               { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::Coverage,                "Coverage",                 OCC::Coverage,                 "coverage",                  { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::InnerCoverage,           "InnerCoverage",            OCC::InnerCoverage,            "innerCoverage",             { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Compute shader                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::ThreadId,                "ThreadId",                 OCC::ThreadId,                 "threadId",                  { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::GroupId,                 "GroupId",                  OCC::GroupId,                  "groupId",                   { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::ThreadIdInGroup,         "ThreadIdInGroup",          OCC::ThreadIdInGroup,          "threadIdInGroup",           { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::FlattenedThreadIdInGroup, "FlattenedThreadIdInGroup", OCC::FlattenedThreadIdInGroup, "flattenedThreadIdInGroup",  { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Geometry shader                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::EmitStream,              "EmitStream",               OCC::EmitStream,               "emitStream",                {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::CutStream,               "CutStream",                OCC::CutStream,                "cutStream",                 {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::EmitThenCutStream,       "EmitThenCutStream",        OCC::EmitThenCutStream,        "emitThenCutStream",         {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::GSInstanceID,            "GSInstanceID",             OCC::GSInstanceID,             "gsInstanceID",              { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Double precision                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::MakeDouble,              "MakeDouble",               OCC::MakeDouble,               "makeDouble",                { false, false, false,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::SplitDouble,             "SplitDouble",              OCC::SplitDouble,              "splitDouble",               { false, false, false,  true, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Domain and hull shader                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::LoadOutputControlPoint,  "LoadOutputControlPoint",   OCC::LoadOutputControlPoint,   "loadOutputControlPoint",    { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::LoadPatchConstant,       "LoadPatchConstant",        OCC::LoadPatchConstant,        "loadPatchConstant",         { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Domain shader                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::DomainLocation,          "DomainLocation",           OCC::DomainLocation,           "domainLocation",            { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Hull shader                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::StorePatchConstant,      "StorePatchConstant",       OCC::StorePatchConstant,       "storePatchConstant",        { false,  true,  true, false, false, false,  true,  true, false, false, false}, Attribute::None,     },
+  {  OC::OutputControlPointID,    "OutputControlPointID",     OCC::OutputControlPointID,     "outputControlPointID",      { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Hull, Domain and Geometry shaders                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::PrimitiveID,             "PrimitiveID",              OCC::PrimitiveID,              "primitiveID",               { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Other                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::CycleCounterLegacy,      "CycleCounterLegacy",       OCC::CycleCounterLegacy,       "cycleCounterLegacy",        {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+
+  // Wave                                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::WaveIsFirstLane,         "WaveIsFirstLane",          OCC::WaveIsFirstLane,          "waveIsFirstLane",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::WaveGetLaneIndex,        "WaveGetLaneIndex",         OCC::WaveGetLaneIndex,         "waveGetLaneIndex",          {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::WaveGetLaneCount,        "WaveGetLaneCount",         OCC::WaveGetLaneCount,         "waveGetLaneCount",          {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::WaveAnyTrue,             "WaveAnyTrue",              OCC::WaveAnyTrue,              "waveAnyTrue",               {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::WaveAllTrue,             "WaveAllTrue",              OCC::WaveAllTrue,              "waveAllTrue",               {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::WaveActiveAllEqual,      "WaveActiveAllEqual",       OCC::WaveActiveAllEqual,       "waveActiveAllEqual",        { false,  true,  true,  true,  true,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::WaveActiveBallot,        "WaveActiveBallot",         OCC::WaveActiveBallot,         "waveActiveBallot",          {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::WaveReadLaneAt,          "WaveReadLaneAt",           OCC::WaveReadLaneAt,           "waveReadLaneAt",            { false,  true,  true,  true,  true,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::WaveReadLaneFirst,       "WaveReadLaneFirst",        OCC::WaveReadLaneFirst,        "waveReadLaneFirst",         { false,  true,  true, false,  true,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::WaveActiveOp,            "WaveActiveOp",             OCC::WaveActiveOp,             "waveActiveOp",              { false,  true,  true,  true,  true,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::WaveActiveBit,           "WaveActiveBit",            OCC::WaveActiveBit,            "waveActiveBit",             { false, false, false, false, false,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::WavePrefixOp,            "WavePrefixOp",             OCC::WavePrefixOp,             "wavePrefixOp",              { false,  true,  true,  true, false,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::QuadReadLaneAt,          "QuadReadLaneAt",           OCC::QuadReadLaneAt,           "quadReadLaneAt",            { false,  true,  true,  true,  true,  true,  true,  true,  true, false, false}, Attribute::None,     },
+  {  OC::QuadOp,                  "QuadOp",                   OCC::QuadOp,                   "quadOp",                    { false,  true,  true,  true, false,  true,  true,  true,  true, false, false}, Attribute::None,     },
+
+  // Bitcasts with different sizes                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::BitcastI16toF16,         "BitcastI16toF16",          OCC::BitcastI16toF16,          "bitcastI16toF16",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::BitcastF16toI16,         "BitcastF16toI16",          OCC::BitcastF16toI16,          "bitcastF16toI16",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::BitcastI32toF32,         "BitcastI32toF32",          OCC::BitcastI32toF32,          "bitcastI32toF32",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::BitcastF32toI32,         "BitcastF32toI32",          OCC::BitcastF32toI32,          "bitcastF32toI32",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::BitcastI64toF64,         "BitcastI64toF64",          OCC::BitcastI64toF64,          "bitcastI64toF64",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::BitcastF64toI64,         "BitcastF64toI64",          OCC::BitcastF64toI64,          "bitcastF64toI64",           {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Legacy floating-point                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::LegacyF32ToF16,          "LegacyF32ToF16",           OCC::LegacyF32ToF16,           "legacyF32ToF16",            {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::LegacyF16ToF32,          "LegacyF16ToF32",           OCC::LegacyF16ToF32,           "legacyF16ToF32",            {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Double precision                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::LegacyDoubleToFloat,     "LegacyDoubleToFloat",      OCC::LegacyDoubleToFloat,      "legacyDoubleToFloat",       {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::LegacyDoubleToSInt32,    "LegacyDoubleToSInt32",     OCC::LegacyDoubleToSInt32,     "legacyDoubleToSInt32",      {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::LegacyDoubleToUInt32,    "LegacyDoubleToUInt32",     OCC::LegacyDoubleToUInt32,     "legacyDoubleToUInt32",      {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Wave                                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::WaveAllBitCount,         "WaveAllBitCount",          OCC::WaveAllOp,                "waveAllOp",                 {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+  {  OC::WavePrefixBitCount,      "WavePrefixBitCount",       OCC::WavePrefixOp,             "wavePrefixOp",              {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+
+  // Pixel shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::AttributeAtVertex,       "AttributeAtVertex",        OCC::AttributeAtVertex,        "attributeAtVertex",         { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Graphics shader                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::ViewID,                  "ViewID",                   OCC::ViewID,                   "viewID",                    { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Resources                                                                                                               void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::RawBufferLoad,           "RawBufferLoad",            OCC::RawBufferLoad,            "rawBufferLoad",             { false,  true,  true,  true, false, false,  true,  true,  true, false, false}, Attribute::ReadOnly, },
+  {  OC::RawBufferStore,          "RawBufferStore",           OCC::RawBufferStore,           "rawBufferStore",            { false,  true,  true,  true, false, false,  true,  true,  true, false, false}, Attribute::None,     },
+
+  // Raytracing object space uint System Values                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::InstanceID,              "InstanceID",               OCC::InstanceID,               "instanceID",                { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::InstanceIndex,           "InstanceIndex",            OCC::InstanceIndex,            "instanceIndex",             { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Raytracing hit uint System Values                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::HitKind,                 "HitKind",                  OCC::HitKind,                  "hitKind",                   { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Raytracing uint System Values                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::RayFlags,                "RayFlags",                 OCC::RayFlags,                 "rayFlags",                  { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Ray Dispatch Arguments                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::DispatchRaysIndex,       "DispatchRaysIndex",        OCC::DispatchRaysIndex,        "dispatchRaysIndex",         { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+  {  OC::DispatchRaysDimensions,  "DispatchRaysDimensions",   OCC::DispatchRaysDimensions,   "dispatchRaysDimensions",    { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Ray Vectors                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::WorldRayOrigin,          "WorldRayOrigin",           OCC::WorldRayOrigin,           "worldRayOrigin",            { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::WorldRayDirection,       "WorldRayDirection",        OCC::WorldRayDirection,        "worldRayDirection",         { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Ray object space Vectors                                                                                                void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::ObjectRayOrigin,         "ObjectRayOrigin",          OCC::ObjectRayOrigin,          "objectRayOrigin",           { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::ObjectRayDirection,      "ObjectRayDirection",       OCC::ObjectRayDirection,       "objectRayDirection",        { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Ray Transforms                                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::ObjectToWorld,           "ObjectToWorld",            OCC::ObjectToWorld,            "objectToWorld",             { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::WorldToObject,           "WorldToObject",            OCC::WorldToObject,            "worldToObject",             { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // RayT                                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::RayTMin,                 "RayTMin",                  OCC::RayTMin,                  "rayTMin",                   { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::RayTCurrent,             "RayTCurrent",              OCC::RayTCurrent,              "rayTCurrent",               { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+
+  // AnyHit Terminals                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::IgnoreHit,               "IgnoreHit",                OCC::IgnoreHit,                "ignoreHit",                 {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::NoReturn, },
+  {  OC::AcceptHitAndEndSearch,   "AcceptHitAndEndSearch",    OCC::AcceptHitAndEndSearch,    "acceptHitAndEndSearch",     {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::NoReturn, },
+
+  // Indirect Shader Invocation                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::TraceRay,                "TraceRay",                 OCC::TraceRay,                 "traceRay",                  { false, false, false, false, false, false, false, false, false,  true, false}, Attribute::None,     },
+  {  OC::ReportHit,               "ReportHit",                OCC::ReportHit,                "reportHit",                 { false, false, false, false, false, false, false, false, false,  true, false}, Attribute::None,     },
+  {  OC::CallShader,              "CallShader",               OCC::CallShader,               "callShader",                { false, false, false, false, false, false, false, false, false,  true, false}, Attribute::None,     },
+
+  // Library create handle from resource struct (like HL intrinsic)                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::CreateHandleForLib,      "CreateHandleForLib",       OCC::CreateHandleForLib,       "createHandleForLib",        { false, false, false, false, false, false, false, false, false, false,  true}, Attribute::ReadOnly, },
+
+  // Raytracing object space uint System Values                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::PrimitiveIndex,          "PrimitiveIndex",           OCC::PrimitiveIndex,           "primitiveIndex",            { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadNone, },
 };
 // OPCODE-OLOADS:END
 
 const char *OP::m_OverloadTypeName[kNumTypeOverloads] = {
-  "void", "f16", "f32", "f64", "i1", "i8", "i16", "i32", "i64"
+  "void", "f16", "f32", "f64", "i1", "i8", "i16", "i32", "i64", "udt",
 };
 
 const char *OP::m_NamePrefix = "dx.op.";
@@ -307,6 +354,8 @@ unsigned OP::GetTypeSlot(Type *pType) {
     case 64:              return 8;
     }
   }
+  case Type::PointerTyID: return 9;
+  case Type::StructTyID:  return 10;
   default:
     break;
   }
@@ -314,10 +363,30 @@ unsigned OP::GetTypeSlot(Type *pType) {
 }
 
 const char *OP::GetOverloadTypeName(unsigned TypeSlot) {
-  DXASSERT(TypeSlot < kNumTypeOverloads, "otherwise caller passed OOB index");
+  DXASSERT(TypeSlot < kUserDefineTypeSlot, "otherwise caller passed OOB index");
   return m_OverloadTypeName[TypeSlot];
 }
 
+llvm::StringRef OP::GetTypeName(Type *Ty, std::string &str) {
+  unsigned TypeSlot = OP::GetTypeSlot(Ty);
+  if (TypeSlot < kUserDefineTypeSlot) {
+    return GetOverloadTypeName(TypeSlot);
+  } else if (TypeSlot == kUserDefineTypeSlot) {
+    if (Ty->isPointerTy())
+      Ty = Ty->getPointerElementType();
+    StructType *ST = cast<StructType>(Ty);
+    return ST->getStructName();
+  } else if (TypeSlot == kObjectTypeSlot) {
+    StructType *ST = cast<StructType>(Ty);
+    return ST->getStructName();
+  } else {
+    raw_string_ostream os(str);
+    Ty->print(os);
+    os.flush();
+    return str;
+  }
+}
+
 const char *OP::GetOpCodeName(OpCode opCode) {
   DXASSERT(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes, "otherwise caller passed OOB index");
   return m_OpCodeProps[(unsigned)opCode].pOpCodeName;
@@ -440,6 +509,141 @@ bool OP::IsDxilOpGradient(OpCode C) {
   // OPCODE-GRADIENT:END
 }
 
+void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
+                                  unsigned &major, unsigned &minor,
+                                  unsigned &mask) {
+  unsigned op = (unsigned)C;
+  // Default is 6.0, all stages
+  major = 6;  minor = 0;
+  mask = ((unsigned)1 << (unsigned)DXIL::ShaderKind::Invalid) - 1;
+#define SFLAG(stage) ((unsigned)1 << (unsigned)DXIL::ShaderKind::stage)
+  /* <py::lines('OPCODE-SMMASK')>hctdb_instrhelp.get_min_sm_and_mask_text()</py>*/
+  // OPCODE-SMMASK:BEGIN
+  // Instructions: ThreadId=93, GroupId=94, ThreadIdInGroup=95,
+  // FlattenedThreadIdInGroup=96
+  if ((93 <= op && op <= 96)) {
+    mask = SFLAG(Compute);
+    return;
+  }
+  // Instructions: DomainLocation=105
+  if (op == 105) {
+    mask = SFLAG(Domain);
+    return;
+  }
+  // Instructions: LoadOutputControlPoint=103, LoadPatchConstant=104
+  if ((103 <= op && op <= 104)) {
+    mask = SFLAG(Domain) | SFLAG(Hull);
+    return;
+  }
+  // Instructions: EmitStream=97, CutStream=98, EmitThenCutStream=99,
+  // GSInstanceID=100
+  if ((97 <= op && op <= 100)) {
+    mask = SFLAG(Geometry);
+    return;
+  }
+  // Instructions: PrimitiveID=108
+  if (op == 108) {
+    mask = SFLAG(Geometry) | SFLAG(Domain) | SFLAG(Hull);
+    return;
+  }
+  // Instructions: StorePatchConstant=106, OutputControlPointID=107
+  if ((106 <= op && op <= 107)) {
+    mask = SFLAG(Hull);
+    return;
+  }
+  // Instructions: Sample=60, SampleBias=61, SampleCmp=64, CalculateLOD=81,
+  // DerivCoarseX=83, DerivCoarseY=84, DerivFineX=85, DerivFineY=86
+  if ((60 <= op && op <= 61) || op == 64 || op == 81 || (83 <= op && op <= 86)) {
+    mask = SFLAG(Library) | SFLAG(Pixel);
+    return;
+  }
+  // Instructions: RenderTargetGetSamplePosition=76,
+  // RenderTargetGetSampleCount=77, Discard=82, EvalSnapped=87,
+  // EvalSampleIndex=88, EvalCentroid=89, SampleIndex=90, Coverage=91,
+  // InnerCoverage=92
+  if ((76 <= op && op <= 77) || op == 82 || (87 <= op && op <= 92)) {
+    mask = SFLAG(Pixel);
+    return;
+  }
+  // Instructions: AttributeAtVertex=137
+  if (op == 137) {
+    major = 6;  minor = 1;
+    mask = SFLAG(Pixel);
+    return;
+  }
+  // Instructions: ViewID=138
+  if (op == 138) {
+    major = 6;  minor = 1;
+    mask = SFLAG(Vertex) | SFLAG(Hull) | SFLAG(Domain) | SFLAG(Geometry) | SFLAG(Pixel);
+    return;
+  }
+  // Instructions: RawBufferLoad=139, RawBufferStore=140
+  if ((139 <= op && op <= 140)) {
+    if (bWithTranslation) {
+      major = 6;  minor = 0;
+    } else {
+      major = 6;  minor = 2;
+    }
+    return;
+  }
+  // Instructions: IgnoreHit=155, AcceptHitAndEndSearch=156
+  if ((155 <= op && op <= 156)) {
+    major = 6;  minor = 3;
+    mask = SFLAG(AnyHit);
+    return;
+  }
+  // Instructions: CallShader=159
+  if (op == 159) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(ClosestHit) | SFLAG(RayGeneration) | SFLAG(Miss) | SFLAG(Callable);
+    return;
+  }
+  // Instructions: ReportHit=158
+  if (op == 158) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(Intersection);
+    return;
+  }
+  // Instructions: InstanceID=141, InstanceIndex=142, HitKind=143,
+  // ObjectRayOrigin=149, ObjectRayDirection=150, ObjectToWorld=151,
+  // WorldToObject=152, PrimitiveIndex=161
+  if ((141 <= op && op <= 143) || (149 <= op && op <= 152) || op == 161) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(Intersection) | SFLAG(AnyHit) | SFLAG(ClosestHit);
+    return;
+  }
+  // Instructions: RayFlags=144, WorldRayOrigin=147, WorldRayDirection=148,
+  // RayTMin=153, RayTCurrent=154
+  if (op == 144 || (147 <= op && op <= 148) || (153 <= op && op <= 154)) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(Intersection) | SFLAG(AnyHit) | SFLAG(ClosestHit) | SFLAG(Miss);
+    return;
+  }
+  // Instructions: TraceRay=157
+  if (op == 157) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(RayGeneration) | SFLAG(ClosestHit) | SFLAG(Miss);
+    return;
+  }
+  // Instructions: DispatchRaysIndex=145, DispatchRaysDimensions=146
+  if ((145 <= op && op <= 146)) {
+    major = 6;  minor = 3;
+    mask = SFLAG(Library) | SFLAG(RayGeneration) | SFLAG(Intersection) | SFLAG(AnyHit) | SFLAG(ClosestHit) | SFLAG(Miss) | SFLAG(Callable);
+    return;
+  }
+  // Instructions: CreateHandleForLib=160
+  if (op == 160) {
+    if (bWithTranslation) {
+      major = 6;  minor = 0;
+    } else {
+      major = 6;  minor = 3;
+    }
+    return;
+  }
+  // OPCODE-SMMASK:END
+#undef SFLAG
+}
+
 static Type *GetOrCreateStructType(LLVMContext &Ctx, ArrayRef<Type*> types, StringRef Name, Module *pModule) {
   if (StructType *ST = pModule->getTypeByName(Name)) {
     // TODO: validate the exist type match types if needed.
@@ -498,8 +702,8 @@ void OP::RefreshCache() {
   }
 }
 
-void OP::UpdateCache(OpCodeClass opClass, unsigned typeSlot, llvm::Function *F) {
-  m_OpCodeClassCache[(unsigned)opClass].pOverloads[typeSlot] = F;
+void OP::UpdateCache(OpCodeClass opClass, Type * Ty, llvm::Function *F) {
+  m_OpCodeClassCache[(unsigned)opClass].pOverloads[Ty] = F;
   m_FunctionToOpClass[F] = opClass;
 }
 
@@ -507,11 +711,10 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   DXASSERT(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes, "otherwise caller passed OOB OpCode");
   _Analysis_assume_(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes);
   DXASSERT(IsOverloadLegal(opCode, pOverloadType), "otherwise the caller requested illegal operation overload (eg HLSL function with unsupported types for mapped intrinsic function)");
-  unsigned TypeSlot = GetTypeSlot(pOverloadType);
   OpCodeClass opClass = m_OpCodeProps[(unsigned)opCode].opCodeClass;
-  Function *&F = m_OpCodeClassCache[(unsigned)opClass].pOverloads[TypeSlot];
+  Function *&F = m_OpCodeClassCache[(unsigned)opClass].pOverloads[pOverloadType];
   if (F != nullptr) {
-    UpdateCache(opClass, TypeSlot, F);
+    UpdateCache(opClass, pOverloadType, F);
     return F;
   }
 
@@ -535,16 +738,19 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   Type *pF64 = Type::getDoubleTy(m_Ctx);
   Type *pSDT = GetSplitDoubleType();  // Split double type.
   Type *pI4S = GetInt4Type(); // 4 i32s in a struct.
+  Type *udt = pOverloadType;
+  Type *obj = pOverloadType;
 
   std::string funcName = (Twine(OP::m_NamePrefix) + Twine(GetOpCodeClassName(opCode))).str();
   // Add ret type to the name.
   if (pOverloadType != pV) {
-    funcName = Twine(funcName).concat(".").concat(GetOverloadTypeName(TypeSlot)).str();
+    std::string typeName;
+    funcName = Twine(funcName).concat(".").concat(GetTypeName(pOverloadType, typeName)).str();
   } 
   // Try to find exist function with the same name in the module.
   if (Function *existF = m_pModule->getFunction(funcName)) {
     F = existF;
-    UpdateCache(opClass, TypeSlot, F);
+    UpdateCache(opClass, pOverloadType, F);
     return F;
   }
 
@@ -726,6 +932,8 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     // Hull shader
   case OpCode::StorePatchConstant:     A(pV);       A(pI32); A(pI32); A(pI32); A(pI8);  A(pETy); break;
   case OpCode::OutputControlPointID:   A(pI32);     A(pI32); break;
+
+    // Hull, Domain and Geometry shaders
   case OpCode::PrimitiveID:            A(pI32);     A(pI32); break;
 
     // Other
@@ -777,6 +985,51 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     // Resources
   case OpCode::RawBufferLoad:          RRT(pETy);   A(pI32); A(pRes); A(pI32); A(pI32); A(pI8);  A(pI32); break;
   case OpCode::RawBufferStore:         A(pV);       A(pI32); A(pRes); A(pI32); A(pI32); A(pETy); A(pETy); A(pETy); A(pETy); A(pI8);  A(pI32); break;
+
+    // Raytracing object space uint System Values
+  case OpCode::InstanceID:             A(pI32);     A(pI32); break;
+  case OpCode::InstanceIndex:          A(pI32);     A(pI32); break;
+
+    // Raytracing hit uint System Values
+  case OpCode::HitKind:                A(pI32);     A(pI32); break;
+
+    // Raytracing uint System Values
+  case OpCode::RayFlags:               A(pI32);     A(pI32); break;
+
+    // Ray Dispatch Arguments
+  case OpCode::DispatchRaysIndex:      A(pI32);     A(pI32); A(pI8);  break;
+  case OpCode::DispatchRaysDimensions: A(pI32);     A(pI32); A(pI8);  break;
+
+    // Ray Vectors
+  case OpCode::WorldRayOrigin:         A(pF32);     A(pI32); A(pI8);  break;
+  case OpCode::WorldRayDirection:      A(pF32);     A(pI32); A(pI8);  break;
+
+    // Ray object space Vectors
+  case OpCode::ObjectRayOrigin:        A(pF32);     A(pI32); A(pI8);  break;
+  case OpCode::ObjectRayDirection:     A(pF32);     A(pI32); A(pI8);  break;
+
+    // Ray Transforms
+  case OpCode::ObjectToWorld:          A(pF32);     A(pI32); A(pI32); A(pI8);  break;
+  case OpCode::WorldToObject:          A(pF32);     A(pI32); A(pI32); A(pI8);  break;
+
+    // RayT
+  case OpCode::RayTMin:                A(pF32);     A(pI32); break;
+  case OpCode::RayTCurrent:            A(pF32);     A(pI32); break;
+
+    // AnyHit Terminals
+  case OpCode::IgnoreHit:              A(pV);       A(pI32); break;
+  case OpCode::AcceptHitAndEndSearch:  A(pV);       A(pI32); break;
+
+    // Indirect Shader Invocation
+  case OpCode::TraceRay:               A(pV);       A(pI32); A(pRes); A(pI32); A(pI32); A(pI32); A(pI32); A(pI32); A(pF32); A(pF32); A(pF32); A(pF32); A(pF32); A(pF32); A(pF32); A(pF32); A(udt);  break;
+  case OpCode::ReportHit:              A(pI1);      A(pI32); A(pF32); A(pI32); A(udt);  break;
+  case OpCode::CallShader:             A(pV);       A(pI32); A(pI32); A(udt);  break;
+
+    // Library create handle from resource struct (like HL intrinsic)
+  case OpCode::CreateHandleForLib:     A(pRes);     A(pI32); A(obj);  break;
+
+    // Raytracing object space uint System Values
+  case OpCode::PrimitiveIndex:         A(pI32);     A(pI32); break;
   // OPCODE-OLOAD-FUNCS:END
   default: DXASSERT(false, "otherwise unhandled case"); break;
   }
@@ -789,7 +1042,7 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
 
   F = cast<Function>(m_pModule->getOrInsertFunction(funcName, pFT));
 
-  UpdateCache(opClass, TypeSlot, F);
+  UpdateCache(opClass, pOverloadType, F);
   F->setCallingConv(CallingConv::C);
   F->addFnAttr(Attribute::NoUnwind);
   if (m_OpCodeProps[(unsigned)opCode].FuncAttr != Attribute::None)
@@ -798,18 +1051,22 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   return F;
 }
 
-llvm::ArrayRef<llvm::Function *> OP::GetOpFuncList(OpCode opCode) const {
-  DXASSERT(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes, "otherwise caller passed OOB OpCode");
+const SmallDenseMap<llvm::Type *, llvm::Function *, 8> &
+OP::GetOpFuncList(OpCode opCode) const {
+  DXASSERT(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes,
+           "otherwise caller passed OOB OpCode");
   _Analysis_assume_(0 <= (unsigned)opCode && opCode < OpCode::NumOpCodes);
-  return llvm::ArrayRef<llvm::Function *>(m_OpCodeClassCache[(unsigned)m_OpCodeProps[(unsigned)opCode].opCodeClass].pOverloads);
+  return m_OpCodeClassCache[(unsigned)m_OpCodeProps[(unsigned)opCode]
+                                .opCodeClass]
+      .pOverloads;
 }
 
 void OP::RemoveFunction(Function *F) {
   if (OP::IsDxilOpFunc(F)) {
     OpCodeClass opClass = m_FunctionToOpClass[F];
-    for (unsigned i=0;i<kNumTypeOverloads;i++) {
-      if (F == m_OpCodeClassCache[(unsigned)opClass].pOverloads[i]) {
-        m_OpCodeClassCache[(unsigned)opClass].pOverloads[i] = nullptr;
+    for (auto it : m_OpCodeClassCache[(unsigned)opClass].pOverloads) {
+      if (it.second == F) {
+        m_OpCodeClassCache[(unsigned)opClass].pOverloads.erase(it.first);
         m_FunctionToOpClass.erase(F);
         break;
       }
@@ -820,7 +1077,8 @@ void OP::RemoveFunction(Function *F) {
 bool OP::GetOpCodeClass(const Function *F, OP::OpCodeClass &opClass) {
   auto iter = m_FunctionToOpClass.find(F);
   if (iter == m_FunctionToOpClass.end()) {
-    DXASSERT(!IsDxilOpFunc(F), "dxil function without an opcode class mapping?");
+    // When no user, cannot get opcode.
+    DXASSERT(F->user_empty() || !IsDxilOpFunc(F), "dxil function without an opcode class mapping?");
     return false;
   }
   opClass = iter->second;
@@ -830,8 +1088,8 @@ bool OP::GetOpCodeClass(const Function *F, OP::OpCodeClass &opClass) {
 bool OP::UseMinPrecision() {
   if (m_LowPrecisionMode == DXIL::LowPrecisionMode::Undefined) {
     if (m_pModule->HasDxilModule()) {
-      m_LowPrecisionMode = m_pModule->GetDxilModule().m_ShaderFlags.GetUseNativeLowPrecision() ?
-        DXIL::LowPrecisionMode::UseNativeLowPrecision : DXIL::LowPrecisionMode::UseMinPrecision;
+      m_LowPrecisionMode = m_pModule->GetDxilModule().GetUseMinPrecision() ?
+        DXIL::LowPrecisionMode::UseMinPrecision : DXIL::LowPrecisionMode::UseNativeLowPrecision;
     }
     else if (m_pModule->HasHLModule()) {
       m_LowPrecisionMode = m_pModule->GetHLModule().GetHLOptions().bUseMinPrecision ?
@@ -856,6 +1114,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   switch (opCode) {            // return     OpCode
   // OPCODE-OLOAD-TYPES:BEGIN
   case OpCode::TempRegStore:
+  case OpCode::CallShader:
     DXASSERT_NOMSG(FT->getNumParams() > 2);
     return FT->getParamType(2);
   case OpCode::MinPrecXRegStore:
@@ -879,11 +1138,18 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::UAddc:
   case OpCode::USubb:
   case OpCode::WaveActiveAllEqual:
+  case OpCode::CreateHandleForLib:
     DXASSERT_NOMSG(FT->getNumParams() > 1);
     return FT->getParamType(1);
   case OpCode::TextureStore:
     DXASSERT_NOMSG(FT->getNumParams() > 5);
     return FT->getParamType(5);
+  case OpCode::TraceRay:
+    DXASSERT_NOMSG(FT->getNumParams() > 15);
+    return FT->getParamType(15);
+  case OpCode::ReportHit:
+    DXASSERT_NOMSG(FT->getNumParams() > 3);
+    return FT->getParamType(3);
   case OpCode::CreateHandle:
   case OpCode::BufferUpdateCounter:
   case OpCode::GetDimensions:
@@ -915,6 +1181,8 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::LegacyDoubleToUInt32:
   case OpCode::WaveAllBitCount:
   case OpCode::WavePrefixBitCount:
+  case OpCode::IgnoreHit:
+  case OpCode::AcceptHitAndEndSearch:
     return Type::getVoidTy(m_Ctx);
   case OpCode::CheckAccessFullyMapped:
   case OpCode::AtomicBinOp:
@@ -930,9 +1198,24 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::OutputControlPointID:
   case OpCode::PrimitiveID:
   case OpCode::ViewID:
+  case OpCode::InstanceID:
+  case OpCode::InstanceIndex:
+  case OpCode::HitKind:
+  case OpCode::RayFlags:
+  case OpCode::DispatchRaysIndex:
+  case OpCode::DispatchRaysDimensions:
+  case OpCode::PrimitiveIndex:
     return IntegerType::get(m_Ctx, 32);
   case OpCode::CalculateLOD:
   case OpCode::DomainLocation:
+  case OpCode::WorldRayOrigin:
+  case OpCode::WorldRayDirection:
+  case OpCode::ObjectRayOrigin:
+  case OpCode::ObjectRayDirection:
+  case OpCode::ObjectToWorld:
+  case OpCode::WorldToObject:
+  case OpCode::RayTMin:
+  case OpCode::RayTCurrent:
     return Type::getFloatTy(m_Ctx);
   case OpCode::MakeDouble:
   case OpCode::SplitDouble:

+ 1155 - 0
lib/HLSL/DxilPatchShaderRecordBindings.cpp

@@ -0,0 +1,1155 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilPatchShaderRecordBindings.cpp                                        //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides a pass used by the RayTracing Fallback Lyaer to add modify       //
+// bindings to pull local root signature parameters from a global            //
+// "shader table" buffer instead                                             //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/HLSL/DxilFallbackLayerPass.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilSignatureElement.h"
+#include "dxc/HLSL/DxilFunctionProps.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/Support/Global.h"
+
+#include "dxc/Support/Unicode.h"
+#include "dxc/HLSL/DxilTypeSystem.h"
+#include "dxc/HLSL/DxilConstants.h"
+#include "dxc/HLSL/DxilInstructions.h"
+#include "dxc/HLSL/DxilSpanAllocator.h"
+#include "dxc/HLSL/DxilRootSignature.h"
+#include "dxc/HLSL/DxilUtil.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+#include <memory>
+#include <unordered_set>
+#include <functional>
+#include <unordered_map>
+#include <array>
+
+struct D3D12_VERSIONED_ROOT_SIGNATURE_DESC;
+#include "DxilPatchShaderRecordBindingsShared.h"
+
+
+using namespace llvm;
+using namespace hlsl;
+
+
+bool operator==(const ViewKey &a, const ViewKey &b) {
+  return memcmp(&a, &b, sizeof(a)) == 0;
+}
+
+const size_t SizeofD3D12GpuVA = sizeof(uint64_t);
+const size_t SizeofD3D12GpuDescriptorHandle = sizeof(uint64_t);
+
+Function *CloneFunction(Function *Orig,
+    const llvm::Twine &Name,
+    llvm::Module *llvmModule) {
+
+    Function *F = Function::Create(Orig->getFunctionType(),
+        GlobalValue::LinkageTypes::ExternalLinkage,
+        Name, llvmModule);
+
+    SmallVector<ReturnInst *, 2> Returns;
+    ValueToValueMapTy vmap;
+    // Map params.
+    auto entryParamIt = F->arg_begin();
+    for (Argument &param : Orig->args()) {
+        vmap[&param] = (entryParamIt++);
+    }
+
+    DxilModule &DM = llvmModule->GetOrCreateDxilModule();
+
+    llvm::CloneFunctionInto(F, Orig, vmap, /*ModuleLevelChagnes*/ false, Returns);
+    DM.GetTypeSystem().CopyFunctionAnnotation(F, Orig, DM.GetTypeSystem());
+
+    if (DM.HasDxilFunctionProps(F)) {
+        DM.CloneDxilEntryProps(Orig, F);
+    }
+    return F;
+}
+
+
+struct ShaderRecordEntry {
+  DxilRootParameterType ParameterType;
+  unsigned int RecordOffsetInBytes;
+  unsigned int OffsetInDescriptors; // Only valid for descriptor tables
+
+  static ShaderRecordEntry InvalidEntry() { return { (DxilRootParameterType)-1, (unsigned int)-1 }; }
+  bool IsInvalid() { return (unsigned int)ParameterType == (unsigned int)-1; }
+};
+
+struct D3D12_VERSIONED_ROOT_SIGNATURE_DESC;
+class DxilPatchShaderRecordBindings : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilPatchShaderRecordBindings() : ModulePass(ID) {}
+  const char *getPassName() const override { return "DXIL Patch Shader Record Binding"; }
+  void applyOptions(PassOptions O) override;
+  bool runOnModule(Module &M) override;
+
+private:
+  void ValidateParameters();
+  void AddInputBinding(Module &M);
+  void PatchShaderBindings(Module &M);
+  void InitializeViewTable();
+
+  unsigned int AddSRVRawBuffer(Module &M, unsigned int registerIndex, unsigned int registerSpace, const std::string &bufferName);
+  unsigned int AddHandle(Module &M, unsigned int baseRegisterIndex, unsigned int rangeSize, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type = nullptr, unsigned int constantBufferSize = 0);
+  unsigned int AddAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type);
+  unsigned int AddCBufferAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, const std::string &bufferName);
+
+  llvm::Value *CreateOffsetToShaderRecord(Module &M, IRBuilder<> &Builder, unsigned int RecordOffsetInBytes, llvm::Value *CbufferOffsetInBytes);
+  llvm::Value *CreateShaderRecordBufferLoad(Module &M, IRBuilder<> &Builder, llvm::Value *ShaderRecordOffsetInBytes, llvm::Type* type);
+  llvm::Value *CreateCBufferLoadOffsetInBytes(Module &M, IRBuilder<> &Builder, llvm::Instruction *instruction);
+  llvm::Value *CreateCBufferLoadLegacy(Module &M, IRBuilder<> &Builder, llvm::Value *ResourceHandle, unsigned int RowToLoad = 0);
+
+  llvm::Value *LoadShaderRecordData(Module &M, IRBuilder<> &Builder,
+                                    llvm::Value *offsetToShaderRecord,
+                                    unsigned int dataOffsetInShaderRecord);
+
+  void PatchCreateHandleToUseDescriptorIndex(
+      _In_ Module &M,
+      _In_ IRBuilder<> &Builder,
+      _In_ DXIL::ResourceKind &resourceKind,
+      _In_ DXIL::ResourceClass &resourceClass,
+      _In_ llvm::Type *resourceType,
+      _In_ llvm::Value *descriptorIndex,
+      _Inout_ DxilInst_CreateHandleForLib &createHandleInstr);
+
+
+  bool GetHandleInfo(
+    Module &M, 
+    DxilInst_CreateHandleForLib &createHandleStructForLib, 
+    _Out_ unsigned int &shaderRegister, 
+    _Out_ unsigned int &registerSpace, 
+    _Out_ DXIL::ResourceKind &kind, 
+    _Out_ DXIL::ResourceClass &resClass,
+    _Out_ llvm::Type *&resType);
+
+  llvm::Value * GetAliasedDescriptorHeapHandle(Module &M, llvm::Type *, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind);
+
+  unsigned int GetConstantBufferOffsetToShaderRecord();
+
+  bool IsCBufferLoad(llvm::Instruction *instruction);
+
+  // Unlike the LLVM version of this function, this does not requires the InstructionToReplace and the ValueToReplaceWith to be the same instruction type
+  static void ReplaceUsesOfWith(llvm::Instruction *InstructionToReplace, llvm::Value *ValueToReplaceWith);
+
+  static ShaderRecordEntry FindRootSignatureDescriptor(const DxilVersionedRootSignatureDesc &rootSignatureDescriptor, unsigned int ShaderRecordIdentifierSizeInBytes, DXIL::ResourceClass resourceClass, unsigned int baseRegisterIndex, unsigned int registerSpace);
+
+  // TODO: I would like to see these prefixed with m_
+  llvm::Value *ShaderTableHandle = nullptr;
+  llvm::Value *DispatchRaysConstantsHandle = nullptr;
+  llvm::Value *BaseShaderRecordOffset = nullptr;
+
+  static const unsigned int NumViewTypes = 4;
+  struct ViewKeyHasher
+  {
+  public:
+      std::size_t operator()(const ViewKey &x) const {
+        return std::hash<unsigned int>()((unsigned int)x.ViewType) ^ 
+            std::hash<unsigned int>()((unsigned int)x.StructuredStride);
+      }
+  };
+
+
+  std::unordered_map<ViewKey, llvm::Value *, ViewKeyHasher>
+      TypeToAliasedDescriptorHeap[NumViewTypes];
+
+  llvm::Function *EntryPointFunction;
+
+  ShaderInfo *pInputShaderInfo;
+  DxilVersionedRootSignatureDesc *pRootSignatureDesc;
+  DXIL::ShaderKind ShaderKind;
+};
+
+char DxilPatchShaderRecordBindings::ID = 0;
+
+// TODO: Find the right thing to do on failure
+void ThrowFailure() {
+  throw std::exception();
+}
+
+// TODO: Stolen from Brandon's code, merge
+// Remove ELF mangling
+static inline std::string GetUnmangledName(StringRef name) {
+  if (!name.startswith("\x1?"))
+      return name;
+
+  size_t pos = name.find("@@");
+  if (pos == name.npos)
+    return name;
+
+
+  return name.substr(2, pos - 2);
+}
+
+static Function* getFunctionFromName(Module &M, const std::wstring& exportName) {
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    std::wstring functionName = Unicode::UTF8ToUTF16StringOrThrow(GetUnmangledName(F->getName()).c_str());
+    if (exportName == functionName) {
+      return F;
+    }
+  }
+  return nullptr;
+}
+
+ModulePass *llvm::createDxilPatchShaderRecordBindingsPass() {
+  return new DxilPatchShaderRecordBindings();
+}
+
+INITIALIZE_PASS(DxilPatchShaderRecordBindings, "hlsl-dxil-patch-shader-record-bindings", "Patch shader record bindings to instead pull from the fallback provided bindings", false, false)
+
+void DxilPatchShaderRecordBindings::applyOptions(PassOptions O) {
+  for (const auto & option : O) {
+    if (0 == option.first.compare("root-signature")) {
+      unsigned int cHexRadix = 16;
+      pInputShaderInfo = (ShaderInfo*)strtoull(option.second.data(), nullptr, cHexRadix);
+      pRootSignatureDesc = (DxilVersionedRootSignatureDesc*)pInputShaderInfo->pRootSignatureDesc;
+    }
+  }
+}
+
+void AddAnnoationsIfNeeded(DxilModule &DM, llvm::StructType *StructTy, const std::string &FieldName, unsigned int numFields = 1)
+{
+    auto pAnnotation = DM.GetTypeSystem().GetStructAnnotation(StructTy);
+    if (pAnnotation == nullptr)
+    {
+        pAnnotation = DM.GetTypeSystem().AddStructAnnotation(StructTy);
+        pAnnotation->SetCBufferSize(sizeof(uint32_t) * numFields);
+        for (unsigned int i = 0; i < numFields; i++)
+        {
+            pAnnotation->GetFieldAnnotation(i).SetCBufferOffset(sizeof(uint32_t) * i);
+            pAnnotation->GetFieldAnnotation(i).SetCompType(hlsl::DXIL::ComponentType::I32);
+            pAnnotation->GetFieldAnnotation(i).SetFieldName(FieldName + std::to_string(i));
+        }
+    }
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddHandle(Module &M, unsigned int baseRegisterIndex, unsigned int rangeSize, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type, unsigned int constantBufferSize) {
+  LLVMContext & Ctx = M.getContext();
+  DxilModule &DM = M.GetOrCreateDxilModule();
+
+  // Set up a SRV with byte address buffer
+  unsigned int resourceHandle;
+  std::unique_ptr<DxilResource> pHandle;
+  std::unique_ptr<DxilCBuffer> pCBuf;
+  std::unique_ptr<DxilSampler> pSampler;
+  DxilResourceBase *pBaseHandle;
+  switch (resClass) {
+  case DXIL::ResourceClass::SRV:
+    resourceHandle = static_cast<unsigned int>(DM.GetSRVs().size());
+    pHandle = llvm::make_unique<DxilResource>();
+    pHandle->SetRW(false);
+    pBaseHandle = pHandle.get();
+    break;
+  case DXIL::ResourceClass::UAV:
+    resourceHandle = static_cast<unsigned int>(DM.GetUAVs().size());
+    pHandle = llvm::make_unique<DxilResource>();
+    pHandle->SetRW(true);
+    pBaseHandle = pHandle.get();
+    break;
+  case DXIL::ResourceClass::CBuffer:
+    resourceHandle = static_cast<unsigned int>(DM.GetCBuffers().size());
+    pCBuf = llvm::make_unique<DxilCBuffer>();
+    pCBuf->SetSize(constantBufferSize);
+    pBaseHandle = pCBuf.get();
+    break;
+  case DXIL::ResourceClass::Sampler:
+    resourceHandle = static_cast<unsigned int>(DM.GetSamplers().size());
+    pSampler = llvm::make_unique<DxilSampler>();
+    // TODO: Is this okay? What if one of the samplers in the table is a comparison sampler?
+    pSampler->SetSamplerKind(DxilSampler::SamplerKind::Default);
+    pBaseHandle = pSampler.get();
+    break;
+  }
+
+  if (!type) {
+    SmallVector<llvm::Type*, 1> Elements{ Type::getInt32Ty(Ctx) };
+    std::string ByteAddressBufferName = "struct.ByteAddressBuffer";
+    type = M.getTypeByName(ByteAddressBufferName);
+    if (!type)
+    {
+        StructType *StructTy;
+        type = StructTy = StructType::create(Elements, ByteAddressBufferName);
+  
+        AddAnnoationsIfNeeded(DM, StructTy, ByteAddressBufferName);
+    }
+  }
+
+  GlobalVariable *GV = M.getGlobalVariable(bufferName);
+  if (!GV) {
+    GV = cast<GlobalVariable>(M.getOrInsertGlobal(bufferName, type));
+  }
+
+  pBaseHandle->SetGlobalName(bufferName.c_str());
+  pBaseHandle->SetGlobalSymbol(GV);
+  pBaseHandle->SetID(resourceHandle);
+  pBaseHandle->SetSpaceID(registerSpace);
+  pBaseHandle->SetLowerBound(baseRegisterIndex);
+  pBaseHandle->SetRangeSize(rangeSize);
+  pBaseHandle->SetKind(resKind);
+
+  if (pHandle) {
+    pHandle->SetGloballyCoherent(false);
+    pHandle->SetHasCounter(false);
+    pHandle->SetCompType(CompType::getF32()); // TODO: Need to handle all types
+  }
+
+  unsigned int ID;
+  switch (resClass) {
+  case DXIL::ResourceClass::SRV:
+    ID = DM.AddSRV(std::move(pHandle));
+    break;
+  case DXIL::ResourceClass::UAV:
+    ID = DM.AddUAV(std::move(pHandle));
+    break;
+  case DXIL::ResourceClass::CBuffer:
+    ID = DM.AddCBuffer(std::move(pCBuf));
+    break;
+  case DXIL::ResourceClass::Sampler:
+    ID = DM.AddSampler(std::move(pSampler));
+    break;
+  }
+
+  assert(ID == resourceHandle);
+  return ID;
+}
+
+unsigned int DxilPatchShaderRecordBindings::GetConstantBufferOffsetToShaderRecord()
+{
+    switch (ShaderKind)
+    {
+    case DXIL::ShaderKind::ClosestHit:
+    case DXIL::ShaderKind::AnyHit:
+    case DXIL::ShaderKind::Intersection:
+        return offsetof(DispatchRaysConstants, HitGroupShaderRecordStride);
+    case DXIL::ShaderKind::Miss:
+        return offsetof(DispatchRaysConstants, MissShaderRecordStride);
+    default:
+        ThrowFailure();
+        return -1;
+    }
+}
+
+
+unsigned int DxilPatchShaderRecordBindings::AddSRVRawBuffer(Module &M, unsigned int registerIndex, unsigned int registerSpace, const std::string &bufferName) {
+  return AddHandle(M, registerIndex, 1, registerSpace, DXIL::ResourceClass::SRV, DXIL::ResourceKind::RawBuffer, bufferName);
+}
+
+llvm::Constant *GetArraySymbol(Module &M, const std::string &bufferName) {
+  LLVMContext & Ctx = M.getContext();
+
+  SmallVector<llvm::Type*, 1> Elements{ Type::getInt32Ty(Ctx) };
+  llvm::StructType *StructTy = llvm::StructType::create(Elements, bufferName);
+  llvm::ArrayType *ArrayTy = ArrayType::get(StructTy, -1);
+
+  return UndefValue::get(ArrayTy->getPointerTo());
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddCBufferAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, const std::string &bufferName) {
+  const unsigned int maxConstantBufferSize = 4096 * 16;
+  return AddHandle(M, baseRegisterIndex, UINT_MAX, registerSpace, DXIL::ResourceClass::CBuffer, DXIL::ResourceKind::CBuffer, bufferName, GetArraySymbol(M, bufferName)->getType(), maxConstantBufferSize);
+}
+
+unsigned int DxilPatchShaderRecordBindings::AddAliasedHandle(Module &M, unsigned int baseRegisterIndex, unsigned int registerSpace, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind, const std::string &bufferName, llvm::Type *type) {
+  return AddHandle(M, baseRegisterIndex, UINT_MAX, registerSpace, resClass, resKind, bufferName, type);
+}
+
+// TODO: Stolen from Brandon's code
+DXIL::ShaderKind GetRayShaderKindCopy(Function* F)
+{
+    if (F->hasFnAttribute("exp-shader"))
+        return DXIL::ShaderKind::RayGeneration;
+
+    DxilModule& DM = F->getParent()->GetDxilModule();
+    if (DM.HasDxilFunctionProps(F) && DM.GetDxilFunctionProps(F).IsRay())
+        return DM.GetDxilFunctionProps(F).shaderKind;
+
+    return DXIL::ShaderKind::Invalid;
+}
+
+static std::string ws2s(const std::wstring& wide)
+{
+    return std::string(wide.begin(), wide.end());
+}
+
+bool DxilPatchShaderRecordBindings::runOnModule(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  EntryPointFunction = pInputShaderInfo->ExportName ? getFunctionFromName(M, pInputShaderInfo->ExportName) : DM.GetEntryFunction();
+  ShaderKind = GetRayShaderKindCopy(EntryPointFunction);
+
+  ValidateParameters();
+  InitializeViewTable();
+
+  PatchShaderBindings(M);
+  DM.ReEmitDxilResources();
+  return true;
+}
+
+void DxilPatchShaderRecordBindings::ValidateParameters() {
+  if (!pInputShaderInfo || !pInputShaderInfo->pRootSignatureDesc) {
+    throw std::exception();
+  }
+}
+
+DxilResourceBase &GetResourceFromID(DxilModule &DM, DXIL::ResourceClass resClass, unsigned int id)
+{
+    switch (resClass)
+    {
+    case DXIL::ResourceClass::CBuffer:
+        return DM.GetCBuffer(id);
+        break;
+    case DXIL::ResourceClass::SRV:
+        return DM.GetSRV(id);
+        break;
+    case DXIL::ResourceClass::UAV:
+        return DM.GetUAV(id);
+        break;
+    case DXIL::ResourceClass::Sampler:
+        return DM.GetSampler(id);
+        break;
+    default:
+        ThrowFailure();
+        return *(DxilResourceBase*)nullptr;
+    }
+}
+
+unsigned int FindOrInsertViewIntoList(const ViewKey &key, ViewKey *pViewList, unsigned int &numViews, unsigned int maxViews)
+{
+    unsigned int viewIndex = 0;
+    for (; viewIndex < numViews; viewIndex++)
+    {
+        if (pViewList[viewIndex] == key)
+        {
+            break;
+        }
+    }
+
+    if (viewIndex == numViews)
+    {
+        if (viewIndex >= maxViews) {
+            ThrowFailure();
+        }
+
+        pViewList[viewIndex] = key;
+        numViews++;
+    }
+    return viewIndex;
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::GetAliasedDescriptorHeapHandle(Module &M, llvm::Type *type, DXIL::ResourceClass resClass, DXIL::ResourceKind resKind)
+{
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    unsigned int resClassIndex = (unsigned int)resClass;
+    
+    ViewKey key = {};
+    key.ViewType = (unsigned int)resKind;
+    if (resKind == DXIL::ResourceKind::StructuredBuffer)
+    {
+      key.StructuredStride = type->getPrimitiveSizeInBits();
+    } else if (resKind != DXIL::ResourceKind::RawBuffer)
+    {
+      auto containedType = type->getContainedType(0);
+      // If it's a vector, get the type of just a single element
+      if (containedType->getNumContainedTypes() > 0)
+      {
+        assert(containedType->getNumContainedTypes() <= 4);
+        containedType = containedType->getContainedType(0);
+      }
+      key.SRVComponentType = (unsigned int)CompType::GetCompType(containedType).GetKind();
+    }
+    auto aliasedDescriptorHeapHandle = TypeToAliasedDescriptorHeap[resClassIndex].find(key);
+    if (aliasedDescriptorHeapHandle == TypeToAliasedDescriptorHeap[resClassIndex].end())
+    {
+        unsigned int registerSpaceOffset = 0;
+        std::string HandleName;
+
+        if (resClass == DXIL::ResourceClass::SRV)
+        {
+          registerSpaceOffset = FindOrInsertViewIntoList(
+              key, 
+              pInputShaderInfo->pSRVRegisterSpaceArray, 
+              *pInputShaderInfo->pNumSRVSpaces, 
+              FallbackLayerNumDescriptorHeapSpacesPerView);
+
+          HandleName = std::string("SRVDescriptorHeapTable") +
+                       std::to_string(registerSpaceOffset);
+        }
+        else if (resClass == DXIL::ResourceClass::UAV)
+        {
+          registerSpaceOffset = FindOrInsertViewIntoList(
+              key,
+              pInputShaderInfo->pUAVRegisterSpaceArray,
+              *pInputShaderInfo->pNumUAVSpaces,
+              FallbackLayerNumDescriptorHeapSpacesPerView);
+
+          if (registerSpaceOffset == 0)
+          {
+              // Using the descriptor heap declared by the fallback for handling emulated pointers,
+              // make sure the name is an exact match
+              assert(key.ViewType == (unsigned int)hlsl::DXIL::ResourceKind::RawBuffer);
+              HandleName = "\01?DescriptorHeapBufferTable@@3PAURWByteAddressBuffer@@A";
+          }
+          else
+          {
+              HandleName = std::string("UAVDescriptorHeapTable") +
+                  std::to_string(registerSpaceOffset);
+          }
+        }
+        else if (resClass == DXIL::ResourceClass::CBuffer)
+        {
+          HandleName = std::string("CBVDescriptorHeapTable");
+
+        } else {
+          HandleName = std::string("SamplerDescriptorHeapTable");
+        }
+
+
+        llvm::ArrayType *descriptorHeapType = ArrayType::get(type, 0);
+        static unsigned int i = 0;
+        unsigned int id = AddAliasedHandle(M, FallbackLayerDescriptorHeapTable, FallbackLayerRegisterSpace + FallbackLayerDescriptorHeapSpaceOffset + registerSpaceOffset, resClass, resKind, HandleName, descriptorHeapType);
+        
+        TypeToAliasedDescriptorHeap[resClassIndex][key] = GetResourceFromID(DM, resClass, id).GetGlobalSymbol();
+    }
+    return TypeToAliasedDescriptorHeap[resClassIndex][key];
+}
+
+void DxilPatchShaderRecordBindings::AddInputBinding(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  auto & EntryBlock = EntryPointFunction->getEntryBlock();
+  auto & Instructions = EntryBlock.getInstList();
+
+  std::string bufferName;
+  unsigned int bufferRegister;
+
+  switch (ShaderKind) {
+  case DXIL::ShaderKind::AnyHit:
+  case DXIL::ShaderKind::ClosestHit:
+  case DXIL::ShaderKind::Intersection:
+    bufferRegister = FallbackLayerHitGroupRecordByteAddressBufferRegister;
+    bufferName = "\01?HitGroupShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::Miss:
+    bufferRegister = FallbackLayerMissShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?MissShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::RayGeneration:
+    bufferRegister = FallbackLayerRayGenShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?RayGenShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  case DXIL::ShaderKind::Callable:
+    bufferRegister = FallbackLayerCallableShaderRecordByteAddressBufferRegister;
+    bufferName = "\01?CallableShaderTable@@3UByteAddressBuffer@@A";
+    break;
+  }
+  unsigned int ShaderRecordID = AddSRVRawBuffer(M, bufferRegister, FallbackLayerRegisterSpace, bufferName);
+
+  auto It = Instructions.begin();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  IRBuilder<> Builder(It);
+  {
+    auto ShaderTableName = "ShaderTableHandle";
+    llvm::Value *Symbol = DM.GetSRV(ShaderRecordID).GetGlobalSymbol();
+    llvm::Value *Load = Builder.CreateLoad(Symbol, "LoadShaderTableHandle");
+
+    Function *CreateHandleForLib = HlslOP->GetOpFunc(DXIL::OpCode::CreateHandleForLib, Load->getType());
+    Constant *CreateHandleOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CreateHandleForLib);
+    ShaderTableHandle = Builder.CreateCall(CreateHandleForLib, { CreateHandleOpcodeArg, Load }, ShaderTableName);
+  }
+
+  {
+    auto CbufferName = "Constants";
+    const unsigned int sizeOfConstantsInBytes = sizeof(DispatchRaysConstants);
+    llvm::StructType *StructTy= M.getTypeByName(CbufferName);
+    if (!StructTy)
+    {
+        const unsigned int numUintsInConstants = sizeOfConstantsInBytes / sizeof(unsigned int);
+        SmallVector<llvm::Type*, numUintsInConstants> Elements(numUintsInConstants);
+        for (unsigned int i = 0; i < numUintsInConstants; i++)
+        {
+            Elements[i] = Type::getInt32Ty(Ctx);
+        }
+        StructTy = llvm::StructType::create(Elements, CbufferName);
+        AddAnnoationsIfNeeded(DM, StructTy, std::string(CbufferName), numUintsInConstants);
+    }
+
+    unsigned int handle = AddHandle(M, FallbackLayerDispatchConstantsRegister, 1, FallbackLayerRegisterSpace, DXIL::ResourceClass::CBuffer, DXIL::ResourceKind::CBuffer, CbufferName, StructTy, sizeOfConstantsInBytes);
+
+    llvm::Value *Symbol = DM.GetCBuffer(handle).GetGlobalSymbol();
+    llvm::Value *Load = Builder.CreateLoad(Symbol, "DispatchRaysConstants");
+
+    Function *CreateHandleForLib = HlslOP->GetOpFunc(DXIL::OpCode::CreateHandleForLib, Load->getType());
+    Constant *CreateHandleOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CreateHandleForLib);
+    DispatchRaysConstantsHandle = Builder.CreateCall(CreateHandleForLib, { CreateHandleOpcodeArg, Load }, CbufferName);
+  }
+  
+  // Raygen always reads from the start so no offset calculations needed
+  if (ShaderKind != DXIL::ShaderKind::RayGeneration)
+  {
+      std::string ShaderRecordOffsetFuncName = "\x1?Fallback_ShaderRecordOffset@@YAIXZ";
+      Function *ShaderRecordOffsetFunc = M.getFunction(ShaderRecordOffsetFuncName);
+      if (!ShaderRecordOffsetFunc)
+      {
+          FunctionType *ShaderRecordOffsetFuncType = FunctionType::get(llvm::Type::getInt32Ty(Ctx), {}, false);
+          ShaderRecordOffsetFunc = Function::Create(ShaderRecordOffsetFuncType, GlobalValue::LinkageTypes::ExternalLinkage, ShaderRecordOffsetFuncName, &M);
+      }
+      BaseShaderRecordOffset = Builder.CreateCall(ShaderRecordOffsetFunc, {}, "shaderRecordOffset");
+  }
+  else
+  {
+      BaseShaderRecordOffset = HlslOP->GetU32Const(0);
+  }
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateOffsetToShaderRecord(Module &M, IRBuilder<> &Builder, unsigned int RecordOffsetInBytes, llvm::Value *CbufferOffsetInBytes) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  // Create handle for the newly-added constant buffer (which is achieved via a function call)
+  auto AdddName = "ShaderRecordOffsetInBytes";
+  Constant *ShaderRecordOffsetInBytes = HlslOP->GetU32Const(RecordOffsetInBytes); // Offset of constants in shader record buffer
+  return Builder.CreateAdd(CbufferOffsetInBytes, ShaderRecordOffsetInBytes, AdddName);
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateCBufferLoadLegacy(Module &M, IRBuilder<> &Builder, llvm::Value *ResourceHandle, unsigned int RowToLoad) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  auto BufferLoadName = "ConstantBuffer";
+  Function *BufferLoad = HlslOP->GetOpFunc(DXIL::OpCode::CBufferLoadLegacy, Type::getInt32Ty(Ctx));
+  Constant *CBufferLoadOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::CBufferLoadLegacy);
+  Constant *RowToLoadConst = HlslOP->GetU32Const(RowToLoad);
+  return Builder.CreateCall(BufferLoad, { CBufferLoadOpcodeArg, ResourceHandle, RowToLoadConst }, BufferLoadName);
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateShaderRecordBufferLoad(Module &M, IRBuilder<> &Builder, llvm::Value *ShaderRecordOffsetInBytes, llvm::Type* type) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+  LLVMContext & Ctx = M.getContext();
+
+  // Create handle for the newly-added constant buffer (which is achieved via a function call)
+  auto BufferLoadName = "ShaderRecordBuffer";
+  if (type->getNumContainedTypes() > 1)
+  {
+      // TODO: Buffer loads aren't legal with container types, check if this is the right wait to handle this
+      type = type->getContainedType(0);
+  }
+
+  // TODO Do I need to check the result? Hopefully not
+  Function *BufferLoad = HlslOP->GetOpFunc(DXIL::OpCode::BufferLoad, type);
+  Constant *BufferLoadOpcodeArg = HlslOP->GetU32Const((unsigned)DXIL::OpCode::BufferLoad);
+  Constant *Unused = UndefValue::get(llvm::Type::getInt32Ty(Ctx));
+  return Builder.CreateCall(BufferLoad, { BufferLoadOpcodeArg, ShaderTableHandle, ShaderRecordOffsetInBytes, Unused }, BufferLoadName);
+}
+
+void DxilPatchShaderRecordBindings::ReplaceUsesOfWith(llvm::Instruction *InstructionToReplace, llvm::Value *ValueToReplaceWith) {
+  for (auto UserIter = InstructionToReplace->user_begin(); UserIter != InstructionToReplace->user_end();) {
+    // Increment the iterator before the replace since the replace alters the uses list
+    auto userInstr = UserIter++;
+    userInstr->replaceUsesOfWith(InstructionToReplace, ValueToReplaceWith);
+  }
+  InstructionToReplace->eraseFromParent();
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::CreateCBufferLoadOffsetInBytes(Module &M, IRBuilder<> &Builder, llvm::Instruction *instruction) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  DxilInst_CBufferLoad cbufferLoad(instruction);
+  DxilInst_CBufferLoadLegacy cbufferLoadLegacy(instruction);
+  if (cbufferLoad) {
+    return cbufferLoad.get_byteOffset();
+  } else if (cbufferLoadLegacy) {
+    Constant *LegacyMultiplier = HlslOP->GetU32Const(16);
+    return Builder.CreateMul(cbufferLoadLegacy.get_regIndex(), LegacyMultiplier);
+  } else {
+    ThrowFailure();
+    return nullptr;
+  }
+}
+
+bool DxilPatchShaderRecordBindings::IsCBufferLoad(llvm::Instruction *instruction) {
+  DxilInst_CBufferLoad cbufferLoad(instruction);
+  DxilInst_CBufferLoadLegacy cbufferLoadLegacy(instruction);
+  return cbufferLoad || cbufferLoadLegacy;
+}
+
+const unsigned int GetResolvedRangeID(DXIL::ResourceClass resClass, Value *rangeIdVal)
+{
+  if (auto CI = dyn_cast<ConstantInt>(rangeIdVal))
+  {
+    return CI->getZExtValue();
+  }
+  else
+  {
+    assert(false);
+    return 0;
+  }
+}
+
+// TODO: This code is quite inefficient
+bool DxilPatchShaderRecordBindings::GetHandleInfo(
+  Module &M,
+  DxilInst_CreateHandleForLib &createHandleStructForLib,
+  _Out_ unsigned int &shaderRegister,
+  _Out_ unsigned int &registerSpace,
+  _Out_ DXIL::ResourceKind &kind,
+  _Out_ DXIL::ResourceClass &resClass,
+  _Out_ llvm::Type *&resType)
+{
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  LoadInst *loadRangeId = cast<LoadInst>(createHandleStructForLib.get_Resource());
+  Value *ResourceSymbol = loadRangeId->getPointerOperand();
+
+  DXIL::ResourceClass resourceClasses[] = {
+    DXIL::ResourceClass::CBuffer,
+    DXIL::ResourceClass::SRV,
+    DXIL::ResourceClass::UAV,
+    DXIL::ResourceClass::Sampler
+  };
+
+  hlsl::DxilResourceBase *Resource = nullptr;
+  for (auto &resourceClass : resourceClasses) {
+    
+    switch (resourceClass)
+    {
+    case DXIL::ResourceClass::CBuffer:
+    {
+      auto &cbuffers = DM.GetCBuffers();
+      for (auto &cbuffer : cbuffers)
+      {
+        if (cbuffer->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = cbuffer.get();
+          break;
+        }
+      }
+      break;
+    }
+    case DXIL::ResourceClass::SRV:
+    case DXIL::ResourceClass::UAV:
+    {
+      auto &viewList = resourceClass == DXIL::ResourceClass::SRV ? DM.GetSRVs() : DM.GetUAVs();
+      for (auto &view : viewList)
+      {
+        if (view->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = view.get();
+          break;
+        }
+      }
+      break;
+    }
+    case DXIL::ResourceClass::Sampler:
+    {
+      auto &samplers = DM.GetSamplers();
+      for (auto &sampler : samplers)
+      {
+        if (sampler->GetGlobalSymbol() == ResourceSymbol)
+        {
+          Resource = sampler.get();
+          break;
+        }
+      }
+      break;
+    }
+    }
+  }
+
+  if (Resource)
+  {
+    registerSpace = Resource->GetSpaceID();
+    shaderRegister = Resource->GetLowerBound();
+    kind = Resource->GetKind();
+    resClass = Resource->GetClass();
+    resType = cast<GlobalVariable>(Resource->GetGlobalSymbol())->getType()->getPointerElementType();
+  }
+  return Resource != nullptr;
+}
+
+llvm::Value *DxilPatchShaderRecordBindings::LoadShaderRecordData(
+    Module &M, 
+    IRBuilder<> &Builder,
+    llvm::Value *offsetToShaderRecord,
+    unsigned int dataOffsetInShaderRecord)
+{
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  LLVMContext &Ctx = M.getContext();
+  OP *HlslOP = DM.GetOP();
+
+  Constant *dataOffset =
+      HlslOP->GetU32Const(dataOffsetInShaderRecord);
+  Value *shaderTableOffsetToData = Builder.CreateAdd(dataOffset, offsetToShaderRecord);
+  return CreateShaderRecordBufferLoad(M, Builder, shaderTableOffsetToData,
+      llvm::Type::getInt32Ty(Ctx));
+}
+
+void DxilPatchShaderRecordBindings::PatchCreateHandleToUseDescriptorIndex(
+    _In_ Module &M,
+    _In_ IRBuilder<> &Builder,
+    _In_ DXIL::ResourceKind &resourceKind,
+    _In_ DXIL::ResourceClass &resourceClass,
+    _In_ llvm::Type *resourceType,
+    _In_ llvm::Value *descriptorIndex,
+    _Inout_ DxilInst_CreateHandleForLib &createHandleInstr)
+{
+    DxilModule &DM = M.GetOrCreateDxilModule();
+    OP *HlslOP = DM.GetOP();
+
+    llvm::Value *descriptorHeapSymbol = GetAliasedDescriptorHeapHandle(M, resourceType, resourceClass, resourceKind);
+    llvm::Value *viewSymbol = Builder.CreateGEP(descriptorHeapSymbol, { HlslOP->GetU32Const(0), descriptorIndex }, "IndexIntoDH");
+    DxilMDHelper::MarkNonUniform(cast<Instruction>(viewSymbol));
+    llvm::Value *handle = Builder.CreateLoad(viewSymbol);
+
+    auto callInst = cast<CallInst>(createHandleInstr.Instr);
+    callInst->setCalledFunction(HlslOP->GetOpFunc(
+        DXIL::OpCode::CreateHandleForLib,
+        handle->getType()));
+    createHandleInstr.set_Resource(handle);
+}
+
+void DxilPatchShaderRecordBindings::InitializeViewTable() {
+    // The Fallback Layer declares a bindless raw buffer that spans the entire descriptor heap,
+    // manually add it to the list of UAV register spaces used
+    if (*pInputShaderInfo->pNumUAVSpaces == 0)
+    {
+        ViewKey key = { (unsigned int)hlsl::DXIL::ResourceKind::RawBuffer, 0 };
+        unsigned int index = FindOrInsertViewIntoList(
+          key, 
+          pInputShaderInfo->pUAVRegisterSpaceArray, 
+          *pInputShaderInfo->pNumUAVSpaces, 
+          FallbackLayerNumDescriptorHeapSpacesPerView);
+        (void)index;
+        assert(index == 0);
+    }
+}
+
+
+void DxilPatchShaderRecordBindings::PatchShaderBindings(Module &M) {
+  DxilModule &DM = M.GetOrCreateDxilModule();
+  OP *HlslOP = DM.GetOP();
+
+  // Don't erase instructions until the very end because it throws off the iterator
+  std::vector<llvm::Instruction *> instructionsToRemove;
+  for (BasicBlock &block : EntryPointFunction->getBasicBlockList()) {
+    auto & Instructions = block.getInstList();
+    auto It = Instructions.begin();
+
+    for (auto &instr : Instructions) {
+      DxilInst_CreateHandleForLib createHandleForLib(&instr);
+      if (createHandleForLib) {
+        DXIL::ResourceClass resourceClass;
+        unsigned int registerSpace;
+        unsigned int registerIndex;
+        DXIL::ResourceKind kind;
+        llvm::Type *resType;
+        bool resourceIsResolved = true;
+        resourceIsResolved = GetHandleInfo(M, createHandleForLib, registerIndex, registerSpace, kind, resourceClass, resType);
+
+        if (!resourceIsResolved) continue; // TODO: This shouldn't actually be happening?
+
+        ShaderRecordEntry shaderRecord = FindRootSignatureDescriptor(
+          *pRootSignatureDesc,
+          pInputShaderInfo->ShaderRecordIdentifierSizeInBytes,
+          resourceClass,
+          registerIndex,
+          registerSpace);
+
+        const bool IsBindingSpecifiedInLocalRootSignature = !shaderRecord.IsInvalid();
+        if (IsBindingSpecifiedInLocalRootSignature) {
+          if (!DispatchRaysConstantsHandle) {
+            AddInputBinding(M);
+          }
+
+          switch (shaderRecord.ParameterType) {
+          case DxilRootParameterType::Constants32Bit:
+          {
+            for (User *U : instr.users()) {
+              llvm::Instruction *instruction = cast<CallInst>(U);
+              if (IsCBufferLoad(instruction)) {
+                llvm::Instruction *cbufferLoadInstr = instruction;
+                IRBuilder<> Builder(cbufferLoadInstr);
+
+                llvm::Value * cbufferOffsetInBytes = CreateCBufferLoadOffsetInBytes(M, Builder, cbufferLoadInstr);
+                llvm::Value *LocalOffsetToRootConstant = CreateOffsetToShaderRecord(M, Builder, shaderRecord.RecordOffsetInBytes, cbufferOffsetInBytes);
+                llvm::Value *GlobalOffsetToRootConstant = Builder.CreateAdd(LocalOffsetToRootConstant, BaseShaderRecordOffset);
+                llvm::Value *srvBufferLoad = CreateShaderRecordBufferLoad(M, Builder, GlobalOffsetToRootConstant, cbufferLoadInstr->getType());
+                ReplaceUsesOfWith(cbufferLoadInstr, srvBufferLoad);
+              } else {
+                ThrowFailure();
+              }
+            }
+            instructionsToRemove.push_back(&instr);
+            break;
+          }
+          case DxilRootParameterType::DescriptorTable:
+          {
+            IRBuilder<> Builder(&instr);
+            llvm::Value *srvBufferLoad = LoadShaderRecordData(
+             M, 
+             Builder, 
+             BaseShaderRecordOffset,
+             shaderRecord.RecordOffsetInBytes);
+
+            llvm::Value *DescriptorTableEntryLo = Builder.CreateExtractValue(srvBufferLoad, 0, "DescriptorTableHandleLo");
+
+            unsigned int offsetToLoadInUints = offsetof(DispatchRaysConstants, SrvCbvUavDescriptorHeapStart) / sizeof(uint32_t);
+            unsigned int uintsPerRow = 4;
+            unsigned int rowToLoad = offsetToLoadInUints / uintsPerRow;
+            unsigned int extractValueOffset = offsetToLoadInUints % uintsPerRow;
+            llvm::Value *DescHeapConstants = CreateCBufferLoadLegacy(M, Builder, DispatchRaysConstantsHandle, rowToLoad);
+            llvm::Value *DescriptorHeapStartAddressLo = Builder.CreateExtractValue(DescHeapConstants, extractValueOffset, "DescriptorHeapStartHandleLo");
+
+            // TODO: The hi bits can only be ignored if the difference is guaranteed to be < 32 bytes. This is an unsafe assumption, particularly given 
+            // large descriptor sizes
+            llvm::Value *DescriptorTableOffsetInBytes = Builder.CreateSub(DescriptorTableEntryLo, DescriptorHeapStartAddressLo, "TableOffsetInBytes");
+
+            Constant *DescriptorSizeInBytes = HlslOP->GetU32Const(pInputShaderInfo->SrvCbvUavDescriptorSizeInBytes);
+            llvm::Value * DescriptorTableStartIndex = Builder.CreateExactUDiv(DescriptorTableOffsetInBytes, DescriptorSizeInBytes, "TableStartIndex");
+
+            Constant *RecordOffset = HlslOP->GetU32Const(shaderRecord.OffsetInDescriptors);
+            llvm::Value * BaseDescriptorIndex = Builder.CreateAdd(DescriptorTableStartIndex, RecordOffset, "BaseDescriptorIndex");
+
+            // TODO: Not supporting dynamic indexing yet, should be pulled from CreateHandleForLib
+            // If dynamic indexing is being used, add the apps index on top of the calculated index
+            llvm::Value * DynamicIndex = HlslOP->GetU32Const(0);
+
+            llvm::Value * DescriptorIndex = Builder.CreateAdd(BaseDescriptorIndex, DynamicIndex, "DescriptorIndex");
+            PatchCreateHandleToUseDescriptorIndex(
+                M, 
+                Builder, 
+                kind, 
+                resourceClass, 
+                resType, 
+                DescriptorIndex, 
+                createHandleForLib);
+            break;
+          }
+          case DxilRootParameterType::CBV:
+          case DxilRootParameterType::SRV:
+          case DxilRootParameterType::UAV: {
+            IRBuilder<> Builder(&instr);
+            llvm::Value *srvBufferLoad = LoadShaderRecordData(
+             M, 
+             Builder, 
+             BaseShaderRecordOffset,
+             shaderRecord.RecordOffsetInBytes);
+
+            llvm::Value *DescriptorIndex = Builder.CreateExtractValue(
+                srvBufferLoad, 1, "DescriptorHeapIndex");
+
+            // TODO: Handle offset in bytes
+            // llvm::Value *OffsetInBytes = Builder.CreateExtractValue(
+            //     srvBufferLoad, 0, "OffsetInBytes");
+
+            PatchCreateHandleToUseDescriptorIndex(
+                M,
+                Builder,
+                kind,
+                resourceClass,
+                resType,
+                DescriptorIndex,
+                createHandleForLib);
+
+            break;
+          }
+          default:
+            ThrowFailure();
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  for (auto instruction : instructionsToRemove) {
+    instruction->eraseFromParent();
+  }
+
+}
+
+bool IsParameterTypeCompatibleWithResourceClass(
+  DXIL::ResourceClass resourceClass,
+  DxilRootParameterType parameterType) {
+  switch (parameterType) {
+  case DxilRootParameterType::DescriptorTable:
+    return true;
+  case DxilRootParameterType::Constants32Bit:
+  case DxilRootParameterType::CBV:
+    return resourceClass == DXIL::ResourceClass::CBuffer;
+  case DxilRootParameterType::SRV:
+    return resourceClass == DXIL::ResourceClass::SRV;
+  case DxilRootParameterType::UAV:
+    return resourceClass == DXIL::ResourceClass::UAV;
+  default:
+    ThrowFailure();
+    return false;
+  }
+}
+
+DxilRootParameterType ConvertD3D12ParameterTypeToDxil(DxilRootParameterType parameter) {
+  switch (parameter) {
+  case DxilRootParameterType::Constants32Bit:
+    return DxilRootParameterType::Constants32Bit;
+  case DxilRootParameterType::DescriptorTable:
+    return DxilRootParameterType::DescriptorTable;
+  case DxilRootParameterType::CBV:
+    return DxilRootParameterType::CBV;
+  case DxilRootParameterType::SRV:
+    return DxilRootParameterType::SRV;
+  case DxilRootParameterType::UAV:
+    return DxilRootParameterType::UAV;
+  }
+
+  assert(false);
+  return (DxilRootParameterType)-1;
+}
+
+DXIL::ResourceClass ConvertD3D12RangeTypeToDxil(DxilDescriptorRangeType rangeType) {
+  switch (rangeType) {
+  case DxilDescriptorRangeType::SRV:
+    return DXIL::ResourceClass::SRV;
+  case DxilDescriptorRangeType::UAV:
+    return DXIL::ResourceClass::UAV;
+  case DxilDescriptorRangeType::CBV:
+    return DXIL::ResourceClass::CBuffer;
+  case DxilDescriptorRangeType::Sampler:
+    return DXIL::ResourceClass::Sampler;
+  }
+  assert(false);
+  return (DXIL::ResourceClass) - 1;
+}
+
+unsigned int GetParameterTypeAlignment(DxilRootParameterType parameterType) {
+  switch (parameterType) {
+  case DxilRootParameterType::DescriptorTable:
+    return SizeofD3D12GpuDescriptorHandle;
+  case DxilRootParameterType::Constants32Bit:
+    return sizeof(uint32_t);
+  case DxilRootParameterType::CBV: // fallthrough
+  case DxilRootParameterType::SRV: // fallthrough
+  case DxilRootParameterType::UAV:
+    return SizeofD3D12GpuVA;
+  default:
+    return UINT_MAX;
+  }
+}
+
+template <typename TD3D12_ROOT_SIGNATURE_DESC>
+ShaderRecordEntry FindRootSignatureDescriptorHelper(
+    const TD3D12_ROOT_SIGNATURE_DESC &rootSignatureDescriptor,
+    unsigned int ShaderRecordIdentifierSizeInBytes,
+    DXIL::ResourceClass resourceClass, unsigned int baseRegisterIndex,
+    unsigned int registerSpace) {
+  // Automatically fail if it's looking for a fallback binding as these never
+  // need to be patched
+  if (registerSpace != FallbackLayerRegisterSpace) {
+    unsigned int recordOffset = ShaderRecordIdentifierSizeInBytes;
+    for (unsigned int rootParamIndex = 0;
+         rootParamIndex < rootSignatureDescriptor.NumParameters;
+         rootParamIndex++) {
+      auto &rootParam = rootSignatureDescriptor.pParameters[rootParamIndex];
+      auto dxilParamType =
+          ConvertD3D12ParameterTypeToDxil(rootParam.ParameterType);
+
+#define ALIGN(alignment, num) (((num + alignment - 1) / alignment) * alignment)
+      recordOffset = ALIGN(GetParameterTypeAlignment(rootParam.ParameterType),
+                           recordOffset);
+
+      switch (rootParam.ParameterType) {
+      case DxilRootParameterType::Constants32Bit:
+        if (IsParameterTypeCompatibleWithResourceClass(resourceClass,
+                                                       dxilParamType) &&
+            baseRegisterIndex == rootParam.Constants.ShaderRegister &&
+            registerSpace == rootParam.Constants.RegisterSpace) {
+          return {dxilParamType, recordOffset};
+        }
+        recordOffset += rootParam.Constants.Num32BitValues * sizeof(uint32_t);
+        break;
+      case DxilRootParameterType::DescriptorTable: {
+        auto &descriptorTable = rootParam.DescriptorTable;
+
+        unsigned int rangeOffsetInDescriptors = 0;
+        for (unsigned int rangeIndex = 0;
+             rangeIndex < descriptorTable.NumDescriptorRanges; rangeIndex++) {
+          auto &range = descriptorTable.pDescriptorRanges[rangeIndex];
+          if (range.OffsetInDescriptorsFromTableStart != -1) {
+            rangeOffsetInDescriptors = range.OffsetInDescriptorsFromTableStart;
+          }
+
+          if (ConvertD3D12RangeTypeToDxil(range.RangeType) == resourceClass &&
+              range.RegisterSpace == registerSpace &&
+              range.BaseShaderRegister <= baseRegisterIndex &&
+              range.BaseShaderRegister + range.NumDescriptors >
+                  baseRegisterIndex) {
+            rangeOffsetInDescriptors +=
+                baseRegisterIndex - range.BaseShaderRegister;
+            return {dxilParamType, recordOffset, rangeOffsetInDescriptors};
+          }
+
+          rangeOffsetInDescriptors += range.NumDescriptors;
+        }
+
+        recordOffset += SizeofD3D12GpuDescriptorHandle;
+        break;
+      }
+      case DxilRootParameterType::CBV:
+      case DxilRootParameterType::SRV:
+      case DxilRootParameterType::UAV:
+        if (IsParameterTypeCompatibleWithResourceClass(resourceClass,
+                                                       dxilParamType) &&
+            baseRegisterIndex == rootParam.Descriptor.ShaderRegister &&
+            registerSpace == rootParam.Descriptor.RegisterSpace) {
+          return {dxilParamType, recordOffset};
+        }
+
+        recordOffset += SizeofD3D12GpuVA;
+        break;
+      }
+    }
+  }
+  return ShaderRecordEntry::InvalidEntry();
+}
+
+// TODO: Consider pre-calculating this into a map
+ShaderRecordEntry DxilPatchShaderRecordBindings::FindRootSignatureDescriptor(
+  const DxilVersionedRootSignatureDesc &rootSignatureDescriptor,
+  unsigned int ShaderRecordIdentifierSizeInBytes,
+  DXIL::ResourceClass resourceClass,
+  unsigned int baseRegisterIndex,
+  unsigned int registerSpace) {
+  switch (rootSignatureDescriptor.Version) {
+  case DxilRootSignatureVersion::Version_1_0:
+    return FindRootSignatureDescriptorHelper(rootSignatureDescriptor.Desc_1_0, ShaderRecordIdentifierSizeInBytes, resourceClass, baseRegisterIndex, registerSpace);
+  case DxilRootSignatureVersion::Version_1_1:
+    return FindRootSignatureDescriptorHelper(rootSignatureDescriptor.Desc_1_1, ShaderRecordIdentifierSizeInBytes, resourceClass, baseRegisterIndex, registerSpace);
+  default:
+    ThrowFailure();
+    return ShaderRecordEntry::InvalidEntry();
+  }
+}
+
+
+
+

+ 75 - 0
lib/HLSL/DxilPatchShaderRecordBindingsShared.h

@@ -0,0 +1,75 @@
+#pragma once
+
+#define FallbackLayerRegisterSpace 214743647
+
+// SRVs
+#define FallbackLayerHitGroupRecordByteAddressBufferRegister 0
+#define FallbackLayerMissShaderRecordByteAddressBufferRegister 1
+#define FallbackLayerRayGenShaderRecordByteAddressBufferRegister 2
+#define FallbackLayerCallableShaderRecordByteAddressBufferRegister 3
+
+// SRV & UAV
+#define FallbackLayerDescriptorHeapTable 0
+
+// There's a driver issue on some hardware that has issues
+// starting a bindless table on any register but 0, so
+// make sure each bindless table has it's own register space
+#define FallbackLayerDescriptorHeapSpaceOffset 1
+#define FallbackLayerNumDescriptorHeapSpacesPerView 10
+
+// CBVs
+#define FallbackLayerDispatchConstantsRegister 0
+#define FallbackLayerAccelerationStructureList 1
+
+#ifndef HLSL
+struct ViewKey {
+  unsigned int ViewType;
+  union
+  {
+    unsigned int StructuredStride; // When ViewType == StructuredBuffer
+    unsigned int SRVComponentType; // When ViewType != StructuredBuffer &&  ViewType != RawBuffer
+  };
+};
+
+struct ShaderInfo {
+  const wchar_t *ExportName;
+  unsigned int SamplerDescriptorSizeInBytes;
+  unsigned int SrvCbvUavDescriptorSizeInBytes;
+  unsigned int ShaderRecordIdentifierSizeInBytes;
+  const void *pRootSignatureDesc;
+
+  ViewKey *pSRVRegisterSpaceArray;
+  unsigned int *pNumSRVSpaces;
+
+  ViewKey *pUAVRegisterSpaceArray;
+  unsigned int *pNumUAVSpaces;
+};
+
+struct DispatchRaysConstants {
+  uint32_t RayDispatchDimensionsWidth;
+  uint32_t RayDispatchDimensionsHeight;
+  uint32_t HitGroupShaderRecordStride;
+  uint32_t MissShaderRecordStride;
+
+  // 64-bit values
+  uint64_t SamplerDescriptorHeapStart;
+  uint64_t SrvCbvUavDescriptorHeapStart;
+};
+
+enum DescriptorRangeTypes { SRV = 0, CBV, UAV, Sampler, NumRangeTypes };
+
+enum RootSignatureParameterOffset {
+  HitGroupRecord = 0,
+  MissShaderRecord,
+  RayGenShaderRecord,
+  CallableShaderRecord,
+  DispatchConstants,
+  CbvSrvUavDescriptorHeapAliasedTables,
+  SamplerDescriptorHeapAliasedTables,
+  AccelerationStructuresList,
+#if ENABLE_UAV_LOG
+  DebugUAVLog,
+#endif
+  NumParameters
+};
+#endif

+ 97 - 24
lib/HLSL/DxilPreparePasses.cpp

@@ -11,6 +11,7 @@
 
 #include "dxc/HLSL/DxilGenerationPass.h"
 #include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/HLOperations.h"
 #include "dxc/HLSL/DxilModule.h"
 #include "dxc/Support/Global.h"
 #include "dxc/HLSL/DxilTypeSystem.h"
@@ -34,6 +35,49 @@
 using namespace llvm;
 using namespace hlsl;
 
+namespace {
+class FailUndefResource : public ModulePass {
+public:
+  static char ID;
+
+  explicit FailUndefResource() : ModulePass(ID) {
+    initializeScalarizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override { return "Fail on undef resource use"; }
+
+  bool runOnModule(Module &M) override;
+};
+}
+
+char FailUndefResource::ID = 0;
+
+ModulePass *llvm::createFailUndefResourcePass() { return new FailUndefResource(); }
+
+INITIALIZE_PASS(FailUndefResource, "fail-undef-resource", "Fail on undef resource use", false, false)
+
+bool FailUndefResource::runOnModule(Module &M) {
+  // Undef resources may be removed on simplify due to the interpretation
+  // of undef that any value could be substituted for identical meaning.
+  // However, these likely indicate uninitialized locals being used in
+  // some code path, which we should catch and report.
+  for (auto &F : M.functions()) {
+    if (GetHLOpcodeGroupByName(&F) == HLOpcodeGroup::HLCreateHandle) {
+      Type *ResTy = F.getFunctionType()->getParamType(
+        HLOperandIndex::kCreateHandleResourceOpIdx);
+      UndefValue *UndefRes = UndefValue::get(ResTy);
+      for (auto U : UndefRes->users()) {
+        // Only report instruction users.
+        if (Instruction *I = dyn_cast<Instruction>(U))
+          dxilutil::EmitResMappingError(I);
+      }
+    }
+  }
+  return false;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 namespace {
 class SimplifyInst : public FunctionPass {
 public:
@@ -133,8 +177,31 @@ INITIALIZE_PASS(DxilDeadFunctionElimination, "dxil-dfe", "Remove all unused func
 
 namespace {
 
-Function *StripFunctionParameter(Function *F, DxilModule &DM,
+static void TransferEntryFunctionAttributes(Function *F, Function *NewFunc) {
+  // Keep necessary function attributes
+  AttributeSet attributeSet = F->getAttributes();
+  StringRef attrKind, attrValue;
+  if (attributeSet.hasAttribute(AttributeSet::FunctionIndex, DXIL::kFP32DenormKindString)) {
+    Attribute attribute = attributeSet.getAttribute(AttributeSet::FunctionIndex, DXIL::kFP32DenormKindString);
+    DXASSERT(attribute.isStringAttribute(), "otherwise we have wrong fp-denorm-mode attribute.");
+    attrKind = attribute.getKindAsString();
+    attrValue = attribute.getValueAsString();
+  }
+  if (F == NewFunc) {
+    NewFunc->removeAttributes(AttributeSet::FunctionIndex, attributeSet);
+  }
+  if (!attrKind.empty() && !attrValue.empty())
+    NewFunc->addFnAttr(attrKind, attrValue);
+}
+
+static Function *StripFunctionParameter(Function *F, DxilModule &DM,
     DenseMap<const Function *, DISubprogram *> &FunctionDIs) {
+  if (F->arg_empty() && F->getReturnType()->isVoidTy()) {
+    // This will strip non-entry function attributes
+    TransferEntryFunctionAttributes(F, F);
+    return nullptr;
+  }
+
   Module &M = *DM.GetModule();
   Type *VoidTy = Type::getVoidTy(M.getContext());
   FunctionType *FT = FunctionType::get(VoidTy, false);
@@ -152,13 +219,7 @@ Function *StripFunctionParameter(Function *F, DxilModule &DM,
   // Splice the body of the old function right into the new function.
   NewFunc->getBasicBlockList().splice(NewFunc->begin(), F->getBasicBlockList());
 
-  // Keep necessary function attributes
-  AttributeSet attributeSet = F->getAttributes();
-  if (attributeSet.hasAttribute(AttributeSet::FunctionIndex, DXIL::kFP32DenormKindString)) {
-    Attribute attribute = attributeSet.getAttribute(AttributeSet::FunctionIndex, DXIL::kFP32DenormKindString);
-    DXASSERT(attribute.isStringAttribute(), "otherwise we have wrong fp-denorm-mode attribute.");
-    NewFunc->addFnAttr(attribute.getKindAsString(), attribute.getValueAsString());
-  }
+  TransferEntryFunctionAttributes(F, NewFunc);
 
   // Patch the pointer to LLVM function in debug info descriptor.
   auto DI = FunctionDIs.find(F);
@@ -172,8 +233,7 @@ Function *StripFunctionParameter(Function *F, DxilModule &DM,
   }
   NewFunc->takeName(F);
   if (DM.HasDxilFunctionProps(F)) {
-    DM.ReplaceDxilEntrySignature(F, NewFunc);
-    DM.ReplaceDxilFunctionProps(F, NewFunc);
+    DM.ReplaceDxilEntryProps(F, NewFunc);
   }
   DM.GetTypeSystem().EraseFunctionAnnotation(F);
   F->eraseFromParent();
@@ -270,12 +330,11 @@ public:
       // Strip parameters of entry function.
       StripEntryParameters(M, DM, IsLib);
 
-      // Skip shader flag for library.
-      if (!IsLib) {
-        DM.CollectShaderFlags(); // Update flags to reflect any changes.
-                                 // Update Validator Version
-        DM.UpgradeToMinValidatorVersion();
-      }
+      // Update flags to reflect any changes.
+      DM.CollectShaderFlagsForModule();
+
+      // Update Validator Version
+      DM.UpgradeToMinValidatorVersion();
 
       return true;
     }
@@ -360,31 +419,45 @@ private:
       if (Function *PatchConstantFunc = DM.GetPatchConstantFunction()) {
         PatchConstantFunc =
             StripFunctionParameter(PatchConstantFunc, DM, FunctionDIs);
-        if (PatchConstantFunc)
+        if (PatchConstantFunc) {
           DM.SetPatchConstantFunction(PatchConstantFunc);
+        }
       }
 
       if (Function *EntryFunc = DM.GetEntryFunction()) {
         StringRef Name = DM.GetEntryFunctionName();
         EntryFunc->setName(Name);
         EntryFunc = StripFunctionParameter(EntryFunc, DM, FunctionDIs);
-        if (EntryFunc)
+        if (EntryFunc) {
           DM.SetEntryFunction(EntryFunc);
+        }
       }
     } else {
       std::vector<Function *> entries;
+      // Handle when multiple hull shaders point to the same patch constant function
+      DenseMap<Function*,Function*> patchConstantUpdates;
       for (iplist<Function>::iterator F : M.getFunctionList()) {
-        if (DM.HasDxilFunctionProps(F)) {
-          entries.emplace_back(F);
+        if (DM.IsEntryThatUsesSignatures(F)) {
+          auto *FT = F->getFunctionType();
+          // Only do this when has parameters.
+          if (FT->getNumParams() > 0 || !FT->getReturnType()->isVoidTy())
+            entries.emplace_back(F);
         }
       }
       for (Function *entry : entries) {
         DxilFunctionProps &props = DM.GetDxilFunctionProps(entry);
         if (props.IsHS()) {
           // Strip patch constant function first.
-          Function *patchConstFunc = StripFunctionParameter(
-              props.ShaderProps.HS.patchConstantFunc, DM, FunctionDIs);
-          props.ShaderProps.HS.patchConstantFunc = patchConstFunc;
+          Function* patchConstFunc = props.ShaderProps.HS.patchConstantFunc;
+          auto it = patchConstantUpdates.find(patchConstFunc);
+          if (it == patchConstantUpdates.end()) {
+            patchConstFunc = patchConstantUpdates[patchConstFunc] =
+                StripFunctionParameter(patchConstFunc, DM, FunctionDIs);
+          } else {
+            patchConstFunc = it->second;
+          }
+          if (patchConstFunc)
+            DM.SetPatchConstantFunctionForHS(entry, patchConstFunc);
         }
         StripFunctionParameter(entry, DM, FunctionDIs);
       }
@@ -451,7 +524,7 @@ void DxilEmitMetadata::patchIsFrontfaceTy(Module &M) {
     return;
   unsigned ValMajor, ValMinor;
   DM.GetValidatorVersion(ValMajor, ValMinor);
-  bool bForceUint = ValMajor >= 1 && ValMinor >= 2;
+  bool bForceUint = ValMajor == 0 || (ValMajor >= 1 && ValMinor >= 2);
   if (pSM->IsPS()) {
     patchIsFrontface(DM.GetInputSignature(), bForceUint);
   } else if (pSM->IsGS()) {

+ 5 - 5
lib/HLSL/DxilPreserveAllOutputs.cpp

@@ -86,15 +86,15 @@ public:
   {
   }
 
-  void CreateAlloca(IRBuilder<> &builder) {
-    LLVMContext &context = builder.getContext();
+  void CreateAlloca(IRBuilder<> &allocaBuilder) {
+    LLVMContext &context = allocaBuilder.getContext();
     Type *elementType = m_OutputElement.GetCompType().GetLLVMType(context);
     Type *allocaType = nullptr;
     if (IsSingleElement())
       allocaType = elementType;
     else
       allocaType = ArrayType::get(elementType, NumElements());
-    m_Alloca = builder.CreateAlloca(allocaType, nullptr, m_OutputElement.GetName());
+    m_Alloca = allocaBuilder.CreateAlloca(allocaType, nullptr, m_OutputElement.GetName());
   }
 
   void StoreTemp(IRBuilder<> &builder, Value *row, Value *col, Value *value) const {
@@ -249,11 +249,11 @@ DxilPreserveAllOutputs::OutputMap DxilPreserveAllOutputs::generateOutputMap(cons
   return map;
 }
 
-void DxilPreserveAllOutputs::createTempAllocas(OutputMap &outputMap, IRBuilder<> &builder)
+void DxilPreserveAllOutputs::createTempAllocas(OutputMap &outputMap, IRBuilder<> &allocaBuilder)
 {
   for (auto &iter: outputMap) {
     OutputElement &output = iter.second;
-    output.CreateAlloca(builder);
+    output.CreateAlloca(allocaBuilder);
   }
 }
 

+ 4 - 0
lib/HLSL/DxilResource.cpp

@@ -141,6 +141,7 @@ unsigned DxilResource::GetNumCoords(Kind ResourceKind) {
       0, // CBuffer,
       0, // Sampler,
       1, // TBuffer,
+      0, // RaytracingAccelerationStructure,
   };
   static_assert(_countof(CoordSizeTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -165,6 +166,7 @@ unsigned DxilResource::GetNumDimensions(Kind ResourceKind) {
       0, // CBuffer,
       0, // Sampler,
       1, // TBuffer,
+      0, // RaytracingAccelerationStructure,
   };
   static_assert(_countof(NumDimTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -189,6 +191,7 @@ unsigned DxilResource::GetNumDimensionsForCalcLOD(Kind ResourceKind) {
       0, // CBuffer,
       0, // Sampler,
       1, // TBuffer,
+      0, // RaytracingAccelerationStructure,
   };
   static_assert(_countof(NumDimTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -213,6 +216,7 @@ unsigned DxilResource::GetNumOffsets(Kind ResourceKind) {
       0, // CBuffer,
       0, // Sampler,
       1, // TBuffer,
+      0, // RaytracingAccelerationStructure,
   };
   static_assert(_countof(OffsetSizeTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");

+ 1 - 1
lib/HLSL/DxilResourceBase.cpp

@@ -84,7 +84,7 @@ static const char *s_ResourceDimNames[(unsigned)DxilResourceBase::Kind::NumEntri
         "invalid", "1d",        "2d",      "2dMS",      "3d",
         "cube",    "1darray",   "2darray", "2darrayMS", "cubearray",
         "buf",     "rawbuf",    "structbuf", "cbuffer", "sampler",
-        "tbuffer",
+        "tbuffer", "ras",
 };
 
 const char *DxilResourceBase::GetResDimName() const {

+ 380 - 0
lib/HLSL/DxilShaderFlags.cpp

@@ -0,0 +1,380 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilShaderFlags.cpp                                                       //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilContainer.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilShaderFlags.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilResource.h"
+#include "dxc/Support/Global.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Casting.h"
+
+using namespace hlsl;
+using namespace llvm;
+
+ShaderFlags::ShaderFlags():
+  m_bDisableOptimizations(false)
+, m_bDisableMathRefactoring(false)
+, m_bEnableDoublePrecision(false)
+, m_bForceEarlyDepthStencil(false)
+, m_bEnableRawAndStructuredBuffers(false)
+, m_bLowPrecisionPresent(false)
+, m_bEnableDoubleExtensions(false)
+, m_bEnableMSAD(false)
+, m_bAllResourcesBound(false)
+, m_bViewportAndRTArrayIndex(false)
+, m_bInnerCoverage(false)
+, m_bStencilRef(false)
+, m_bTiledResources(false)
+, m_bUAVLoadAdditionalFormats(false)
+, m_bLevel9ComparisonFiltering(false)
+, m_bCSRawAndStructuredViaShader4X(false)
+, m_b64UAVs(false)
+, m_UAVsAtEveryStage(false)
+, m_bROVS(false)
+, m_bWaveOps(false)
+, m_bInt64Ops(false)
+, m_bViewID(false)
+, m_bBarycentrics(false)
+, m_bUseNativeLowPrecision(false)
+, m_align0(0)
+, m_align1(0)
+{}
+
+uint64_t ShaderFlags::GetFeatureInfo() const {
+  uint64_t Flags = 0;
+  Flags |= m_bEnableDoublePrecision ? hlsl::ShaderFeatureInfo_Doubles : 0;
+  Flags |= m_bLowPrecisionPresent && !m_bUseNativeLowPrecision ? hlsl::ShaderFeatureInfo_MinimumPrecision: 0;
+  Flags |= m_bLowPrecisionPresent && m_bUseNativeLowPrecision ? hlsl::ShaderFeatureInfo_NativeLowPrecision : 0;
+  Flags |= m_bEnableDoubleExtensions ? hlsl::ShaderFeatureInfo_11_1_DoubleExtensions : 0;
+  Flags |= m_bWaveOps ? hlsl::ShaderFeatureInfo_WaveOps : 0;
+  Flags |= m_bInt64Ops ? hlsl::ShaderFeatureInfo_Int64Ops : 0;
+  Flags |= m_bROVS ? hlsl::ShaderFeatureInfo_ROVs : 0;
+  Flags |= m_bViewportAndRTArrayIndex ? hlsl::ShaderFeatureInfo_ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer : 0;
+  Flags |= m_bInnerCoverage ? hlsl::ShaderFeatureInfo_InnerCoverage : 0;
+  Flags |= m_bStencilRef ? hlsl::ShaderFeatureInfo_StencilRef : 0;
+  Flags |= m_bTiledResources ? hlsl::ShaderFeatureInfo_TiledResources : 0;
+  Flags |= m_bEnableMSAD ? hlsl::ShaderFeatureInfo_11_1_ShaderExtensions : 0;
+  Flags |= m_bCSRawAndStructuredViaShader4X ? hlsl::ShaderFeatureInfo_ComputeShadersPlusRawAndStructuredBuffersViaShader4X : 0;
+  Flags |= m_UAVsAtEveryStage ? hlsl::ShaderFeatureInfo_UAVsAtEveryStage : 0;
+  Flags |= m_b64UAVs ? hlsl::ShaderFeatureInfo_64UAVs : 0;
+  Flags |= m_bLevel9ComparisonFiltering ? hlsl::ShaderFeatureInfo_LEVEL9ComparisonFiltering : 0;
+  Flags |= m_bUAVLoadAdditionalFormats ? hlsl::ShaderFeatureInfo_TypedUAVLoadAdditionalFormats : 0;
+  Flags |= m_bViewID ? hlsl::ShaderFeatureInfo_ViewID : 0;
+  Flags |= m_bBarycentrics ? hlsl::ShaderFeatureInfo_Barycentrics : 0;
+
+  return Flags;
+}
+
+uint64_t ShaderFlags::GetShaderFlagsRaw() const {
+  union Cast {
+    Cast(const ShaderFlags &flags) {
+      shaderFlags = flags;
+    }
+    ShaderFlags shaderFlags;
+    uint64_t  rawData;
+  };
+  static_assert(sizeof(uint64_t) == sizeof(ShaderFlags),
+                "size must match to make sure no undefined bits when cast");
+  Cast rawCast(*this);
+  return rawCast.rawData;
+}
+
+void ShaderFlags::SetShaderFlagsRaw(uint64_t data) {
+  union Cast {
+    Cast(uint64_t data) {
+      rawData = data;
+    }
+    ShaderFlags shaderFlags;
+    uint64_t  rawData;
+  };
+
+  Cast rawCast(data);
+  *this = rawCast.shaderFlags;
+}
+
+uint64_t ShaderFlags::GetShaderFlagsRawForCollection() {
+  // This should be all the flags that can be set by DxilModule::CollectShaderFlags.
+  ShaderFlags Flags;
+  Flags.SetEnableDoublePrecision(true);
+  Flags.SetInt64Ops(true);
+  Flags.SetLowPrecisionPresent(true);
+  Flags.SetEnableDoubleExtensions(true);
+  Flags.SetWaveOps(true);
+  Flags.SetTiledResources(true);
+  Flags.SetEnableMSAD(true);
+  Flags.SetUAVLoadAdditionalFormats(true);
+  Flags.SetStencilRef(true);
+  Flags.SetInnerCoverage(true);
+  Flags.SetViewportAndRTArrayIndex(true);
+  Flags.Set64UAVs(true);
+  Flags.SetUAVsAtEveryStage(true);
+  Flags.SetEnableRawAndStructuredBuffers(true);
+  Flags.SetCSRawAndStructuredViaShader4X(true);
+  Flags.SetViewID(true);
+  Flags.SetBarycentrics(true);
+  return Flags.GetShaderFlagsRaw();
+}
+
+unsigned ShaderFlags::GetGlobalFlags() const {
+  unsigned Flags = 0;
+  Flags |= m_bDisableOptimizations ? DXIL::kDisableOptimizations : 0;
+  Flags |= m_bDisableMathRefactoring ? DXIL::kDisableMathRefactoring : 0;
+  Flags |= m_bEnableDoublePrecision ? DXIL::kEnableDoublePrecision : 0;
+  Flags |= m_bForceEarlyDepthStencil ? DXIL::kForceEarlyDepthStencil : 0;
+  Flags |= m_bEnableRawAndStructuredBuffers ? DXIL::kEnableRawAndStructuredBuffers : 0;
+  Flags |= m_bLowPrecisionPresent && !m_bUseNativeLowPrecision? DXIL::kEnableMinPrecision : 0;
+  Flags |= m_bEnableDoubleExtensions ? DXIL::kEnableDoubleExtensions : 0;
+  Flags |= m_bEnableMSAD ? DXIL::kEnableMSAD : 0;
+  Flags |= m_bAllResourcesBound ? DXIL::kAllResourcesBound : 0;
+  return Flags;
+}
+
+// Given a CreateHandle call, returns arbitrary ConstantInt rangeID
+// Note: HLSL is currently assuming that rangeID is a constant value, but this code is assuming
+// that it can be either constant, phi node, or select instruction
+static ConstantInt *GetArbitraryConstantRangeID(CallInst *handleCall) {
+  Value *rangeID =
+      handleCall->getArgOperand(DXIL::OperandIndex::kCreateHandleResIDOpIdx);
+  ConstantInt *ConstantRangeID = dyn_cast<ConstantInt>(rangeID);
+  while (ConstantRangeID == nullptr) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(rangeID)) {
+      ConstantRangeID = CI;
+    } else if (PHINode *PN = dyn_cast<PHINode>(rangeID)) {
+      rangeID = PN->getIncomingValue(0);
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(rangeID)) {
+      rangeID = SI->getTrueValue();
+    } else {
+      return nullptr;
+    }
+  }
+  return ConstantRangeID;
+}
+
+static bool IsResourceSingleComponent(llvm::Type *Ty) {
+  if (llvm::ArrayType *arrType = llvm::dyn_cast<llvm::ArrayType>(Ty)) {
+    if (arrType->getArrayNumElements() > 1) {
+      return false;
+    }
+    return IsResourceSingleComponent(arrType->getArrayElementType());
+  } else if (llvm::StructType *structType =
+                 llvm::dyn_cast<llvm::StructType>(Ty)) {
+    if (structType->getStructNumElements() > 1) {
+      return false;
+    }
+    return IsResourceSingleComponent(structType->getStructElementType(0));
+  } else if (llvm::VectorType *vectorType =
+                 llvm::dyn_cast<llvm::VectorType>(Ty)) {
+    if (vectorType->getNumElements() > 1) {
+      return false;
+    }
+    return IsResourceSingleComponent(vectorType->getVectorElementType());
+  }
+  return true;
+}
+
+// Given a handle type, find an arbitrary call instructions to create handle
+static CallInst *FindCallToCreateHandle(Value *handleType) {
+  Value *curVal = handleType;
+  CallInst *CI = dyn_cast<CallInst>(handleType);
+  while (CI == nullptr) {
+    if (PHINode *PN = dyn_cast<PHINode>(curVal)) {
+      curVal = PN->getIncomingValue(0);
+    }
+    else if (SelectInst *SI = dyn_cast<SelectInst>(curVal)) {
+      curVal = SI->getTrueValue();
+    }
+    else {
+      return nullptr;
+    }
+    CI = dyn_cast<CallInst>(curVal);
+  }
+  return CI;
+}
+
+ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
+                                           const hlsl::DxilModule *M) {
+  ShaderFlags flag;
+  // Module level options
+  flag.SetUseNativeLowPrecision(!M->GetUseMinPrecision());
+  flag.SetDisableOptimizations(M->GetDisableOptimization());
+  flag.SetAllResourcesBound(M->GetAllResourcesBound());
+
+  bool hasDouble = false;
+  // ddiv dfma drcp d2i d2u i2d u2d.
+  // fma has dxil op. Others should check IR instruction div/cast.
+  bool hasDoubleExtension = false;
+  bool has64Int = false;
+  bool has16 = false;
+  bool hasWaveOps = false;
+  bool hasCheckAccessFully = false;
+  bool hasMSAD = false;
+  bool hasInnerCoverage = false;
+  bool hasViewID = false;
+  bool hasMulticomponentUAVLoads = false;
+  // Try to maintain compatibility with a v1.0 validator if that's what we have.
+  uint32_t valMajor, valMinor;
+  M->GetValidatorVersion(valMajor, valMinor);
+  bool hasMulticomponentUAVLoadsBackCompat = valMajor == 1 && valMinor == 0;
+
+  Type *int16Ty = Type::getInt16Ty(F->getContext());
+  Type *int64Ty = Type::getInt64Ty(F->getContext());
+
+  for (const BasicBlock &BB : F->getBasicBlockList()) {
+    for (const Instruction &I : BB.getInstList()) {
+      // Skip none dxil function call.
+      if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (!OP::IsDxilOpFunc(CI->getCalledFunction()))
+          continue;
+      }
+      Type *Ty = I.getType();
+      bool isDouble = Ty->isDoubleTy();
+      bool isHalf = Ty->isHalfTy();
+      bool isInt16 = Ty == int16Ty;
+      bool isInt64 = Ty == int64Ty;
+      if (isa<ExtractElementInst>(&I) ||
+        isa<InsertElementInst>(&I))
+        continue;
+      for (Value *operand : I.operands()) {
+        Type *Ty = operand->getType();
+        isDouble |= Ty->isDoubleTy();
+        isHalf |= Ty->isHalfTy();
+        isInt16 |= Ty == int16Ty;
+        isInt64 |= Ty == int64Ty;
+      }
+        if (isDouble) {
+          hasDouble = true;
+          switch (I.getOpcode()) {
+          case Instruction::FDiv:
+          case Instruction::UIToFP:
+          case Instruction::SIToFP:
+          case Instruction::FPToUI:
+          case Instruction::FPToSI:
+            hasDoubleExtension = true;
+            break;
+          }
+        }
+
+      has16 |= isHalf;
+      has16 |= isInt16;
+      has64Int |= isInt64;
+      if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (!OP::IsDxilOpFunc(CI->getCalledFunction()))
+          continue;
+        Value *opcodeArg = CI->getArgOperand(DXIL::OperandIndex::kOpcodeIdx);
+        ConstantInt *opcodeConst = dyn_cast<ConstantInt>(opcodeArg);
+        DXASSERT(opcodeConst, "DXIL opcode arg must be immediate");
+        unsigned opcode = opcodeConst->getLimitedValue();
+        DXASSERT(opcode < static_cast<unsigned>(DXIL::OpCode::NumOpCodes),
+          "invalid DXIL opcode");
+        DXIL::OpCode dxilOp = static_cast<DXIL::OpCode>(opcode);
+        if (hlsl::OP::IsDxilOpWave(dxilOp))
+          hasWaveOps = true;
+        switch (dxilOp) {
+        case DXIL::OpCode::CheckAccessFullyMapped:
+          hasCheckAccessFully = true;
+          break;
+        case DXIL::OpCode::Msad:
+          hasMSAD = true;
+          break;
+        case DXIL::OpCode::BufferLoad:
+        case DXIL::OpCode::TextureLoad: {
+          if (hasMulticomponentUAVLoads) continue;
+          // This is the old-style computation (overestimating requirements).
+          Value *resHandle = CI->getArgOperand(DXIL::OperandIndex::kBufferStoreHandleOpIdx);
+          CallInst *handleCall = FindCallToCreateHandle(resHandle);
+          // Check if this is a library handle or general create handle
+          if (handleCall) {
+            ConstantInt *HandleOpCodeConst = cast<ConstantInt>(
+                handleCall->getArgOperand(DXIL::OperandIndex::kOpcodeIdx));
+            DXIL::OpCode handleOp = static_cast<DXIL::OpCode>(HandleOpCodeConst->getLimitedValue());
+            if (handleOp == DXIL::OpCode::CreateHandle) {
+              if (ConstantInt *resClassArg =
+                dyn_cast<ConstantInt>(handleCall->getArgOperand(
+                  DXIL::OperandIndex::kCreateHandleResClassOpIdx))) {
+                DXIL::ResourceClass resClass = static_cast<DXIL::ResourceClass>(
+                  resClassArg->getLimitedValue());
+                if (resClass == DXIL::ResourceClass::UAV) {
+                  // Validator 1.0 assumes that all uav load is multi component load.
+                  if (hasMulticomponentUAVLoadsBackCompat) {
+                    hasMulticomponentUAVLoads = true;
+                    continue;
+                  }
+                  else {
+                    ConstantInt *rangeID = GetArbitraryConstantRangeID(handleCall);
+                    if (rangeID) {
+                      DxilResource resource = M->GetUAV(rangeID->getLimitedValue());
+                      if ((resource.IsTypedBuffer() ||
+                        resource.IsAnyTexture()) &&
+                        !IsResourceSingleComponent(resource.GetRetType())) {
+                        hasMulticomponentUAVLoads = true;
+                      }
+                    }
+                  }
+                }
+              }
+              else {
+                DXASSERT(false, "Resource class must be constant.");
+              }
+            }
+            else if (handleOp == DXIL::OpCode::CreateHandleForLib) {
+              // If library handle, find DxilResource by checking the name
+              if (LoadInst *LI = dyn_cast<LoadInst>(handleCall->getArgOperand(
+                      DXIL::OperandIndex::
+                          kCreateHandleForLibResOpIdx))) {
+                Value *resType = LI->getOperand(0);
+                for (auto &&res : M->GetUAVs()) {
+                  if (res->GetGlobalSymbol() == resType) {
+                    if ((res->IsTypedBuffer() || res->IsAnyTexture()) &&
+                        !IsResourceSingleComponent(res->GetRetType())) {
+                      hasMulticomponentUAVLoads = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+       } break;
+        case DXIL::OpCode::Fma:
+          hasDoubleExtension |= isDouble;
+          break;
+        case DXIL::OpCode::InnerCoverage:
+          hasInnerCoverage = true;
+          break;
+        case DXIL::OpCode::ViewID:
+          hasViewID = true;
+          break;
+        default:
+          // Normal opcodes.
+          break;
+        }
+      }
+    }
+  }
+    
+  flag.SetEnableDoublePrecision(hasDouble);
+  flag.SetInnerCoverage(hasInnerCoverage);
+  flag.SetInt64Ops(has64Int);
+  flag.SetLowPrecisionPresent(has16);
+  flag.SetEnableDoubleExtensions(hasDoubleExtension);
+  flag.SetWaveOps(hasWaveOps);
+  flag.SetTiledResources(hasCheckAccessFully);
+  flag.SetEnableMSAD(hasMSAD);
+  flag.SetUAVLoadAdditionalFormats(hasMulticomponentUAVLoads);
+  flag.SetViewID(hasViewID);
+
+  return flag;
+}
+
+void ShaderFlags::CombineShaderFlags(const ShaderFlags &other) {
+  SetShaderFlagsRaw(GetShaderFlagsRaw() | other.GetShaderFlagsRaw());
+}

+ 27 - 4
lib/HLSL/DxilShaderModel.cpp

@@ -59,6 +59,8 @@ bool ShaderModel::IsValidForDxil() const {
       case 2:
       case 3:
         return true;
+      case kOfflineMinor:
+        return m_Kind == Kind::Library;
       }
     }
     break;
@@ -66,6 +68,11 @@ bool ShaderModel::IsValidForDxil() const {
   return false;
 }
 
+bool ShaderModel::IsValidForModule() const {
+  // Ray tracing shader model should only be used on functions in a lib
+  return IsValid() && !IsRay();
+}
+
 const ShaderModel *ShaderModel::Get(unsigned Idx) {
   DXASSERT_NOMSG(Idx < kNumShaderModels - 1);
   if (Idx < kNumShaderModels - 1)
@@ -134,6 +141,12 @@ const ShaderModel *ShaderModel::GetByName(const char *pszName) {
         break;
       }
       else return GetInvalid();
+    case 'x':
+      if (kind == Kind::Library && Major == 6) {
+        Minor = kOfflineMinor;
+        break;
+      }
+      else return GetInvalid();
     default:  return GetInvalid();
   }
   if (pszName[Idx++] != 0)
@@ -156,6 +169,7 @@ void ShaderModel::GetDxilVersion(unsigned &DxilMajor, unsigned &DxilMinor) const
     DxilMinor = 2;
     break;
   case 3:
+  case kOfflineMinor: // Always update this to highest dxil version
     DxilMinor = 3;
     break;
   default:
@@ -180,6 +194,10 @@ void ShaderModel::GetMinValidatorVersion(unsigned &ValMajor, unsigned &ValMinor)
   case 3:
     ValMinor = 3;
     break;
+  case kOfflineMinor:
+    ValMajor = 0;
+    ValMinor = 0;
+    break;
   default:
     DXASSERT(0, "IsValidForDxil() should have caught this.");
     break;
@@ -187,15 +205,17 @@ void ShaderModel::GetMinValidatorVersion(unsigned &ValMajor, unsigned &ValMinor)
 }
 
 static const char *ShaderModelKindNames[] = {
-    "ps", "vs", "gs", "hs", "ds", "cs", "lib", "invalid",
+    "ps", "vs", "gs", "hs", "ds", "cs", "lib",
+    "raygeneration", "intersection", "anyhit", "closesthit", "miss", "callable",
+    "invalid",
 };
 
-std::string ShaderModel::GetKindName() const {
+const char * ShaderModel::GetKindName() const {
   return GetKindName(m_Kind);
 }
 
-std::string ShaderModel::GetKindName(Kind kind) {
-  return std::string(ShaderModelKindNames[static_cast<unsigned int>(kind)]);
+const char * ShaderModel::GetKindName(Kind kind) {
+  return ShaderModelKindNames[static_cast<unsigned int>(kind)];
 }
 
 const ShaderModel *ShaderModel::GetInvalid() {
@@ -260,6 +280,9 @@ const ShaderModel ShaderModel::ms_ShaderModels[kNumShaderModels] = {
   SM(Kind::Library,  6, 2, "lib_6_2",  32, 32,  true,  true,  UINT_MAX),
   SM(Kind::Library,  6, 3, "lib_6_3",  32, 32,  true,  true,  UINT_MAX),
 
+  // lib_6_x is for offline linking only, and relaxes restrictions
+  SM(Kind::Library,  6, kOfflineMinor, "lib_6_x",  32, 32,  true,  true,  UINT_MAX),
+
   SM(Kind::Invalid,  0, 0, "invalid", 0,  0,   false, false, 0),
 };
 

+ 8 - 4
lib/HLSL/DxilSignature.cpp

@@ -24,17 +24,21 @@ namespace hlsl {
 // Singnature methods.
 //
 DxilSignature::DxilSignature(DXIL::ShaderKind shaderKind,
-                             DXIL::SignatureKind sigKind, bool useMinPrecision)
+                             DXIL::SignatureKind sigKind,
+                             bool useMinPrecision)
     : m_sigPointKind(SigPoint::GetKind(shaderKind, sigKind,
                                        /*isPatchConstantFunction*/ false,
                                        /*isSpecialInput*/ false)),
       m_UseMinPrecision(useMinPrecision) {}
 
-DxilSignature::DxilSignature(DXIL::SigPointKind sigPointKind)
-: m_sigPointKind(sigPointKind) {}
+DxilSignature::DxilSignature(DXIL::SigPointKind sigPointKind,
+                             bool useMinPrecision)
+    : m_sigPointKind(sigPointKind),
+      m_UseMinPrecision(useMinPrecision) {}
 
 DxilSignature::DxilSignature(const DxilSignature &src)
-    : m_sigPointKind(src.m_sigPointKind) {
+    : m_sigPointKind(src.m_sigPointKind),
+      m_UseMinPrecision(src.m_UseMinPrecision) {
   const bool bSetID = false;
   for (auto &Elt : src.GetElements()) {
     std::unique_ptr<DxilSignatureElement> newElt = CreateElement();

+ 3 - 3
lib/HLSL/DxilTypeSystem.cpp

@@ -318,7 +318,7 @@ void DxilTypeSystem::CopyFunctionAnnotation(const llvm::Function *pDstFunction,
 
   // Copy the annotation.
   *dstAnnot = *annot;
-
+  dstAnnot->m_pFunction = pDstFunction;
   // Clone ret type annotation.
   CopyTypeAnnotation(pDstFunction->getReturnType(), src);
   // Clone param type annotations.
@@ -415,8 +415,8 @@ DXIL::SigPointKind SigPointFromInputQual(DxilParamInputQual Q, DXIL::ShaderKind
 bool DxilTypeSystem::UseMinPrecision() {
   if (m_LowPrecisionMode == DXIL::LowPrecisionMode::Undefined) {
     if (m_pModule->HasDxilModule()) {
-      m_LowPrecisionMode = m_pModule->GetDxilModule().m_ShaderFlags.GetUseNativeLowPrecision() ?
-        DXIL::LowPrecisionMode::UseNativeLowPrecision : DXIL::LowPrecisionMode::UseMinPrecision;
+      m_LowPrecisionMode = m_pModule->GetDxilModule().GetUseMinPrecision() ?
+        DXIL::LowPrecisionMode::UseMinPrecision : DXIL::LowPrecisionMode::UseNativeLowPrecision;
     }
     else if (m_pModule->HasHLModule()) {
       m_LowPrecisionMode = m_pModule->GetHLModule().GetHLOptions().bUseMinPrecision ?

+ 245 - 26
lib/HLSL/DxilUtil.cpp

@@ -14,6 +14,7 @@
 #include "dxc/HLSL/DxilTypeSystem.h"
 #include "dxc/HLSL/DxilUtil.h"
 #include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/HLModule.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -23,6 +24,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "dxc/Support/Global.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 
 using namespace llvm;
 using namespace hlsl;
@@ -31,6 +36,9 @@ namespace hlsl {
 
 namespace dxilutil {
 
+const char ManglingPrefix[] = "\01?";
+const char EntryPrefix[] = "dx.entry.";
+
 Type *GetArrayEltTy(Type *Ty) {
   if (isa<PointerType>(Ty))
     Ty = Ty->getPointerElementType();
@@ -126,14 +134,72 @@ void PrintDiagnosticHandler(const llvm::DiagnosticInfo &DI, void *Context) {
   DI.print(*printer);
 }
 
+StringRef DemangleFunctionName(StringRef name) {
+  if (!name.startswith(ManglingPrefix)) {
+    // Name isn't mangled.
+    return name;
+  }
+
+  size_t nameEnd = name.find_first_of("@");
+  DXASSERT(nameEnd != StringRef::npos, "else Name isn't mangled but has \01?");
+
+  return name.substr(2, nameEnd - 2);
+}
+
+std::string ReplaceFunctionName(StringRef originalName, StringRef newName) {
+  if (originalName.startswith(ManglingPrefix)) {
+    return (Twine(ManglingPrefix) + newName +
+      originalName.substr(originalName.find_first_of('@'))).str();
+  } else if (originalName.startswith(EntryPrefix)) {
+    return (Twine(EntryPrefix) + newName).str();
+  }
+  return newName.str();
+}
+
+// From AsmWriter.cpp
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+void PrintEscapedString(StringRef Name, raw_ostream &Out) {
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    unsigned char C = Name[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else
+      Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+void PrintUnescapedString(StringRef Name, raw_ostream &Out) {
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    unsigned char C = Name[i];
+    if (C == '\\') {
+      C = Name[++i];
+      unsigned value = hexDigitValue(C);
+      if (value != -1U) {
+        C = (unsigned char)value;
+        unsigned value2 = hexDigitValue(Name[i+1]);
+        assert(value2 != -1U && "otherwise, not a two digit hex escape");
+        if (value2 != -1U) {
+          C = (C << 4) + (unsigned char)value2;
+          ++i;
+        }
+      } // else, the next character (in C) should be the escaped character
+    }
+    Out << C;
+  }
+}
+
 std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::MemoryBuffer *MB,
   llvm::LLVMContext &Ctx,
   std::string &DiagStr) {
   raw_string_ostream DiagStream(DiagStr);
   llvm::DiagnosticPrinterRawOStream DiagPrinter(DiagStream);
+  LLVMContext::DiagnosticHandlerTy OrigHandler = Ctx.getDiagnosticHandler();
+  void *OrigContext = Ctx.getDiagnosticContext();
   Ctx.setDiagnosticHandler(PrintDiagnosticHandler, &DiagPrinter, true);
   ErrorOr<std::unique_ptr<llvm::Module>> pModule(
     llvm::parseBitcodeFile(MB->getMemBufferRef(), Ctx));
+  Ctx.setDiagnosticHandler(OrigHandler, OrigContext);
   if (std::error_code ec = pModule.getError()) {
     return nullptr;
   }
@@ -148,33 +214,186 @@ std::unique_ptr<llvm::Module> LoadModuleFromBitcode(llvm::StringRef BC,
   return LoadModuleFromBitcode(pBitcodeBuf.get(), Ctx, DiagStr);
 }
 
-llvm::Instruction *SkipAllocas(llvm::Instruction *I) {
-  // Step past any allocas:
-  while (I && isa<AllocaInst>(I))
-    I = I->getNextNode();
-  return I;
-}
-llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I) {
-  Function *F = I->getParent()->getParent();
-  if (F)
-    return F->getEntryBlock().getFirstInsertionPt();
-  else // BB with no parent function
-    return I->getParent()->getFirstInsertionPt();
-}
-llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F) {
-  return F->getEntryBlock().getFirstInsertionPt();
-}
-llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I) {
-  return SkipAllocas(FindAllocaInsertionPt(I));
-}
-llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB) {
-  return SkipAllocas(
-    BB->getFirstInsertionPt());
-}
-llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Function* F) {
-  return SkipAllocas(
-    F->getEntryBlock().getFirstInsertionPt());
+// If we don't have debug location and this is select/phi,
+// try recursing users to find instruction with debug info.
+// Only recurse phi/select and limit depth to prevent doing
+// too much work if no debug location found.
+static bool EmitErrorOnInstructionFollowPhiSelect(
+    Instruction *I, StringRef Msg, unsigned depth=0) {
+  if (depth > 4)
+    return false;
+  if (I->getDebugLoc().get()) {
+    EmitErrorOnInstruction(I, Msg);
+    return true;
+  }
+  if (isa<PHINode>(I) || isa<SelectInst>(I)) {
+    for (auto U : I->users())
+      if (Instruction *UI = dyn_cast<Instruction>(U))
+        if (EmitErrorOnInstructionFollowPhiSelect(UI, Msg, depth+1))
+          return true;
+  }
+  return false;
+}
+
+void EmitErrorOnInstruction(Instruction *I, StringRef Msg) {
+  const DebugLoc &DL = I->getDebugLoc();
+  if (DL.get()) {
+    std::string locString;
+    raw_string_ostream os(locString);
+    DL.print(os);
+    I->getContext().emitError(os.str() + ": " + Twine(Msg));
+    return;
+  } else if (isa<PHINode>(I) || isa<SelectInst>(I)) {
+    if (EmitErrorOnInstructionFollowPhiSelect(I, Msg))
+      return;
+  }
+
+  I->getContext().emitError(Twine(Msg) + " Use /Zi for source location.");
+}
+
+const StringRef kResourceMapErrorMsg =
+    "local resource not guaranteed to map to unique global resource.";
+void EmitResMappingError(Instruction *Res) {
+  EmitErrorOnInstruction(Res, kResourceMapErrorMsg);
+}
+
+void CollectSelect(llvm::Instruction *Inst,
+                   std::unordered_set<llvm::Instruction *> &selectSet) {
+  unsigned startOpIdx = 0;
+  // Skip Cond for Select.
+  if (isa<SelectInst>(Inst)) {
+    startOpIdx = 1;
+  } else if (!isa<PHINode>(Inst)) {
+    // Only check phi and select here.
+    return;
+  }
+  // Already add.
+  if (selectSet.count(Inst))
+    return;
+
+  selectSet.insert(Inst);
+
+  // Scan operand to add node which is phi/select.
+  unsigned numOperands = Inst->getNumOperands();
+  for (unsigned i = startOpIdx; i < numOperands; i++) {
+    Value *V = Inst->getOperand(i);
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      CollectSelect(I, selectSet);
+    }
+  }
+}
+
+Value *MergeSelectOnSameValue(Instruction *SelInst, unsigned startOpIdx,
+                            unsigned numOperands) {
+  Value *op0 = nullptr;
+  for (unsigned i = startOpIdx; i < numOperands; i++) {
+    Value *op = SelInst->getOperand(i);
+    if (i == startOpIdx) {
+      op0 = op;
+    } else {
+      if (op0 != op)
+        return nullptr;
+    }
+  }
+  if (op0) {
+    SelInst->replaceAllUsesWith(op0);
+    SelInst->eraseFromParent();
+  }
+  return op0;
 }
 
+Value *SelectOnOperation(llvm::Instruction *Inst, unsigned operandIdx) {
+  Instruction *prototype = Inst;
+  for (unsigned i = 0; i < prototype->getNumOperands(); i++) {
+    if (i == operandIdx)
+      continue;
+    if (!isa<Constant>(prototype->getOperand(i)))
+      return nullptr;
+  }
+  Value *V = prototype->getOperand(operandIdx);
+  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    IRBuilder<> Builder(SI);
+    Instruction *trueClone = Inst->clone();
+    trueClone->setOperand(operandIdx, SI->getTrueValue());
+    Builder.Insert(trueClone);
+    Instruction *falseClone = Inst->clone();
+    falseClone->setOperand(operandIdx, SI->getFalseValue());
+    Builder.Insert(falseClone);
+    Value *newSel =
+        Builder.CreateSelect(SI->getCondition(), trueClone, falseClone);
+    return newSel;
+  }
+
+  if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+    Type *Ty = Inst->getType();
+    unsigned numOperands = Phi->getNumOperands();
+    IRBuilder<> Builder(Phi);
+    PHINode *newPhi = Builder.CreatePHI(Ty, numOperands);
+    for (unsigned i = 0; i < numOperands; i++) {
+      BasicBlock *b = Phi->getIncomingBlock(i);
+      Value *V = Phi->getIncomingValue(i);
+      Instruction *iClone = Inst->clone();
+      IRBuilder<> iBuilder(b->getTerminator()->getPrevNode());
+      iClone->setOperand(operandIdx, V);
+      iBuilder.Insert(iClone);
+      newPhi->addIncoming(iClone, b);
+    }
+    return newPhi;
+  }
+  return nullptr;
+}
+
+llvm::Instruction *SkipAllocas(llvm::Instruction *I) {
+  // Step past any allocas:
+  while (I && isa<AllocaInst>(I))
+    I = I->getNextNode();
+  return I;
+}
+llvm::Instruction *FindAllocaInsertionPt(llvm::Instruction* I) {
+  Function *F = I->getParent()->getParent();
+  if (F)
+    return F->getEntryBlock().getFirstInsertionPt();
+  else // BB with no parent function
+    return I->getParent()->getFirstInsertionPt();
+}
+llvm::Instruction *FindAllocaInsertionPt(llvm::Function* F) {
+  return F->getEntryBlock().getFirstInsertionPt();
+}
+llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Instruction* I) {
+  return SkipAllocas(FindAllocaInsertionPt(I));
+}
+llvm::Instruction *FirstNonAllocaInsertionPt(llvm::BasicBlock* BB) {
+  return SkipAllocas(
+    BB->getFirstInsertionPt());
+}
+llvm::Instruction *FirstNonAllocaInsertionPt(llvm::Function* F) {
+  return SkipAllocas(
+    F->getEntryBlock().getFirstInsertionPt());
+}
+
+bool ContainsHLSLObjectType(llvm::Type *Ty) {
+  // Unwrap pointer/array
+  while (llvm::isa<llvm::PointerType>(Ty))
+    Ty = llvm::cast<llvm::PointerType>(Ty)->getPointerElementType();
+  while (llvm::isa<llvm::ArrayType>(Ty))
+    Ty = llvm::cast<llvm::ArrayType>(Ty)->getArrayElementType();
+
+  if (llvm::StructType *ST = llvm::dyn_cast<llvm::StructType>(Ty)) {
+    if (ST->getName().startswith("dx.types."))
+      return true;
+    // TODO: How is this suppoed to check for Input/OutputPatch types if
+    // these have already been eliminated in function arguments during CG?
+    if (HLModule::IsHLSLObjectType(Ty))
+      return true;
+    // Otherwise, recurse elements of UDT
+    for (auto ETy : ST->elements()) {
+      if (ContainsHLSLObjectType(ETy))
+        return true;
+    }
+  }
+  return false;
+}
+
+
 }
 }

File diff suppressed because it is too large
+ 473 - 219
lib/HLSL/DxilValidation.cpp


File diff suppressed because it is too large
+ 332 - 211
lib/HLSL/HLMatrixLowerPass.cpp


+ 57 - 3
lib/HLSL/HLModule.cpp

@@ -125,6 +125,13 @@ const HLOptions &HLModule::GetHLOptions() const {
   return m_Options;
 }
 
+void HLModule::SetAutoBindingSpace(uint32_t Space) {
+  m_AutoBindingSpace = Space;
+}
+uint32_t HLModule::GetAutoBindingSpace() const {
+  return m_AutoBindingSpace;
+}
+
 Function *HLModule::GetEntryFunction() const {
   return m_pEntryFunc;
 }
@@ -310,8 +317,7 @@ RootSignatureHandle *HLModule::ReleaseRootSignature() {
   return m_RootSignature.release();
 }
 
-std::unordered_map<llvm::Function *, std::unique_ptr<DxilFunctionProps>> &&
-HLModule::ReleaseFunctionPropsMap() {
+DxilFunctionPropsMap &&HLModule::ReleaseFunctionPropsMap() {
   return std::move(m_DxilFunctionPropsMap);
 }
 
@@ -351,6 +357,35 @@ void HLModule::AddDxilFunctionProps(llvm::Function *F, std::unique_ptr<DxilFunct
   DXASSERT_NOMSG(info->shaderKind != DXIL::ShaderKind::Invalid);
   m_DxilFunctionPropsMap[F] = std::move(info);
 }
+void HLModule::SetPatchConstantFunctionForHS(llvm::Function *hullShaderFunc, llvm::Function *patchConstantFunc) {
+  auto propIter = m_DxilFunctionPropsMap.find(hullShaderFunc);
+  DXASSERT(propIter != m_DxilFunctionPropsMap.end(), "else Hull Shader missing function props");
+  DxilFunctionProps &props = *(propIter->second);
+  DXASSERT(props.IsHS(), "else hullShaderFunc is not a Hull Shader");
+  if (props.ShaderProps.HS.patchConstantFunc)
+    m_PatchConstantFunctions.erase(props.ShaderProps.HS.patchConstantFunc);
+  props.ShaderProps.HS.patchConstantFunc = patchConstantFunc;
+  if (patchConstantFunc)
+    m_PatchConstantFunctions.insert(patchConstantFunc);
+}
+bool HLModule::IsGraphicsShader(llvm::Function *F) {
+  return HasDxilFunctionProps(F) && GetDxilFunctionProps(F).IsGraphics();
+}
+bool HLModule::IsPatchConstantShader(llvm::Function *F) {
+  return m_PatchConstantFunctions.count(F) != 0;
+}
+bool HLModule::IsComputeShader(llvm::Function *F) {
+  return HasDxilFunctionProps(F) && GetDxilFunctionProps(F).IsCS();
+}
+bool HLModule::IsEntryThatUsesSignatures(llvm::Function *F) {
+  auto propIter = m_DxilFunctionPropsMap.find(F);
+  if (propIter != m_DxilFunctionPropsMap.end()) {
+    DxilFunctionProps &props = *(propIter->second);
+    return props.IsGraphics() || props.IsCS();
+  }
+  // Otherwise, return true if patch constant function
+  return IsPatchConstantShader(F);
+}
 
 DxilFunctionAnnotation *HLModule::GetFunctionAnnotation(llvm::Function *F) {
   return m_pTypeSystem->GetFunctionAnnotation(F);
@@ -394,6 +429,14 @@ void HLModule::SetFloat32DenormMode(const DXIL::Float32DenormMode mode) {
   m_Float32DenormMode = mode;
 }
 
+DXIL::DefaultLinkage HLModule::GetDefaultLinkage() const {
+  return m_DefaultLinkage;
+}
+
+void HLModule::SetDefaultLinkage(const DXIL::DefaultLinkage linkage) {
+  m_DefaultLinkage = linkage;
+}
+
 static const StringRef kHLDxilFunctionPropertiesMDName           = "dx.fnprops";
 static const StringRef kHLDxilOptionsMDName                      = "dx.options";
 static const StringRef kHLDxilResourceTypeAnnotationMDName       = "dx.resource.type.annotation";
@@ -426,6 +469,7 @@ void HLModule::EmitHLMetadata() {
     NamedMDNode * options = m_pModule->getOrInsertNamedMetadata(kHLDxilOptionsMDName);
     uint32_t hlOptions = m_Options.GetHLOptionsRaw();
     options->addOperand(MDNode::get(m_Ctx, m_pMDHelper->Uint32ToConstMD(hlOptions)));
+    options->addOperand(MDNode::get(m_Ctx, m_pMDHelper->Uint32ToConstMD(GetAutoBindingSpace())));
 
     NamedMDNode * resTyAnnotations = m_pModule->getOrInsertNamedMetadata(kHLDxilResourceTypeAnnotationMDName);
     resTyAnnotations->addOperand(EmitResTyAnnotations());
@@ -466,7 +510,12 @@ void HLModule::LoadHLMetadata() {
       std::unique_ptr<hlsl::DxilFunctionProps> props =
           llvm::make_unique<hlsl::DxilFunctionProps>();
 
-      Function *F = m_pMDHelper->LoadDxilFunctionProps(pProps, props.get());
+      const Function *F = m_pMDHelper->LoadDxilFunctionProps(pProps, props.get());
+
+      if (props->IsHS() && props->ShaderProps.HS.patchConstantFunc) {
+        // Add patch constant function to m_PatchConstantFunctions
+        m_PatchConstantFunctions.insert(props->ShaderProps.HS.patchConstantFunc);
+      }
 
       m_DxilFunctionPropsMap[F] = std::move(props);
     }
@@ -474,6 +523,8 @@ void HLModule::LoadHLMetadata() {
     const NamedMDNode * options = m_pModule->getOrInsertNamedMetadata(kHLDxilOptionsMDName);
     const MDNode *MDOptions = options->getOperand(0);
     m_Options.SetHLOptionsRaw(DxilMDHelper::ConstMDToUint32(MDOptions->getOperand(0)));
+    if (options->getNumOperands() > 1)
+      SetAutoBindingSpace(DxilMDHelper::ConstMDToUint32(options->getOperand(1)->getOperand(0)));
     NamedMDNode * resTyAnnotations = m_pModule->getOrInsertNamedMetadata(kHLDxilResourceTypeAnnotationMDName);
     const MDNode *MDResTyAnnotations = resTyAnnotations->getOperand(0);
     if (MDResTyAnnotations->getNumOperands())
@@ -770,6 +821,9 @@ bool HLModule::IsHLSLObjectType(llvm::Type *Ty) {
     if (name.startswith("ConstantBuffer"))
       return true;
 
+    if (name == "RaytracingAccelerationStructure")
+      return true;
+
     name = name.ltrim("RasterizerOrdered");
     name = name.ltrim("RW");
     if (name == "ByteAddressBuffer")

+ 407 - 217
lib/HLSL/HLOperationLower.cpp

@@ -411,9 +411,14 @@ Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode, ArrayRef<Valu
     }
     return retVal;
   } else {
-    Value *retVal =
-        Builder.CreateCall(dxilFunc, args, hlslOP->GetOpCodeName(opcode));
-    return retVal;
+    if (!RetTy->isVoidTy()) {
+      Value *retVal =
+          Builder.CreateCall(dxilFunc, args, hlslOP->GetOpCodeName(opcode));
+      return retVal;
+    } else {
+      // Cannot add name to void.
+      return Builder.CreateCall(dxilFunc, args);
+    }
   }
 }
 // Generates a DXIL operation over an overloaded type (Ty), returning a
@@ -885,6 +890,19 @@ Value *TrivialNoArgOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return dxilOp;
 }
 
+Value *TrivialNoArgWithRetOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                             HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  Type *Ty = CI->getType();
+
+  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
+  Value *args[] = {opArg};
+  IRBuilder<> Builder(CI);
+  Value *dxilOp = TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
+
+  return dxilOp;
+}
+
 Value *TranslateGetRTSamplePos(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
                                HLOperationLowerHelper &helper,  HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   hlsl::OP *hlslOP = &helper.hlslOP;
@@ -2217,8 +2235,8 @@ Value *TranslateGetDimensions(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
     // Set stride.
     Value *stridePtr = CI->getArgOperand(widthOpIdx + 1);
     const DataLayout &DL = helper.dataLayout;
-    Value *buf = CI->getArgOperand(HLOperandIndex::kHandleOpIdx);
-    Type *bufTy = buf->getType();
+    Value *handle = CI->getArgOperand(HLOperandIndex::kHandleOpIdx);
+    Type *bufTy = pObjHelper->GetResourceType(handle);
     Type *bufRetTy = bufTy->getStructElementType(0);
     unsigned stride = DL.getTypeAllocSize(bufRetTy);
     Builder.CreateStore(hlslOP->GetU32Const(stride), stridePtr);
@@ -3083,46 +3101,34 @@ void Make64bitResultForLoad(Type *EltTy, ArrayRef<Value *> resultElts32,
 }
 
 static Constant *GetRawBufferMaskForETy(Type *Ty, unsigned NumComponents, hlsl::OP *OP) {
-  Type *ETy = Ty->getScalarType();
-  bool is64 = ETy->isDoubleTy() || ETy == Type::getInt64Ty(ETy->getContext());
   unsigned mask = 0;
-  if (is64) {
-    switch (NumComponents) {
-    case 0:
-      break;
-    case 1:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
-      break;
-    case 2:
-      mask = DXIL::kCompMask_All;
-      break;
-    default:
-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
-    }
-  }
-  else {
-    switch (NumComponents) {
-    case 0:
-      break;
-    case 1:
-      mask = DXIL::kCompMask_X;
-      break;
-    case 2:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
-      break;
-    case 3:
-      mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
-      break;
-    case 4:
-      mask = DXIL::kCompMask_All;
-      break;
-    default:
-      DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
-    }
+
+  switch (NumComponents) {
+  case 0:
+    break;
+  case 1:
+    mask = DXIL::kCompMask_X;
+    break;
+  case 2:
+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y;
+    break;
+  case 3:
+    mask = DXIL::kCompMask_X | DXIL::kCompMask_Y | DXIL::kCompMask_Z;
+    break;
+  case 4:
+    mask = DXIL::kCompMask_All;
+    break;
+  default:
+    DXASSERT(false, "Cannot load more than 2 components for 64bit types.");
   }
   return OP->GetI8Const(mask);
 }
 
+void GenerateStructBufLd(Value *handle, Value *bufIdx, Value *offset,
+  Value *status, Type *EltTy,
+  MutableArrayRef<Value *> resultElts, hlsl::OP *OP,
+  IRBuilder<> &Builder, unsigned NumComponents, Constant *alignment);
+
 void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
                    IRBuilder<> &Builder, hlsl::OP *OP, const DataLayout &DL) {
 
@@ -3140,8 +3146,26 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
   Type *doubleTy = Builder.getDoubleTy();
   Type *EltTy = Ty->getScalarType();
   Constant *Alignment = OP->GetI32Const(OP->GetAllocSizeForType(EltTy));
+  unsigned numComponents = 1;
+  if (Ty->isVectorTy()) {
+    numComponents = Ty->getVectorNumElements();
+  }
+
+  if (RK == HLResource::Kind::StructuredBuffer) {
+    // Basic type case for StructuredBuffer::Load()
+    Value *ResultElts[4];
+    GenerateStructBufLd(helper.handle, helper.addr, OP->GetU32Const(0),
+      helper.status, EltTy, ResultElts, OP, Builder, numComponents, Alignment);
+    Value *retValNew = ScalarizeElements(Ty, ResultElts, Builder);
+    helper.retVal->replaceAllUsesWith(retValNew);
+    helper.retVal = retValNew;
+    return;
+  }
+
+  bool isTyped = opcode == OP::OpCode::TextureLoad ||
+                 RK == DxilResource::Kind::TypedBuffer;
   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (is64) {
+  if (is64 && isTyped) {
     EltTy = i32Ty;
   }
 
@@ -3207,35 +3231,21 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
     // elementOffset, mask, alignment
     loadArgs.emplace_back(undefI);
     Type *rtnTy = helper.retVal->getType();
-    unsigned numComponents = 1;
-    if (VectorType *VTy = dyn_cast<VectorType>(rtnTy)) {
-      rtnTy = VTy->getElementType();
-      numComponents = VTy->getNumElements();
-    }
     loadArgs.emplace_back(GetRawBufferMaskForETy(rtnTy, numComponents, OP));
     loadArgs.emplace_back(Alignment);
   }
   else if (RK == DxilResource::Kind::TypedBuffer) {
     loadArgs.emplace_back(undefI);
   }
-  else if (RK == DxilResource::Kind::StructuredBuffer) {
-    // elementOffset, mask, alignment
-    loadArgs.emplace_back(
-      OP->GetU32Const(0)); // For case use built-in types in structure buffer.
-    loadArgs.emplace_back(OP->GetU8Const(0)); // When is this case hit?
-    loadArgs.emplace_back(Alignment);
-  }
+
   Value *ResRet =
       Builder.CreateCall(F, loadArgs, OP->GetOpCodeName(opcode));
 
   Value *retValNew = nullptr;
-  if (!is64) {
+  if (!is64 || !isTyped) {
     retValNew = ScalarizeResRet(Ty, ResRet, Builder);
   } else {
-    unsigned size = 1;
-    if (Ty->isVectorTy()) {
-      size = Ty->getVectorNumElements();
-    }
+    unsigned size = numComponents;
     DXASSERT(size <= 2, "typed buffer only allow 4 dwords");
     EltTy = Ty->getScalarType();
     Value *Elts[2];
@@ -3334,13 +3344,16 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     break;
   }
 
+  bool isTyped = opcode == OP::OpCode::TextureStore ||
+                 RK == DxilResource::Kind::TypedBuffer;
+
   Type *i32Ty = Builder.getInt32Ty();
   Type *i64Ty = Builder.getInt64Ty();
   Type *doubleTy = Builder.getDoubleTy();
   Type *EltTy = Ty->getScalarType();
   Constant *Alignment = OP->GetI32Const(OP->GetAllocSizeForType(EltTy));
   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (is64) {
+  if (is64 && isTyped) {
     EltTy = i32Ty;
   }
 
@@ -3388,8 +3401,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   }
 
   // values
-  bool isTyped = opcode == OP::OpCode::TextureStore ||
-                 RK == DxilResource::Kind::TypedBuffer;
   uint8_t mask = 0;
   if (Ty->isVectorTy()) {
     unsigned vecSize = Ty->getVectorNumElements();
@@ -3424,7 +3435,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     }
   }
 
-  if (is64) {
+  if (is64 && isTyped) {
     unsigned size = 1;
     if (Ty->isVectorTy()) {
       size = Ty->getVectorNumElements();
@@ -4242,6 +4253,158 @@ Value *TranslateProcessTessFactors(CallInst *CI, IntrinsicOp IOP, OP::OpCode opc
 
 }
 
+// Ray Tracing.
+namespace {
+Value *TranslateReportIntersection(CallInst *CI, IntrinsicOp IOP,
+                                   OP::OpCode opcode,
+                                   HLOperationLowerHelper &helper,
+                                   HLObjectOperationLowerHelper *pObjHelper,
+                                   bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  Value *THit = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
+  Value *HitKind = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
+  Value *Attr = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
+  Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(opcode));
+
+  Type *Ty = Attr->getType();
+  Function *F = hlslOP->GetOpFunc(opcode, Ty);
+
+  IRBuilder<> Builder(CI);
+  return Builder.CreateCall(F, {opArg, THit, HitKind, Attr});
+}
+
+Value *TranslateCallShader(CallInst *CI, IntrinsicOp IOP,
+                                   OP::OpCode opcode,
+                                   HLOperationLowerHelper &helper,
+                                   HLObjectOperationLowerHelper *pObjHelper,
+                                   bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  Value *ShaderIndex = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *Parameter = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(opcode));
+
+  Type *Ty = Parameter->getType();
+  Function *F = hlslOP->GetOpFunc(opcode, Ty);
+
+  IRBuilder<> Builder(CI);
+  return Builder.CreateCall(F, {opArg, ShaderIndex, Parameter});
+}
+
+Value *TranslateTraceRay(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                         HLOperationLowerHelper &helper,
+                         HLObjectOperationLowerHelper *pObjHelper,
+                         bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+
+  Value *rayDesc = CI->getArgOperand(HLOperandIndex::kTraceRayRayDescOpIdx);
+  Value *payLoad = CI->getArgOperand(HLOperandIndex::kTraceRayPayLoadOpIdx);
+
+  Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(opcode));
+
+  Value *Args[DXIL::OperandIndex::kTraceRayNumOp];
+  Args[0] = opArg;
+  for (unsigned i = 1; i < HLOperandIndex::kTraceRayRayDescOpIdx; i++) {
+    Args[i] = CI->getArgOperand(i);
+  }
+  IRBuilder<> Builder(CI);
+  // struct RayDesc
+  //{
+  //    float3 Origin;
+  //    float  TMin;
+  //    float3 Direction;
+  //    float  TMax;
+  //};
+  Value *zeroIdx = hlslOP->GetU32Const(0);
+  Value *origin = Builder.CreateGEP(rayDesc, {zeroIdx, zeroIdx});
+  origin = Builder.CreateLoad(origin);
+  unsigned index = DXIL::OperandIndex::kTraceRayRayDescOpIdx;
+  Args[index++] = Builder.CreateExtractElement(origin, (uint64_t)0);
+  Args[index++] = Builder.CreateExtractElement(origin, 1);
+  Args[index++] = Builder.CreateExtractElement(origin, 2);
+
+  Value *tmin = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(1)});
+  tmin = Builder.CreateLoad(tmin);
+  Args[index++] = tmin;
+
+  Value *direction = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(2)});
+  direction = Builder.CreateLoad(direction);
+
+  Args[index++] = Builder.CreateExtractElement(direction, (uint64_t)0);
+  Args[index++] = Builder.CreateExtractElement(direction, 1);
+  Args[index++] = Builder.CreateExtractElement(direction, 2);
+
+  Value *tmax = Builder.CreateGEP(rayDesc, {zeroIdx, hlslOP->GetU32Const(3)});
+  tmax = Builder.CreateLoad(tmax);
+  Args[index++] = tmax;
+
+  Args[DXIL::OperandIndex::kTraceRayPayloadOpIdx] = payLoad;
+
+  Type *Ty = payLoad->getType();
+  Function *F = hlslOP->GetOpFunc(opcode, Ty);
+
+
+  return Builder.CreateCall(F, Args);
+}
+
+Value *TranslateNoArgVectorOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                         HLOperationLowerHelper &helper,
+                         HLObjectOperationLowerHelper *pObjHelper,
+                         bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  uint8_t vals[] = {0,1,2,3};
+  Constant *src = ConstantDataVector::get(CI->getContext(), vals);
+  Value *retVal = TrivialDxilOperation(opcode, {nullptr, src}, Ty, CI, hlslOP);
+  return retVal;
+}
+
+Value *TranslateNoArgMatrix3x4Operation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                         HLOperationLowerHelper &helper,
+                         HLObjectOperationLowerHelper *pObjHelper,
+                         bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  uint32_t rVals[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2};
+  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
+  uint8_t cVals[] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
+  Value *retVal =
+      TrivialDxilOperation(opcode, {nullptr, rows, cols}, Ty, CI, hlslOP);
+  return retVal;
+}
+
+Value *TranslateNoArgTransposedMatrix3x4Operation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                                                  HLOperationLowerHelper &helper,
+                                                  HLObjectOperationLowerHelper *pObjHelper,
+                                                  bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+  VectorType *Ty = cast<VectorType>(CI->getType());
+  uint32_t rVals[] = { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2 };
+  Constant *rows = ConstantDataVector::get(CI->getContext(), rVals);
+  uint8_t cVals[] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 };
+  Constant *cols = ConstantDataVector::get(CI->getContext(), cVals);
+  Value *retVal =
+    TrivialDxilOperation(opcode, { nullptr, rows, cols }, Ty, CI, hlslOP);
+  return retVal;
+}
+
+Value *TranslateNoArgNoReturnPreserveOutput(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+  HLOperationLowerHelper &helper, HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
+  Instruction *pResult = cast<Instruction>(
+    TrivialNoArgOperation(CI, IOP, opcode, helper, pObjHelper, Translated));
+  // HL intrinsic must have had a return injected just after the call.
+  // SROA_Parameter_HLSL will copy from alloca to output just before each return.
+  // Now move call after the copy and just before the return.
+  if (isa<ReturnInst>(pResult->getNextNode()))
+    return pResult;
+  ReturnInst *RetI = cast<ReturnInst>(pResult->getParent()->getTerminator());
+  pResult->removeFromParent();
+  pResult->insertBefore(RetI);
+  return pResult;
+}
+
+} // namespace
+
 // Lower table.
 namespace {
 
@@ -4275,13 +4438,17 @@ Value *StreamOutputLower(CallInst *CI, IntrinsicOp IOP, DXIL::OpCode opcode,
 
 // This table has to match IntrinsicOp orders
 IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] = {
+    {IntrinsicOp::IOP_AcceptHitAndEndSearch, TranslateNoArgNoReturnPreserveOutput, DXIL::OpCode::AcceptHitAndEndSearch},
     {IntrinsicOp::IOP_AddUint64,  TranslateAddUint64,  DXIL::OpCode::UAddc},
     {IntrinsicOp::IOP_AllMemoryBarrier, TrivialBarrier, DXIL::OpCode::Barrier},
     {IntrinsicOp::IOP_AllMemoryBarrierWithGroupSync, TrivialBarrier, DXIL::OpCode::Barrier},
+    {IntrinsicOp::IOP_CallShader, TranslateCallShader, DXIL::OpCode::CallShader},
     {IntrinsicOp::IOP_CheckAccessFullyMapped, TranslateCheckAccess, DXIL::OpCode::CheckAccessFullyMapped},
     {IntrinsicOp::IOP_D3DCOLORtoUBYTE4, TranslateD3DColorToUByte4, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_DeviceMemoryBarrier, TrivialBarrier, DXIL::OpCode::Barrier},
     {IntrinsicOp::IOP_DeviceMemoryBarrierWithGroupSync, TrivialBarrier, DXIL::OpCode::Barrier},
+    {IntrinsicOp::IOP_DispatchRaysDimensions, TranslateNoArgVectorOperation, DXIL::OpCode::DispatchRaysDimensions},
+    {IntrinsicOp::IOP_DispatchRaysIndex, TranslateNoArgVectorOperation, DXIL::OpCode::DispatchRaysIndex},
     {IntrinsicOp::IOP_EvaluateAttributeAtSample, TranslateEvalSample, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_EvaluateAttributeCentroid, TranslateEvalCentroid, DXIL::OpCode::EvalCentroid},
     {IntrinsicOp::IOP_EvaluateAttributeSnapped, TranslateEvalSnapped, DXIL::OpCode::NumOpCodes},
@@ -4290,6 +4457,10 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     {IntrinsicOp::IOP_GetRenderTargetSamplePosition, TranslateGetRTSamplePos, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_GroupMemoryBarrier, TrivialBarrier, DXIL::OpCode::Barrier},
     {IntrinsicOp::IOP_GroupMemoryBarrierWithGroupSync, TrivialBarrier, DXIL::OpCode::Barrier},
+    {IntrinsicOp::IOP_HitKind, TrivialNoArgWithRetOperation, DXIL::OpCode::HitKind},
+    {IntrinsicOp::IOP_IgnoreHit, TranslateNoArgNoReturnPreserveOutput, DXIL::OpCode::IgnoreHit},
+    {IntrinsicOp::IOP_InstanceID, TrivialNoArgWithRetOperation, DXIL::OpCode::InstanceID},
+    {IntrinsicOp::IOP_InstanceIndex, TrivialNoArgWithRetOperation, DXIL::OpCode::InstanceIndex},
     {IntrinsicOp::IOP_InterlockedAdd, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedAnd, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedCompareExchange, TranslateIopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
@@ -4300,6 +4471,12 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     {IntrinsicOp::IOP_InterlockedOr, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedXor, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_NonUniformResourceIndex, TranslateNonUniformResourceIndex, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_ObjectRayDirection, TranslateNoArgVectorOperation, DXIL::OpCode::ObjectRayDirection},
+    {IntrinsicOp::IOP_ObjectRayOrigin, TranslateNoArgVectorOperation, DXIL::OpCode::ObjectRayOrigin},
+    {IntrinsicOp::IOP_ObjectToWorld, TranslateNoArgMatrix3x4Operation, DXIL::OpCode::ObjectToWorld},
+    {IntrinsicOp::IOP_ObjectToWorld3x4, TranslateNoArgMatrix3x4Operation, DXIL::OpCode::ObjectToWorld},
+    {IntrinsicOp::IOP_ObjectToWorld4x3, TranslateNoArgTransposedMatrix3x4Operation, DXIL::OpCode::ObjectToWorld},
+    {IntrinsicOp::IOP_PrimitiveIndex, TrivialNoArgWithRetOperation, DXIL::OpCode::PrimitiveIndex},
     {IntrinsicOp::IOP_Process2DQuadTessFactorsAvg, TranslateProcessTessFactors, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_Process2DQuadTessFactorsMax, TranslateProcessTessFactors, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_Process2DQuadTessFactorsMin, TranslateProcessTessFactors, DXIL::OpCode::NumOpCodes},
@@ -4314,6 +4491,11 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     {IntrinsicOp::IOP_QuadReadAcrossX, TranslateQuadReadAcross, DXIL::OpCode::QuadOp},
     {IntrinsicOp::IOP_QuadReadAcrossY, TranslateQuadReadAcross, DXIL::OpCode::QuadOp},
     {IntrinsicOp::IOP_QuadReadLaneAt,  TranslateQuadReadLaneAt, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_RayFlags, TrivialNoArgWithRetOperation, DXIL::OpCode::RayFlags},
+    {IntrinsicOp::IOP_RayTCurrent, TrivialNoArgWithRetOperation, DXIL::OpCode::RayTCurrent},
+    {IntrinsicOp::IOP_RayTMin, TrivialNoArgWithRetOperation, DXIL::OpCode::RayTMin},
+    {IntrinsicOp::IOP_ReportHit, TranslateReportIntersection, DXIL::OpCode::ReportHit},
+    {IntrinsicOp::IOP_TraceRay, TranslateTraceRay, DXIL::OpCode::TraceRay},
     {IntrinsicOp::IOP_WaveActiveAllEqual, TranslateWaveAllEqual, DXIL::OpCode::WaveActiveAllEqual},
     {IntrinsicOp::IOP_WaveActiveAllTrue, TranslateWaveA2B, DXIL::OpCode::WaveAllTrue},
     {IntrinsicOp::IOP_WaveActiveAnyTrue, TranslateWaveA2B, DXIL::OpCode::WaveAnyTrue},
@@ -4334,6 +4516,11 @@ IntrinsicLower gLowerTable[static_cast<unsigned>(IntrinsicOp::Num_Intrinsics)] =
     {IntrinsicOp::IOP_WavePrefixSum, TranslateWaveA2A, DXIL::OpCode::WavePrefixOp},
     {IntrinsicOp::IOP_WaveReadLaneAt, TranslateWaveReadLaneAt, DXIL::OpCode::WaveReadLaneAt},
     {IntrinsicOp::IOP_WaveReadLaneFirst, TranslateWaveReadLaneFirst, DXIL::OpCode::WaveReadLaneFirst},
+    {IntrinsicOp::IOP_WorldRayDirection, TranslateNoArgVectorOperation, DXIL::OpCode::WorldRayDirection},
+    {IntrinsicOp::IOP_WorldRayOrigin, TranslateNoArgVectorOperation, DXIL::OpCode::WorldRayOrigin},
+    {IntrinsicOp::IOP_WorldToObject, TranslateNoArgMatrix3x4Operation, DXIL::OpCode::WorldToObject},
+    {IntrinsicOp::IOP_WorldToObject3x4, TranslateNoArgMatrix3x4Operation, DXIL::OpCode::WorldToObject},
+    {IntrinsicOp::IOP_WorldToObject4x3, TranslateNoArgTransposedMatrix3x4Operation, DXIL::OpCode::WorldToObject},
     {IntrinsicOp::IOP_abort, EmptyLower, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_abs, TransalteAbs, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_acos, TrivialUnaryOperation, DXIL::OpCode::Acos},
@@ -5406,14 +5593,27 @@ void TranslateCBGepLegacy(GetElementPtrInst *GEP, Value *handle,
       // Indexing on vector.
       if (bImmIdx) {
         unsigned tempOffset = size * immIdx;
-        unsigned channelInc = tempOffset >> 2;
-        DXASSERT((channel + channelInc)<=4, "vector should not cross cb register");
-        channel += channelInc;
-        if (channel == 4) {
-          // Get to another row.
-          // Update index and channel.
-          channel = 0;
-          legacyIndex = Builder.CreateAdd(legacyIndex, Builder.getInt32(1));
+        if (size == 2) { // 16-bit types
+          unsigned channelInc = tempOffset >> 1;
+          DXASSERT((channel + channelInc) <= 8, "vector should not cross cb register (8x16bit)");
+          channel += channelInc;
+          if (channel == 8) {
+            // Get to another row.
+            // Update index and channel.
+            channel = 0;
+            legacyIndex = Builder.CreateAdd(legacyIndex, Builder.getInt32(1));
+          }
+        }
+        else {
+          unsigned channelInc = tempOffset >> 2;
+          DXASSERT((channel + channelInc) <= 4, "vector should not cross cb register (8x32bit)");
+          channel += channelInc;
+          if (channel == 4) {
+            // Get to another row.
+            // Update index and channel.
+            channel = 0;
+            legacyIndex = Builder.CreateAdd(legacyIndex, Builder.getInt32(1));
+          }
         }
       } else {
         Type *EltTy = GEPIt->getVectorElementType();
@@ -5551,57 +5751,23 @@ void GenerateStructBufLd(Value *handle, Value *bufIdx, Value *offset,
   DXASSERT(resultElts.size() <= 4,
            "buffer load cannot load more than 4 values");
 
+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
+  Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
+                   handle,
+                   bufIdx,
+                   offset,
+                   mask,
+                   alignment};
+  Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
 
-  Type *i64Ty = Builder.getInt64Ty();
-  Type *doubleTy = Builder.getDoubleTy();
-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-
-  if (!is64) {
-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents, OP);
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-
-    for (unsigned i = 0; i < resultElts.size(); i++) {
-      resultElts[i] = Builder.CreateExtractValue(Ld, i);
-    }
-
-    // status
-    UpdateStatus(Ld, status, Builder, OP);
-    return;
-  } else {
-    // 64 bit.
-    Function *dxilF = OP->GetOpFunc(opcode, Builder.getInt32Ty());
-    Constant *mask = GetRawBufferMaskForETy(EltTy, NumComponents < 2 ? NumComponents : 2, OP);
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode), handle, bufIdx, offset, mask, alignment};
-    Value *Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-    Value *resultElts32[8];
-    unsigned size = resultElts.size();
-    unsigned eltBase = 0;
-    for (unsigned i = 0; i < size; i++) {
-      if (i == 2) {
-        // Update offset 4 by 4 bytes.
-        Args[DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx] =
-            Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
-        // Update Mask
-        Args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
-          GetRawBufferMaskForETy(EltTy, NumComponents < 3 ? 0 : NumComponents - 2, OP);
-        Ld = Builder.CreateCall(dxilF, Args, OP::GetOpCodeName(opcode));
-        eltBase = 4;
-      }
-      unsigned resBase = 2 * i;
-      resultElts32[resBase] = Builder.CreateExtractValue(Ld, resBase - eltBase);
-      resultElts32[resBase + 1] =
-          Builder.CreateExtractValue(Ld, resBase + 1 - eltBase);
-    }
-
-    Make64bitResultForLoad(EltTy, resultElts32, size, resultElts, OP, Builder);
-
-    // status
-    UpdateStatus(Ld, status, Builder, OP);
-
-    return;
+  for (unsigned i = 0; i < resultElts.size(); i++) {
+    resultElts[i] = Builder.CreateExtractValue(Ld, i);
   }
+
+  // status
+  UpdateStatus(Ld, status, Builder, OP);
+  return;
 }
 
 void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
@@ -5609,85 +5775,19 @@ void GenerateStructBufSt(Value *handle, Value *bufIdx, Value *offset,
                          ArrayRef<Value *> vals, uint8_t mask, Constant *alignment) {
   OP::OpCode opcode = OP::OpCode::RawBufferStore;
   DXASSERT(vals.size() == 4, "buffer store need 4 values");
-  Type *i64Ty = Builder.getInt64Ty();
-  Type *doubleTy = Builder.getDoubleTy();
-  bool is64 = EltTy == i64Ty || EltTy == doubleTy;
-  if (!is64) {
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                     handle,
-                     bufIdx,
-                     offset,
-                     vals[0],
-                     vals[1],
-                     vals[2],
-                     vals[3],
-                     OP->GetU8Const(mask),
-                     alignment};
-    Function *dxilF = OP->GetOpFunc(opcode, EltTy);
-    Builder.CreateCall(dxilF, Args);
-  } else {
-    Type *i32Ty = Builder.getInt32Ty();
-    Function *dxilF = OP->GetOpFunc(opcode, i32Ty);
-
-    Value *undefI32 = UndefValue::get(i32Ty);
-    Value *vals32[8] = {undefI32, undefI32, undefI32, undefI32,
-                        undefI32, undefI32, undefI32, undefI32};
-
-    unsigned maskLo = 0;
-    unsigned maskHi = 0;
-    unsigned size = 0;
-    switch (mask) {
-    case 1:
-      maskLo = 3;
-      size = 1;
-      break;
-    case 3:
-      maskLo = 15;
-      size = 2;
-      break;
-    case 7:
-      maskLo = 15;
-      maskHi = 3;
-      size = 3;
-      break;
-    case 15:
-      maskLo = 15;
-      maskHi = 15;
-      size = 4;
-      break;
-    default:
-      DXASSERT(0, "invalid mask");
-    }
 
-    Split64bitValForStore(EltTy, vals, size, vals32, OP, Builder);
-
-    Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                     handle,
-                     bufIdx,
-                     offset,
-                     vals32[0],
-                     vals32[1],
-                     vals32[2],
-                     vals32[3],
-                     OP->GetU8Const(maskLo),
-                     alignment};
-    Builder.CreateCall(dxilF, Args);
-    if (maskHi) {
-      // Update offset 4 by 4 bytes.
-      offset = Builder.CreateAdd(offset, Builder.getInt32(4 * 4));
-      Value *Args[] = {OP->GetU32Const((unsigned)opcode),
-                       handle,
-                       bufIdx,
-                       offset,
-                       vals32[4],
-                       vals32[5],
-                       vals32[6],
-                       vals32[7],
-                       OP->GetU8Const(maskHi),
-                       alignment};
-      Builder.CreateCall(dxilF, Args);
-    }
-  }
+  Value *Args[] = {OP->GetU32Const((unsigned)opcode),
+                   handle,
+                   bufIdx,
+                   offset,
+                   vals[0],
+                   vals[1],
+                   vals[2],
+                   vals[3],
+                   OP->GetU8Const(mask),
+                   alignment};
+  Function *dxilF = OP->GetOpFunc(opcode, EltTy);
+  Builder.CreateCall(dxilF, Args);
 }
 
 Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
@@ -5696,7 +5796,8 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
                                bool colMajor, const DataLayout &DL) {
   unsigned col, row;
   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
-  Constant* alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
+  unsigned  EltSize = DL.getTypeAllocSize(EltTy);
+  Constant* alignment = OP->GetI32Const(EltSize);
 
   Value *offset = baseOffset;
   if (baseOffset == nullptr)
@@ -5711,7 +5812,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     GenerateStructBufLd(handle, bufIdx, offset, status, EltTy, ResultElts, OP, Builder, 3, alignment);
     for (unsigned i = 0; i < rest; i++)
       elts[i] = ResultElts[i];
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * rest));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(EltSize * rest));
   }
 
   for (unsigned i = rest; i < matSize; i += 4) {
@@ -5723,7 +5824,7 @@ Value *TranslateStructBufMatLd(Type *matType, IRBuilder<> &Builder,
     elts[i + 3] = ResultElts[3];
 
     // Update offset by 4*4bytes.
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 
   return HLMatrixLower::BuildVector(EltTy, col * row, elts, Builder);
@@ -5734,7 +5835,8 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                              Value *val, bool colMajor, const DataLayout &DL) {
   unsigned col, row;
   Type *EltTy = HLMatrixLower::GetMatrixInfo(matType, col, row);
-  Constant *Alignment = OP->GetI32Const(DL.getTypeAllocSize(EltTy));
+  unsigned EltSize = DL.getTypeAllocSize(EltTy);
+  Constant *Alignment = OP->GetI32Const(EltSize);
   Value *offset = baseOffset;
   if (baseOffset == nullptr)
     offset = OP->GetU32Const(0);
@@ -5770,7 +5872,7 @@ void TranslateStructBufMatSt(Type *matType, IRBuilder<> &Builder, Value *handle,
                         {elts[i], elts[i + 1], elts[i + 2], elts[i + 3]}, mask,
                         Alignment);
     // Update offset by 4*4bytes.
-    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * 4));
+    offset = Builder.CreateAdd(offset, OP->GetU32Const(4 * EltSize));
   }
 }
 
@@ -6029,7 +6131,6 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
       }
       userCall->eraseFromParent();
     } else if (group == HLOpcodeGroup::HLMatLoadStore)
-      // TODO: support 64 bit.
       TranslateStructBufMatLdSt(userCall, handle, OP, status, bufIdx,
                                 baseOffset, DL);
     else if (group == HLOpcodeGroup::HLSubscript) {
@@ -6507,8 +6608,8 @@ void TranslateHLSubscript(CallInst *CI, HLSubscriptOpcode opcode,
         TranslateStructBufSubscript(CI, handle, /*status*/ nullptr, hlslOP,
                                     helper.dataLayout);
         // Clear offset for typed buf.
-        for (auto User : handle->users()) {
-          CallInst *CI = cast<CallInst>(User);
+        for (auto User = handle->user_begin(); User != handle->user_end(); ) {
+          CallInst *CI = cast<CallInst>(*(User++));
           // Skip not lowered HL functions.
           if (hlsl::GetHLOpcodeGroupByName(CI->getCalledFunction()) != HLOpcodeGroup::NotHL)
             continue;
@@ -6593,6 +6694,36 @@ void TranslateSubscriptOperation(Function *F, HLOperationLowerHelper &helper,  H
   }
 }
 
+// Create BitCast if ptr, otherwise, create alloca of new type, write to bitcast of alloca, and return load from alloca
+// If bOrigAllocaTy is true: create alloca of old type instead, write to alloca, and return load from bitcast of alloca
+static Instruction *BitCastValueOrPtr(Value* V, Instruction *Insert, Type *Ty, bool bOrigAllocaTy = false, const Twine &Name = "") {
+  IRBuilder<> Builder(Insert);
+  if (Ty->isPointerTy()) {
+    // If pointer, we can bitcast directly
+    return cast<Instruction>(Builder.CreateBitCast(V, Ty, Name));
+  } else {
+    // If value, we have to alloca, store to bitcast ptr, and load
+    IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(Insert));
+    Type *allocaTy = bOrigAllocaTy ? V->getType() : Ty;
+    Type *otherTy = bOrigAllocaTy ? Ty : V->getType();
+    Instruction *allocaInst = AllocaBuilder.CreateAlloca(allocaTy);
+    Instruction *bitCast = cast<Instruction>(Builder.CreateBitCast(allocaInst, otherTy->getPointerTo()));
+    Builder.CreateStore(V, bOrigAllocaTy ? allocaInst : bitCast);
+    return Builder.CreateLoad(bOrigAllocaTy ? bitCast : allocaInst, Name);
+  }
+}
+
+static Instruction *CreateTransposeShuffle(IRBuilder<> &Builder, Value *vecVal, unsigned toRows, unsigned toCols) {
+  SmallVector<int, 16> castMask(toCols * toRows);
+  unsigned idx = 0;
+  for (unsigned r = 0; r < toRows; r++)
+    for (unsigned c = 0; c < toCols; c++)
+      castMask[idx++] = c * toRows + r;
+  return cast<Instruction>(
+    Builder.CreateShuffleVector(vecVal, vecVal, castMask));
+}
+
+
 void TranslateHLBuiltinOperation(Function *F, HLOperationLowerHelper &helper,
                                hlsl::HLOpcodeGroup group, HLObjectOperationLowerHelper *pObjHelper) {
   if (group == HLOpcodeGroup::HLIntrinsic) {
@@ -6622,14 +6753,78 @@ void TranslateHLBuiltinOperation(Function *F, HLOperationLowerHelper &helper,
       Type *PtrTy =
           F->getFunctionType()->getParamType(HLOperandIndex::kMatLoadPtrOpIdx);
 
-      if (PtrTy->getPointerAddressSpace() == DXIL::kTGSMAddrSpace ||
-          // TODO: use DeviceAddressSpace for SRV/UAV and CBufferAddressSpace
-          // for CBuffer.
-          PtrTy->getPointerAddressSpace() == DXIL::kDefaultAddrSpace) {
-        // Translate matrix into vector of array for share memory or local
+      if (PtrTy->getPointerAddressSpace() == DXIL::kTGSMAddrSpace) {
+        // Translate matrix into vector of array for shared memory
         // variable should be done in HLMatrixLowerPass.
         if (!F->user_empty())
           F->getContext().emitError("Fail to lower matrix load/store.");
+      } else if (PtrTy->getPointerAddressSpace() == DXIL::kDefaultAddrSpace) {
+        // Default address space may be function argument in lib target
+        if (!F->user_empty()) {
+          for (auto U = F->user_begin(); U != F->user_end();) {
+            Value *User = *(U++);
+            if (!isa<Instruction>(User))
+              continue;
+            // must be call inst
+            CallInst *CI = cast<CallInst>(User);
+            IRBuilder<> Builder(CI);
+            HLMatLoadStoreOpcode opcode = static_cast<HLMatLoadStoreOpcode>(hlsl::GetHLOpcode(CI));
+            switch (opcode) {
+            case HLMatLoadStoreOpcode::ColMatStore:
+            case HLMatLoadStoreOpcode::RowMatStore: {
+              Value *vecVal = CI->getArgOperand(HLOperandIndex::kMatStoreValOpIdx);
+              Value *matPtr = CI->getArgOperand(HLOperandIndex::kMatStoreDstPtrOpIdx);
+              Value *castPtr = Builder.CreateBitCast(matPtr, vecVal->getType()->getPointerTo());
+              Builder.CreateStore(vecVal, castPtr);
+              CI->eraseFromParent();
+            } break;
+            case HLMatLoadStoreOpcode::ColMatLoad:
+            case HLMatLoadStoreOpcode::RowMatLoad: {
+              Value *matPtr = CI->getArgOperand(HLOperandIndex::kMatLoadPtrOpIdx);
+              Value *castPtr = Builder.CreateBitCast(matPtr, CI->getType()->getPointerTo());
+              Value *vecVal = Builder.CreateLoad(castPtr);
+              CI->replaceAllUsesWith(vecVal);
+              CI->eraseFromParent();
+            } break;
+            }
+          }
+        }
+      }
+    } else if (group == HLOpcodeGroup::HLCast) {
+      // HLCast may be used on matrix value function argument in lib target
+      if (!F->user_empty()) {
+        for (auto U = F->user_begin(); U != F->user_end();) {
+          Value *User = *(U++);
+          if (!isa<Instruction>(User))
+            continue;
+          // must be call inst
+          CallInst *CI = cast<CallInst>(User);
+          IRBuilder<> Builder(CI);
+          HLCastOpcode opcode = static_cast<HLCastOpcode>(hlsl::GetHLOpcode(CI));
+          bool bTranspose = false;
+          bool bColDest = false;
+          switch (opcode) {
+          case HLCastOpcode::RowMatrixToColMatrix:
+            bColDest = true;
+          case HLCastOpcode::ColMatrixToRowMatrix:
+            bTranspose = true;
+          case HLCastOpcode::ColMatrixToVecCast:
+          case HLCastOpcode::RowMatrixToVecCast: {
+            Value *matVal = CI->getArgOperand(HLOperandIndex::kInitFirstArgOpIdx);
+            Value *vecVal = BitCastValueOrPtr(matVal, CI, CI->getType(),
+              /*bOrigAllocaTy*/false,
+              matVal->getName());
+            if (bTranspose) {
+              unsigned row, col;
+              HLMatrixLower::GetMatrixInfo(matVal->getType(), col, row);
+              if (bColDest) std::swap(row, col);
+              vecVal = CreateTransposeShuffle(Builder, vecVal, row, col);
+            }
+            CI->replaceAllUsesWith(vecVal);
+            CI->eraseFromParent();
+          } break;
+          }
+        }
       }
     } else if (group == HLOpcodeGroup::HLSubscript) {
       TranslateSubscriptOperation(F, helper, pObjHelper);
@@ -6667,7 +6862,6 @@ static void TranslateHLExtension(Function *F,
   }
 }
 
-
 namespace hlsl {
 
 void TranslateBuiltinOperations(
@@ -6683,11 +6877,11 @@ void TranslateBuiltinOperations(
 
   // generate dxil operation
   for (iplist<Function>::iterator F : M->getFunctionList()) {
+    if (F->user_empty())
+      continue;
     if (!F->isDeclaration()) {
       continue;
     }
-    if (F->user_empty())
-      continue;
     hlsl::HLOpcodeGroup group = hlsl::GetHLOpcodeGroup(F);
     if (group == HLOpcodeGroup::NotHL) {
       // Nothing to do.
@@ -6697,10 +6891,6 @@ void TranslateBuiltinOperations(
       TranslateHLExtension(F, extCodegenHelper, helper.hlslOP);
       continue;
     }
-    if (group == HLOpcodeGroup::HLCreateHandle) {
-      // Will lower in later pass.
-      continue;
-    }
     TranslateHLBuiltinOperation(F, helper, group, &objHelper);
   }
 }

+ 2 - 0
lib/HLSL/HLOperations.cpp

@@ -493,6 +493,8 @@ Function *GetOrCreateHLFunctionWithBody(Module &M, FunctionType *funcTy,
 
   SetHLFunctionAttribute(F, group, opcode);
 
+  F->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
+
   return F;
 }
 

+ 3 - 1
lib/HLSL/HLSignatureLower.cpp

@@ -19,6 +19,7 @@
 #include "dxc/HLSL/HLModule.h"
 #include "dxc/HLSL/HLMatrixLowerHelper.h"
 #include "dxc/HlslIntrinsicOp.h"
+#include "dxc/HLSL/DxilUtil.h"
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/DebugInfo.h"
@@ -537,6 +538,7 @@ Value *replaceLdWithLdInput(Function *loadInput, LoadInst *ldInst,
                             unsigned cols, MutableArrayRef<Value *> args,
                             bool bCast) {
   IRBuilder<> Builder(ldInst);
+  IRBuilder<> AllocaBuilder(dxilutil::FindAllocaInsertionPt(ldInst));
   Type *Ty = ldInst->getType();
   Type *EltTy = Ty->getScalarType();
   // Change i1 to i32 for load input.
@@ -577,7 +579,7 @@ Value *replaceLdWithLdInput(Function *loadInput, LoadInst *ldInst,
       // Vector indexing.
       // Load to array.
       ArrayType *AT = ArrayType::get(ldInst->getType(), cols);
-      Value *arrayVec = Builder.CreateAlloca(AT);
+      Value *arrayVec = AllocaBuilder.CreateAlloca(AT);
       Value *zeroIdx = Builder.getInt32(0);
 
       for (unsigned col = 0; col < cols; col++) {

+ 2 - 1
lib/LLVMBuild.txt

@@ -38,8 +38,9 @@ subdirectories =
  Target
  Transforms
  HLSL
+ DxrFallback
 
-; HLSL Change: remove LibDriver, LineEditor, add HLSL
+; HLSL Change: remove LibDriver, LineEditor, add HLSL, add DxrtFallback
 
 [component_0]
 type = Group

+ 65 - 18
lib/Support/Windows/MSFileSystem.inc.cpp

@@ -53,33 +53,80 @@ namespace fs {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Per-thread MSFileSystem support.
 
-static DWORD g_FileSystemTls;
+namespace {
 
-error_code SetupPerThreadFileSystem() throw()
-{
-  assert(g_FileSystemTls == 0 && "otherwise this has already been initialized");
-  g_FileSystemTls = TlsAlloc();
-  if (g_FileSystemTls == TLS_OUT_OF_INDEXES)
-  {
-    g_FileSystemTls = 0;
-    return mapWindowsError(::GetLastError());
+template <typename _T>
+class ThreadLocalStorage {
+  DWORD m_Tls;
+  DWORD m_dwError;
+public:
+  ThreadLocalStorage() : m_Tls(TLS_OUT_OF_INDEXES), m_dwError(ERROR_NOT_READY) {}
+  DWORD Setup() {
+    if (m_Tls == TLS_OUT_OF_INDEXES) {
+      m_Tls = TlsAlloc();
+      m_dwError = (m_Tls == TLS_OUT_OF_INDEXES) ? ::GetLastError() : 0;
+    }
+    return m_dwError;
   }
-  return error_code();
+  void Cleanup() {
+    if (m_Tls != TLS_OUT_OF_INDEXES)
+      TlsFree(m_Tls);
+    m_Tls = TLS_OUT_OF_INDEXES;
+    m_dwError = ERROR_NOT_READY;
+  }
+  ~ThreadLocalStorage() { Cleanup(); }
+  _T GetValue() const {
+    if (m_Tls != TLS_OUT_OF_INDEXES)
+      return (_T)TlsGetValue(m_Tls);
+    else
+      return nullptr;
+  }
+  bool SetValue(_T value) {
+    if (m_Tls != TLS_OUT_OF_INDEXES) {
+      return TlsSetValue(m_Tls, (void*)value);
+    } else {
+      ::SetLastError(m_dwError);
+      return false;
+    }
+  }
+  // Retrieve error code if TlsAlloc() failed
+  DWORD GetError() const {
+    return m_dwError;
+  }
+  operator bool() const { return m_Tls != TLS_OUT_OF_INDEXES; }
+};
+
+static ThreadLocalStorage<MSFileSystemRef> g_PerThreadSystem;
+
 }
 
-void CleanupPerThreadFileSystem() throw()
-{
-  TlsFree(g_FileSystemTls);
-  g_FileSystemTls = 0;
+error_code GetFileSystemTlsStatus() throw() {
+  DWORD dwError = g_PerThreadSystem.GetError();
+  if (dwError)
+    return error_code(dwError, system_category());
+  else
+    return error_code();
 }
 
-MSFileSystemRef GetCurrentThreadFileSystem() throw()
-{
-  return (MSFileSystemRef)TlsGetValue(g_FileSystemTls);
+error_code SetupPerThreadFileSystem() throw() {
+  assert(!g_PerThreadSystem && g_PerThreadSystem.GetError() == ERROR_NOT_READY &&
+          "otherwise, PerThreadSystem already set up.");
+  if (g_PerThreadSystem.Setup())
+    return GetFileSystemTlsStatus();
+  return error_code();
+}
+void CleanupPerThreadFileSystem() throw() {
+  g_PerThreadSystem.Cleanup();
+}
+
+MSFileSystemRef GetCurrentThreadFileSystem() throw() {
+  assert(g_PerThreadSystem && "otherwise, TLS not initialized");
+  return g_PerThreadSystem.GetValue();
 }
 
 error_code SetCurrentThreadFileSystem(MSFileSystemRef value) throw()
 {
+  assert(g_PerThreadSystem && "otherwise, TLS not initialized");
   // For now, disallow reentrancy in APIs (i.e., replace the current instance with another one).
   if (value != nullptr)
   {
@@ -90,7 +137,7 @@ error_code SetCurrentThreadFileSystem(MSFileSystemRef value) throw()
     }
   }
 
-  if (!TlsSetValue(g_FileSystemTls, value))
+  if (!g_PerThreadSystem.SetValue(value))
   {
     return mapWindowsError(::GetLastError());
   }

+ 1 - 1
lib/Support/raw_ostream.cpp

@@ -488,7 +488,7 @@ raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
 
 // HLSL Change Starts - Add handling of numerical base IO manipulators.
 raw_ostream &raw_ostream::
-operator<<(std::ios_base &(*iomanip)(std::ios_base &)) {
+operator<<(std::ios_base &(__cdecl*iomanip)(std::ios_base &)) {
   if (iomanip == std::hex)
     writeBase = 16;
   else if (iomanip == std::oct)

+ 16 - 6
lib/Transforms/IPO/PassManagerBuilder.cpp

@@ -227,7 +227,6 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
                                              /*Promote*/ !NoOpt));
 
   MPM.add(createHLMatrixLowerPass());
-  MPM.add(createResourceToHandlePass());
   // DCE should after SROA to remove unused element.
   MPM.add(createDeadCodeEliminationPass());
   MPM.add(createGlobalDCEPass());
@@ -253,14 +252,24 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
     MPM.add(createLoopRotatePass());
     MPM.add(createLoopUnrollPass());
   }
+
+  if (!NoOpt) {
+    // Verify no undef resource path before simplify, since that can remove undef
+    // paths.  For NoOpt, resources are unpromoted here, so this will not work.
+    MPM.add(createFailUndefResourcePass());
+  }
   MPM.add(createSimplifyInstPass());
 
   MPM.add(createCFGSimplificationPass());
 
-  MPM.add(createDxilLegalizeResourceUsePass());
-  MPM.add(createDxilLegalizeStaticResourceUsePass());
+  MPM.add(createDxilPromoteLocalResources());
+  MPM.add(createDxilPromoteStaticResources());
+  // Verify no undef resource again after promotion
+  MPM.add(createFailUndefResourcePass());
+
   MPM.add(createDxilGenerationPass(NoOpt, ExtHelper));
   MPM.add(createDxilLoadMetadataPass()); // Ensure DxilModule is loaded for optimizations.
+
   // Propagate precise attribute.
   MPM.add(createDxilPrecisePropagatePass());
 
@@ -274,7 +283,6 @@ static void addHLSLPasses(bool HLSLHighLevel, unsigned OptLevel, hlsl::HLSLExten
   MPM.add(createCFGSimplificationPass());
 
   MPM.add(createDeadCodeEliminationPass());
-  MPM.add(createDxilTranslateRawBuffer());
 }
 // HLSL Change Ends
 
@@ -308,7 +316,8 @@ void PassManagerBuilder::populateModulePassManager(
     if (!HLSLHighLevel) {
       MPM.add(createDxilConvergentClearPass());
       MPM.add(createMultiDimArrayToOneDimArrayPass());
-      MPM.add(createDxilCondenseResourcesPass());
+      MPM.add(createDxilLowerCreateHandleForLibPass());
+      MPM.add(createDxilTranslateRawBuffer());
       MPM.add(createDxilLegalizeSampleOffsetPass());
       MPM.add(createDxilFinalizeModulePass());
       MPM.add(createComputeViewIdStatePass());
@@ -588,7 +597,8 @@ void PassManagerBuilder::populateModulePassManager(
   if (!HLSLHighLevel) {
     MPM.add(createDxilConvergentClearPass());
     MPM.add(createMultiDimArrayToOneDimArrayPass());
-    MPM.add(createDxilCondenseResourcesPass());
+    MPM.add(createDxilLowerCreateHandleForLibPass());
+    MPM.add(createDxilTranslateRawBuffer());
     MPM.add(createDeadCodeEliminationPass());
     if (DisableUnrollLoops)
       MPM.add(createDxilLegalizeSampleOffsetPass());

+ 2 - 2
lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

@@ -521,7 +521,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
     if (ST->getNumElements() == 1
-        && !hlsl::OP::IsDxilOpType(ST) // HLSL Change - avoid unpack dxil types.
+        && false // HLSL Change - avoid unpack dxil types.
         ) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
@@ -901,7 +901,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
     if (ST->getNumElements() == 1
-        && !hlsl::OP::IsDxilOpType(ST) // HLSL Change - avoid unpack dxil types.
+        && false // HLSL Change - avoid unpack dxil types.
         ) {
       V = IC.Builder->CreateExtractValue(V, 0);
       combineStoreToNewValue(IC, SI, V);

+ 12 - 0
lib/Transforms/Scalar/GVN.cpp

@@ -955,6 +955,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const DataLayout &DL) {
+#if 0   // HLSL Change: Don't support bitcasting to different sizes.
+
   // If the loaded or stored value is a first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
@@ -1022,12 +1024,15 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // Okay, we can do this transformation.  Return the number of bytes into the
   // store that the load is.
   return LoadOffset-StoreOffset;
+#endif  // HLSL Change: Don't support bitcasting to different sizes.
+  return -1;
 }
 
 /// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
 static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI) {
+#if 0   // HLSL Change: Don't support bitcasting to different sizes.
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepSI->getValueOperand()->getType()->isStructTy() ||
       DepSI->getValueOperand()->getType()->isArrayTy())
@@ -1038,6 +1043,8 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
   uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
                                         StorePtr, StoreSize, DL);
+#endif  // HLSL Change: Don't support bitcasting to different sizes.
+  return -1;
 }
 
 /// This function is called when we have a
@@ -1045,6 +1052,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 /// the other load can feed into the second load.
 static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
                                          LoadInst *DepLI, const DataLayout &DL){
+#if 0   // HLSL Change: Don't support bitcasting to different sizes.
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
@@ -1066,6 +1074,8 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   if (Size == 0) return -1;
 
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
+#endif
+  return -1;
 }
 
 
@@ -1073,6 +1083,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
 static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
                                             const DataLayout &DL) {
+#if 0   // HLSL Change: Don't support bitcasting to different sizes.
   // If the mem operation is a non-constant size, we can't handle it.
   ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
   if (!SizeCst) return -1;
@@ -1113,6 +1124,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, DL))
     return Offset;
+#endif
   return -1;
 }
 

+ 7 - 7
lib/Transforms/Scalar/Reg2MemHLSL.cpp

@@ -59,15 +59,15 @@ namespace {
       return nullptr;
     }
 
-    IRBuilder<> Builder(P);
+    IRBuilder<> AllocaBuilder(P);
     if (!AllocaPoint) {
       Function *F = P->getParent()->getParent();
       AllocaPoint = F->getEntryBlock().begin();
     }
-    Builder.SetInsertPoint(AllocaPoint);
+    AllocaBuilder.SetInsertPoint(AllocaPoint);
 
     // Create a stack slot to hold the value.
-    AllocaInst *Slot = Builder.CreateAlloca(P->getType(), nullptr, P->getName() + ".reg2mem");
+    AllocaInst *Slot = AllocaBuilder.CreateAlloca(P->getType(), nullptr, P->getName() + ".reg2mem");
 
     // Insert a load in place of the PHI and replace all uses.
     BasicBlock::iterator InsertPt = P;
@@ -123,23 +123,23 @@ namespace {
       return nullptr;
     }
 
-    IRBuilder<> Builder(&I);
+    IRBuilder<> AllocaBuilder(&I);
     if (!AllocaPoint) {
       Function *F = I.getParent()->getParent();
       AllocaPoint = F->getEntryBlock().begin();
     }
-    Builder.SetInsertPoint(AllocaPoint);
+    AllocaBuilder.SetInsertPoint(AllocaPoint);
 
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       // Create a stack slot to hold the value.
-      AllocaInst *Slot = Builder.CreateAlloca(AI->getAllocatedType(), nullptr, I.getName() + ".reg2mem");
+      AllocaInst *Slot = AllocaBuilder.CreateAlloca(AI->getAllocatedType(), nullptr, I.getName() + ".reg2mem");
 	  I.replaceAllUsesWith(Slot);
 	  I.eraseFromParent();
 	  return Slot;
     }
 
     // Create a stack slot to hold the value.
-    AllocaInst *Slot = Builder.CreateAlloca(I.getType(), nullptr, I.getName() + ".reg2mem");;
+    AllocaInst *Slot = AllocaBuilder.CreateAlloca(I.getType(), nullptr, I.getName() + ".reg2mem");;
 
     // Change all of the users of the instruction to read from the stack slot.
     while (!I.use_empty()) {

+ 2 - 0
lib/Transforms/Scalar/SROA.cpp

@@ -56,6 +56,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "dxc/HLSL/HLModule.h"  // HLSL Change - not sroa resource type.
 
 #if __cplusplus >= 201103L && !defined(NDEBUG)
 // We only use this for a debug check in C++11
@@ -4309,6 +4310,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // Skip alloca forms that this analysis can't handle.
   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      hlsl::HLModule::IsHLSLObjectType(AI.getAllocatedType()) || // HLSL Change - not sroa resource type.
       DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
     return false;
 

File diff suppressed because it is too large
+ 298 - 532
lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp


+ 1 - 1
lib/Transforms/Utils/InlineFunction.cpp

@@ -1432,7 +1432,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   if (PHI) {
     auto &DL = Caller->getParent()->getDataLayout();
     if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr,
-                                       &IFI.ACT->getAssumptionCache(*Caller))) {
+          IFI.ACT ? &IFI.ACT->getAssumptionCache(*Caller) : nullptr)) { // HLSL Change: fix nullptr dereference
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }

+ 6 - 0
lib/Transforms/Utils/Local.cpp

@@ -44,6 +44,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
+#include "dxc/HLSL/DxilMetadataHelper.h" // HLSL Change - combine dxil metadata.
 using namespace llvm;
 
 #define DEBUG_TYPE "local"
@@ -1323,6 +1325,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign
         break;
     }
   }
+
+  // HLSL Change Begin - combine dxil metadata.
+  hlsl::DxilMDHelper::combineDxilMetadata(K, J);
+  // HLSL Change End.
 }
 
 unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,

+ 8 - 0
tools/clang/include/clang/AST/HlslTypes.h

@@ -310,6 +310,9 @@ void AddRecordTypeWithHandle(
   _Outptr_  clang::CXXRecordDecl** typeDecl, 
   _In_z_    const char* typeName);
 
+void AddRayFlags(clang::ASTContext& context);
+void AddHitKinds(clang::ASTContext& context);
+
 /// <summary>Adds the implementation for std::is_equal.</summary>
 void AddStdIsEqualImplementation(clang::ASTContext& context, clang::Sema& sema);
 
@@ -363,6 +366,9 @@ ConvertHLSLVecMatTypeToExtVectorType(const clang::ASTContext &,
 bool IsHLSLVecMatType(clang::QualType);
 bool IsHLSLVecType(clang::QualType type);
 bool IsHLSLMatType(clang::QualType type);
+clang::QualType GetElementTypeOrType(clang::QualType type);
+bool HasHLSLMatOrientation(clang::QualType type, bool *pIsRowMajor);
+bool HasHLSLUNormSNorm(clang::QualType type, bool *pIsSNorm);
 bool IsHLSLInputPatchType(clang::QualType type);
 bool IsHLSLOutputPatchType(clang::QualType type);
 bool IsHLSLPointStreamType(clang::QualType type);
@@ -370,6 +376,8 @@ bool IsHLSLLineStreamType(clang::QualType type);
 bool IsHLSLTriangleStreamType(clang::QualType type);
 bool IsHLSLStreamOutputType(clang::QualType type);
 bool IsHLSLResourceType(clang::QualType type);
+bool IsHLSLNumeric(clang::QualType type);
+bool IsHLSLNumericUserDefinedType(clang::QualType type);
 clang::QualType GetHLSLResourceResultType(clang::QualType type);
 bool IsIncompleteHLSLResourceArrayType(clang::ASTContext& context, clang::QualType type);
 clang::QualType GetHLSLInputPatchElementType(clang::QualType type);

+ 2 - 1
tools/clang/include/clang/AST/PrettyPrinter.h

@@ -41,7 +41,8 @@ struct PrintingPolicy {
       ConstantArraySizeAsWritten(false), AnonymousTagLocations(true),
       SuppressStrongLifetime(false), SuppressLifetimeQualifiers(false),
       Bool(LO.Bool), TerseOutput(false), PolishForDeclaration(false),
-      Half(LO.Half), MSWChar(LO.MicrosoftExt && !LO.WChar),
+      Half(LO.HLSL || LO.Half), // HLSL Change - always print 'half' for HLSL
+      MSWChar(LO.MicrosoftExt && !LO.WChar),
       IncludeNewlines(true) { }
 
   /// \brief What language we're printing.

Some files were not shown because too many files changed in this diff