Parcourir la source

Implement Shader Model 6.6 (#3293)

This is the work of many contributors from Microsoft and our partners.
Thank you all!

Add support for 64-bit atomics, compute shader derivatives, dynamic
resources (from heap), 8-bit Packed Operations, and Wave Size.
All of these require compiling with 6_6 targets with just a few
exceptions. Each of these features include unittests and Execution
tests.

64-bit atomics add 64-bit variants of all Interlocked* intrinsic
operations. This involves changing some of the code that matches
intrinsic overloads to call instructions. Also adds a few float
intrinsics for compare and exchange interlocked ops which are available
for all shader models 6.0 and up.

Compute shader derivatives adds dd[x|y], CalculateLevelOfDetail, and
Sample operations that require derivatives to compute. QuadRead
operations have been allowed in compute from 6.0+ and tests are added
for them here.

Dynamic resources introduce global arrays that represent the resource
and sampler heaps that can be indexed without requiring root signature
representations. This involves a new way of creating and annotating
resource handles.

8-bit Packed operations introduces a set of intrinsics to pack and
unpack 8-bit values into and out of new 32-bit unsigned types that can
be trivially converted to and from uints.

WaveSize introduces a shader attribute that indicates what size the
shader depends on the wave being. If the runtime has a different wave
size, trying to create a pipeline with this size will fail.
Greg Roth il y a 4 ans
Parent
commit
d574d27c1f
100 fichiers modifiés avec 5756 ajouts et 866 suppressions
  1. 8 2
      docs/DXIL.rst
  2. 57 18
      include/dxc/DXIL/DxilConstants.h
  3. 3 0
      include/dxc/DXIL/DxilFunctionProps.h
  4. 120 27
      include/dxc/DXIL/DxilInstructions.h
  5. 1 0
      include/dxc/DXIL/DxilMetadataHelper.h
  6. 4 0
      include/dxc/DXIL/DxilModule.h
  7. 7 2
      include/dxc/DXIL/DxilOperations.h
  8. 54 0
      include/dxc/DXIL/DxilResourceBinding.h
  9. 49 33
      include/dxc/DXIL/DxilResourceProperties.h
  10. 16 2
      include/dxc/DXIL/DxilShaderFlags.h
  11. 1 0
      include/dxc/DXIL/DxilUtil.h
  12. 3 1
      include/dxc/DxilRootSignature/DxilRootSignature.h
  13. 2 0
      include/dxc/HLSL/DxilValidation.h
  14. 5 5
      include/dxc/HLSL/HLOperations.h
  15. 28 0
      include/dxc/HlslIntrinsicOp.h
  16. 4 0
      include/dxc/Test/DxcTestUtils.h
  17. 8 1
      include/dxc/dxcapi.internal.h
  18. 1 0
      lib/DXIL/CMakeLists.txt
  19. 15 1
      lib/DXIL/DxilCompType.cpp
  20. 12 0
      lib/DXIL/DxilMetadataHelper.cpp
  21. 18 1
      lib/DXIL/DxilModule.cpp
  22. 107 22
      lib/DXIL/DxilOperations.cpp
  23. 11 11
      lib/DXIL/DxilResource.cpp
  24. 1 2
      lib/DXIL/DxilResourceBase.cpp
  25. 109 0
      lib/DXIL/DxilResourceBinding.cpp
  26. 59 81
      lib/DXIL/DxilResourceProperties.cpp
  27. 64 20
      lib/DXIL/DxilShaderFlags.cpp
  28. 11 2
      lib/DXIL/DxilSignatureElement.cpp
  29. 22 0
      lib/DXIL/DxilUtil.cpp
  30. 34 27
      lib/DxilContainer/DxilContainerAssembler.cpp
  31. 2 0
      lib/HLSL/ComputeViewIdStateBuilder.cpp
  32. 154 54
      lib/HLSL/DxilCondenseResources.cpp
  33. 1 2
      lib/HLSL/DxilContainerReflection.cpp
  34. 1 5
      lib/HLSL/DxilGenerationPass.cpp
  35. 90 42
      lib/HLSL/DxilValidation.cpp
  36. 19 24
      lib/HLSL/HLModule.cpp
  37. 266 57
      lib/HLSL/HLOperationLower.cpp
  38. 4 4
      lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
  39. 1 0
      tools/clang/include/clang/AST/ASTContext.h
  40. 6 0
      tools/clang/include/clang/AST/BuiltinTypes.def
  41. 7 2
      tools/clang/include/clang/AST/HlslTypes.h
  42. 6 0
      tools/clang/include/clang/Basic/Attr.td
  43. 2 0
      tools/clang/include/clang/Basic/Specifiers.h
  44. 2 0
      tools/clang/include/clang/Sema/DeclSpec.h
  45. 7 1
      tools/clang/lib/AST/ASTContext.cpp
  46. 52 4
      tools/clang/lib/AST/ASTContextHLSL.cpp
  47. 2 0
      tools/clang/lib/AST/ItaniumMangle.cpp
  48. 7 1
      tools/clang/lib/AST/MicrosoftMangle.cpp
  49. 2 0
      tools/clang/lib/AST/StmtPrinter.cpp
  50. 2 0
      tools/clang/lib/AST/Type.cpp
  51. 2 0
      tools/clang/lib/AST/TypeLoc.cpp
  52. 3 1
      tools/clang/lib/CodeGen/CGDebugInfo.cpp
  53. 4 0
      tools/clang/lib/CodeGen/CGExpr.cpp
  54. 116 62
      tools/clang/lib/CodeGen/CGHLSLMS.cpp
  55. 297 160
      tools/clang/lib/CodeGen/CGHLSLMSFinishCodeGen.cpp
  56. 16 3
      tools/clang/lib/CodeGen/CGHLSLMSHelper.h
  57. 2 0
      tools/clang/lib/CodeGen/CodeGenTypes.cpp
  58. 2 0
      tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
  59. 5 1
      tools/clang/lib/Index/USRGeneration.cpp
  60. 10 2
      tools/clang/lib/Parse/HLSLRootSignature.cpp
  61. 2 0
      tools/clang/lib/Parse/HLSLRootSignature.h
  62. 1 0
      tools/clang/lib/Parse/ParseDecl.cpp
  63. 2 0
      tools/clang/lib/SPIRV/SpirvEmitter.cpp
  64. 4 0
      tools/clang/lib/Sema/DeclSpec.cpp
  65. 137 12
      tools/clang/lib/Sema/SemaHLSL.cpp
  66. 2 0
      tools/clang/lib/Sema/SemaTemplateVariadic.cpp
  67. 2 0
      tools/clang/lib/Sema/SemaType.cpp
  68. 256 116
      tools/clang/lib/Sema/gen_intrin_main_tables_15.h
  69. 19 0
      tools/clang/test/CodeGenHLSL/attributes_wavesize.hlsl
  70. 1257 1
      tools/clang/test/HLSL/ShaderOpArith.xml
  71. 2 0
      tools/clang/test/HLSLFileCheck/hlsl/control_flow/attributes/unroll/2d_array.hlsl
  72. 1 0
      tools/clang/test/HLSLFileCheck/hlsl/control_flow/attributes/unroll/extern.hlsl
  73. 13 8
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic.hlsl
  74. 193 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpstr_i64_and_i32.hlsl
  75. 116 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpstr_method_i64_and_i32.hlsl
  76. 215 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpxchg_i64_and_i32.hlsl
  77. 167 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpxchg_method_i64_and_i32.hlsl
  78. 114 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_float.hlsl
  79. 83 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_float_errors.hlsl
  80. 105 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_method_float.hlsl
  81. 74 32
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_overload.hlsl
  82. 74 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_structuredbuf_i64.hlsl
  83. 185 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_xchg_i64_and_i32.hlsl
  84. 132 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_xchg_method_i64_and_i32.hlsl
  85. 91 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_float.hlsl
  86. 84 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_i64.hlsl
  87. 128 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_i64_and_i32.hlsl
  88. 46 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_method_float.hlsl
  89. 107 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_method_i64_and_i32.hlsl
  90. 5 2
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/annotateHandle.hlsl
  91. 3 3
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap.hlsl
  92. 9 9
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap2.hlsl
  93. 16 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap3.hlsl
  94. 21 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap4.hlsl
  95. 28 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_16.hlsl
  96. 28 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_32.hlsl
  97. 22 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_error.hlsl
  98. 49 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_mix.hlsl
  99. 15 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_uint.hlsl
  100. 24 0
      tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_packed_type_arg.hlsl

+ 8 - 2
docs/DXIL.rst

@@ -581,6 +581,7 @@ Compute shader has the following tag-value properties.
 Tag	                  Value                    Description
 ===================== ======================== =============================================
 kDxilNumThreadsTag(4) MD list: (i32, i32, i32) Number of threads (X,Y,Z) for compute shader.
+kDxilWaveSizeTag      MD list: (i32)           Wave size the shader is compatible with (optional).
 ===================== ======================== =============================================
 
 Shader Parameters and Signatures
@@ -2311,8 +2312,11 @@ ID  Name                                                  Description
 213 GeometryIndex                                         The autogenerated index of the current geometry in the bottom-level structure
 214 RayQuery_CandidateInstanceContributionToHitGroupIndex returns candidate hit InstanceContributionToHitGroupIndex
 215 RayQuery_CommittedInstanceContributionToHitGroupIndex returns committed hit InstanceContributionToHitGroupIndex
-216 CreateHandleFromHeap                                  create resource handle from heap
-217 AnnotateHandle                                        annotate handle with resource properties
+216 AnnotateHandle                                        annotate handle with resource properties
+217 CreateHandleFromBinding                               create resource handle from binding
+218 CreateHandleFromHeap                                  create resource handle from heap
+219 Unpack4x8                                             unpacks 4 8-bit signed or unsigned values into int32 or int16 vector
+220 Pack4x8                                               packs vector of 4 signed or unsigned values into a packed datatype, drops or clamps unused bits
 === ===================================================== =======================================================================================================================================================================================================================
 
 
@@ -3184,6 +3188,8 @@ SM.TRIOUTPUTPRIMITIVEMISMATCH             Hull Shader declared with Tri Domain m
 SM.UNDEFINEDOUTPUT                        Not all elements of output %0 were written.
 SM.VALIDDOMAIN                            Invalid Tessellator Domain specified. Must be isoline, tri or quad.
 SM.VIEWIDNEEDSSLOT                        ViewID requires compatible space in pixel shader input signature
+SM.WAVESIZENEEDSDXIL16PLUS                WaveSize is valid only for DXIL version 1.6 and higher.
+SM.WAVESIZEVALUE                          Declared WaveSize %0 outside valid range [%1..%2], or not a power of 2.
 SM.ZEROHSINPUTCONTROLPOINTWITHINPUT       When HS input control point count is 0, no input signature should exist.
 TYPES.DEFINED                             Type must be defined based on DXIL primitives
 TYPES.I8                                  I8 can only be used as immediate value for intrinsic.

+ 57 - 18
include/dxc/DXIL/DxilConstants.h

@@ -109,6 +109,8 @@ namespace DXIL {
   const unsigned kMaxMSPSigRows = 32;
   const unsigned kMaxMSTotalSigRows = 32;
   const unsigned kMaxMSSMSize = 1024 * 28;
+  const unsigned kMinWaveSize = 4;
+  const unsigned kMaxWaveSize = 128;
 
   const float kMaxMipLodBias = 15.99f;
   const float kMinMipLodBias = -16.0f;
@@ -120,6 +122,7 @@ namespace DXIL {
     I1, I16, U16, I32, U32, I64, U64,
     F16, F32, F64,
     SNormF16, UNormF16, SNormF32, UNormF32, SNormF64, UNormF64,
+    PackedS8x32, PackedU8x32,
     LastEntry };
 
   // Must match D3D_INTERPOLATION_MODE
@@ -330,8 +333,6 @@ namespace DXIL {
     RTAccelerationStructure,
     FeedbackTexture2D,
     FeedbackTexture2DArray,
-    StructuredBufferWithCounter,
-    SamplerComparison,
     NumEntries,
   };
 
@@ -341,8 +342,7 @@ namespace DXIL {
   }
 
   inline bool IsStructuredBuffer(DXIL::ResourceKind ResourceKind) {
-    return ResourceKind == DXIL::ResourceKind::StructuredBuffer ||
-           ResourceKind == DXIL::ResourceKind::StructuredBufferWithCounter;
+    return ResourceKind == DXIL::ResourceKind::StructuredBuffer;
   }
 
   inline bool IsTypedBuffer(DXIL::ResourceKind ResourceKind) {
@@ -366,6 +366,11 @@ namespace DXIL {
            ResourceKind == DXIL::ResourceKind::FeedbackTexture2DArray;
   }
 
+  inline bool IsValidWaveSizeValue(unsigned size) {
+    // must be power of 2 between 4 and 128
+    return size >= kMinWaveSize && size <= kMaxWaveSize && (size & (size - 1)) == 0;
+  }
+
   // TODO: change opcodes.
   /* <py::lines('OPCODE-ENUM')>hctdb_instrhelp.get_enum_decl("OpCode")</py>*/
   // OPCODE-ENUM:BEGIN
@@ -415,6 +420,13 @@ namespace DXIL {
     ThreadId = 93, // reads the thread ID
     ThreadIdInGroup = 95, // reads the thread ID within the group (SV_GroupThreadID)
   
+    // Derivatives
+    CalculateLOD = 81, // calculates the level of detail
+    DerivCoarseX = 83, // computes the rate of change per stamp in x direction.
+    DerivCoarseY = 84, // computes the rate of change per stamp in y direction.
+    DerivFineX = 85, // computes the rate of change per pixel in x direction.
+    DerivFineY = 86, // computes the rate of change per pixel in y direction.
+  
     // Domain and hull shader
     LoadOutputControlPoint = 103, // LoadOutputControlPoint
     LoadPatchConstant = 104, // LoadPatchConstant
@@ -446,8 +458,9 @@ namespace DXIL {
     GSInstanceID = 100, // GSInstanceID
   
     // Get handle from heap
-    AnnotateHandle = 217, // annotate handle with resource properties
-    CreateHandleFromHeap = 216, // create resource handle from heap
+    AnnotateHandle = 216, // annotate handle with resource properties
+    CreateHandleFromBinding = 217, // create resource handle from binding
+    CreateHandleFromHeap = 218, // create resource handle from heap
   
     // Graphics shader
     ViewID = 138, // returns the view index
@@ -520,14 +533,12 @@ namespace DXIL {
     // Other
     CycleCounterLegacy = 109, // CycleCounterLegacy
   
+    // Packing intrinsics
+    Pack4x8 = 220, // packs vector of 4 signed or unsigned values into a packed datatype, drops or clamps unused bits
+  
     // Pixel shader
     AttributeAtVertex = 137, // returns the values of the attributes at the vertex.
-    CalculateLOD = 81, // calculates the level of detail
     Coverage = 91, // returns the coverage mask input in a pixel shader
-    DerivCoarseX = 83, // computes the rate of change per stamp in x direction.
-    DerivCoarseY = 84, // computes the rate of change per stamp in y direction.
-    DerivFineX = 85, // computes the rate of change per pixel in x direction.
-    DerivFineY = 86, // computes the rate of change per pixel in y direction.
     Discard = 82, // discard the current pixel
     EvalCentroid = 89, // evaluates an input attribute at pixel center
     EvalSampleIndex = 88, // evaluates an input attribute at a sample location
@@ -674,6 +685,9 @@ namespace DXIL {
     // Unary uint
     FirstbitHi = 33, // Returns the location of the first set bit starting from the highest order bit and working downward.
   
+    // Unpacking intrinsics
+    Unpack4x8 = 219, // unpacks 4 8-bit signed or unsigned values into int32 or int16 vector
+  
     // Wave
     WaveActiveAllEqual = 115, // returns 1 if all the lanes have the same value
     WaveActiveBallot = 116, // returns a struct with a bit set for each lane where the condition is true
@@ -699,9 +713,9 @@ namespace DXIL {
     NumOpCodes_Dxil_1_3 = 162,
     NumOpCodes_Dxil_1_4 = 165,
     NumOpCodes_Dxil_1_5 = 216,
-    NumOpCodes_Dxil_1_6 = 218,
+    NumOpCodes_Dxil_1_6 = 221,
   
-    NumOpCodes = 218 // exclusive last value of enumeration
+    NumOpCodes = 221 // exclusive last value of enumeration
   };
   // OPCODE-ENUM:END
 
@@ -739,6 +753,10 @@ namespace DXIL {
     ThreadId,
     ThreadIdInGroup,
   
+    // Derivatives
+    CalculateLOD,
+    Unary,
+  
     // Domain and hull shader
     LoadOutputControlPoint,
     LoadPatchConstant,
@@ -770,6 +788,7 @@ namespace DXIL {
   
     // Get handle from heap
     AnnotateHandle,
+    CreateHandleFromBinding,
     CreateHandleFromHeap,
   
     // Graphics shader
@@ -818,9 +837,11 @@ namespace DXIL {
     // Other
     CycleCounterLegacy,
   
+    // Packing intrinsics
+    Pack4x8,
+  
     // Pixel shader
     AttributeAtVertex,
-    CalculateLOD,
     Coverage,
     Discard,
     EvalCentroid,
@@ -828,7 +849,6 @@ namespace DXIL {
     EvalSnapped,
     InnerCoverage,
     SampleIndex,
-    Unary,
   
     // Quad Wave Ops
     QuadOp,
@@ -928,6 +948,9 @@ namespace DXIL {
     // Unary int
     UnaryBits,
   
+    // Unpacking intrinsics
+    Unpack4x8,
+  
     // Wave
     WaveActiveAllEqual,
     WaveActiveBallot,
@@ -952,9 +975,9 @@ namespace DXIL {
     NumOpClasses_Dxil_1_3 = 118,
     NumOpClasses_Dxil_1_4 = 120,
     NumOpClasses_Dxil_1_5 = 143,
-    NumOpClasses_Dxil_1_6 = 145,
+    NumOpClasses_Dxil_1_6 = 148,
   
-    NumOpClasses = 145 // exclusive last value of enumeration
+    NumOpClasses = 148 // exclusive last value of enumeration
   };
   // OPCODECLASS-ENUM:END
 
@@ -1062,6 +1085,7 @@ namespace DXIL {
     const unsigned kTextureSampleClampOpIdx = 10;
 
     // AtomicBinOp.
+    const unsigned kAtomicBinOpHandleOpIdx = 1;
     const unsigned kAtomicBinOpCoord0OpIdx = 3;
     const unsigned kAtomicBinOpCoord1OpIdx = 4;
     const unsigned kAtomicBinOpCoord2OpIdx = 5;
@@ -1325,6 +1349,18 @@ namespace DXIL {
     SkipProceduralPrimitives = 0x200,
   };
 
+  // Packing/unpacking intrinsics
+  enum class UnpackMode : uint8_t {
+    Unsigned = 0,   // not sign extended
+    Signed = 1,     // sign extended
+  };
+
+  enum class PackMode : uint8_t {
+    Trunc = 0,      // Pack low bits, drop the rest
+    UClamp = 1,     // Unsigned clamp - [0, 255] for 8-bits
+    SClamp = 2,     // Signed clamp - [-128, 127] for 8-bits
+  };
+
   // Corresponds to HIT_KIND_* in HLSL
   enum class HitKind : uint8_t {
     None = 0x00,
@@ -1371,8 +1407,11 @@ namespace DXIL {
   const uint64_t ShaderFeatureInfo_ShadingRate = 0x80000;
   const uint64_t ShaderFeatureInfo_Raytracing_Tier_1_1 = 0x100000;
   const uint64_t ShaderFeatureInfo_SamplerFeedback = 0x200000;
+  const uint64_t ShaderFeatureInfo_AtomicInt64OnTypedResource = 0x400000;
+  const uint64_t ShaderFeatureInfo_AtomicInt64OnGroupShared = 0x800000;
+  const uint64_t ShaderFeatureInfo_DerivativesInMeshAndAmpShaders = 0x1000000;
 
-  const unsigned ShaderFeatureInfoCount = 22;
+  const unsigned ShaderFeatureInfoCount = 25;
 
   // DxilSubobjectType must match D3D12_STATE_SUBOBJECT_TYPE, with
   // certain values reserved, since they cannot be used from Dxil.

+ 3 - 0
include/dxc/DXIL/DxilFunctionProps.h

@@ -82,6 +82,9 @@ struct DxilFunctionProps {
     } AS;
   } ShaderProps;
   DXIL::ShaderKind shaderKind;
+  // WaveSize is currently allowed only on compute shaders, but could be supported on other shader types in the future
+  unsigned waveSize; 
+
   // TODO: Should we have an unmangled name here for ray tracing shaders?
   bool IsPS() const     { return shaderKind == DXIL::ShaderKind::Pixel; }
   bool IsVS() const     { return shaderKind == DXIL::ShaderKind::Vertex; }

+ 120 - 27
include/dxc/DXIL/DxilInstructions.h

@@ -6988,6 +6988,67 @@ struct DxilInst_RayQuery_CommittedInstanceContributionToHitGroupIndex {
   void set_rayQueryHandle(llvm::Value *val) { Instr->setOperand(1, val); }
 };
 
+/// This instruction annotate handle with resource properties
+struct DxilInst_AnnotateHandle {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_AnnotateHandle(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::AnnotateHandle);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_res = 1,
+    arg_props = 2,
+  };
+  // Accessors
+  llvm::Value *get_res() const { return Instr->getOperand(1); }
+  void set_res(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_props() const { return Instr->getOperand(2); }
+  void set_props(llvm::Value *val) { Instr->setOperand(2, val); }
+};
+
+/// This instruction create resource handle from binding
+struct DxilInst_CreateHandleFromBinding {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_CreateHandleFromBinding(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::CreateHandleFromBinding);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_bind = 1,
+    arg_index = 2,
+    arg_nonUniformIndex = 3,
+  };
+  // Accessors
+  llvm::Value *get_bind() const { return Instr->getOperand(1); }
+  void set_bind(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_index() const { return Instr->getOperand(2); }
+  void set_index(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_nonUniformIndex() const { return Instr->getOperand(3); }
+  void set_nonUniformIndex(llvm::Value *val) { Instr->setOperand(3, val); }
+  bool get_nonUniformIndex_val() const { return (bool)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(3))->getZExtValue()); }
+  void set_nonUniformIndex_val(bool val) { Instr->setOperand(3, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 1), llvm::APInt(1, (uint64_t)val))); }
+};
+
 /// This instruction create resource handle from heap
 struct DxilInst_CreateHandleFromHeap {
   llvm::Instruction *Instr;
@@ -6999,7 +7060,7 @@ struct DxilInst_CreateHandleFromHeap {
   // Validation support
   bool isAllowed() const { return true; }
   bool isArgumentListValid() const {
-    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
     return true;
   }
   // Metadata
@@ -7007,53 +7068,85 @@ struct DxilInst_CreateHandleFromHeap {
   // Operand indexes
   enum OperandIdx {
     arg_index = 1,
-    arg_nonUniformIndex = 2,
+    arg_samplerHeap = 2,
+    arg_nonUniformIndex = 3,
   };
   // Accessors
   llvm::Value *get_index() const { return Instr->getOperand(1); }
   void set_index(llvm::Value *val) { Instr->setOperand(1, val); }
-  llvm::Value *get_nonUniformIndex() const { return Instr->getOperand(2); }
-  void set_nonUniformIndex(llvm::Value *val) { Instr->setOperand(2, val); }
-  bool get_nonUniformIndex_val() const { return (bool)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(2))->getZExtValue()); }
-  void set_nonUniformIndex_val(bool val) { Instr->setOperand(2, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 1), llvm::APInt(1, (uint64_t)val))); }
+  llvm::Value *get_samplerHeap() const { return Instr->getOperand(2); }
+  void set_samplerHeap(llvm::Value *val) { Instr->setOperand(2, val); }
+  bool get_samplerHeap_val() const { return (bool)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(2))->getZExtValue()); }
+  void set_samplerHeap_val(bool val) { Instr->setOperand(2, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 1), llvm::APInt(1, (uint64_t)val))); }
+  llvm::Value *get_nonUniformIndex() const { return Instr->getOperand(3); }
+  void set_nonUniformIndex(llvm::Value *val) { Instr->setOperand(3, val); }
+  bool get_nonUniformIndex_val() const { return (bool)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(3))->getZExtValue()); }
+  void set_nonUniformIndex_val(bool val) { Instr->setOperand(3, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 1), llvm::APInt(1, (uint64_t)val))); }
 };
 
-/// This instruction annotate handle with resource properties
-struct DxilInst_AnnotateHandle {
+/// This instruction unpacks 4 8-bit signed or unsigned values into int32 or int16 vector
+struct DxilInst_Unpack4x8 {
   llvm::Instruction *Instr;
   // Construction and identification
-  DxilInst_AnnotateHandle(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  DxilInst_Unpack4x8(llvm::Instruction *pInstr) : Instr(pInstr) {}
   operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::AnnotateHandle);
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Unpack4x8);
   }
   // Validation support
   bool isAllowed() const { return true; }
   bool isArgumentListValid() const {
-    if (5 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
     return true;
   }
   // Metadata
   bool requiresUniformInputs() const { return false; }
   // Operand indexes
   enum OperandIdx {
-    arg_res = 1,
-    arg_resourceClass = 2,
-    arg_resourceKind = 3,
-    arg_props = 4,
+    arg_unpackMode = 1,
+    arg_pk = 2,
   };
   // Accessors
-  llvm::Value *get_res() const { return Instr->getOperand(1); }
-  void set_res(llvm::Value *val) { Instr->setOperand(1, val); }
-  llvm::Value *get_resourceClass() const { return Instr->getOperand(2); }
-  void set_resourceClass(llvm::Value *val) { Instr->setOperand(2, val); }
-  int8_t get_resourceClass_val() const { return (int8_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(2))->getZExtValue()); }
-  void set_resourceClass_val(int8_t val) { Instr->setOperand(2, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 8), llvm::APInt(8, (uint64_t)val))); }
-  llvm::Value *get_resourceKind() const { return Instr->getOperand(3); }
-  void set_resourceKind(llvm::Value *val) { Instr->setOperand(3, val); }
-  int8_t get_resourceKind_val() const { return (int8_t)(llvm::dyn_cast<llvm::ConstantInt>(Instr->getOperand(3))->getZExtValue()); }
-  void set_resourceKind_val(int8_t val) { Instr->setOperand(3, llvm::Constant::getIntegerValue(llvm::IntegerType::get(Instr->getContext(), 8), llvm::APInt(8, (uint64_t)val))); }
-  llvm::Value *get_props() const { return Instr->getOperand(4); }
-  void set_props(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_unpackMode() const { return Instr->getOperand(1); }
+  void set_unpackMode(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_pk() const { return Instr->getOperand(2); }
+  void set_pk(llvm::Value *val) { Instr->setOperand(2, val); }
+};
+
+/// This instruction packs vector of 4 signed or unsigned values into a packed datatype, drops or clamps unused bits
+struct DxilInst_Pack4x8 {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_Pack4x8(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Pack4x8);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (6 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_packMode = 1,
+    arg_x = 2,
+    arg_y = 3,
+    arg_z = 4,
+    arg_w = 5,
+  };
+  // Accessors
+  llvm::Value *get_packMode() const { return Instr->getOperand(1); }
+  void set_packMode(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_x() const { return Instr->getOperand(2); }
+  void set_x(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_y() const { return Instr->getOperand(3); }
+  void set_y(llvm::Value *val) { Instr->setOperand(3, val); }
+  llvm::Value *get_z() const { return Instr->getOperand(4); }
+  void set_z(llvm::Value *val) { Instr->setOperand(4, val); }
+  llvm::Value *get_w() const { return Instr->getOperand(5); }
+  void set_w(llvm::Value *val) { Instr->setOperand(5, val); }
 };
 // INSTR-HELPER:END
 } // namespace hlsl

+ 1 - 0
include/dxc/DXIL/DxilMetadataHelper.h

@@ -260,6 +260,7 @@ public:
   static const unsigned kDxilShaderKindTag      = 8;
   static const unsigned kDxilMSStateTag         = 9;
   static const unsigned kDxilASStateTag         = 10;
+  static const unsigned kDxilWaveSizeTag        = 11;
 
   // GSState.
   static const unsigned kDxilGSStateNumFields               = 5;

+ 4 - 0
include/dxc/DXIL/DxilModule.h

@@ -239,6 +239,10 @@ public:
   void SetNumThreads(unsigned x, unsigned y, unsigned z);
   unsigned GetNumThreads(unsigned idx) const;
 
+  // Compute shader
+  void SetWaveSize(unsigned size);
+  unsigned GetWaveSize() const;
+
   // Geometry shader.
   DXIL::InputPrimitive GetInputPrimitive() const;
   void SetInputPrimitive(DXIL::InputPrimitive IP);

+ 7 - 2
include/dxc/DXIL/DxilOperations.h

@@ -50,15 +50,18 @@ public:
   llvm::LLVMContext &GetCtx() { return m_Ctx; }
   llvm::Type *GetHandleType() const;
   llvm::Type *GetResourcePropertiesType() const;
+  llvm::Type *GetResourceBindingType() const;
   llvm::Type *GetDimensionsType() const;
   llvm::Type *GetSamplePosType() const;
   llvm::Type *GetBinaryWithCarryType() const;
   llvm::Type *GetBinaryWithTwoOutputsType() const;
   llvm::Type *GetSplitDoubleType() const;
-  llvm::Type *GetInt4Type() const;
+  llvm::Type *GetFourI32Type() const;
+  llvm::Type *GetFourI16Type() const;
 
   llvm::Type *GetResRetType(llvm::Type *pOverloadType);
   llvm::Type *GetCBufferRetType(llvm::Type *pOverloadType);
+  llvm::Type *GetVectorType(unsigned numElements, llvm::Type *pOverloadType);
   bool IsResRetType(llvm::Type *Ty);
 
   // Try to get the opcode class for a function.
@@ -122,12 +125,14 @@ private:
 
   llvm::Type *m_pHandleType;
   llvm::Type *m_pResourcePropertiesType;
+  llvm::Type *m_pResourceBindingType;
   llvm::Type *m_pDimensionsType;
   llvm::Type *m_pSamplePosType;
   llvm::Type *m_pBinaryWithCarryType;
   llvm::Type *m_pBinaryWithTwoOutputsType;
   llvm::Type *m_pSplitDoubleType;
-  llvm::Type *m_pInt4Type;
+  llvm::Type *m_pFourI32Type;
+  llvm::Type *m_pFourI16Type;
 
   DXIL::LowPrecisionMode m_LowPrecisionMode;
 

+ 54 - 0
include/dxc/DXIL/DxilResourceBinding.h

@@ -0,0 +1,54 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilResourceBinding.h                                                     //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Representation properties for DXIL resource binding.                      //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "DxilConstants.h"
+
+namespace llvm {
+class Constant;
+class Type;
+}
+
+namespace hlsl {
+
+struct DxilResourceBinding {
+  uint32_t rangeLowerBound;
+  uint32_t rangeUpperBound;
+  uint32_t spaceID;
+  uint8_t resourceClass;
+  uint8_t Reserved1;
+  uint8_t Reserved2;
+  uint8_t Reserved3;
+  bool operator==(const DxilResourceBinding &);
+  bool operator!=(const DxilResourceBinding &);
+};
+
+static_assert(sizeof(DxilResourceBinding) == 4 * sizeof(uint32_t),
+              "update shader model and functions read/write "
+              "DxilResourceBinding when size is changed");
+
+class ShaderModel;
+class DxilResourceBase;
+struct DxilInst_CreateHandleFromBinding;
+
+namespace resource_helper {
+llvm::Constant *getAsConstant(const DxilResourceBinding &, llvm::Type *Ty,
+                              const ShaderModel &);
+DxilResourceBinding loadBindingFromConstant(const llvm::Constant &C);
+DxilResourceBinding
+loadBindingFromCreateHandleFromBinding(DxilInst_CreateHandleFromBinding &createHandle, llvm::Type *Ty,
+                       const ShaderModel &);
+DxilResourceBinding loadBindingFromResourceBase(DxilResourceBase *);
+
+} // namespace resource_helper
+
+} // namespace hlsl

+ 49 - 33
include/dxc/DXIL/DxilResourceProperties.h

@@ -1,6 +1,6 @@
 ///////////////////////////////////////////////////////////////////////////////
 //                                                                           //
-// DxilHandleAnnotation.h                                                    //
+// DxilResourceProperties.h                                                  //
 // Copyright (C) Microsoft Corporation. All rights reserved.                 //
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
@@ -21,43 +21,61 @@ class Type;
 namespace hlsl {
 
 struct DxilResourceProperties {
-  DXIL::ResourceClass Class;
-  DXIL::ResourceKind  Kind;
-  static constexpr unsigned kSampleCountUndefined = 0x7;
-
-  struct DxilTyped {
-    DXIL::ComponentType CompType : 5; // TypedBuffer/Image.
-    uint32_t SingleComponent : 1;     // Return type is single component.
-    // 2^SampleCountPow2 for Sample count of Texture2DMS.
-    uint32_t SampleCountPow2 : 3;
-    uint32_t Reserved : 23;
+  struct TypedProps {
+    uint8_t CompType;  // TypedBuffer/Image component type.
+    uint8_t CompCount; // Number of components known to shader.
+    uint8_t Reserved2;
+    uint8_t Reserved3;
   };
 
-  union {
-    DxilTyped Typed;
-    uint32_t ElementStride; // in bytes for StructurizedBuffer.
-    DXIL::SamplerFeedbackType SamplerFeedbackType; // FeedbackTexture2D.
-    uint32_t SizeInBytes; // Cbuffer instance size in bytes.
-    uint32_t RawDword0;
-  };
+  struct BasicProps {
+    // BYTE 0
+    uint8_t ResourceKind; // DXIL::ResourceKind
+
+    // BYTE 1
+    // Alignment of SRV/UAV base in 2^n. 0 is unknown/worst-case.
+    uint8_t BaseAlignLog2 : 4;
+    uint8_t IsUAV : 1;
+    uint8_t IsROV : 1;
+    uint8_t IsGloballyCoherent : 1;
+
+    // Depending on ResourceKind, this indicates:
+    //  Sampler: SamplerKind::Comparison
+    //  StructuredBuffer: HasCounter
+    //  Other: must be 0
+    uint8_t SamplerCmpOrHasCounter : 1;
 
-  struct DxilUAV {
-    uint32_t bROV : 1;              // UAV
-    uint32_t bGloballyCoherent : 1; // UAV
-    uint32_t Reserved : 30;
+    // BYTE 2
+    uint8_t Reserved2;
+
+    // BYTE 3
+    uint8_t Reserved3;
   };
 
   union {
-    DxilUAV UAV;
+    BasicProps  Basic;
+    uint32_t RawDword0;
+  };
+  // DWORD
+  union {
+    TypedProps Typed;
+    uint32_t StructStrideInBytes; // in bytes for StructuredBuffer.
+    DXIL::SamplerFeedbackType SamplerFeedbackType; // FeedbackTexture2D.
+    uint32_t CBufferSizeInBytes; // Cbuffer used size in bytes.
     uint32_t RawDword1;
   };
-
-  bool operator==(const DxilResourceProperties &);
-  bool operator!=(const DxilResourceProperties &);
-  unsigned getSampleCount();
+  DxilResourceProperties();
+  DXIL::ResourceClass getResourceClass() const;
+  DXIL::ResourceKind  getResourceKind() const;
+  DXIL::ComponentType getCompType() const;
+  void setResourceKind(DXIL::ResourceKind RK);
+  bool isUAV() const;
+  bool operator==(const DxilResourceProperties &) const;
+  bool operator!=(const DxilResourceProperties &) const;
+  bool isValid() const;
 };
 
-static_assert(sizeof(DxilResourceProperties) == 4 * sizeof(uint32_t),
+static_assert(sizeof(DxilResourceProperties) == 2 * sizeof(uint32_t),
               "update shader model and functions read/write "
               "DxilResourceProperties when size is changed");
 
@@ -68,13 +86,11 @@ struct DxilInst_AnnotateHandle;
 namespace resource_helper {
 llvm::Constant *getAsConstant(const DxilResourceProperties &, llvm::Type *Ty,
                               const ShaderModel &);
-DxilResourceProperties loadFromConstant(const llvm::Constant &C,
-                                        DXIL::ResourceClass RC,
-                                        DXIL::ResourceKind RK);
+DxilResourceProperties loadPropsFromConstant(const llvm::Constant &C);
 DxilResourceProperties
-loadFromAnnotateHandle(DxilInst_AnnotateHandle &annotateHandle, llvm::Type *Ty,
+loadPropsFromAnnotateHandle(DxilInst_AnnotateHandle &annotateHandle, llvm::Type *Ty,
                        const ShaderModel &);
-DxilResourceProperties loadFromResourceBase(DxilResourceBase *);
+DxilResourceProperties loadPropsFromResourceBase(DxilResourceBase *);
 
 } // namespace resource_helper
 

+ 16 - 2
include/dxc/DXIL/DxilShaderFlags.h

@@ -114,6 +114,15 @@ namespace hlsl {
     void SetSamplerFeedback(bool flag) { m_bSamplerFeedback = flag; }
     bool GetSamplerFeedback() const { return m_bSamplerFeedback; }
 
+    void SetAtomicInt64OnTypedResource(bool flag) { m_bAtomicInt64OnTypedResource = flag; }
+    bool GetAtomicInt64OnTypedResource() const { return m_bAtomicInt64OnTypedResource; }
+
+    void SetAtomicInt64OnGroupShared(bool flag) { m_bAtomicInt64OnGroupShared = flag; }
+    bool GetAtomicInt64OnGroupShared() const { return m_bAtomicInt64OnGroupShared; }
+
+    void SetDerivativesInMeshAndAmpShaders(bool flag) { m_bDerivativesInMeshAndAmpShaders = flag; }
+    bool GetDerivativesInMeshAndAmpShaders() { return m_bDerivativesInMeshAndAmpShaders; }
+
   private:
     unsigned m_bDisableOptimizations :1;   // D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION
     unsigned m_bDisableMathRefactoring :1; //~D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED
@@ -152,10 +161,15 @@ namespace hlsl {
     unsigned m_bRaytracingTier1_1 : 1; // SHADER_FEATURE_RAYTRACING_TIER_1_1
     unsigned m_bSamplerFeedback : 1; // SHADER_FEATURE_SAMPLER_FEEDBACK
 
-    unsigned m_align0 : 5;        // align to 32 bit.
+    unsigned m_bAtomicInt64OnTypedResource : 1; // SHADER_FEATURE_ATOMIC_INT64_ON_TYPED_RESOURCE
+    unsigned m_bAtomicInt64OnGroupShared : 1;//SHADER_FEATURE_ATOMIC_INT64_ON_GROUP_SHARED
+
+    unsigned m_bDerivativesInMeshAndAmpShaders : 1; //SHADER_FEATURE_DERIVATIVES_IN_MESH_AND_AMPLIFICATION_SHADERS
+
+    unsigned m_align0 : 2;        // align to 32 bit.
     uint32_t m_align1;            // align to 64 bit.
   };
 
 
 
-}
+}

+ 1 - 0
include/dxc/DXIL/DxilUtil.h

@@ -133,6 +133,7 @@ namespace dxilutil {
   bool IsHLSLRayQueryType(llvm::Type *Ty);
   bool IsHLSLResourceDescType(llvm::Type *Ty);
   bool IsResourceSingleComponent(llvm::Type *Ty);
+  uint8_t GetResourceComponentCount(llvm::Type *Ty);
   bool IsSplat(llvm::ConstantDataVector *cdv);
 
   llvm::Type* StripArrayTypes(llvm::Type *Ty, llvm::SmallVectorImpl<unsigned> *OuterToInnerLengths = nullptr);

+ 3 - 1
include/dxc/DxilRootSignature/DxilRootSignature.h

@@ -109,8 +109,10 @@ enum class DxilRootSignatureFlags : uint32_t {
   LocalRootSignature = 0x80,
   DenyAmplificationShaderRootAccess = 0x100,
   DenyMeshShaderRootAccess = 0x200,
+  CBVSRVUAVHeapDirectlyIndexed = 0x400,
+  SamplerHeapDirectlyIndexed = 0x800,
   AllowLowTierReservedHwCbLimit = 0x80000000,
-  ValidFlags = 0x800003ff
+  ValidFlags = 0x80000fff
 };
 enum class DxilRootParameterType {
   DescriptorTable = 0,

+ 2 - 0
include/dxc/HLSL/DxilValidation.h

@@ -265,6 +265,8 @@ enum class ValidationRule : unsigned {
   SmUndefinedOutput, // Not all elements of output %0 were written.
   SmValidDomain, // Invalid Tessellator Domain specified. Must be isoline, tri or quad.
   SmViewIDNeedsSlot, // ViewID requires compatible space in pixel shader input signature
+  SmWaveSizeNeedsDxil16Plus, // WaveSize is valid only for DXIL version 1.6 and higher.
+  SmWaveSizeValue, // Declared WaveSize %0 outside valid range [%1..%2], or not a power of 2.
   SmZeroHSInputControlPointWithInput, // When HS input control point count is 0, no input signature should exist.
 
   // Type system

+ 5 - 5
include/dxc/HLSL/HLOperations.h

@@ -193,6 +193,9 @@ const unsigned kInterlockedDestOpIndex = 1;
 const unsigned kInterlockedValueOpIndex = 2;
 const unsigned kInterlockedOriginalValueOpIndex = 3;
 
+// Interlocked method
+const unsigned kInterlockedMethodValueOpIndex = 3;
+
 // InterlockedCompareExchange.
 const unsigned kInterlockedCmpDestOpIndex = 1;
 const unsigned kInterlockedCmpCompareValueOpIndex = 2;
@@ -357,11 +360,8 @@ const unsigned kCreateHandleIndexOpIdx = 2; // Only for array of cbuffer.
 
 // AnnotateHandle.
 const unsigned kAnnotateHandleHandleOpIdx = 1;
-const unsigned kAnnotateHandleResourceClassOpIdx = 2;
-const unsigned kAnnotateHandleResourceKindOpIdx = 3;
-const unsigned kAnnotateHandleResourcePropertiesOpIdx = 4;
-const unsigned kAnnotateHandleResourceTypeOpIdx = 5;
-
+const unsigned kAnnotateHandleResourcePropertiesOpIdx = 2;
+const unsigned kAnnotateHandleResourceTypeOpIdx = 3;
 
 // TraceRay.
 const unsigned kTraceRayRayDescOpIdx = 7;

+ 28 - 0
include/dxc/HlslIntrinsicOp.h

@@ -51,7 +51,9 @@ import hctdb_instrhelp
   IOP_InterlockedAdd,
   IOP_InterlockedAnd,
   IOP_InterlockedCompareExchange,
+  IOP_InterlockedCompareExchangeFloatBitwise,
   IOP_InterlockedCompareStore,
+  IOP_InterlockedCompareStoreFloatBitwise,
   IOP_InterlockedExchange,
   IOP_InterlockedMax,
   IOP_InterlockedMin,
@@ -182,6 +184,10 @@ import hctdb_instrhelp
   IOP_msad4,
   IOP_mul,
   IOP_normalize,
+  IOP_pack_clamp_s8,
+  IOP_pack_clamp_u8,
+  IOP_pack_s8,
+  IOP_pack_u8,
   IOP_pow,
   IOP_printf,
   IOP_radians,
@@ -227,6 +233,10 @@ import hctdb_instrhelp
 #ifdef ENABLE_SPIRV_CODEGEN
   IOP_VkReadClock,
 #endif // ENABLE_SPIRV_CODEGEN
+  IOP_unpack_s8s16,
+  IOP_unpack_s8s32,
+  IOP_unpack_u8u16,
+  IOP_unpack_u8u32,
   MOP_Append,
   MOP_RestartStrip,
   MOP_CalculateLevelOfDetail,
@@ -254,14 +264,26 @@ import hctdb_instrhelp
   MOP_Load3,
   MOP_Load4,
   MOP_InterlockedAdd,
+  MOP_InterlockedAdd64,
   MOP_InterlockedAnd,
+  MOP_InterlockedAnd64,
   MOP_InterlockedCompareExchange,
+  MOP_InterlockedCompareExchange64,
+  MOP_InterlockedCompareExchangeFloatBitwise,
   MOP_InterlockedCompareStore,
+  MOP_InterlockedCompareStore64,
+  MOP_InterlockedCompareStoreFloatBitwise,
   MOP_InterlockedExchange,
+  MOP_InterlockedExchange64,
+  MOP_InterlockedExchangeFloat,
   MOP_InterlockedMax,
+  MOP_InterlockedMax64,
   MOP_InterlockedMin,
+  MOP_InterlockedMin64,
   MOP_InterlockedOr,
+  MOP_InterlockedOr64,
   MOP_InterlockedXor,
+  MOP_InterlockedXor64,
   MOP_Store,
   MOP_Store2,
   MOP_Store3,
@@ -368,7 +390,9 @@ import hctdb_instrhelp
   case IntrinsicOp::IOP_mul:
   case IntrinsicOp::IOP_sign:
   case IntrinsicOp::MOP_InterlockedMax:
+  case IntrinsicOp::MOP_InterlockedMax64:
   case IntrinsicOp::MOP_InterlockedMin:
+  case IntrinsicOp::MOP_InterlockedMin64:
 // HLSL-HAS-UNSIGNED-INTRINSICS:END
     return true;
   default:
@@ -422,8 +446,12 @@ import hctdb_instrhelp
     return static_cast<unsigned>(IntrinsicOp::IOP_usign);
   case IntrinsicOp::MOP_InterlockedMax:
     return static_cast<unsigned>(IntrinsicOp::MOP_InterlockedUMax);
+  case IntrinsicOp::MOP_InterlockedMax64:
+    return static_cast<unsigned>(IntrinsicOp::MOP_InterlockedUMax);
   case IntrinsicOp::MOP_InterlockedMin:
     return static_cast<unsigned>(IntrinsicOp::MOP_InterlockedUMin);
+  case IntrinsicOp::MOP_InterlockedMin64:
+    return static_cast<unsigned>(IntrinsicOp::MOP_InterlockedUMin);
 // HLSL-GET-UNSIGNED-INTRINSICS:END
   default:
     return static_cast<unsigned>(opcode);

+ 4 - 0
include/dxc/Test/DxcTestUtils.h

@@ -18,6 +18,7 @@
 #include "dxc/Support/dxcapi.use.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace hlsl {
 namespace options {
@@ -46,6 +47,9 @@ public:
   /// checks that some error message does not occur, for example.
   bool AllowEmptyInput;
 
+  /// VariableTable - This holds all the current filecheck variables.
+  llvm::StringMap<std::string> VariableTable;
+
   /// String to read in place of standard input.
   std::string InputForStdin;
   /// Output stream.

+ 8 - 1
include/dxc/dxcapi.internal.h

@@ -91,7 +91,14 @@ enum LEGAL_INTRINSIC_COMPTYPES {
   LICOMPTYPE_TEXTURE2DARRAY = 34,
   LICOMPTYPE_RESOURCE = 35,
   LICOMPTYPE_INT32_ONLY = 36,
-  LICOMPTYPE_COUNT = 37
+  LICOMPTYPE_INT64_ONLY = 37,
+  LICOMPTYPE_ANY_INT64 = 38,
+  LICOMPTYPE_FLOAT32_ONLY = 39,
+  LICOMPTYPE_INT8_4PACKED = 40,
+  LICOMPTYPE_UINT8_4PACKED = 41,
+  LICOMPTYPE_ANY_INT16_OR_32 = 42,
+  LICOMPTYPE_SINT16_OR_32_ONLY = 43,
+  LICOMPTYPE_COUNT = 44
 };
 
 static const BYTE IA_SPECIAL_BASE = 0xf0;

+ 1 - 0
lib/DXIL/CMakeLists.txt

@@ -10,6 +10,7 @@ add_llvm_library(LLVMDXIL
   DxilOperations.cpp
   DxilResource.cpp
   DxilResourceBase.cpp
+  DxilResourceBinding.cpp
   DxilResourceProperties.cpp
   DxilSampler.cpp
   DxilSemantic.cpp

+ 15 - 1
lib/DXIL/DxilCompType.cpp

@@ -59,6 +59,8 @@ uint8_t CompType::GetSizeInBits() const {
   case Kind::I32:
   case Kind::U32:
   case Kind::F32:
+  case Kind::PackedS8x32:
+  case Kind::PackedU8x32:
     return 32;
   case Kind::I64:
   case Kind::U64:
@@ -157,7 +159,8 @@ bool CompType::IsSIntTy() const {
 }
 
 bool CompType::IsUIntTy() const {
-  return m_Kind == Kind::U16 || m_Kind == Kind::U32 || m_Kind == Kind::U64;
+  return m_Kind == Kind::U16 || m_Kind == Kind::U32 || m_Kind == Kind::U64 ||
+    m_Kind == Kind::PackedS8x32 || m_Kind == Kind::PackedU8x32;
 }
 
 bool CompType::IsBoolTy() const {
@@ -202,6 +205,8 @@ CompType CompType::GetBaseCompType() const {
   switch (m_Kind) {
   case Kind::I1:        return CompType(Kind::I1);
   case Kind::I16:       __fallthrough;
+  case Kind::PackedS8x32: __fallthrough;
+  case Kind::PackedU8x32: __fallthrough;
   case Kind::I32:       return CompType(Kind::I32);
   case Kind::I64:       return CompType(Kind::I64);
   case Kind::U16:       __fallthrough;
@@ -231,6 +236,8 @@ bool CompType::HasMinPrec() const {
   case Kind::UNormF16:
     return true;
   case Kind::I1:
+  case Kind::PackedS8x32:
+  case Kind::PackedU8x32:
   case Kind::I32:
   case Kind::U32:
   case Kind::I64:
@@ -253,6 +260,8 @@ Type *CompType::GetLLVMType(LLVMContext &Ctx) const {
   case Kind::I1:        return (Type*)Type::getInt1Ty(Ctx);
   case Kind::I16:
   case Kind::U16:       return (Type*)Type::getInt16Ty(Ctx);
+  case Kind::PackedS8x32:
+  case Kind::PackedU8x32:
   case Kind::I32:
   case Kind::U32:       return (Type*)Type::getInt32Ty(Ctx);
   case Kind::I64:
@@ -277,6 +286,8 @@ PointerType *CompType::GetLLVMPtrType(LLVMContext &Ctx, const unsigned AddrSpace
   case Kind::I1:        return Type::getInt1PtrTy  (Ctx, AddrSpace);
   case Kind::I16:
   case Kind::U16:       return Type::getInt16PtrTy (Ctx, AddrSpace);
+  case Kind::PackedS8x32:
+  case Kind::PackedU8x32:
   case Kind::I32:
   case Kind::U32:       return Type::getInt32PtrTy (Ctx, AddrSpace);
   case Kind::I64:
@@ -319,6 +330,7 @@ static const char *s_TypeKindNames[(unsigned)CompType::Kind::LastEntry] = {
   "i1", "i16", "u16", "i32", "u32", "i64", "u64",
   "f16", "f32", "f64",
   "snorm_f16", "unorm_f16", "snorm_f32", "unorm_f32", "snorm_f64", "unorm_f64",
+  "p32i8", "p32u8",
 };
 
 const char *CompType::GetName() const {
@@ -330,6 +342,7 @@ static const char *s_TypeKindHLSLNames[(unsigned)CompType::Kind::LastEntry] = {
   "bool", "int16_t", "uint16_t", "int", "uint", "int64_t", "uint64_t",
   "half", "float", "double",
   "snorm_half", "unorm_half", "snorm_float", "unorm_float", "snorm_double", "unorm_double",
+  "int8_t_packed", "uint8_t_packed",
 };
 
 static const char *s_TypeKindHLSLNamesMinPrecision[(unsigned)CompType::Kind::LastEntry] = {
@@ -337,6 +350,7 @@ static const char *s_TypeKindHLSLNamesMinPrecision[(unsigned)CompType::Kind::Las
   "bool", "min16i", "min16ui", "int", "uint", "int64_t", "uint64_t",
   "min16float", "float", "double",
   "snorm_min16f", "unorm_min16f", "snorm_float", "unorm_float", "snorm_double", "unorm_double",
+  "int8_t_packed", "uint8_t_packed",
 };
 
 const char *CompType::GetHLSLName(bool MinPrecision) const {

+ 12 - 0
lib/DXIL/DxilMetadataHelper.cpp

@@ -1232,6 +1232,13 @@ MDTuple *DxilMDHelper::EmitDxilEntryProperties(uint64_t rawShaderFlag,
     NumThreadVals.emplace_back(Uint32ToConstMD(CS.numThreads[1]));
     NumThreadVals.emplace_back(Uint32ToConstMD(CS.numThreads[2]));
     MDVals.emplace_back(MDNode::get(m_Ctx, NumThreadVals));
+
+    if (props.waveSize != 0) {
+      MDVals.emplace_back(Uint32ToConstMD(DxilMDHelper::kDxilWaveSizeTag));
+      vector<Metadata *> WaveSizeVal;
+      WaveSizeVal.emplace_back(Uint32ToConstMD(props.waveSize));
+      MDVals.emplace_back(MDNode::get(m_Ctx, WaveSizeVal));
+    }
   } break;
   // Geometry shader.
   case DXIL::ShaderKind::Geometry: {
@@ -1433,6 +1440,11 @@ void DxilMDHelper::LoadDxilEntryProperties(const MDOperand &MDO,
       auto &AS = props.ShaderProps.AS;
       LoadDxilASState(MDO, AS.numThreads, AS.payloadSizeInBytes);
     } break;
+    case DxilMDHelper::kDxilWaveSizeTag: {
+      DXASSERT(props.IsCS(), "else invalid shader kind");
+      MDNode *pNode = cast<MDNode>(MDO.get());
+      props.waveSize = ConstMDToUint32(pNode->getOperand(0));
+    } break;
     default:
       DXASSERT(false, "Unknown extended shader properties tag");
       m_bExtraMetadata = true;

+ 18 - 1
lib/DXIL/DxilModule.cpp

@@ -330,7 +330,6 @@ void DxilModule::CollectShaderFlagsForModule(ShaderFlags &Flags) {
     switch (UAV->GetKind()) {
     case DXIL::ResourceKind::RawBuffer:
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
       hasRawAndStructuredBuffer = true;
       break;
     default:
@@ -409,6 +408,24 @@ unsigned DxilModule::GetNumThreads(unsigned idx) const {
   return numThreads[idx];
 }
 
+void DxilModule::SetWaveSize(unsigned size) {
+  DXASSERT(m_DxilEntryPropsMap.size() == 1 && m_pSM->IsCS(),
+    "only works for CS profile");
+  DxilFunctionProps &props = m_DxilEntryPropsMap.begin()->second->props;
+  DXASSERT_NOMSG(m_pSM->GetKind() == props.shaderKind);
+  props.waveSize = size;
+}
+
+unsigned DxilModule::GetWaveSize() const {
+  DXASSERT(m_DxilEntryPropsMap.size() == 1 && m_pSM->IsCS(),
+    "only works for CS profiles");
+  if (!m_pSM->IsCS())
+    return 0;
+  const DxilFunctionProps &props = m_DxilEntryPropsMap.begin()->second->props;
+  DXASSERT_NOMSG(m_pSM->GetKind() == props.shaderKind);
+  return props.waveSize;
+}
+
 DXIL::InputPrimitive DxilModule::GetInputPrimitive() const {
   if (!m_pSM->IsGS())
     return DXIL::InputPrimitive::Undefined;

+ 107 - 22
lib/DXIL/DxilOperations.cpp

@@ -170,17 +170,23 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
   {  OC::RenderTargetGetSampleCount, "RenderTargetGetSampleCount", OCC::RenderTargetGetSampleCount, "renderTargetGetSampleCount", {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
 
   // Synchronization                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
-  {  OC::AtomicBinOp,             "AtomicBinOp",              OCC::AtomicBinOp,              "atomicBinOp",               { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::None,     },
-  {  OC::AtomicCompareExchange,   "AtomicCompareExchange",    OCC::AtomicCompareExchange,    "atomicCompareExchange",     { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::None,     },
+  {  OC::AtomicBinOp,             "AtomicBinOp",              OCC::AtomicBinOp,              "atomicBinOp",               { false, false, false, false, false, false, false,  true,  true, false, false}, Attribute::None,     },
+  {  OC::AtomicCompareExchange,   "AtomicCompareExchange",    OCC::AtomicCompareExchange,    "atomicCompareExchange",     { false, false, false, false, false, false, false,  true,  true, false, false}, Attribute::None,     },
   {  OC::Barrier,                 "Barrier",                  OCC::Barrier,                  "barrier",                   {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::NoDuplicate, },
 
-  // Pixel shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  // Derivatives                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
   {  OC::CalculateLOD,            "CalculateLOD",             OCC::CalculateLOD,             "calculateLOD",              { false, false,  true, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
+
+  // Pixel shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
   {  OC::Discard,                 "Discard",                  OCC::Discard,                  "discard",                   {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::None,     },
+
+  // Derivatives                                                                                                             void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
   {  OC::DerivCoarseX,            "DerivCoarseX",             OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
   {  OC::DerivCoarseY,            "DerivCoarseY",             OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
   {  OC::DerivFineX,              "DerivFineX",               OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
   {  OC::DerivFineY,              "DerivFineY",               OCC::Unary,                    "unary",                     { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Pixel shader                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
   {  OC::EvalSnapped,             "EvalSnapped",              OCC::EvalSnapped,              "evalSnapped",               { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
   {  OC::EvalSampleIndex,         "EvalSampleIndex",          OCC::EvalSampleIndex,          "evalSampleIndex",           { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
   {  OC::EvalCentroid,            "EvalCentroid",             OCC::EvalCentroid,             "evalCentroid",              { false,  true,  true, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
@@ -386,8 +392,15 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
   {  OC::RayQuery_CommittedInstanceContributionToHitGroupIndex, "RayQuery_CommittedInstanceContributionToHitGroupIndex", OCC::RayQuery_StateScalar,     "rayQuery_StateScalar",      { false, false, false, false, false, false, false,  true, false, false, false}, Attribute::ReadOnly, },
 
   // Get handle from heap                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
-  {  OC::CreateHandleFromHeap,    "CreateHandleFromHeap",     OCC::CreateHandleFromHeap,     "createHandleFromHeap",      {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadOnly, },
   {  OC::AnnotateHandle,          "AnnotateHandle",           OCC::AnnotateHandle,           "annotateHandle",            {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::CreateHandleFromBinding, "CreateHandleFromBinding",  OCC::CreateHandleFromBinding,  "createHandleFromBinding",   {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+  {  OC::CreateHandleFromHeap,    "CreateHandleFromHeap",     OCC::CreateHandleFromHeap,     "createHandleFromHeap",      {  true, false, false, false, false, false, false, false, false, false, false}, Attribute::ReadNone, },
+
+  // Unpacking intrinsics                                                                                                    void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Unpack4x8,               "Unpack4x8",                OCC::Unpack4x8,                "unpack4x8",                 { false, false, false, false, false, false,  true,  true, false, false, false}, Attribute::ReadNone, },
+
+  // Packing intrinsics                                                                                                      void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64,   udt,   obj ,  function attribute
+  {  OC::Pack4x8,                 "Pack4x8",                  OCC::Pack4x8,                  "pack4x8",                   { false, false, false, false, false, false,  true,  true, false, false, false}, Attribute::ReadNone, },
 };
 // OPCODE-OLOADS:END
 
@@ -673,7 +686,7 @@ void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
   // Instructions: Sample=60, SampleBias=61, SampleCmp=64, CalculateLOD=81,
   // DerivCoarseX=83, DerivCoarseY=84, DerivFineX=85, DerivFineY=86
   if ((60 <= op && op <= 61) || op == 64 || op == 81 || (83 <= op && op <= 86)) {
-    mask = SFLAG(Library) | SFLAG(Pixel);
+    mask = SFLAG(Library) | SFLAG(Pixel) | SFLAG(Compute) | SFLAG(Amplification) | SFLAG(Mesh);
     return;
   }
   // Instructions: RenderTargetGetSamplePosition=76,
@@ -825,8 +838,9 @@ void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
     mask = SFLAG(Mesh);
     return;
   }
-  // Instructions: CreateHandleFromHeap=216, AnnotateHandle=217
-  if ((216 <= op && op <= 217)) {
+  // Instructions: AnnotateHandle=216, CreateHandleFromBinding=217,
+  // CreateHandleFromHeap=218, Unpack4x8=219, Pack4x8=220
+  if ((216 <= op && op <= 220)) {
     major = 6;  minor = 6;
     return;
   }
@@ -840,6 +854,22 @@ void OP::GetMinShaderModelAndMask(const llvm::CallInst *CI, bool bWithTranslatio
   OpCode opcode = OP::GetDxilOpFuncCallInst(CI);
   GetMinShaderModelAndMask(opcode, bWithTranslation, major, minor, mask);
 
+  unsigned op = (unsigned)opcode;
+  // These ops cannot indicate support for CS, AS, or MS,
+  // otherwise, it's saying these are guaranteed to be supported
+  // on the lowest shader model returned by this function
+  // for these shader stages.  For CS, SM 6.6 is required,
+  // and for AS/MS, an optional feature is required.
+  // This also breaks compatibility for existing validators.
+  // We need a different mechanism to be supported in functions
+  // for runtime linking.
+  // Instructions: Sample=60, SampleBias=61, SampleCmp=64, CalculateLOD=81,
+  // DerivCoarseX=83, DerivCoarseY=84, DerivFineX=85, DerivFineY=86
+  if ((60 <= op && op <= 61) || op == 64 || op == 81 || (83 <= op && op <= 86)) {
+    mask &= ~(SFLAG(Compute) | SFLAG(Amplification) | SFLAG(Mesh));
+    return;
+  }
+
   if (DXIL::CompareVersions(valMajor, valMinor, 1, 5) < 0) {
     // validator 1.4 didn't exclude wave ops in mask
     if (IsDxilOpWave(opcode))
@@ -862,6 +892,15 @@ void OP::GetMinShaderModelAndMask(const llvm::CallInst *CI, bool bWithTranslatio
     }
     return;
   }
+
+  // 64-bit integer atomic ops require 6.6
+  else if (opcode == DXIL::OpCode::AtomicBinOp || opcode == DXIL::OpCode::AtomicCompareExchange) {
+    Type *pOverloadType = GetOverloadType(opcode, CI->getCalledFunction());
+    if (pOverloadType->isIntegerTy(64)) {
+      major = 6;
+      minor = 6;
+    }
+  }
 }
 #undef SFLAG
 
@@ -894,6 +933,12 @@ OP::OP(LLVMContext &Ctx, Module *pModule)
       m_Ctx, {Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx)},
       "dx.types.ResourceProperties", pModule);
 
+  m_pResourceBindingType =
+      GetOrCreateStructType(m_Ctx,
+                            {Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx),
+                             Type::getInt32Ty(m_Ctx), Type::getInt8Ty(m_Ctx)},
+                            "dx.types.ResBind", pModule);
+
   Type *DimsType[4] = { Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx) };
   m_pDimensionsType = GetOrCreateStructType(m_Ctx, DimsType, "dx.types.Dimensions", pModule);
 
@@ -909,8 +954,11 @@ OP::OP(LLVMContext &Ctx, Module *pModule)
   Type *SplitDoubleTypes[2] = { Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx) }; // Lo, Hi.
   m_pSplitDoubleType = GetOrCreateStructType(m_Ctx, SplitDoubleTypes, "dx.types.splitdouble", pModule);
 
-  Type *Int4Types[4] = { Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx) }; // HiHi, HiLo, LoHi, LoLo
-  m_pInt4Type = GetOrCreateStructType(m_Ctx, Int4Types, "dx.types.fouri32", pModule);
+  Type *FourI32Types[4] = { Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx) }; // HiHi, HiLo, LoHi, LoLo
+  m_pFourI32Type = GetOrCreateStructType(m_Ctx, FourI32Types, "dx.types.fouri32", pModule);
+
+  Type *FourI16Types[4] = { Type::getInt16Ty(m_Ctx), Type::getInt16Ty(m_Ctx), Type::getInt16Ty(m_Ctx), Type::getInt16Ty(m_Ctx) }; // HiHi, HiLo, LoHi, LoLo
+  m_pFourI16Type = GetOrCreateStructType(m_Ctx, FourI16Types, "dx.types.fouri16", pModule);
 
   // When loading a module into an existing context where types are merged,
   // type names may change.  When this happens, any intrinsics overloaded on
@@ -996,10 +1044,12 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   Type *p2I32 = GetBinaryWithTwoOutputsType();
   Type *pF64 = Type::getDoubleTy(m_Ctx);
   Type *pSDT = GetSplitDoubleType();  // Split double type.
-  Type *pI4S = GetInt4Type(); // 4 i32s in a struct.
+  Type *p4I32 = GetFourI32Type(); // 4 i32s in a struct.
+  
   Type *udt = pOverloadType;
   Type *obj = pOverloadType;
   Type *resProperty = GetResourcePropertiesType();
+  Type *resBind = GetResourceBindingType();
 
   std::string funcName;
   ConstructOverloadName(pOverloadType, opCode, funcName);
@@ -1014,6 +1064,7 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
 #define A(_x) ArgTypes.emplace_back(_x)
 #define RRT(_y) A(GetResRetType(_y))
 #define CBRT(_y) A(GetCBufferRetType(_y))
+#define VEC4(_y) A(GetVectorType(4,_y))
 
 /* <py::lines('OPCODE-OLOAD-FUNCS')>hctdb_instrhelp.get_oloads_funcs()</py>*/
   switch (opCode) {            // return     opCode
@@ -1145,17 +1196,23 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::RenderTargetGetSampleCount:A(pI32);     A(pI32); break;
 
     // Synchronization
-  case OpCode::AtomicBinOp:            A(pI32);     A(pI32); A(pRes); A(pI32); A(pI32); A(pI32); A(pI32); A(pI32); break;
-  case OpCode::AtomicCompareExchange:  A(pI32);     A(pI32); A(pRes); A(pI32); A(pI32); A(pI32); A(pI32); A(pI32); break;
+  case OpCode::AtomicBinOp:            A(pETy);     A(pI32); A(pRes); A(pI32); A(pI32); A(pI32); A(pI32); A(pETy); break;
+  case OpCode::AtomicCompareExchange:  A(pETy);     A(pI32); A(pRes); A(pI32); A(pI32); A(pI32); A(pETy); A(pETy); break;
   case OpCode::Barrier:                A(pV);       A(pI32); A(pI32); break;
 
-    // Pixel shader
+    // Derivatives
   case OpCode::CalculateLOD:           A(pF32);     A(pI32); A(pRes); A(pRes); A(pF32); A(pF32); A(pF32); A(pI1);  break;
+
+    // Pixel shader
   case OpCode::Discard:                A(pV);       A(pI32); A(pI1);  break;
+
+    // Derivatives
   case OpCode::DerivCoarseX:           A(pETy);     A(pI32); A(pETy); break;
   case OpCode::DerivCoarseY:           A(pETy);     A(pI32); A(pETy); break;
   case OpCode::DerivFineX:             A(pETy);     A(pI32); A(pETy); break;
   case OpCode::DerivFineY:             A(pETy);     A(pI32); A(pETy); break;
+
+    // Pixel shader
   case OpCode::EvalSnapped:            A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  A(pI32); A(pI32); break;
   case OpCode::EvalSampleIndex:        A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  A(pI32); break;
   case OpCode::EvalCentroid:           A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  break;
@@ -1203,7 +1260,7 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::WaveAnyTrue:            A(pI1);      A(pI32); A(pI1);  break;
   case OpCode::WaveAllTrue:            A(pI1);      A(pI32); A(pI1);  break;
   case OpCode::WaveActiveAllEqual:     A(pI1);      A(pI32); A(pETy); break;
-  case OpCode::WaveActiveBallot:       A(pI4S);     A(pI32); A(pI1);  break;
+  case OpCode::WaveActiveBallot:       A(p4I32);    A(pI32); A(pI1);  break;
   case OpCode::WaveReadLaneAt:         A(pETy);     A(pI32); A(pETy); A(pI32); break;
   case OpCode::WaveReadLaneFirst:      A(pETy);     A(pI32); A(pETy); break;
   case OpCode::WaveActiveOp:           A(pETy);     A(pI32); A(pETy); A(pI8);  A(pI8);  break;
@@ -1296,7 +1353,7 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::Dot4AddU8Packed:        A(pI32);     A(pI32); A(pI32); A(pI32); A(pI32); break;
 
     // Wave
-  case OpCode::WaveMatch:              A(pI4S);     A(pI32); A(pETy); break;
+  case OpCode::WaveMatch:              A(p4I32);    A(pI32); A(pETy); break;
   case OpCode::WaveMultiPrefixOp:      A(pETy);     A(pI32); A(pETy); A(pI32); A(pI32); A(pI32); A(pI32); A(pI8);  A(pI8);  break;
   case OpCode::WaveMultiPrefixBitCount:A(pI32);     A(pI32); A(pI1);  A(pI32); A(pI32); A(pI32); A(pI32); break;
 
@@ -1361,8 +1418,15 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::RayQuery_CommittedInstanceContributionToHitGroupIndex:A(pI32);     A(pI32); A(pI32); break;
 
     // Get handle from heap
-  case OpCode::CreateHandleFromHeap:   A(pRes);     A(pI32); A(pI32); A(pI1);  break;
-  case OpCode::AnnotateHandle:         A(pRes);     A(pI32); A(pRes); A(pI8);  A(pI8);  A(resProperty);break;
+  case OpCode::AnnotateHandle:         A(pRes);     A(pI32); A(pRes); A(resProperty);break;
+  case OpCode::CreateHandleFromBinding:A(pRes);     A(pI32); A(resBind);A(pI32); A(pI1);  break;
+  case OpCode::CreateHandleFromHeap:   A(pRes);     A(pI32); A(pI32); A(pI1);  A(pI1);  break;
+
+    // Unpacking intrinsics
+  case OpCode::Unpack4x8:              VEC4(pETy);  A(pI32); A(pI8);  A(pI32); break;
+
+    // Packing intrinsics
+  case OpCode::Pack4x8:                A(pI32);     A(pI32); A(pI8);  A(pETy); A(pETy); A(pETy); A(pETy); break;
   // OPCODE-OLOAD-FUNCS:END
   default: DXASSERT(false, "otherwise unhandled case"); break;
   }
@@ -1447,6 +1511,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   // OPCODE-OLOAD-TYPES:BEGIN
   case OpCode::TempRegStore:
   case OpCode::CallShader:
+  case OpCode::Pack4x8:
     DXASSERT_NOMSG(FT->getNumParams() > 2);
     return FT->getParamType(2);
   case OpCode::MinPrecXRegStore:
@@ -1531,12 +1596,11 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::RayQuery_Abort:
   case OpCode::RayQuery_CommitNonOpaqueTriangleHit:
   case OpCode::RayQuery_CommitProceduralPrimitiveHit:
-  case OpCode::CreateHandleFromHeap:
   case OpCode::AnnotateHandle:
+  case OpCode::CreateHandleFromBinding:
+  case OpCode::CreateHandleFromHeap:
     return Type::getVoidTy(Ctx);
   case OpCode::CheckAccessFullyMapped:
-  case OpCode::AtomicBinOp:
-  case OpCode::AtomicCompareExchange:
   case OpCode::SampleIndex:
   case OpCode::Coverage:
   case OpCode::InnerCoverage:
@@ -1618,6 +1682,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::TextureGather:
   case OpCode::TextureGatherCmp:
   case OpCode::RawBufferLoad:
+  case OpCode::Unpack4x8:
   {
     StructType *ST = cast<StructType>(Ty);
     return ST->getElementType(0);
@@ -1635,6 +1700,10 @@ Type *OP::GetResourcePropertiesType() const {
   return m_pResourcePropertiesType;
 }
 
+Type *OP::GetResourceBindingType() const {
+  return m_pResourceBindingType;
+}
+
 Type *OP::GetDimensionsType() const
 {
   return m_pDimensionsType;
@@ -1657,8 +1726,12 @@ Type *OP::GetSplitDoubleType() const {
   return m_pSplitDoubleType;
 }
 
-Type *OP::GetInt4Type() const {
-  return m_pInt4Type;
+Type *OP::GetFourI32Type() const {
+  return m_pFourI32Type;
+}
+
+Type *OP::GetFourI16Type() const {
+  return m_pFourI16Type;
 }
 
 bool OP::IsResRetType(llvm::Type *Ty) {
@@ -1710,6 +1783,18 @@ Type *OP::GetCBufferRetType(Type *pOverloadType) {
   return m_pCBufferRetType[TypeSlot];
 }
 
+Type *OP::GetVectorType(unsigned numElements, Type *pOverloadType) {
+  if (numElements == 4) {
+    if (pOverloadType == Type::getInt32Ty(pOverloadType->getContext())) {
+      return m_pFourI32Type;
+    }
+    else if (pOverloadType == Type::getInt16Ty(pOverloadType->getContext())) {
+      return m_pFourI16Type;
+    }
+  }
+  DXASSERT(false, "unexpected overload type");
+  return nullptr;
+}
 
 //------------------------------------------------------------------------------
 //

+ 11 - 11
lib/DXIL/DxilResource.cpp

@@ -36,7 +36,16 @@ CompType DxilResource::GetCompType() const {
 }
 
 void DxilResource::SetCompType(const CompType CT) {
-  m_CompType = CT;
+  // Translate packed types to u32
+  switch(CT.GetKind()) {
+    case CompType::Kind::PackedS8x32:
+    case CompType::Kind::PackedU8x32:
+      m_CompType = CompType::getU32();
+      break;
+    default:
+      m_CompType = CT;
+      break;
+  }
 }
 
 Type *DxilResource::GetRetType() const {
@@ -121,8 +130,7 @@ bool DxilResource::IsAnyTexture(Kind ResourceKind) {
 }
 
 bool DxilResource::IsStructuredBuffer() const {
-  return GetKind() == Kind::StructuredBuffer ||
-         GetKind() == Kind::StructuredBufferWithCounter;
+  return GetKind() == Kind::StructuredBuffer;
 }
 
 bool DxilResource::IsTypedBuffer() const {
@@ -162,8 +170,6 @@ unsigned DxilResource::GetNumCoords(Kind ResourceKind) {
       0, // RaytracingAccelerationStructure,
       2, // FeedbackTexture2D,
       3, // FeedbackTexture2DArray,
-      2, // StructureBufferWithCounter,
-      0, // SamplerComparation,
   };
   static_assert(_countof(CoordSizeTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -191,8 +197,6 @@ unsigned DxilResource::GetNumDimensions(Kind ResourceKind) {
       0, // RaytracingAccelerationStructure,
       2, // FeedbackTexture2D,
       2, // FeedbackTexture2DArray,
-      2, // StructureBufferWithCounter,
-      0, // SamplerComparation,
   };
   static_assert(_countof(NumDimTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -220,8 +224,6 @@ unsigned DxilResource::GetNumDimensionsForCalcLOD(Kind ResourceKind) {
       0, // RaytracingAccelerationStructure,
       2, // FeedbackTexture2D,
       2, // FeedbackTexture2DArray,
-      2, // StructureBufferWithCounter,
-      0, // SamplerComparation,
   };
   static_assert(_countof(NumDimTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");
@@ -249,8 +251,6 @@ unsigned DxilResource::GetNumOffsets(Kind ResourceKind) {
       0, // RaytracingAccelerationStructure,
       2, // FeedbackTexture2D,
       2, // FeedbackTexture2DArray,
-      0, // StructureBufferWithCounter,
-      0, // SamplerComparation,
   };
   static_assert(_countof(OffsetSizeTab) == (unsigned)Kind::NumEntries, "check helper array size");
   DXASSERT(ResourceKind > Kind::Invalid && ResourceKind < Kind::NumEntries, "otherwise the caller passed wrong resource type");

+ 1 - 2
lib/DXIL/DxilResourceBase.cpp

@@ -90,7 +90,7 @@ static const char *s_ResourceDimNames[] = {
         "invalid", "1d",        "2d",      "2dMS",      "3d",
         "cube",    "1darray",   "2darray", "2darrayMS", "cubearray",
         "buf",     "rawbuf",    "structbuf", "cbuffer", "sampler",
-        "tbuffer", "ras", "fbtex2d", "fbtex2darray", "structbufwithcounter", "samplercomparison",
+        "tbuffer", "ras", "fbtex2d", "fbtex2darray",
 };
 static_assert(_countof(s_ResourceDimNames) == (unsigned)DxilResourceBase::Kind::NumEntries,
   "Resource dim names array must be updated when new resource kind enums are added.");
@@ -104,7 +104,6 @@ static const char *s_ResourceKindNames[] = {
         "TextureCube", "Texture1DArray",   "Texture2DArray",   "Texture2DMSArray", "TextureCubeArray",
         "TypedBuffer", "RawBuffer",        "StructuredBuffer", "CBuffer",          "Sampler",
         "TBuffer",     "RTAccelerationStructure", "FeedbackTexture2D", "FeedbackTexture2DArray",
-        "StructuredBufferWithCounter", "SamplerComparison",
 };
 static_assert(_countof(s_ResourceKindNames) == (unsigned)DxilResourceBase::Kind::NumEntries,
   "Resource kind names array must be updated when new resource kind enums are added.");

+ 109 - 0
lib/DXIL/DxilResourceBinding.cpp

@@ -0,0 +1,109 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilResourceBinding.cpp                                                   //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/DXIL/DxilResourceBinding.h"
+#include "llvm/IR/Constant.h"
+#include "dxc/DXIL/DxilShaderModel.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Constants.h"
+#include "dxc/DXIL/DxilResourceBase.h"
+#include "dxc/DXIL/DxilResource.h"
+#include "dxc/DXIL/DxilCBuffer.h"
+#include "dxc/DXIL/DxilSampler.h"
+#include "dxc/DXIL/DxilOperations.h"
+#include "dxc/DXIL/DxilInstructions.h"
+#include "dxc/DXIL/DxilUtil.h"
+
+using namespace llvm;
+
+namespace hlsl {
+
+bool DxilResourceBinding::operator==(const DxilResourceBinding &B) {
+  return rangeLowerBound == B.rangeLowerBound &&
+         rangeUpperBound == B.rangeUpperBound && spaceID == B.spaceID &&
+         resourceClass == B.resourceClass;
+}
+
+bool DxilResourceBinding::operator!=(const DxilResourceBinding &B) {
+  return !(*this == B) ;
+}
+
+namespace resource_helper {
+
+// The constant is as struct with int fields.
+// ShaderModel 6.6 has 4 fileds.
+llvm::Constant *getAsConstant(const DxilResourceBinding &B, llvm::Type *Ty,
+                              const ShaderModel &) {
+  StructType *ST = cast<StructType>(Ty);
+  switch (ST->getNumElements()) {
+  case 4: {
+    Constant *RawDwords[] = {
+        ConstantInt::get(ST->getElementType(0), B.rangeLowerBound),
+        ConstantInt::get(ST->getElementType(1), B.rangeUpperBound),
+        ConstantInt::get(ST->getElementType(2), B.spaceID),
+        ConstantInt::get(ST->getElementType(3), B.resourceClass)};
+    return ConstantStruct::get(ST, RawDwords);
+  } break;
+  default:
+    return nullptr;
+    break;
+  }
+  return nullptr;
+}
+DxilResourceBinding loadBindingFromConstant(const llvm::Constant &C) {
+  DxilResourceBinding B;
+
+  // Ty Should match C.getType().
+  Type *Ty = C.getType();
+  StructType *ST = cast<StructType>(Ty);
+  switch (ST->getNumElements()) {
+  case 4: {
+    if (isa<ConstantAggregateZero>(&C)) {
+      B.rangeLowerBound = 0;
+      B.rangeUpperBound = 0;
+      B.spaceID = 0;
+      B.resourceClass = 0;
+    } else {
+      const ConstantStruct *CS = cast<ConstantStruct>(&C);
+      const Constant *rangeLowerBound = CS->getOperand(0);
+      const Constant *rangeUpperBound = CS->getOperand(1);
+      const Constant *spaceID = CS->getOperand(2);
+      const Constant *resourceClass = CS->getOperand(3);
+      B.rangeLowerBound = cast<ConstantInt>(rangeLowerBound)->getLimitedValue();
+      B.rangeUpperBound = cast<ConstantInt>(rangeUpperBound)->getLimitedValue();
+      B.spaceID = cast<ConstantInt>(spaceID)->getLimitedValue();
+      B.resourceClass = cast<ConstantInt>(resourceClass)->getLimitedValue();
+    }
+  } break;
+  default:
+    B.resourceClass = (uint8_t)DXIL::ResourceClass::Invalid;
+    break;
+  }
+  return B;
+}
+DxilResourceBinding loadBindingFromCreateHandleFromBinding(
+    DxilInst_CreateHandleFromBinding &createHandle, llvm::Type *Ty,
+    const ShaderModel &) {
+  Constant *B = cast<Constant>(createHandle.get_bind());
+  return loadBindingFromConstant(*B);
+}
+DxilResourceBinding loadBindingFromResourceBase(DxilResourceBase *Res) {
+  DxilResourceBinding B = {};
+  B.resourceClass = (uint8_t)DXIL::ResourceClass::Invalid;
+  if (!Res)
+    return B;
+  B.rangeLowerBound = Res->GetLowerBound();
+  B.rangeUpperBound = Res->GetUpperBound();
+  B.spaceID = Res->GetSpaceID();
+  B.resourceClass = (uint8_t)Res->GetClass();
+  return B;
+}
+
+} // namespace resource_helper
+} // namespace hlsl

+ 59 - 81
lib/DXIL/DxilResourceProperties.cpp

@@ -24,33 +24,52 @@ using namespace llvm;
 
 namespace hlsl {
 
-bool DxilResourceProperties::operator==(const DxilResourceProperties &RP) {
-  return Class == RP.Class && Kind == RP.Kind && RawDword0 == RP.RawDword0 &&
-         RawDword1 == RP.RawDword1;
+DxilResourceProperties::DxilResourceProperties() {
+  RawDword0 = 0;
+  RawDword1 = 0;
+  Basic.ResourceKind = (uint8_t)DXIL::ResourceKind::Invalid;
+}
+bool DxilResourceProperties::isUAV() const { return Basic.IsUAV; }
+bool DxilResourceProperties::isValid() const {
+  return getResourceKind() != DXIL::ResourceKind::Invalid;
 }
 
-bool DxilResourceProperties::operator!=(const DxilResourceProperties &RP) {
-  return !(*this == RP) ;
+DXIL::ResourceClass DxilResourceProperties::getResourceClass() const {
+  switch (static_cast<DXIL::ResourceKind>(Basic.ResourceKind)) {
+  default:
+    return Basic.IsUAV ? DXIL::ResourceClass::UAV : DXIL::ResourceClass::SRV;
+  case DXIL::ResourceKind::CBuffer:
+    return DXIL::ResourceClass::CBuffer;
+  case DXIL::ResourceKind::Sampler:
+    return DXIL::ResourceClass::Sampler;
+  case DXIL::ResourceKind::Invalid:
+    return DXIL::ResourceClass::Invalid;
+  }
 }
 
-unsigned DxilResourceProperties::getSampleCount() {
-  assert(DXIL::IsTyped(Kind));
-  const unsigned SampleCountTable[] = {
-    1,  // 0
-    2,  // 1
-    4,  // 2
-    8,  // 3
-    16, // 4
-    32, // 5
-    0,  // 6
-    0,  // kSampleCountUndefined.
-  };
-  return SampleCountTable[Typed.SampleCountPow2];
+DXIL::ResourceKind DxilResourceProperties::getResourceKind() const {
+  return static_cast<DXIL::ResourceKind>(Basic.ResourceKind);
+}
+
+void DxilResourceProperties::setResourceKind(DXIL::ResourceKind RK) {
+  Basic.ResourceKind = (uint8_t)RK;
+}
+
+DXIL::ComponentType DxilResourceProperties::getCompType() const {
+  return static_cast<DXIL::ComponentType>(Typed.CompType);
+}
+
+bool DxilResourceProperties::operator==(const DxilResourceProperties &RP) const {
+  return RawDword0 == RP.RawDword0 &&
+         RawDword1 == RP.RawDword1;
+}
+
+bool DxilResourceProperties::operator!=(const DxilResourceProperties &RP) const {
+  return !(*this == RP) ;
 }
 
 namespace resource_helper {
-// Resource Class and Resource Kind is used as seperate parameter, other fileds
-// are saved in constant.
+
 // The constant is as struct with int32 fields.
 // ShaderModel 6.6 has 2 fileds.
 Constant *getAsConstant(const DxilResourceProperties &RP, Type *Ty,
@@ -70,12 +89,9 @@ Constant *getAsConstant(const DxilResourceProperties &RP, Type *Ty,
   return nullptr;
 }
 
-DxilResourceProperties loadFromConstant(const Constant &C,
-                                        DXIL::ResourceClass RC,
-                                        DXIL::ResourceKind RK) {
+DxilResourceProperties loadPropsFromConstant(const Constant &C) {
   DxilResourceProperties RP;
-  RP.Class = RC;
-  RP.Kind = RK;
+
   // Ty Should match C.getType().
   Type *Ty = C.getType();
   StructType *ST = cast<StructType>(Ty);
@@ -93,31 +109,25 @@ DxilResourceProperties loadFromConstant(const Constant &C,
     }
   } break;
   default:
-    RP.Class = DXIL::ResourceClass::Invalid;
     break;
   }
   return RP;
 }
 
 DxilResourceProperties
-loadFromAnnotateHandle(DxilInst_AnnotateHandle &annotateHandle, llvm::Type *Ty,
-                       const ShaderModel &SM) {
+loadPropsFromAnnotateHandle(DxilInst_AnnotateHandle &annotateHandle,
+                            llvm::Type *Ty, const ShaderModel &SM) {
   Constant *ResProp = cast<Constant>(annotateHandle.get_props());
-  return loadFromConstant(
-      *ResProp, (DXIL::ResourceClass)annotateHandle.get_resourceClass_val(),
-      (DXIL::ResourceKind)annotateHandle.get_resourceKind_val());
+  return loadPropsFromConstant(*ResProp);
 }
 
-DxilResourceProperties loadFromResourceBase(DxilResourceBase *Res) {
+DxilResourceProperties loadPropsFromResourceBase(DxilResourceBase *Res) {
 
   DxilResourceProperties RP;
-  RP.Class = DXIL::ResourceClass::Invalid;
   if (!Res) {
     return RP;
   }
 
-  RP.RawDword0 = 0;
-  RP.RawDword1 = 0;
 
   auto SetResProperties = [&RP](DxilResource &Res) {
     switch (Res.GetKind()) {
@@ -131,35 +141,10 @@ DxilResourceProperties loadFromResourceBase(DxilResourceBase *Res) {
 
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
-      RP.ElementStride = Res.GetElementStride();
+      RP.StructStrideInBytes = Res.GetElementStride();
       break;
     case DXIL::ResourceKind::Texture2DMS:
     case DXIL::ResourceKind::Texture2DMSArray:
-      switch (Res.GetSampleCount()) {
-      default:
-        RP.Typed.SampleCountPow2 =
-            DxilResourceProperties::kSampleCountUndefined;
-        break;
-      case 1:
-        RP.Typed.SampleCountPow2 = 0;
-        break;
-      case 2:
-        RP.Typed.SampleCountPow2 = 1;
-        break;
-      case 4:
-        RP.Typed.SampleCountPow2 = 2;
-        break;
-      case 8:
-        RP.Typed.SampleCountPow2 = 3;
-        break;
-      case 16:
-        RP.Typed.SampleCountPow2 = 4;
-        break;
-      case 32:
-        RP.Typed.SampleCountPow2 = 5;
-        break;
-      }
       break;
     case DXIL::ResourceKind::TypedBuffer:
     case DXIL::ResourceKind::Texture1D:
@@ -170,10 +155,8 @@ DxilResourceProperties loadFromResourceBase(DxilResourceBase *Res) {
     case DXIL::ResourceKind::TextureCubeArray:
     case DXIL::ResourceKind::Texture3D:
       Type *Ty = Res.GetRetType();
-      RP.Typed.SingleComponent = dxilutil::IsResourceSingleComponent(Ty);
-      RP.Typed.CompType = Res.GetCompType().GetKind();
-      RP.Typed.SampleCountPow2 =
-          DxilResourceProperties::kSampleCountUndefined;
+      RP.Typed.CompCount = dxilutil::GetResourceComponentCount(Ty);
+      RP.Typed.CompType = (uint8_t)Res.GetCompType().GetKind();
       break;
     }
   };
@@ -181,35 +164,30 @@ DxilResourceProperties loadFromResourceBase(DxilResourceBase *Res) {
   switch (Res->GetClass()) { case DXIL::ResourceClass::Invalid: return RP;
   case DXIL::ResourceClass::SRV: {
     DxilResource *SRV = (DxilResource*)(Res);
-    RP.Kind = Res->GetKind();
-    RP.Class = Res->GetClass();
+    RP.Basic.ResourceKind = (uint8_t)Res->GetKind();
     SetResProperties(*SRV);
   } break;
   case DXIL::ResourceClass::UAV: {
     DxilResource *UAV = (DxilResource *)(Res);
-    RP.Kind = Res->GetKind();
-    RP.Class = Res->GetClass();
-    RP.UAV.bGloballyCoherent = UAV->IsGloballyCoherent();
-    if (UAV->HasCounter()) {
-      RP.Kind = DXIL::ResourceKind::StructuredBufferWithCounter;
-    }
-    RP.UAV.bROV = UAV->IsROV();
+    RP.Basic.IsUAV = true;
+    RP.Basic.ResourceKind = (uint8_t)Res->GetKind();
+    RP.Basic.IsGloballyCoherent = UAV->IsGloballyCoherent();
+    RP.Basic.SamplerCmpOrHasCounter = UAV->HasCounter();
+    RP.Basic.IsROV = UAV->IsROV();
     SetResProperties(*UAV);
   } break;
   case DXIL::ResourceClass::Sampler: {
-    RP.Class = DXIL::ResourceClass::Sampler;
-    RP.Kind = Res->GetKind();
+    RP.Basic.ResourceKind = (uint8_t)Res->GetKind();
     DxilSampler *Sampler = (DxilSampler*)Res;
     if (Sampler->GetSamplerKind() == DXIL::SamplerKind::Comparison)
-      RP.Kind = DXIL::ResourceKind::SamplerComparison;
+      RP.Basic.SamplerCmpOrHasCounter = true;
     else if (Sampler->GetSamplerKind() == DXIL::SamplerKind::Invalid)
-      RP.Kind = DXIL::ResourceKind::Invalid;
+      RP.Basic.ResourceKind = (uint8_t)DXIL::ResourceKind::Invalid;
   } break;
   case DXIL::ResourceClass::CBuffer: {
-    RP.Class = DXIL::ResourceClass::CBuffer;
-    RP.Kind = Res->GetKind();
+    RP.Basic.ResourceKind = (uint8_t)Res->GetKind();
     DxilCBuffer *CB = (DxilCBuffer *)Res;
-    RP.SizeInBytes = CB->GetSize();
+    RP.CBufferSizeInBytes = CB->GetSize();
   } break;
   }
   return RP;

+ 64 - 20
lib/DXIL/DxilShaderFlags.cpp

@@ -52,6 +52,9 @@ ShaderFlags::ShaderFlags():
 , m_bShadingRate(false)
 , m_bRaytracingTier1_1(false)
 , m_bSamplerFeedback(false)
+, m_bAtomicInt64OnTypedResource(false)
+, m_bAtomicInt64OnGroupShared(false)
+, m_bDerivativesInMeshAndAmpShaders(false)
 , m_align0(0)
 , m_align1(0)
 {
@@ -104,6 +107,9 @@ uint64_t ShaderFlags::GetFeatureInfo() const {
   Flags |= m_bShadingRate ? hlsl::DXIL::ShaderFeatureInfo_ShadingRate : 0;
   Flags |= m_bRaytracingTier1_1 ? hlsl::DXIL::ShaderFeatureInfo_Raytracing_Tier_1_1 : 0;
   Flags |= m_bSamplerFeedback ? hlsl::DXIL::ShaderFeatureInfo_SamplerFeedback : 0;
+  Flags |= m_bAtomicInt64OnTypedResource ? hlsl::DXIL::ShaderFeatureInfo_AtomicInt64OnTypedResource : 0;
+  Flags |= m_bAtomicInt64OnGroupShared ? hlsl::DXIL::ShaderFeatureInfo_AtomicInt64OnGroupShared : 0;
+  Flags |= m_bDerivativesInMeshAndAmpShaders ? hlsl::DXIL::ShaderFeatureInfo_DerivativesInMeshAndAmpShaders : 0;
 
   return Flags;
 }
@@ -158,6 +164,9 @@ uint64_t ShaderFlags::GetShaderFlagsRawForCollection() {
   Flags.SetShadingRate(true);
   Flags.SetRaytracingTier1_1(true);
   Flags.SetSamplerFeedback(true);
+  Flags.SetAtomicInt64OnTypedResource(true);
+  Flags.SetAtomicInt64OnGroupShared(true);
+  Flags.SetDerivativesInMeshAndAmpShaders(true);
   return Flags.GetShaderFlagsRaw();
 }
 
@@ -217,9 +226,7 @@ static CallInst *FindCallToCreateHandle(Value *handleType) {
 
 DxilResourceProperties GetResourcePropertyFromHandleCall(const hlsl::DxilModule *M, CallInst *handleCall) {
 
-  DxilResourceProperties RP = {};
-  RP.Class = DXIL::ResourceClass::Invalid;
-  RP.Kind = DXIL::ResourceKind::Invalid;
+  DxilResourceProperties RP;
 
   ConstantInt *HandleOpCodeConst = cast<ConstantInt>(
       handleCall->getArgOperand(DXIL::OperandIndex::kOpcodeIdx));
@@ -237,7 +244,7 @@ DxilResourceProperties GetResourcePropertyFromHandleCall(const hlsl::DxilModule
           resource = M->GetUAV(rangeID->getLimitedValue());
         else if (resClass == DXIL::ResourceClass::SRV)
           resource = M->GetSRV(rangeID->getLimitedValue());
-        RP = resource_helper::loadFromResourceBase(&resource);
+        RP = resource_helper::loadPropsFromResourceBase(&resource);
       }
     }
   }
@@ -248,7 +255,7 @@ DxilResourceProperties GetResourcePropertyFromHandleCall(const hlsl::DxilModule
       Value *resType = LI->getOperand(0);
       for (auto &&res : M->GetUAVs()) {
         if (res->GetGlobalSymbol() == resType) {
-          RP = resource_helper::loadFromResourceBase(res.get());
+          RP = resource_helper::loadPropsFromResourceBase(res.get());
         }
       }
     }
@@ -256,7 +263,7 @@ DxilResourceProperties GetResourcePropertyFromHandleCall(const hlsl::DxilModule
     DxilInst_AnnotateHandle annotateHandle(cast<Instruction>(handleCall));
     Type *ResPropTy = M->GetOP()->GetResourcePropertiesType();
 
-    RP = resource_helper::loadFromAnnotateHandle(annotateHandle, ResPropTy, *M->GetShaderModel());
+    RP = resource_helper::loadPropsFromAnnotateHandle(annotateHandle, ResPropTy, *M->GetShaderModel());
   }
 
   return RP;
@@ -288,6 +295,9 @@ ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
   bool hasShadingRate = false;
   bool hasSamplerFeedback = false;
   bool hasRaytracingTier1_1 = false;
+  bool hasAtomicInt64OnTypedResource = false;
+  bool hasAtomicInt64OnGroupShared = false;
+  bool hasDerivativesInMeshAndAmpShaders = false;
 
   // Try to maintain compatibility with a v1.0 validator if that's what we have.
   uint32_t valMajor, valMinor;
@@ -320,22 +330,30 @@ ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
         isInt16 |= Ty == int16Ty;
         isInt64 |= Ty == int64Ty;
       }
-        if (isDouble) {
-          hasDouble = true;
-          switch (I.getOpcode()) {
-          case Instruction::FDiv:
-          case Instruction::UIToFP:
-          case Instruction::SIToFP:
-          case Instruction::FPToUI:
-          case Instruction::FPToSI:
-            hasDoubleExtension = true;
-            break;
-          }
+      if (isDouble) {
+        hasDouble = true;
+        switch (I.getOpcode()) {
+        case Instruction::FDiv:
+        case Instruction::UIToFP:
+        case Instruction::SIToFP:
+        case Instruction::FPToUI:
+        case Instruction::FPToSI:
+          hasDoubleExtension = true;
+          break;
+        }
+      }
+      if (isInt64) {
+        has64Int = true;
+        switch (I.getOpcode()) {
+        case Instruction::AtomicCmpXchg:
+        case Instruction::AtomicRMW:
+          hasAtomicInt64OnGroupShared = true;
+          break;
         }
+      }
 
       has16 |= isHalf;
       has16 |= isInt16;
-      has64Int |= isInt64;
       if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (!OP::IsDxilOpFunc(CI->getCalledFunction()))
           continue;
@@ -364,13 +382,14 @@ ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
           // Check if this is a library handle or general create handle
           if (handleCall) {
             DxilResourceProperties RP = GetResourcePropertyFromHandleCall(M, handleCall);
-            if (RP.Class == DXIL::ResourceClass::UAV) {
+            if (RP.isUAV()) {
               // Validator 1.0 assumes that all uav load is multi component load.
               if (hasMulticomponentUAVLoadsBackCompat) {
                 hasMulticomponentUAVLoads = true;
                 continue;
               } else {
-                if (DXIL::IsTyped(RP.Kind) && !RP.Typed.SingleComponent)
+                if (DXIL::IsTyped(RP.getResourceKind()) &&
+                    RP.Typed.CompCount > 1)
                   hasMulticomponentUAVLoads = true;
               }
             }
@@ -389,6 +408,28 @@ ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
         case DXIL::OpCode::GeometryIndex:
           hasRaytracingTier1_1 = true;
           break;
+        case DXIL::OpCode::AtomicBinOp:
+        case DXIL::OpCode::AtomicCompareExchange:
+          if (isInt64) {
+            Value *resHandle = CI->getArgOperand(DXIL::OperandIndex::kAtomicBinOpHandleOpIdx);
+            CallInst *handleCall = FindCallToCreateHandle(resHandle);
+            DxilResourceProperties RP = GetResourcePropertyFromHandleCall(M, handleCall);
+            if (DXIL::IsTyped(RP.getResourceKind()))
+                hasAtomicInt64OnTypedResource = true;
+          }
+          break;
+        case DXIL::OpCode::DerivFineX:
+        case DXIL::OpCode::DerivFineY:
+        case DXIL::OpCode::DerivCoarseX:
+        case DXIL::OpCode::DerivCoarseY:
+        case DXIL::OpCode::CalculateLOD:
+        case DXIL::OpCode::Sample:
+        case DXIL::OpCode::SampleBias:
+        case DXIL::OpCode::SampleCmp: {
+          const ShaderModel *pSM = M->GetShaderModel();
+          if (pSM->IsAS() || pSM->IsMS())
+            hasDerivativesInMeshAndAmpShaders = true;
+        } break;
         default:
           // Normal opcodes.
           break;
@@ -492,6 +533,9 @@ ShaderFlags ShaderFlags::CollectShaderFlags(const Function *F,
   flag.SetShadingRate(hasShadingRate);
   flag.SetSamplerFeedback(hasSamplerFeedback);
   flag.SetRaytracingTier1_1(hasRaytracingTier1_1);
+  flag.SetAtomicInt64OnTypedResource(hasAtomicInt64OnTypedResource);
+  flag.SetAtomicInt64OnGroupShared(hasAtomicInt64OnGroupShared);
+  flag.SetDerivativesInMeshAndAmpShaders(hasDerivativesInMeshAndAmpShaders);
 
   return flag;
 }

+ 11 - 2
lib/DXIL/DxilSignatureElement.cpp

@@ -58,7 +58,7 @@ void DxilSignatureElement::Initialize(llvm::StringRef Name, const CompType &Elem
     m_SemanticStartIndex = IndexVector[0];
   // Find semantic in the table.
   m_pSemantic = Semantic::GetByName(m_SemanticName, m_sigPointKind);
-  m_CompType = ElementType;
+  SetCompType(ElementType);
   m_InterpMode = InterpMode;
   m_SemanticIndex = IndexVector;
   m_Rows = Rows;
@@ -222,7 +222,16 @@ void DxilSignatureElement::AppendSemanticIndex(unsigned SemIdx) {
 }
 
 void DxilSignatureElement::SetCompType(CompType CT) {
-  m_CompType = CT;
+  // Translate packed types to u32
+  switch(CT.GetKind()) {
+    case CompType::Kind::PackedS8x32:
+    case CompType::Kind::PackedU8x32:
+      m_CompType = CompType::getU32();
+      break;
+    default:
+      m_CompType = CT;
+      break;
+  }
 }
 
 uint8_t DxilSignatureElement::GetColsAsMask() const {

+ 22 - 0
lib/DXIL/DxilUtil.cpp

@@ -607,6 +607,25 @@ bool IsResourceSingleComponent(Type *Ty) {
   return true;
 }
 
+uint8_t GetResourceComponentCount(llvm::Type *Ty) {
+  if (llvm::ArrayType *arrType = llvm::dyn_cast<llvm::ArrayType>(Ty)) {
+    return arrType->getArrayNumElements() *
+           GetResourceComponentCount(arrType->getArrayElementType());
+  } else if (llvm::StructType *structType =
+                 llvm::dyn_cast<llvm::StructType>(Ty)) {
+    uint32_t Count = 0;
+    for (Type *EltTy : structType->elements())  {
+      Count += GetResourceComponentCount(EltTy);
+    }
+    DXASSERT(Count <= 4, "Component Count out of bound.");
+    return Count;
+  } else if (llvm::VectorType *vectorType =
+                 llvm::dyn_cast<llvm::VectorType>(Ty)) {
+    return vectorType->getNumElements();
+  }
+  return 1;
+}
+
 bool IsHLSLResourceType(llvm::Type *Ty) {
   if (llvm::StructType *ST = dyn_cast<llvm::StructType>(Ty)) {
     if (!ST->hasName())
@@ -720,6 +739,9 @@ bool IsHLSLResourceDescType(llvm::Type *Ty) {
     // TODO: don't check names.
     if (name == ("struct..Resource"))
       return true;
+
+    if (name == "struct..Sampler")
+      return true;
   }
   return false;
 }

+ 34 - 27
lib/DxilContainer/DxilContainerAssembler.cpp

@@ -558,7 +558,7 @@ public:
     }
     DXASSERT_NOMSG(m_PSVBuffer.size() == m_PSVBufferSize);
 
-    // Set DxilRuntimInfo
+    // Set DxilRuntimeInfo
     PSVRuntimeInfo0* pInfo = m_PSV.GetPSVRuntimeInfo0();
     PSVRuntimeInfo1* pInfo1 = m_PSV.GetPSVRuntimeInfo1();
     const ShaderModel* SM = m_Module.GetShaderModel();
@@ -645,34 +645,41 @@ public:
         }
         break;
       }
-    case ShaderModel::Kind::Compute:
-    case ShaderModel::Kind::Library:
-    case ShaderModel::Kind::Invalid:
-      // Compute, Library, and Invalid not relevant to PSVRuntimeInfo0
-      break;
-    case ShaderModel::Kind::Mesh: {
-      pInfo->MS.MaxOutputVertices = (UINT)m_Module.GetMaxOutputVertices();
-      pInfo->MS.MaxOutputPrimitives = (UINT)m_Module.GetMaxOutputPrimitives();
-      pInfo1->MS1.MeshOutputTopology = (UINT)m_Module.GetMeshOutputTopology();
-      Module *mod = m_Module.GetModule();
-      const DataLayout &DL = mod->getDataLayout();
-      unsigned totalByteSize = 0;
-      for (GlobalVariable &GV : mod->globals()) {
-        PointerType *gvPtrType = cast<PointerType>(GV.getType());
-        if (gvPtrType->getAddressSpace() == hlsl::DXIL::kTGSMAddrSpace) {
-          Type *gvType = gvPtrType->getPointerElementType();
-          unsigned byteSize = DL.getTypeAllocSize(gvType);
-          totalByteSize += byteSize;
+      case ShaderModel::Kind::Compute: {
+        UINT waveSize = (UINT)m_Module.GetWaveSize();
+        if (waveSize != 0) {
+          pInfo->MinimumExpectedWaveLaneCount = waveSize;
+          pInfo->MaximumExpectedWaveLaneCount = waveSize;
         }
+        break;
+      }
+      case ShaderModel::Kind::Library:
+      case ShaderModel::Kind::Invalid:
+        // Library and Invalid not relevant to PSVRuntimeInfo0
+        break;
+      case ShaderModel::Kind::Mesh: {
+        pInfo->MS.MaxOutputVertices = (UINT)m_Module.GetMaxOutputVertices();
+        pInfo->MS.MaxOutputPrimitives = (UINT)m_Module.GetMaxOutputPrimitives();
+        pInfo1->MS1.MeshOutputTopology = (UINT)m_Module.GetMeshOutputTopology();
+        Module *mod = m_Module.GetModule();
+        const DataLayout &DL = mod->getDataLayout();
+        unsigned totalByteSize = 0;
+        for (GlobalVariable &GV : mod->globals()) {
+          PointerType *gvPtrType = cast<PointerType>(GV.getType());
+          if (gvPtrType->getAddressSpace() == hlsl::DXIL::kTGSMAddrSpace) {
+            Type *gvType = gvPtrType->getPointerElementType();
+            unsigned byteSize = DL.getTypeAllocSize(gvType);
+            totalByteSize += byteSize;
+          }
+        }
+        pInfo->MS.GroupSharedBytesUsed = totalByteSize;
+        pInfo->MS.PayloadSizeInBytes = m_Module.GetPayloadSizeInBytes();
+        break;
+      }
+      case ShaderModel::Kind::Amplification: {
+        pInfo->AS.PayloadSizeInBytes = m_Module.GetPayloadSizeInBytes();
+        break;
       }
-      pInfo->MS.GroupSharedBytesUsed = totalByteSize;
-      pInfo->MS.PayloadSizeInBytes = m_Module.GetPayloadSizeInBytes();
-      break;
-    }
-    case ShaderModel::Kind::Amplification: {
-      pInfo->AS.PayloadSizeInBytes = m_Module.GetPayloadSizeInBytes();
-      break;
-    }
     }
 
     // Set resource binding information

+ 2 - 0
lib/HLSL/ComputeViewIdStateBuilder.cpp

@@ -688,6 +688,8 @@ void DxilViewIdStateBuilder::CollectReachingDeclsRec(Value *pValue, ValueSetType
     CollectReachingDeclsRec(cast<ConstantExpr>(pValue)->getOperand(0), ReachingDecls, Visited);
   } else if (AddrSpaceCastInst *pCI = dyn_cast<AddrSpaceCastInst>(pValue)) {
     CollectReachingDeclsRec(pCI->getOperand(0), ReachingDecls, Visited);
+  } else if (BitCastInst *pCI = dyn_cast<BitCastInst>(pValue)) {
+    CollectReachingDeclsRec(pCI->getOperand(0), ReachingDecls, Visited);
   } else if (dyn_cast<AllocaInst>(pValue)) {
     ReachingDecls.emplace(pValue);
   } else if (PHINode *phi = dyn_cast<PHINode>(pValue)) {

+ 154 - 54
lib/HLSL/DxilCondenseResources.cpp

@@ -16,6 +16,7 @@
 #include "dxc/Support/Global.h"
 #include "dxc/DXIL/DxilTypeSystem.h"
 #include "dxc/DXIL/DxilInstructions.h"
+#include "dxc/DXIL/DxilResourceBinding.h"
 #include "dxc/HLSL/DxilSpanAllocator.h"
 #include "dxc/HLSL/HLMatrixType.h"
 #include "dxc/DXIL/DxilUtil.h"
@@ -33,6 +34,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <memory>
@@ -656,6 +658,9 @@ public:
     FailOnPoisonResources();
 
     bool bChanged = false;
+    if (DM.GetShaderModel()->IsSM66Plus())
+      bChanged = PatchDynamicTBuffers(DM);
+
     unsigned numResources = DM.GetCBuffers().size() + DM.GetUAVs().size() +
                             DM.GetSRVs().size() + DM.GetSamplers().size();
 
@@ -665,7 +670,9 @@ public:
     // Switch tbuffers to SRVs, as they have been treated as cbuffers up to this
     // point.
     if (DM.GetCBuffers().size())
-      bChanged = PatchTBuffers(DM) || bChanged;
+      bChanged |= PatchTBuffers(DM);
+
+
 
     // Gather reserved resource registers while we still have
     // unused resources that might have explicit register assignments.
@@ -740,8 +747,9 @@ private:
   void GenerateDxilResourceHandles();
   void UpdateStructTypeForLegacyLayout();
   // Switch CBuffer for SRV for TBuffers.
+  bool PatchDynamicTBuffers(DxilModule &DM);
   bool PatchTBuffers(DxilModule &DM);
-  void PatchTBufferUse(Value *V, DxilModule &DM);
+  void PatchTBufferUse(Value *V, DxilModule &DM, DenseSet<Value *> &patchedSet);
   void UpdateCBufferUsage();
 };
 
@@ -2015,14 +2023,39 @@ void ReplaceResourceUserWithHandle(
     };
 
     // Search all users for update counter
-    for (User *U : handle->users()) {
-      if (IsDxilOp(U, hlsl::OP::OpCode::BufferUpdateCounter)) {
-        res.SetHasCounter(true);
+    bool updateAnnotateHandle = false;
+    if (!res.HasCounter()) {
+      for (User *U : handle->users()) {
+        if (IsDxilOp(U, hlsl::OP::OpCode::BufferUpdateCounter)) {
+          res.SetHasCounter(true);
+          break;
+        }
+        else if (IsDxilOp(U, hlsl::OP::OpCode::AnnotateHandle)) {
+          for (User *UU : U->users()) {
+            if (IsDxilOp(UU, hlsl::OP::OpCode::BufferUpdateCounter)) {
+              res.SetHasCounter(true);
+              updateAnnotateHandle = true;
+              break;
+            }
+          }
+          if (updateAnnotateHandle)
+            break;
+        }
       }
-      else if (IsDxilOp(U, hlsl::OP::OpCode::AnnotateHandle)) {
-        for (User *UU : U->users()) {
-          if (IsDxilOp(UU, hlsl::OP::OpCode::BufferUpdateCounter))
-            res.SetHasCounter(true);
+      if (updateAnnotateHandle) {
+        // Update resource props with counter flag
+        DxilResourceProperties RP =
+          resource_helper::loadPropsFromResourceBase(&res);
+        // Require ShaderModule to reconstruct resource property constant
+        const ShaderModel *pSM = load->getParent()->getParent()->getParent()
+                                    ->GetDxilModule().GetShaderModel();
+        for (User *U : handle->users()) {
+          DxilInst_AnnotateHandle annotateHandle(cast<Instruction>(U));
+          if (annotateHandle) {
+            annotateHandle.set_props(
+              resource_helper::getAsConstant(
+                RP, annotateHandle.get_props()->getType(), *pSM));
+          }
         }
       }
     }
@@ -2031,13 +2064,48 @@ void ReplaceResourceUserWithHandle(
   load->eraseFromParent();
 }
 
+Value *flattenGepIdx(GEPOperator *GEP) {
+  Value *idx = nullptr;
+  if (GEP->getNumIndices() == 2) {
+    // one dim array of resource
+    idx = (GEP->idx_begin() + 1)->get();
+  } else {
+    gep_type_iterator GEPIt = gep_type_begin(GEP), E = gep_type_end(GEP);
+    // Must be instruction for multi dim array.
+    std::unique_ptr<IRBuilder<>> Builder;
+    if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP)) {
+      Builder = llvm::make_unique<IRBuilder<>>(GEPInst);
+    } else {
+      Builder = llvm::make_unique<IRBuilder<>>(GEP->getContext());
+    }
+    for (; GEPIt != E; ++GEPIt) {
+      if (GEPIt->isArrayTy()) {
+        unsigned arraySize = GEPIt->getArrayNumElements();
+        Value *tmpIdx = GEPIt.getOperand();
+        if (idx == nullptr)
+          idx = tmpIdx;
+        else {
+          idx = Builder->CreateMul(idx, Builder->getInt32(arraySize));
+          idx = Builder->CreateAdd(idx, tmpIdx);
+        }
+      }
+    }
+  }
+  return idx;
+}
+
 } // namespace
 void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
     DxilResourceBase &res) {
   OP *hlslOP = m_DM->GetOP();
+  // Generate createHandleFromBinding for sm66 and later.
+  bool bCreateFromBinding = m_DM->GetShaderModel()->IsSM66Plus();
+  OP::OpCode createOp = bCreateFromBinding ? OP::OpCode::CreateHandleFromBinding
+                                           : OP::OpCode::CreateHandle;
   Function *createHandle = hlslOP->GetOpFunc(
-      OP::OpCode::CreateHandle, llvm::Type::getVoidTy(m_DM->GetCtx()));
-  Value *opArg = hlslOP->GetU32Const((unsigned)OP::OpCode::CreateHandle);
+      createOp, llvm::Type::getVoidTy(m_DM->GetCtx()));
+  Value *opArg = hlslOP->GetU32Const((unsigned)createOp);
+
   bool isViewResource = res.GetClass() == DXIL::ResourceClass::SRV ||
                         res.GetClass() == DXIL::ResourceClass::UAV;
   bool isROV = isViewResource && static_cast<DxilResource &>(res).IsROV();
@@ -2079,6 +2147,26 @@ void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
   Value *createHandleArgs[] = {opArg, resClassArg, resIDArg, resLowerBound,
                                isUniformRes};
 
+  DxilResourceBinding binding = resource_helper::loadBindingFromResourceBase(&res);
+  Value *bindingV = resource_helper::getAsConstant(
+      binding, hlslOP->GetResourceBindingType(), *m_DM->GetShaderModel());
+
+  Value *createHandleFromBindingArgs[] = {opArg, bindingV, resLowerBound, isUniformRes};
+
+  MutableArrayRef<Value *> Args(bCreateFromBinding ? createHandleFromBindingArgs
+                                                   : createHandleArgs,
+                                bCreateFromBinding ? 4 : 5);
+
+  const unsigned resIdxOpIdx = bCreateFromBinding
+                                   ? DxilInst_CreateHandleFromBinding::arg_index
+                                   : DxilInst_CreateHandle::arg_index;
+  const unsigned nonUniformOpIdx = bCreateFromBinding
+                                   ? DxilInst_CreateHandleFromBinding::arg_nonUniformIndex
+                                   : DxilInst_CreateHandle::arg_nonUniformIndex;
+
+
+
+
   for (iplist<Function>::iterator F : pM->getFunctionList()) {
     if (!F->isDeclaration()) {
       if (!isResArray) {
@@ -2088,7 +2176,7 @@ void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
           // Builder.SetCurrentDebugLocation(DL);
         }
         handleMapOnFunction[F] =
-            Builder.CreateCall(createHandle, createHandleArgs, handleName);
+            Builder.CreateCall(createHandle, Args, handleName);
       }
     }
   }
@@ -2105,36 +2193,11 @@ void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
       Value *handle = handleMapOnFunction[userF];
       ReplaceResourceUserWithHandle(static_cast<DxilResource &>(res), ldInst, handle);
     } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(user)) {
-      Value *idx = nullptr;
-      if (GEP->getNumIndices() == 2) {
-        // one dim array of resource
-        idx = (GEP->idx_begin() + 1)->get();
-      } else {
-        gep_type_iterator GEPIt = gep_type_begin(GEP), E = gep_type_end(GEP);
-        // Must be instruction for multi dim array.
-        std::unique_ptr<IRBuilder<> > Builder;
-        if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(GEP)) {
-          Builder = llvm::make_unique<IRBuilder<> >(GEPInst);
-        } else {
-          Builder = llvm::make_unique<IRBuilder<> >(GV->getContext());
-        }
-        for (; GEPIt != E; ++GEPIt) {
-          if (GEPIt->isArrayTy()) {
-            unsigned arraySize = GEPIt->getArrayNumElements();
-            Value * tmpIdx = GEPIt.getOperand();
-            if (idx == nullptr)
-              idx = tmpIdx;
-            else {
-              idx = Builder->CreateMul(idx, Builder->getInt32(arraySize));
-              idx = Builder->CreateAdd(idx, tmpIdx);
-            }
-          }
-        }
-      }
+      Value *idx = flattenGepIdx(GEP);
 
-      createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] = idx;
+      Args[resIdxOpIdx] = idx;
 
-      createHandleArgs[DXIL::OperandIndex::kCreateHandleIsUniformOpIdx] =
+      Args[nonUniformOpIdx] =
           isUniformRes;
 
       Value *handle = nullptr;
@@ -2142,14 +2205,13 @@ void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
         IRBuilder<> Builder = IRBuilder<>(GEPInst);
         if (DxilMDHelper::IsMarkedNonUniform(GEPInst)) {
           // Mark nonUniform.
-          createHandleArgs[DXIL::OperandIndex::kCreateHandleIsUniformOpIdx] =
+          Args[nonUniformOpIdx] =
               hlslOP->GetI1Const(1);
           // Clear nonUniform on GEP.
           GEPInst->setMetadata(DxilMDHelper::kDxilNonUniformAttributeMDName, nullptr);
         }
-        createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] =
-            Builder.CreateAdd(idx, resLowerBound);
-        handle = Builder.CreateCall(createHandle, createHandleArgs, handleName);
+        Args[resIdxOpIdx] = Builder.CreateAdd(idx, resLowerBound);
+        handle = Builder.CreateCall(createHandle, Args, handleName);
       }
 
       for (auto GEPU = GEP->user_begin(), GEPE = GEP->user_end();
@@ -2160,10 +2222,9 @@ void DxilLowerCreateHandleForLib::TranslateDxilResourceUses(
           ReplaceResourceUserWithHandle(static_cast<DxilResource &>(res), ldInst, handle);
         } else {
           IRBuilder<> Builder = IRBuilder<>(ldInst);
-          createHandleArgs[DXIL::OperandIndex::kCreateHandleResIndexOpIdx] =
-              Builder.CreateAdd(idx, resLowerBound);
+          Args[resIdxOpIdx] = Builder.CreateAdd(idx, resLowerBound);
           Value *localHandle =
-              Builder.CreateCall(createHandle, createHandleArgs, handleName);
+              Builder.CreateCall(createHandle, Args, handleName);
           ReplaceResourceUserWithHandle(static_cast<DxilResource &>(res), ldInst, localHandle);
         }
       }
@@ -2245,7 +2306,11 @@ void InitTBuffer(const DxilCBuffer *pSource, DxilResource *pDest) {
   pDest->SetHandle(pSource->GetHandle());
 }
 
-void PatchTBufferLoad(CallInst *handle, DxilModule &DM) {
+void PatchTBufferLoad(CallInst *handle, DxilModule &DM,
+                      DenseSet<Value *> &patchedSet) {
+  if (patchedSet.count(handle))
+    return;
+  patchedSet.insert(handle);
   hlsl::OP *hlslOP = DM.GetOP();
   llvm::LLVMContext &Ctx = DM.GetCtx();
   Type *doubleTy = Type::getDoubleTy(Ctx);
@@ -2342,8 +2407,9 @@ void PatchTBufferLoad(CallInst *handle, DxilModule &DM) {
       DXASSERT(false, "otherwise CBufferLoad used for tbuffer rather than "
                       "CBufferLoadLegacy");
     } else if (opcode == DXIL::OpCode::AnnotateHandle) {
-      DxilInst_AnnotateHandle annotateHandle(I);
-      PatchTBufferLoad(cast<CallInst>(annotateHandle.get_res()), DM);
+      PatchTBufferLoad(cast<CallInst>(I), DM,
+                       patchedSet);
+      continue;
     } else {
       DXASSERT(false, "otherwise unexpected user of CreateHandle value");
     }
@@ -2353,16 +2419,49 @@ void PatchTBufferLoad(CallInst *handle, DxilModule &DM) {
 
 } // namespace
 
-void DxilLowerCreateHandleForLib::PatchTBufferUse(Value *V, DxilModule &DM) {
+void DxilLowerCreateHandleForLib::PatchTBufferUse(
+    Value *V, DxilModule &DM, DenseSet<Value *> &patchedSet) {
   for (User *U : V->users()) {
     if (CallInst *CI = dyn_cast<CallInst>(U)) {
       // Patch dxil call.
       if (hlsl::OP::IsDxilOpFuncCallInst(CI))
-        PatchTBufferLoad(CI, DM);
+        PatchTBufferLoad(CI, DM, patchedSet);
     } else {
-      PatchTBufferUse(U, DM);
+      PatchTBufferUse(U, DM, patchedSet);
+    }
+  }
+}
+
+bool DxilLowerCreateHandleForLib::PatchDynamicTBuffers(DxilModule &DM) {
+  hlsl::OP *hlslOP = DM.GetOP();
+  Function *AnnotHandleFn = hlslOP->GetOpFunc(DXIL::OpCode::AnnotateHandle,
+                                              Type::getVoidTy(DM.GetCtx()));
+  if (AnnotHandleFn->user_empty()) {
+    AnnotHandleFn->eraseFromParent();
+    return false;
+  }
+  bool bUpdated = false;
+  for (User *U : AnnotHandleFn->users()) {
+    CallInst *CI = cast<CallInst>(U);
+    DxilInst_AnnotateHandle annot(CI);
+    DxilResourceProperties RP = resource_helper::loadPropsFromAnnotateHandle(
+        annot, hlslOP->GetResourcePropertiesType(), *DM.GetShaderModel());
+
+    if (RP.getResourceKind() != DXIL::ResourceKind::TBuffer)
+      continue;
+    // Skip handle from createHandleForLib which take care in PatchTBuffers.
+    if (CallInst *HdlCI = dyn_cast<CallInst>(annot.get_res())) {
+      if (hlslOP->IsDxilOpFuncCallInst(HdlCI)) {
+        if (hlslOP->GetDxilOpFuncCallInst(HdlCI) == DXIL::OpCode::CreateHandleForLib)
+          continue;
+      }
     }
+
+    DenseSet<Value *> patchedSet;
+    PatchTBufferLoad(CI, DM, patchedSet);
+    bUpdated = true;
   }
+  return bUpdated;
 }
 
 bool DxilLowerCreateHandleForLib::PatchTBuffers(DxilModule &DM) {
@@ -2380,7 +2479,8 @@ bool DxilLowerCreateHandleForLib::PatchTBuffers(DxilModule &DM) {
       GlobalVariable *GV = dyn_cast<GlobalVariable>(CB->GetGlobalSymbol());
       if (GV == nullptr)
         continue;
-      PatchTBufferUse(GV, DM);
+      DenseSet<Value*> patchedSet;
+      PatchTBufferUse(GV, DM, patchedSet);
       // Set global symbol for cbuffer to an unused value so it can be removed
       // in RemoveUnusedResourceSymbols.
       Type *Ty = GV->getType()->getElementType();

+ 1 - 2
lib/HLSL/DxilContainerReflection.cpp

@@ -1493,8 +1493,7 @@ static D3D_SHADER_INPUT_TYPE ResourceToShaderInputType(DxilResourceBase *RB) {
     return D3D_SIT_SAMPLER;
   case DxilResource::Kind::RawBuffer:
     return isUAV ? D3D_SIT_UAV_RWBYTEADDRESS : D3D_SIT_BYTEADDRESS;
-  case DxilResource::Kind::StructuredBuffer:
-  case DxilResource::Kind::StructuredBufferWithCounter: {
+  case DxilResource::Kind::StructuredBuffer: {
     if (!isUAV) return D3D_SIT_STRUCTURED;
     // TODO: D3D_SIT_UAV_CONSUME_STRUCTURED, D3D_SIT_UAV_APPEND_STRUCTURED?
     if (R->HasCounter()) return D3D_SIT_UAV_RWSTRUCTURED_WITH_COUNTER;

+ 1 - 5
lib/HLSL/DxilGenerationPass.cpp

@@ -368,10 +368,6 @@ void TranslateHLAnnotateHandle(
     CallInst *CI = cast<CallInst>(user);
     Value *handle =
         CI->getArgOperand(HLOperandIndex::kAnnotateHandleHandleOpIdx);
-    Value *RC =
-        CI->getArgOperand(HLOperandIndex::kAnnotateHandleResourceClassOpIdx);
-    Value *RK =
-        CI->getArgOperand(HLOperandIndex::kAnnotateHandleResourceKindOpIdx);
     Value *RP = CI->getArgOperand(
         HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx);
     Type *ResTy =
@@ -382,7 +378,7 @@ void TranslateHLAnnotateHandle(
     Function *annotateHandle =
         hlslOP.GetOpFunc(DXIL::OpCode::AnnotateHandle, Builder.getVoidTy());
     CallInst *newHandle =
-        Builder.CreateCall(annotateHandle, {opArg, handle, RC, RK, RP});
+        Builder.CreateCall(annotateHandle, {opArg, handle, RP});
     HandleToResTypeMap[newHandle] = ResTy;
     CI->replaceAllUsesWith(newHandle);
     CI->eraseFromParent();

+ 90 - 42
lib/HLSL/DxilValidation.cpp

@@ -218,6 +218,8 @@ const char *hlsl::GetValidationRuleText(ValidationRule value) {
     case hlsl::ValidationRule::SmThreadGroupChannelRange: return "Declared Thread Group %0 size %1 outside valid range [%2..%3].";
     case hlsl::ValidationRule::SmMaxTheadGroup: return "Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1.";
     case hlsl::ValidationRule::SmMaxTGSMSize: return "Total Thread Group Shared Memory storage is %0, exceeded %1.";
+    case hlsl::ValidationRule::SmWaveSizeValue: return "Declared WaveSize %0 outside valid range [%1..%2], or not a power of 2.";
+    case hlsl::ValidationRule::SmWaveSizeNeedsDxil16Plus: return "WaveSize is valid only for DXIL version 1.6 and higher.";
     case hlsl::ValidationRule::SmROVOnlyInPS: return "RasterizerOrdered objects are only allowed in 5.0+ pixel shaders.";
     case hlsl::ValidationRule::SmTessFactorForDomain: return "Required TessFactor for domain not found declared anywhere in Patch Constant data.";
     case hlsl::ValidationRule::SmTessFactorSizeMatchDomain: return "TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.";
@@ -458,13 +460,13 @@ struct ValidationContext {
   void PropagateResMap(Value *V, DxilResourceBase *Res) {
     auto it = ResPropMap.find(V);
     if (it != ResPropMap.end()) {
-      DxilResourceProperties RP = resource_helper::loadFromResourceBase(Res);
+      DxilResourceProperties RP = resource_helper::loadPropsFromResourceBase(Res);
       DxilResourceProperties itRP = it->second;
       if (itRP != RP) {
         EmitResourceError(Res, ValidationRule::InstrResourceMapToSingleEntry);
       }
     } else {
-      DxilResourceProperties RP = resource_helper::loadFromResourceBase(Res);
+      DxilResourceProperties RP = resource_helper::loadPropsFromResourceBase(Res);
       ResPropMap[V] = RP;
       for (User *U : V->users()) {
         if (GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
@@ -474,7 +476,7 @@ struct ValidationContext {
           DxilInst_CreateHandleForLib hdl(CI);
           if (hdl) {
             DxilResourceProperties RP =
-                resource_helper::loadFromResourceBase(Res);
+                resource_helper::loadPropsFromResourceBase(Res);
             ResPropMap[CI] = RP;
           }
         } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
@@ -587,7 +589,7 @@ struct ValidationContext {
             }
           }
           HandleResIndexMap[CI] = rangeId;
-          DxilResourceProperties RP = resource_helper::loadFromResourceBase(Res);
+          DxilResourceProperties RP = resource_helper::loadPropsFromResourceBase(Res);
           ResPropMap[CI] = RP;
         }
       }
@@ -603,21 +605,13 @@ struct ValidationContext {
       for (User *U : F->users()) {
         CallInst *CI = cast<CallInst>(U);
         DxilInst_AnnotateHandle hdl(CI);
-        // Validate Class/RangeID/Index.
-        Value *resClass = hdl.get_resourceClass();
-        if (!isa<ConstantInt>(resClass)) {
-          EmitInstrError(CI, ValidationRule::InstrOpConstRange);
-          continue;
-        }
-
-        Value *resKind = hdl.get_resourceKind();
-        if (!isa<ConstantInt>(resKind)) {
+        DxilResourceProperties RP =
+            resource_helper::loadPropsFromAnnotateHandle(hdl, ResPropTy, SM);
+        if (RP.getResourceKind() == DXIL::ResourceKind::Invalid) {
           EmitInstrError(CI, ValidationRule::InstrOpConstRange);
           continue;
         }
 
-        DxilResourceProperties RP =
-            resource_helper::loadFromAnnotateHandle(hdl, ResPropTy, SM);
         ResPropMap[CI] = RP;
       }
     }
@@ -865,7 +859,7 @@ static bool ValidateOpcodeInProfile(DXIL::OpCode opcode,
   // Instructions: Sample=60, SampleBias=61, SampleCmp=64, CalculateLOD=81,
   // DerivCoarseX=83, DerivCoarseY=84, DerivFineX=85, DerivFineY=86
   if ((60 <= op && op <= 61) || op == 64 || op == 81 || (83 <= op && op <= 86))
-    return (SK == DXIL::ShaderKind::Library || SK == DXIL::ShaderKind::Pixel);
+    return (SK == DXIL::ShaderKind::Library || SK == DXIL::ShaderKind::Pixel || SK == DXIL::ShaderKind::Compute || SK == DXIL::ShaderKind::Amplification || SK == DXIL::ShaderKind::Mesh);
   // Instructions: RenderTargetGetSamplePosition=76,
   // RenderTargetGetSampleCount=77, Discard=82, EvalSnapped=87,
   // EvalSampleIndex=88, EvalCentroid=89, SampleIndex=90, Coverage=91,
@@ -969,8 +963,9 @@ static bool ValidateOpcodeInProfile(DXIL::OpCode opcode,
   if ((168 <= op && op <= 172))
     return (major > 6 || (major == 6 && minor >= 5))
         && (SK == DXIL::ShaderKind::Mesh);
-  // Instructions: CreateHandleFromHeap=216, AnnotateHandle=217
-  if ((216 <= op && op <= 217))
+  // Instructions: AnnotateHandle=216, CreateHandleFromBinding=217,
+  // CreateHandleFromHeap=218, Unpack4x8=219, Pack4x8=220
+  if ((216 <= op && op <= 220))
     return (major > 6 || (major == 6 && minor >= 6));
   return true;
   // VALOPCODESM-TEXT:END
@@ -1047,12 +1042,11 @@ static DxilResourceProperties GetResourceFromHandle(Value *Handle,
     else
       ValCtx.EmitError(ValidationRule::InstrHandleNotFromCreateHandle);
     DxilResourceProperties RP;
-    RP.Class = DXIL::ResourceClass::Invalid;
     return RP;
   }
 
   DxilResourceProperties RP = ValCtx.GetResourceFromVal(Handle);
-  if (RP.Class == DXIL::ResourceClass::Invalid) {
+  if (RP.getResourceClass() == DXIL::ResourceClass::Invalid) {
     ValCtx.EmitInstrError(cast<CallInst>(Handle),
                           ValidationRule::InstrHandleNotFromCreateHandle);
   }
@@ -1064,13 +1058,13 @@ static DXIL::SamplerKind GetSamplerKind(Value *samplerHandle,
                                         ValidationContext &ValCtx) {
   DxilResourceProperties RP = GetResourceFromHandle(samplerHandle, ValCtx);
 
-  if (RP.Class != DXIL::ResourceClass::Sampler) {
+  if (RP.getResourceClass() != DXIL::ResourceClass::Sampler) {
     // must be sampler.
     return DXIL::SamplerKind::Invalid;
   }
-  if (RP.Kind == DXIL::ResourceKind::SamplerComparison)
+  if (RP.Basic.SamplerCmpOrHasCounter)
     return DXIL::SamplerKind::Comparison;
-  else if (RP.Kind == DXIL::ResourceKind::Invalid)
+  else if (RP.getResourceKind() == DXIL::ResourceKind::Invalid)
     return DXIL::SamplerKind::Invalid;
   else
     return DXIL::SamplerKind::Default;
@@ -1083,7 +1077,7 @@ static DXIL::ResourceKind GetResourceKindAndCompTy(Value *handle, DXIL::Componen
   // TODO: validate ROV is used only in PS.
 
   DxilResourceProperties RP = GetResourceFromHandle(handle, ValCtx);
-  ResClass = RP.Class;
+  ResClass = RP.getResourceClass();
 
   switch (ResClass) {
   case DXIL::ResourceClass::SRV:
@@ -1097,12 +1091,12 @@ static DXIL::ResourceKind GetResourceKindAndCompTy(Value *handle, DXIL::Componen
     // Emit invalid res class
     return DXIL::ResourceKind::Invalid;
   }
-  if (!DXIL::IsStructuredBuffer(RP.Kind))
-    CompTy = RP.Typed.CompType;
+  if (!DXIL::IsStructuredBuffer(RP.getResourceKind()))
+    CompTy = static_cast<DXIL::ComponentType>(RP.Typed.CompType);
   else
     CompTy = DXIL::ComponentType::Invalid;
 
-  return RP.Kind;
+  return RP.getResourceKind();
 }
 
 DxilFieldAnnotation *GetFieldAnnotation(Type *Ty,
@@ -1142,7 +1136,6 @@ DxilResourceProperties ValidationContext::GetResourceFromVal(Value *resVal) {
   }
   else {
     DxilResourceProperties RP;
-    RP.Class = DXIL::ResourceClass::Invalid;
     return RP;
   }
 }
@@ -1268,6 +1261,16 @@ static void ValidateResourceOffset(CallInst *CI, DXIL::ResourceKind resKind,
   }
 }
 
+// Validate derivative and derivative dependent ops in CS/MS/AS
+static void ValidateDerivativeOp(CallInst *CI, ValidationContext &ValCtx) {
+
+  const ShaderModel *pSM = ValCtx.DxilMod.GetShaderModel();
+  if (pSM && (pSM->IsMS() || pSM->IsAS() || pSM->IsCS()) && !pSM->IsSM66Plus())
+    ValCtx.EmitInstrFormatError(CI, ValidationRule::SmOpcodeInInvalidFunction,
+                                {"Derivatives in CS/MS/AS", "Shader Model 6.6+"});
+}
+
+
 static void ValidateSampleInst(CallInst *CI, Value *srvHandle, Value *samplerHandle,
                                ArrayRef<Value *> coords,
                                ArrayRef<Value *> offsets,
@@ -1399,13 +1402,13 @@ static unsigned StoreValueToMask(ArrayRef<Value *> vals) {
 static int GetCBufSize(Value *cbHandle, ValidationContext &ValCtx) {
   DxilResourceProperties RP = GetResourceFromHandle(cbHandle, ValCtx);
 
-  if (RP.Class != DXIL::ResourceClass::CBuffer) {
+  if (RP.getResourceClass() != DXIL::ResourceClass::CBuffer) {
     ValCtx.EmitInstrError(cast<CallInst>(cbHandle),
                           ValidationRule::InstrCBufferClassForCBufferHandle);
     return -1;
   }
 
-  return RP.SizeInBytes;
+  return RP.CBufferSizeInBytes;
 }
 
 static unsigned GetNumVertices(DXIL::InputPrimitive inputPrimitive) {
@@ -1769,6 +1772,7 @@ static void ValidateImmOperandForMathDxilOp(CallInst *CI, DXIL::OpCode opcode,
         ValCtx.EmitInstrError(CI, ValidationRule::InstrNoIndefiniteDsxy);
       }
     }
+    ValidateDerivativeOp(CI, ValCtx);
   } break;
   default:
     break;
@@ -1868,7 +1872,6 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
     case DXIL::ResourceKind::TextureCubeArray:
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
     case DXIL::ResourceKind::RawBuffer:
     case DXIL::ResourceKind::TypedBuffer:
     case DXIL::ResourceKind::TBuffer: {
@@ -1930,6 +1933,7 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
       break;
     }
 
+    ValidateDerivativeOp(CI, ValCtx);
   } break;
   case DXIL::OpCode::TextureGather: {
     DxilInst_TextureGather gather(CI);
@@ -1955,6 +1959,7 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
          sample.get_coord3()},
         {sample.get_offset0(), sample.get_offset1(), sample.get_offset2()},
         /*IsSampleC*/ false, ValCtx);
+    ValidateDerivativeOp(CI, ValCtx);
   } break;
   case DXIL::OpCode::SampleCmp: {
     DxilInst_SampleCmp sample(CI);
@@ -1964,6 +1969,7 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
          sample.get_coord3()},
         {sample.get_offset0(), sample.get_offset1(), sample.get_offset2()},
         /*IsSampleC*/ true, ValCtx);
+    ValidateDerivativeOp(CI, ValCtx);
   } break;
   case DXIL::OpCode::SampleCmpLevelZero: {
     // sampler must be comparison mode.
@@ -1995,6 +2001,7 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
          sample.get_coord3()},
         {sample.get_offset0(), sample.get_offset1(), sample.get_offset2()},
         /*IsSampleC*/ false, ValCtx);
+    ValidateDerivativeOp(CI, ValCtx);
   } break;
   case DXIL::OpCode::SampleGrad: {
     DxilInst_SampleGrad sample(CI);
@@ -2066,7 +2073,6 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
       }
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
       if (isa<UndefValue>(offset)) {
         ValCtx.EmitInstrError(CI,
                               ValidationRule::InstrCoordinateCountForStructBuf);
@@ -2135,7 +2141,6 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
       }
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
       if (isa<UndefValue>(offset)) {
         ValCtx.EmitInstrError(CI,
                               ValidationRule::InstrCoordinateCountForStructBuf);
@@ -2250,7 +2255,6 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
       }
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
       if (isa<UndefValue>(offset)) {
         ValCtx.EmitInstrError(CI,
                               ValidationRule::InstrCoordinateCountForStructBuf);
@@ -2305,7 +2309,6 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
       }
       break;
     case DXIL::ResourceKind::StructuredBuffer:
-    case DXIL::ResourceKind::StructuredBufferWithCounter:
       if (isa<UndefValue>(offset)) {
         ValCtx.EmitInstrError(CI,
                               ValidationRule::InstrCoordinateCountForStructBuf);
@@ -2321,11 +2324,11 @@ static void ValidateResourceDxilOp(CallInst *CI, DXIL::OpCode opcode,
     DxilInst_TraceRay traceRay(CI);
     Value *hdl = traceRay.get_AccelerationStructure();
     DxilResourceProperties RP = ValCtx.GetResourceFromVal(hdl);
-    if (RP.Class == DXIL::ResourceClass::Invalid) {
+    if (RP.getResourceClass() == DXIL::ResourceClass::Invalid) {
       ValCtx.EmitInstrError(CI, ValidationRule::InstrResourceKindForTraceRay);
       return;
     }
-    if (RP.Kind != DXIL::ResourceKind::RTAccelerationStructure) {
+    if (RP.getResourceKind() != DXIL::ResourceKind::RTAccelerationStructure) {
       ValCtx.EmitInstrError(CI, ValidationRule::InstrResourceKindForTraceRay);
     }
   } break;
@@ -2414,16 +2417,16 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
     Value *handle = updateCounter.get_uav();
     DxilResourceProperties RP = ValCtx.GetResourceFromVal(handle);
 
-    if (RP.Class != DXIL::ResourceClass::UAV) {
+    if (!RP.isUAV()) {
       ValCtx.EmitInstrError(CI,
                                ValidationRule::InstrBufferUpdateCounterOnUAV);
     }
 
-    if (!DXIL::IsStructuredBuffer(RP.Kind)) {
+    if (!DXIL::IsStructuredBuffer(RP.getResourceKind())) {
       ValCtx.EmitInstrError(CI, ValidationRule::SmCounterOnlyOnStructBuf);
     }
 
-    if (RP.Kind != DXIL::ResourceKind::StructuredBufferWithCounter) {
+    if (!RP.Basic.SamplerCmpOrHasCounter) {
       ValCtx.EmitInstrError(
           CI, ValidationRule::InstrBufferUpdateCounterOnResHasCounter);
     }
@@ -2495,6 +2498,13 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI,
                                   {"CreateHandleForLib", "Library"});
     }
     break;
+  case DXIL::OpCode::AtomicBinOp:
+  case DXIL::OpCode::AtomicCompareExchange: {
+    Type *pOverloadType = OP::GetOverloadType(opcode, CI->getCalledFunction());
+    if ((pOverloadType->isIntegerTy(64)) && !pSM->IsSM66Plus())
+      ValCtx.EmitInstrFormatError(CI, ValidationRule::SmOpcodeInInvalidFunction,
+                                  {"64-bit atomic operations", "Shader Model 6.6+"});
+  } break;
   default:
     // TODO: make sure every opcode is checked.
     // Skip opcodes don't need special check.
@@ -2512,7 +2522,19 @@ static bool IsDxilFunction(llvm::Function *F) {
   return OP::IsDxilOpFunc(F);
 }
 
+static bool IsLifetimeIntrinsic(llvm::Function *F) {
+  return (F->isIntrinsic() &&
+          (F->getIntrinsicID() == Intrinsic::lifetime_start ||
+           F->getIntrinsicID() == Intrinsic::lifetime_end));
+}
+
 static void ValidateExternalFunction(Function *F, ValidationContext &ValCtx) {
+  if (DXIL::CompareVersions(ValCtx.m_DxilMajor, ValCtx.m_DxilMinor, 1, 6) >= 0 &&
+      IsLifetimeIntrinsic(F)) {
+    // TODO: validate lifetime intrinsic users
+    return;
+  }
+
   if (!IsDxilFunction(F) && !ValCtx.isLibProfile) {
     ValCtx.EmitFnFormatError(F, ValidationRule::DeclDxilFnExtern, {F->getName()});
     return;
@@ -2639,7 +2661,9 @@ static bool IsDxilBuiltinStructType(StructType *ST, hlsl::OP *hlslOP) {
     return true;
   if (ST == hlslOP->GetBinaryWithTwoOutputsType())
     return true;
-  if (ST == hlslOP->GetInt4Type())
+  if (ST == hlslOP->GetFourI32Type())
+    return true;
+  if (ST == hlslOP->GetFourI16Type())
     return true;
   if (ST == hlslOP->GetDimensionsType())
     return true;
@@ -3461,6 +3485,15 @@ static void ValidateFunctionBody(Function *F, ValidationContext &ValCtx) {
           ValCtx.EmitInstrError(Cast, ValidationRule::InstrMinPrecisonBitCast);
         }
       } break;
+      case Instruction::AtomicCmpXchg:
+      case Instruction::AtomicRMW: {
+        Type *T = cast<PointerType>(I.getOperand(AtomicRMWInst::getPointerOperandIndex())->getType())->getElementType();
+        const ShaderModel *pSM = ValCtx.DxilMod.GetShaderModel();
+        if ((T->isIntegerTy(64)) && !pSM->IsSM66Plus())
+          ValCtx.EmitInstrFormatError(&I, ValidationRule::SmOpcodeInInvalidFunction,
+                                      {"64-bit atomic operations", "Shader Model 6.6+"});
+      } break;
+
       }
 
       if (PointerType *PT = dyn_cast<PointerType>(I.getType())) {
@@ -3926,7 +3959,6 @@ static void ValidateResource(hlsl::DxilResource &res,
   case DXIL::ResourceKind::TypedBuffer:
   case DXIL::ResourceKind::TBuffer:
   case DXIL::ResourceKind::StructuredBuffer:
-  case DXIL::ResourceKind::StructuredBufferWithCounter:
   case DXIL::ResourceKind::Texture1D:
   case DXIL::ResourceKind::Texture1DArray:
   case DXIL::ResourceKind::Texture2D:
@@ -4234,6 +4266,9 @@ static void ValidateSignatureElement(DxilSignatureElement &SE,
   switch (compKind) {
   case CompType::Kind::U64: compWidth = 64; compInt = true; break;
   case CompType::Kind::I64: compWidth = 64; compInt = true; break;
+  // These should be translated for signatures:
+  //case CompType::Kind::PackedS8x32:
+  //case CompType::Kind::PackedU8x32:
   case CompType::Kind::U32: compWidth = 32; compInt = true; break;
   case CompType::Kind::I32: compWidth = 32; compInt = true; break;
   case CompType::Kind::U16: compWidth = 16; compInt = true; break;
@@ -5118,6 +5153,19 @@ static void ValidateEntryProps(ValidationContext &ValCtx,
   const DxilFunctionProps &props = entryProps.props;
   DXIL::ShaderKind ShaderType = props.shaderKind;
 
+  // validate wave size (currently allowed only on CS but might be supported on other shader types in the future)
+  if (props.waveSize != 0) {
+    if (DXIL::CompareVersions(ValCtx.m_DxilMajor, ValCtx.m_DxilMinor, 1, 6) < 0) {
+      ValCtx.EmitFnFormatError(F, ValidationRule::SmWaveSizeNeedsDxil16Plus, {});
+    }
+    if (!DXIL::IsValidWaveSizeValue(props.waveSize)) {
+      ValCtx.EmitFnFormatError(F, ValidationRule::SmWaveSizeValue,
+        {std::to_string(props.waveSize),
+         std::to_string(DXIL::kMinWaveSize),
+         std::to_string(DXIL::kMaxWaveSize) });
+    }
+  }
+
   if (ShaderType == DXIL::ShaderKind::Compute) {
     const auto &CS = props.ShaderProps.CS;
     unsigned x = CS.numThreads[0];

+ 19 - 24
lib/HLSL/HLModule.cpp

@@ -698,8 +698,8 @@ void HLModule::LoadDxilSamplerFromMDNode(llvm::MDNode *MD, DxilSampler &S) {
 DxilResourceBase *
 HLModule::AddResourceWithGlobalVariableAndProps(llvm::Constant *GV,
                                                  DxilResourceProperties &RP) {
-  DxilResource::Class RC = RP.Class;
-
+  DxilResource::Class RC = RP.getResourceClass();
+  DxilResource::Kind RK = RP.getResourceKind();
   unsigned rangeSize = 1;
   Type *Ty = GV->getType()->getPointerElementType();
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
@@ -708,11 +708,11 @@ HLModule::AddResourceWithGlobalVariableAndProps(llvm::Constant *GV,
   switch (RC) {
   case DxilResource::Class::Sampler: {
     std::unique_ptr<DxilSampler> S = llvm::make_unique<DxilSampler>();
-    if (RP.Kind == DXIL::ResourceKind::SamplerComparison)
+    if (RP.Basic.SamplerCmpOrHasCounter)
       S->SetSamplerKind(DxilSampler::SamplerKind::Comparison);
     else
       S->SetSamplerKind(DxilSampler::SamplerKind::Default);
-    S->SetKind(RP.Kind);
+    S->SetKind(RK);
     S->SetGlobalSymbol(GV);
     S->SetGlobalName(GV->getName());
     S->SetRangeSize(rangeSize);
@@ -721,16 +721,13 @@ HLModule::AddResourceWithGlobalVariableAndProps(llvm::Constant *GV,
   } break;
   case DxilResource::Class::SRV: {
     std::unique_ptr<HLResource> Res = llvm::make_unique<HLResource>();
-    if (DXIL::IsTyped(RP.Kind)) {
+    if (DXIL::IsTyped(RP.getResourceKind())) {
       Res->SetCompType(RP.Typed.CompType);
-      if (RP.Kind == DXIL::ResourceKind::Texture2DMS ||
-          RP.Kind == DXIL::ResourceKind::Texture2DMSArray)
-        Res->SetSampleCount(RP.getSampleCount());
-    } else if (DXIL::IsStructuredBuffer(RP.Kind)) {
-      Res->SetElementStride(RP.ElementStride);
+    } else if (DXIL::IsStructuredBuffer(RK)) {
+      Res->SetElementStride(RP.StructStrideInBytes);
     }
     Res->SetRW(false);
-    Res->SetKind(RP.Kind);
+    Res->SetKind(RK);
     Res->SetGlobalSymbol(GV);
     Res->SetGlobalName(GV->getName());
     Res->SetRangeSize(rangeSize);
@@ -739,19 +736,17 @@ HLModule::AddResourceWithGlobalVariableAndProps(llvm::Constant *GV,
   } break;
   case DxilResource::Class::UAV: {
     std::unique_ptr<HLResource> Res = llvm::make_unique<HLResource>();
-    if (DXIL::IsTyped(RP.Kind)) {
+    if (DXIL::IsTyped(RK)) {
       Res->SetCompType(RP.Typed.CompType);
-      Res->SetSampleCount(RP.getSampleCount());
-    } else if (DXIL::IsStructuredBuffer(RP.Kind)) {
-      Res->SetElementStride(RP.ElementStride);
+    } else if (DXIL::IsStructuredBuffer(RK)) {
+      Res->SetElementStride(RP.StructStrideInBytes);
     }
 
     Res->SetRW(true);
-    Res->SetROV(RP.UAV.bROV);
-    Res->SetGloballyCoherent(RP.UAV.bGloballyCoherent);
-    if (RP.Kind == DXIL::ResourceKind::StructuredBufferWithCounter)
-      Res->SetHasCounter(true);
-    Res->SetKind(RP.Kind);
+    Res->SetROV(RP.Basic.IsROV);
+    Res->SetGloballyCoherent(RP.Basic.IsGloballyCoherent);
+    Res->SetHasCounter(RP.Basic.SamplerCmpOrHasCounter);
+    Res->SetKind(RK);
     Res->SetGlobalSymbol(GV);
     Res->SetGlobalName(GV->getName());
     Res->SetRangeSize(rangeSize);
@@ -791,10 +786,10 @@ DXIL::ResourceClass GetRCFromType(StructType *ST, Module &M) {
     if (Ty != ST)
       continue;
     CallInst *CI = cast<CallInst>(F.user_back());
-    return (DXIL::ResourceClass)cast<ConstantInt>(
-               CI->getArgOperand(
-                   HLOperandIndex::kAnnotateHandleResourceClassOpIdx))
-        ->getLimitedValue();
+    Constant *Props = cast<Constant>(CI->getArgOperand(
+        HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx));
+    DxilResourceProperties RP = resource_helper::loadPropsFromConstant(*Props);
+    return RP.getResourceClass();
   }
   return DXIL::ResourceClass::Invalid;
 }

+ 266 - 57
lib/HLSL/HLOperationLower.cpp

@@ -43,6 +43,7 @@ struct HLOperationLowerHelper {
   Type *voidTy;
   Type *f32Ty;
   Type *i32Ty;
+  Type *i16Ty;
   llvm::Type *i1Ty;
   Type *i8Ty;
   DxilTypeSystem &dxilTypeSys;
@@ -61,6 +62,7 @@ HLOperationLowerHelper::HLOperationLowerHelper(HLModule &HLM)
   voidTy = Type::getVoidTy(Ctx);
   f32Ty = Type::getFloatTy(Ctx);
   i32Ty = Type::getInt32Ty(Ctx);
+  i16Ty = Type::getInt16Ty(Ctx);
   i1Ty = Type::getInt1Ty(Ctx);
   i8Ty = Type::getInt8Ty(Ctx);
   Function *EntryFunc = HLM.GetEntryFunction();
@@ -114,12 +116,17 @@ public:
     Value *counterHandle =
         CIHandle->getArgOperand(HLOperandIndex::kAnnotateHandleHandleOpIdx);
     // Change kind into StructurBufferWithCounter.
+    Constant *Props = cast<Constant>(CIHandle->getArgOperand(
+        HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx));
+    DxilResourceProperties RP = resource_helper::loadPropsFromConstant(*Props);
+    RP.Basic.SamplerCmpOrHasCounter = true;
 
     CIHandle->setArgOperand(
-        HLOperandIndex::kAnnotateHandleResourceKindOpIdx,
-        ConstantInt::get(
-            i8Ty,
-            (unsigned)DXIL::ResourceKind::StructuredBufferWithCounter));
+        HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx,
+        resource_helper::getAsConstant(
+            RP,
+            HLM.GetOP()->GetResourcePropertiesType(),
+            *HLM.GetShaderModel()));
 
     DXIL::ResourceClass RC = GetRC(handle);
     DXASSERT_LOCALVAR(RC, RC == DXIL::ResourceClass::UAV,
@@ -167,20 +174,10 @@ public:
   }
 
   DxilResourceProperties GetResPropsFromAnnotateHandle(CallInst *Anno) {
-    DXIL::ResourceClass RC =
-        (DXIL::ResourceClass)cast<ConstantInt>(
-            Anno->getArgOperand(
-                HLOperandIndex::kAnnotateHandleResourceClassOpIdx))
-            ->getLimitedValue();
-    DXIL::ResourceKind RK =
-        (DXIL::ResourceKind)cast<ConstantInt>(
-            Anno->getArgOperand(
-                HLOperandIndex::kAnnotateHandleResourceKindOpIdx))
-            ->getLimitedValue();
     Constant *Props = cast<Constant>(Anno->getArgOperand(
         HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx));
-    DxilResourceProperties RP = resource_helper::loadFromConstant(
-        *Props, RC, RK);
+    DxilResourceProperties RP = resource_helper::loadPropsFromConstant(
+        *Props);
     return RP;
   }
 
@@ -197,16 +194,15 @@ private:
       hlsl::HLOpcodeGroup group =
           hlsl::GetHLOpcodeGroupByName(CI->getCalledFunction());
       if (group == HLOpcodeGroup::HLAnnotateHandle) {
-        ConstantInt *RC = cast<ConstantInt>(CI->getArgOperand(
-            HLOperandIndex::kAnnotateHandleResourceClassOpIdx));
-        ConstantInt *RK = cast<ConstantInt>(CI->getArgOperand(
-            HLOperandIndex::kAnnotateHandleResourceKindOpIdx));
+        Constant *Props = cast<Constant>(CI->getArgOperand(
+            HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx));
+        DxilResourceProperties RP =
+            resource_helper::loadPropsFromConstant(*Props);
         Type *ResTy =
             CI->getArgOperand(HLOperandIndex::kAnnotateHandleResourceTypeOpIdx)
                 ->getType();
 
-        ResAttribute Attrib = {(DXIL::ResourceClass)RC->getLimitedValue(),
-                               (DXIL::ResourceKind)RK->getLimitedValue(),
+        ResAttribute Attrib = {RP.getResourceClass(), RP.getResourceKind(),
                                ResTy};
 
         HandleMetaMap[Handle] = Attrib;
@@ -326,7 +322,7 @@ private:
 
     Type *Ty = CbPtr->getResultElementType();
     // Not support resource array in cbuffer.
-    unsigned ResBinding = HLM.GetBindingForResourceInCB(CbPtr, CbGV, RP.Class);
+    unsigned ResBinding = HLM.GetBindingForResourceInCB(CbPtr, CbGV, RP.getResourceClass());
     return CreateResourceGV(Ty, Name, RP, ResBinding);
   }
 
@@ -3429,7 +3425,6 @@ ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
   switch (RK) {
   case DxilResource::Kind::RawBuffer:
   case DxilResource::Kind::StructuredBuffer:
-  case DxilResource::Kind::StructuredBufferWithCounter:
     opcode = OP::OpCode::RawBufferLoad;
     break;
   case DxilResource::Kind::TypedBuffer:
@@ -3805,7 +3800,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   switch (RK) {
   case DxilResource::Kind::RawBuffer:
   case DxilResource::Kind::StructuredBuffer:
-  case DxilResource::Kind::StructuredBufferWithCounter:
     opcode = OP::OpCode::RawBufferStore;
     break;
   case DxilResource::Kind::TypedBuffer:
@@ -3980,9 +3974,9 @@ Value *TranslateResourceStore(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 namespace {
 // Atomic intrinsics.
 struct AtomicHelper {
-  AtomicHelper(CallInst *CI, OP::OpCode op, Value *h);
+  AtomicHelper(CallInst *CI, OP::OpCode op, Value *h, Type *opType=nullptr);
   AtomicHelper(CallInst *CI, OP::OpCode op, Value *h, Value *bufIdx,
-               Value *baseOffset);
+               Value *baseOffset, Type *opType=nullptr);
   OP::OpCode opcode;
   Value *handle;
   Value *addr;
@@ -3990,11 +3984,13 @@ struct AtomicHelper {
   Value *value;
   Value *originalValue;
   Value *compareValue;
+  Type *operationType;
 };
 
 // For MOP version of Interlocked*.
-AtomicHelper::AtomicHelper(CallInst *CI, OP::OpCode op, Value *h)
-    : opcode(op), handle(h), offset(nullptr), originalValue(nullptr) {
+AtomicHelper::AtomicHelper(CallInst *CI, OP::OpCode op, Value *h, Type *opType)
+    : opcode(op), handle(h), offset(nullptr), originalValue(nullptr),
+      operationType(opType) {
   addr = CI->getArgOperand(HLOperandIndex::kObjectInterlockedDestOpIndex);
   if (op == OP::OpCode::AtomicCompareExchange) {
     compareValue = CI->getArgOperand(
@@ -4012,12 +4008,15 @@ AtomicHelper::AtomicHelper(CallInst *CI, OP::OpCode op, Value *h)
       originalValue = CI->getArgOperand(
           HLOperandIndex::kObjectInterlockedOriginalValueOpIndex);
   }
+  if (nullptr == operationType)
+    operationType = value->getType();
 }
 // For IOP version of Interlocked*.
 AtomicHelper::AtomicHelper(CallInst *CI, OP::OpCode op, Value *h, Value *bufIdx,
-                           Value *baseOffset)
+                           Value *baseOffset, Type *opType)
     : opcode(op), handle(h), addr(bufIdx),
-      offset(baseOffset), originalValue(nullptr) {
+      offset(baseOffset), originalValue(nullptr),
+      operationType(opType) {
   if (op == OP::OpCode::AtomicCompareExchange) {
     compareValue =
         CI->getArgOperand(HLOperandIndex::kInterlockedCmpCompareValueOpIndex);
@@ -4033,6 +4032,8 @@ AtomicHelper::AtomicHelper(CallInst *CI, OP::OpCode op, Value *h, Value *bufIdx,
       originalValue =
           CI->getArgOperand(HLOperandIndex::kInterlockedOriginalValueOpIndex);
   }
+  if (nullptr == operationType)
+    operationType = value->getType();
 }
 
 void TranslateAtomicBinaryOperation(AtomicHelper &helper,
@@ -4041,13 +4042,18 @@ void TranslateAtomicBinaryOperation(AtomicHelper &helper,
   Value *handle = helper.handle;
   Value *addr = helper.addr;
   Value *val = helper.value;
-  Type *Ty = val->getType();
+  Type *Ty = helper.operationType;
+  Type *valTy = val->getType();
 
   Value *undefI = UndefValue::get(Type::getInt32Ty(Ty->getContext()));
 
   Function *dxilAtomic = hlslOP->GetOpFunc(helper.opcode, Ty->getScalarType());
   Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(helper.opcode));
   Value *atomicOpArg = hlslOP->GetU32Const(static_cast<unsigned>(atomicOp));
+
+  if (Ty != valTy)
+    val = Builder.CreateBitCast(val, Ty);
+
   Value *args[] = {opArg,  handle, atomicOpArg,
                    undefI, undefI, undefI, // coordinates
                    val};
@@ -4071,6 +4077,8 @@ void TranslateAtomicBinaryOperation(AtomicHelper &helper,
   Value *origVal =
       Builder.CreateCall(dxilAtomic, args, hlslOP->GetAtomicOpName(atomicOp));
   if (helper.originalValue) {
+    if (Ty != valTy)
+      origVal = Builder.CreateBitCast(origVal, valTy);
     Builder.CreateStore(origVal, helper.originalValue);
   }
 }
@@ -4084,27 +4092,37 @@ Value *TranslateMopAtomicBinaryOperation(CallInst *CI, IntrinsicOp IOP,
   IRBuilder<> Builder(CI);
 
   switch (IOP) {
-  case IntrinsicOp::MOP_InterlockedAdd: {
+  case IntrinsicOp::MOP_InterlockedAdd:
+  case IntrinsicOp::MOP_InterlockedAdd64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Add, Builder,
                                    hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedAnd: {
+  case IntrinsicOp::MOP_InterlockedAnd:
+  case IntrinsicOp::MOP_InterlockedAnd64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::And, Builder,
                                    hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedExchange: {
+  case IntrinsicOp::MOP_InterlockedExchange:
+  case IntrinsicOp::MOP_InterlockedExchange64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Exchange,
                                    Builder, hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedMax: {
+  case IntrinsicOp::MOP_InterlockedExchangeFloat: {
+    AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle, Type::getInt32Ty(CI->getContext()));
+    TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Exchange,
+                                   Builder, hlslOP);
+  } break;
+  case IntrinsicOp::MOP_InterlockedMax:
+  case IntrinsicOp::MOP_InterlockedMax64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::IMax, Builder,
                                    hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedMin: {
+  case IntrinsicOp::MOP_InterlockedMin:
+  case IntrinsicOp::MOP_InterlockedMin64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::IMin, Builder,
                                    hlslOP);
@@ -4119,14 +4137,16 @@ Value *TranslateMopAtomicBinaryOperation(CallInst *CI, IntrinsicOp IOP,
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::UMin, Builder,
                                    hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedOr: {
+  case IntrinsicOp::MOP_InterlockedOr:
+  case IntrinsicOp::MOP_InterlockedOr64: {
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Or, Builder,
                                    hlslOP);
   } break;
-  case IntrinsicOp::MOP_InterlockedXor: {
-  default:
-    DXASSERT(IOP == IntrinsicOp::MOP_InterlockedXor,
+  case IntrinsicOp::MOP_InterlockedXor:
+  case IntrinsicOp::MOP_InterlockedXor64:
+  default: {
+    DXASSERT(IOP == IntrinsicOp::MOP_InterlockedXor || IOP == IntrinsicOp::MOP_InterlockedXor64,
              "invalid MOP atomic intrinsic");
     AtomicHelper helper(CI, DXIL::OpCode::AtomicBinOp, handle);
     TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Xor, Builder,
@@ -4143,12 +4163,20 @@ void TranslateAtomicCmpXChg(AtomicHelper &helper, IRBuilder<> &Builder,
   Value *val = helper.value;
   Value *cmpVal = helper.compareValue;
 
-  Type *Ty = val->getType();
+  Type *Ty = helper.operationType;
+  Type *valTy = val->getType();
 
   Value *undefI = UndefValue::get(Type::getInt32Ty(Ty->getContext()));
 
   Function *dxilAtomic = hlslOP->GetOpFunc(helper.opcode, Ty->getScalarType());
   Value *opArg = hlslOP->GetU32Const(static_cast<unsigned>(helper.opcode));
+
+  if (Ty != valTy) {
+    val = Builder.CreateBitCast(val, Ty);
+    if (cmpVal)
+      cmpVal = Builder.CreateBitCast(cmpVal, Ty);
+  }
+
   Value *args[] = {opArg,  handle, undefI, undefI, undefI, // coordinates
                    cmpVal, val};
 
@@ -4170,6 +4198,8 @@ void TranslateAtomicCmpXChg(AtomicHelper &helper, IRBuilder<> &Builder,
 
   Value *origVal = Builder.CreateCall(dxilAtomic, args);
   if (helper.originalValue) {
+    if (Ty != valTy)
+      origVal = Builder.CreateBitCast(origVal, valTy);
     Builder.CreateStore(origVal, helper.originalValue);
   }
 }
@@ -4181,13 +4211,22 @@ Value *TranslateMopAtomicCmpXChg(CallInst *CI, IntrinsicOp IOP,
 
   Value *handle = CI->getArgOperand(HLOperandIndex::kHandleOpIdx);
   IRBuilder<> Builder(CI);
-  AtomicHelper atomicHelper(CI, OP::OpCode::AtomicCompareExchange, handle);
+  Type *opType = nullptr;
+  if (IOP == IntrinsicOp::MOP_InterlockedCompareStoreFloatBitwise ||
+      IOP == IntrinsicOp::MOP_InterlockedCompareExchangeFloatBitwise)
+    opType = Type::getInt32Ty(CI->getContext());
+  AtomicHelper atomicHelper(CI, OP::OpCode::AtomicCompareExchange, handle, opType);
   TranslateAtomicCmpXChg(atomicHelper, Builder, hlslOP);
   return nullptr;
 }
 
 void TranslateSharedMemAtomicBinOp(CallInst *CI, IntrinsicOp IOP, Value *addr) {
   AtomicRMWInst::BinOp Op;
+  IRBuilder<> Builder(CI);
+  Value *val = CI->getArgOperand(HLOperandIndex::kInterlockedValueOpIndex);
+  PointerType *ptrType = dyn_cast<PointerType>(
+               CI->getArgOperand(HLOperandIndex::kInterlockedDestOpIndex)->getType());
+  bool needCast = ptrType && ptrType->getElementType()->isFloatTy();
   switch (IOP) {
   case IntrinsicOp::IOP_InterlockedAdd:
     Op = AtomicRMWInst::BinOp::Add;
@@ -4196,6 +4235,10 @@ void TranslateSharedMemAtomicBinOp(CallInst *CI, IntrinsicOp IOP, Value *addr) {
     Op = AtomicRMWInst::BinOp::And;
     break;
   case IntrinsicOp::IOP_InterlockedExchange:
+    if (needCast) {
+      val = Builder.CreateBitCast(val, Type::getInt32Ty(CI->getContext()));
+      addr = Builder.CreateBitCast(addr, Type::getInt32PtrTy(CI->getContext(), DXIL::kTGSMAddrSpace));
+    }
     Op = AtomicRMWInst::BinOp::Xchg;
     break;
   case IntrinsicOp::IOP_InterlockedMax:
@@ -4220,16 +4263,16 @@ void TranslateSharedMemAtomicBinOp(CallInst *CI, IntrinsicOp IOP, Value *addr) {
     break;
   }
 
-  Value *val = CI->getArgOperand(HLOperandIndex::kInterlockedValueOpIndex);
-
-  IRBuilder<> Builder(CI);
   Value *Result = Builder.CreateAtomicRMW(
       Op, addr, val, AtomicOrdering::SequentiallyConsistent);
   if (CI->getNumArgOperands() >
-      HLOperandIndex::kInterlockedOriginalValueOpIndex)
+      HLOperandIndex::kInterlockedOriginalValueOpIndex) {
+    if (needCast)
+      Result = Builder.CreateBitCast(Result, Type::getFloatTy(CI->getContext()));
     Builder.CreateStore(
         Result,
         CI->getArgOperand(HLOperandIndex::kInterlockedOriginalValueOpIndex));
+  }
 }
 
 static Value* SkipAddrSpaceCast(Value* Ptr) {
@@ -4266,6 +4309,17 @@ void TranslateSharedMemAtomicCmpXChg(CallInst *CI, Value *addr) {
   Value *cmpVal =
       CI->getArgOperand(HLOperandIndex::kInterlockedCmpCompareValueOpIndex);
   IRBuilder<> Builder(CI);
+
+  PointerType *ptrType = dyn_cast<PointerType>(
+               CI->getArgOperand(HLOperandIndex::kInterlockedDestOpIndex)->getType());
+  bool needCast = false;
+  if (ptrType && ptrType->getElementType()->isFloatTy()) {
+    needCast = true;
+    val = Builder.CreateBitCast(val, Type::getInt32Ty(CI->getContext()));
+    cmpVal = Builder.CreateBitCast(cmpVal, Type::getInt32Ty(CI->getContext()));
+    addr = Builder.CreateBitCast(addr, Type::getInt32PtrTy(CI->getContext(), DXIL::kTGSMAddrSpace));
+  }
+
   Value *Result = Builder.CreateAtomicCmpXchg(
       addr, cmpVal, val, AtomicOrdering::SequentiallyConsistent,
       AtomicOrdering::SequentiallyConsistent);
@@ -4273,6 +4327,8 @@ void TranslateSharedMemAtomicCmpXChg(CallInst *CI, Value *addr) {
   if (CI->getNumArgOperands() >
       HLOperandIndex::kInterlockedCmpOriginalValueOpIndex) {
     Value *originVal = Builder.CreateExtractValue(Result, 0);
+    if (needCast)
+      originVal = Builder.CreateBitCast(originVal, Type::getFloatTy(CI->getContext()));
     Builder.CreateStore(
         originVal,
         CI->getArgOperand(HLOperandIndex::kInterlockedCmpOriginalValueOpIndex));
@@ -5083,13 +5139,13 @@ Value *TranslateDot4AddPacked(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *src0 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
   DXASSERT(
       !src0->getType()->isVectorTy() && src0->getType()->isIntegerTy(32),
-      "otherwise, unexpected vector support in high level intrinsic tempalte");
+      "otherwise, unexpected vector support in high level intrinsic template");
   Value *src1 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
   DXASSERT(src0->getType() == src1->getType(), "otherwise, mismatched argument types");
   Value *accArg = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
   Type *accTy = accArg->getType();
   DXASSERT(!accTy->isVectorTy() && accTy->isIntegerTy(32),
-    "otherwise, unexpected vector support in high level intrinsic tempalte");
+    "otherwise, unexpected vector support in high level intrinsic template");
   IRBuilder<> Builder(CI);
 
   Function *dxilFunc = hlslOP->GetOpFunc(opcode, accTy);
@@ -5097,6 +5153,99 @@ Value *TranslateDot4AddPacked(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Builder.CreateCall(dxilFunc, { opArg, accArg, src0, src1 });
 }
 
+Value *TranslatePack(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                     HLOperationLowerHelper &helper, 
+                     HLObjectOperationLowerHelper *pObjHelper,
+                     bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+
+  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Type *valTy = val->getType();
+  Type *eltTy = valTy->getScalarType();
+
+  DXASSERT(valTy->isVectorTy() && valTy->getVectorNumElements() == 4 && eltTy->isIntegerTy() &&
+    (eltTy->getIntegerBitWidth() == 32 || eltTy->getIntegerBitWidth() == 16),
+    "otherwise, unexpected input dimension or component type");
+
+  DXIL::PackMode packMode = DXIL::PackMode::Trunc;
+  switch (IOP) {
+    case hlsl::IntrinsicOp::IOP_pack_clamp_s8: 
+      packMode = DXIL::PackMode::SClamp;
+      break;
+    case hlsl::IntrinsicOp::IOP_pack_clamp_u8:
+      packMode = DXIL::PackMode::UClamp;
+      break;
+    case hlsl::IntrinsicOp::IOP_pack_s8:
+    case hlsl::IntrinsicOp::IOP_pack_u8:
+      packMode = DXIL::PackMode::Trunc;
+      break;
+    default:
+      DXASSERT(false, "unexpected opcode");
+      break;
+  }
+
+  IRBuilder<> Builder(CI);
+  Function *dxilFunc = hlslOP->GetOpFunc(opcode, eltTy);
+  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
+  Constant *packModeArg = hlslOP->GetU8Const((unsigned)packMode);
+
+  Value *elt0 = Builder.CreateExtractElement(val, (uint64_t)0);
+  Value *elt1 = Builder.CreateExtractElement(val, (uint64_t)1);
+  Value *elt2 = Builder.CreateExtractElement(val, (uint64_t)2);
+  Value *elt3 = Builder.CreateExtractElement(val, (uint64_t)3);
+  return Builder.CreateCall(dxilFunc, { opArg, packModeArg, elt0, elt1, elt2, elt3 });
+}
+
+Value *TranslateUnpack(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
+                     HLOperationLowerHelper &helper, 
+                     HLObjectOperationLowerHelper *pObjHelper,
+                     bool &Translated) {
+  hlsl::OP *hlslOP = &helper.hlslOP;
+
+  Value *packedVal = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  DXASSERT(!packedVal->getType()->isVectorTy() && packedVal->getType()->isIntegerTy(32),
+    "otherwise, unexpected vector support in high level intrinsic template");
+
+  Type *overloadType = nullptr;
+  DXIL::UnpackMode unpackMode = DXIL::UnpackMode::Unsigned;
+  switch (IOP) {
+    case hlsl::IntrinsicOp::IOP_unpack_s8s32:
+      unpackMode = DXIL::UnpackMode::Signed;
+      overloadType = helper.i32Ty;
+      break;
+    case hlsl::IntrinsicOp::IOP_unpack_u8u32:
+      unpackMode = DXIL::UnpackMode::Unsigned;
+      overloadType = helper.i32Ty;
+      break;
+    case hlsl::IntrinsicOp::IOP_unpack_s8s16:
+      unpackMode = DXIL::UnpackMode::Signed;
+      overloadType = helper.i16Ty;
+      break;
+    case hlsl::IntrinsicOp::IOP_unpack_u8u16:
+      unpackMode = DXIL::UnpackMode::Unsigned;
+      overloadType = helper.i16Ty;
+      break;
+    default:
+      DXASSERT(false, "unexpected opcode");
+      break;
+  }
+  
+  IRBuilder<> Builder(CI);
+  Function *dxilFunc = hlslOP->GetOpFunc(opcode, overloadType);
+  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
+  Constant *unpackModeArg = hlslOP->GetU8Const((unsigned)unpackMode);
+  Value *Res = Builder.CreateCall(dxilFunc, { opArg, unpackModeArg , packedVal });
+
+  // Convert the final aggregate into a vector to make the types match
+  const unsigned vecSize = 4;
+  Value *ResVec = UndefValue::get(CI->getType());
+  for (unsigned i = 0; i < vecSize; ++i) {
+    Value *Elt = Builder.CreateExtractValue(Res, i);
+    ResVec = Builder.CreateInsertElement(ResVec, Elt, i);
+  }
+  return ResVec;
+}
+
 } // namespace
 
 // Resource Handle.
@@ -5111,7 +5260,8 @@ Value *TranslateGetHandleFromHeap(CallInst *CI, IntrinsicOp IOP,
   IRBuilder<> Builder(CI);
   Value *opArg = ConstantInt::get(helper.i32Ty, (unsigned)opcode);
   return Builder.CreateCall(
-      dxilFunc, {opArg, CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx),
+      dxilFunc, {opArg, CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx),
+                 CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx),
                  // TODO: update nonUniformIndex later.
                  Builder.getInt1(false)});
 }
@@ -5182,7 +5332,9 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_InterlockedAdd, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedAnd, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedCompareExchange, TranslateIopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise, TranslateIopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedCompareStore, TranslateIopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise, TranslateIopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedExchange, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedMax, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_InterlockedMin, TranslateIopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
@@ -5313,6 +5465,10 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_msad4, TranslateMSad4, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_mul, TranslateMul, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_normalize, TranslateNormalize, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::IOP_pack_clamp_s8, TranslatePack, DXIL::OpCode::Pack4x8 },
+    {IntrinsicOp::IOP_pack_clamp_u8, TranslatePack, DXIL::OpCode::Pack4x8 },
+    {IntrinsicOp::IOP_pack_s8, TranslatePack, DXIL::OpCode::Pack4x8 },
+    {IntrinsicOp::IOP_pack_u8, TranslatePack, DXIL::OpCode::Pack4x8 },
     {IntrinsicOp::IOP_pow, TranslatePow, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_printf, TranslatePrintf, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_radians, TranslateRadians, DXIL::OpCode::NumOpCodes},
@@ -5355,10 +5511,13 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_texCUBEproj, EmptyLower, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_transpose, EmptyLower, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_trunc, TrivialUnaryOperation, DXIL::OpCode::Round_z},
-#ifdef ENABLE_SPIRV_CODEGEN
+    #ifdef ENABLE_SPIRV_CODEGEN
     {IntrinsicOp::IOP_VkReadClock, UnsupportedVulkanIntrinsic, DXIL::OpCode::NumOpCodes},
 #endif // ENABLE_SPIRV_CODEGEN
-
+    {IntrinsicOp::IOP_unpack_s8s16, TranslateUnpack, DXIL::OpCode::Unpack4x8},
+    {IntrinsicOp::IOP_unpack_s8s32, TranslateUnpack, DXIL::OpCode::Unpack4x8},
+    {IntrinsicOp::IOP_unpack_u8u16, TranslateUnpack, DXIL::OpCode::Unpack4x8},
+    {IntrinsicOp::IOP_unpack_u8u32, TranslateUnpack, DXIL::OpCode::Unpack4x8},
     {IntrinsicOp::MOP_Append, StreamOutputLower, DXIL::OpCode::EmitStream},
     {IntrinsicOp::MOP_RestartStrip, StreamOutputLower, DXIL::OpCode::CutStream},
     {IntrinsicOp::MOP_CalculateLevelOfDetail, TranslateCalculateLOD, DXIL::OpCode::NumOpCodes},
@@ -5386,14 +5545,26 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::MOP_Load3, TranslateResourceLoad, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_Load4, TranslateResourceLoad, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedAdd, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedAdd64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedAnd, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedAnd64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedCompareExchange, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedCompareExchange64, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedCompareExchangeFloatBitwise, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedCompareStore, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedCompareStore64, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedCompareStoreFloatBitwise, TranslateMopAtomicCmpXChg, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedExchange, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedExchange64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedExchangeFloat, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedMax, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedMax64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedMin, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedMin64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedOr, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedOr64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_InterlockedXor, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_InterlockedXor64, TranslateMopAtomicBinaryOperation, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_Store, TranslateResourceStore, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_Store2, TranslateResourceStore, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::MOP_Store3, TranslateResourceStore, DXIL::OpCode::NumOpCodes},
@@ -5732,6 +5903,10 @@ void TranslateCBAddressUser(Instruction *user, Value *handle, Value *baseOffset,
     // Resource inside cbuffer is lowered after GenerateDxilOperations.
     if (dxilutil::IsHLSLObjectType(Ty)) {
       CallInst *CI = cast<CallInst>(handle);
+      // CI should be annotate handle.
+      // Need createHandle here.
+      if (GetHLOpcodeGroup(CI->getCalledFunction()) == HLOpcodeGroup::HLAnnotateHandle)
+        CI = cast<CallInst>(CI->getArgOperand(HLOperandIndex::kAnnotateHandleHandleOpIdx));
       GlobalVariable *CbGV = cast<GlobalVariable>(
           CI->getArgOperand(HLOperandIndex::kCreateHandleResourceOpIdx));
       TranslateResourceInCB(ldInst, pObjHelper, CbGV);
@@ -6224,6 +6399,11 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
     // Resource inside cbuffer is lowered after GenerateDxilOperations.
     if (dxilutil::IsHLSLObjectType(Ty)) {
       CallInst *CI = cast<CallInst>(handle);
+      // CI should be annotate handle.
+      // Need createHandle here.
+      if (GetHLOpcodeGroup(CI->getCalledFunction()) == HLOpcodeGroup::HLAnnotateHandle)
+        CI = cast<CallInst>(CI->getArgOperand(HLOperandIndex::kAnnotateHandleHandleOpIdx));
+
       GlobalVariable *CbGV = cast<GlobalVariable>(
           CI->getArgOperand(HLOperandIndex::kCreateHandleResourceOpIdx));
       TranslateResourceInCB(ldInst, pObjHelper, CbGV);
@@ -6943,8 +7123,13 @@ void TranslateStructBufSubscriptUser(
                                        Builder, OP);
       } break;
       case IntrinsicOp::IOP_InterlockedExchange: {
+        Type *opType = nullptr;
+        PointerType *ptrType = dyn_cast<PointerType>(
+                      userCall->getArgOperand(HLOperandIndex::kInterlockedDestOpIndex)->getType());
+        if (ptrType && ptrType->getElementType()->isFloatTy())
+          opType = Type::getInt32Ty(userCall->getContext());
         AtomicHelper helper(userCall, DXIL::OpCode::AtomicBinOp, handle, bufIdx,
-                            baseOffset);
+                            baseOffset, opType);
         TranslateAtomicBinaryOperation(helper, DXIL::AtomicBinOpCode::Exchange,
                                        Builder, OP);
       } break;
@@ -6990,6 +7175,13 @@ void TranslateStructBufSubscriptUser(
                             handle, bufIdx, baseOffset);
         TranslateAtomicCmpXChg(helper, Builder, OP);
       } break;
+      case IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise:
+      case IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise: {
+        Type *i32Ty = Type::getInt32Ty(userCall->getContext());
+        AtomicHelper helper(userCall, DXIL::OpCode::AtomicCompareExchange,
+                            handle, bufIdx, baseOffset, i32Ty);
+        TranslateAtomicCmpXChg(helper, Builder, OP);
+      } break;
       default:
         DXASSERT(0, "invalid opcode");
         break;
@@ -7303,7 +7495,9 @@ void TranslateDefaultSubscript(CallInst *CI, HLOperationLowerHelper &helper,  HL
         case IntrinsicOp::IOP_InterlockedOr:
         case IntrinsicOp::IOP_InterlockedXor:
         case IntrinsicOp::IOP_InterlockedCompareStore:
-        case IntrinsicOp::IOP_InterlockedCompareExchange: {
+        case IntrinsicOp::IOP_InterlockedCompareExchange:
+        case IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise:
+        case IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise: {
           // Invalid operations.
           Translated = false;
           dxilutil::EmitErrorOnInstruction(
@@ -7340,9 +7534,11 @@ void TranslateDefaultSubscript(CallInst *CI, HLOperationLowerHelper &helper,  HL
           case IntrinsicOp::IOP_InterlockedOr:
           case IntrinsicOp::IOP_InterlockedXor:
           case IntrinsicOp::IOP_InterlockedCompareStore:
-          case IntrinsicOp::IOP_InterlockedCompareExchange: {
+          case IntrinsicOp::IOP_InterlockedCompareExchange:
+          case IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise:
+          case IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise: {
             dxilutil::EmitErrorOnInstruction(
-                userCall, "Atomic operation targets must be groupshared on UAV.");
+                userCall, "Atomic operation targets must be groupshared or UAV.");
             return;
           } break;
           default:
@@ -7368,8 +7564,13 @@ void TranslateDefaultSubscript(CallInst *CI, HLOperationLowerHelper &helper,  HL
         } break;
         case IntrinsicOp::IOP_InterlockedExchange: {
           ResLoadHelper helper(CI, RK, RC, handle, IntrinsicOp::IOP_InterlockedExchange);
+          Type *opType = nullptr;
+          PointerType *ptrType = dyn_cast<PointerType>(
+                       userCall->getArgOperand(HLOperandIndex::kInterlockedDestOpIndex)->getType());
+          if (ptrType && ptrType->getElementType()->isFloatTy())
+            opType = Type::getInt32Ty(userCall->getContext());
           AtomicHelper atomHelper(userCall, DXIL::OpCode::AtomicBinOp, handle,
-                                  helper.addr, /*offset*/ nullptr);
+                                  helper.addr, /*offset*/ nullptr, opType);
           TranslateAtomicBinaryOperation(
               atomHelper, DXIL::AtomicBinOpCode::Exchange, Builder, hlslOP);
         } break;
@@ -7422,6 +7623,14 @@ void TranslateDefaultSubscript(CallInst *CI, HLOperationLowerHelper &helper,  HL
                                   handle, helper.addr, /*offset*/ nullptr);
           TranslateAtomicCmpXChg(atomHelper, Builder, hlslOP);
         } break;
+        case IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise:
+        case IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise: {
+          Type *i32Ty = Type::getInt32Ty(userCall->getContext());
+          ResLoadHelper helper(CI, RK, RC, handle, IntrinsicOp::IOP_InterlockedCompareExchange);
+          AtomicHelper atomHelper(userCall, DXIL::OpCode::AtomicCompareExchange,
+                                  handle, helper.addr, /*offset*/ nullptr, i32Ty);
+          TranslateAtomicCmpXChg(atomHelper, Builder, hlslOP);
+        } break;
         default:
           DXASSERT(0, "invalid opcode");
           break;

+ 4 - 4
lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp

@@ -3508,10 +3508,10 @@ static bool isReadOnlyPtr(CallInst *PtrCI) {
       hlsl::HLOpcodeGroup group =
           hlsl::GetHLOpcodeGroup(handleCI->getCalledFunction());
       if (group == HLOpcodeGroup::HLAnnotateHandle) {
-        ConstantInt *RCVal = cast<ConstantInt>(handleCI->getArgOperand(
-            HLOperandIndex::kAnnotateHandleResourceClassOpIdx));
-        DXIL::ResourceClass RC = (DXIL::ResourceClass)RCVal->getLimitedValue();
-        if (RC == DXIL::ResourceClass::SRV) {
+        Constant *Props = cast<Constant>(handleCI->getArgOperand(
+                             HLOperandIndex::kAnnotateHandleResourcePropertiesOpIdx));
+        DxilResourceProperties RP = resource_helper::loadPropsFromConstant(*Props);
+        if (RP.getResourceClass() == DXIL::ResourceClass::SRV) {
           // Ptr from SRV is readonly.
           return true;
         }

+ 1 - 0
tools/clang/include/clang/AST/ASTContext.h

@@ -843,6 +843,7 @@ public:
   CanQualType LitIntTy, LitFloatTy;
   CanQualType HalfFloatTy, Min16FloatTy, Min16IntTy, Min16UIntTy;
   CanQualType HLSLStringTy;
+  CanQualType Int8_4PackedTy, UInt8_4PackedTy;
 
   // HLSL Changes end
 

+ 6 - 0
tools/clang/include/clang/AST/BuiltinTypes.def

@@ -96,6 +96,12 @@ UNSIGNED_TYPE(ULongLong, UnsignedLongLongTy)
 // '__uint128_t'
 UNSIGNED_TYPE(UInt128, UnsignedInt128Ty)
 
+// 'int8_t4_packed'
+UNSIGNED_TYPE(Int8_4Packed, Int8_4PackedTy)
+
+// 'uint8_t4_packed'
+UNSIGNED_TYPE(UInt8_4Packed, UInt8_4PackedTy)
+
 //===- Signed Types -------------------------------------------------------===//
 
 // 'char' for targets where it's signed

+ 7 - 2
tools/clang/include/clang/AST/HlslTypes.h

@@ -76,12 +76,14 @@ enum HLSLScalarType {
   HLSLScalarType_float16,
   HLSLScalarType_float32,
   HLSLScalarType_float64,
+  HLSLScalarType_int8_4packed,
+  HLSLScalarType_uint8_4packed
 };
 
 HLSLScalarType MakeUnsigned(HLSLScalarType T);
 
 static const HLSLScalarType HLSLScalarType_minvalid = HLSLScalarType_bool;
-static const HLSLScalarType HLSLScalarType_max = HLSLScalarType_float64;
+static const HLSLScalarType HLSLScalarType_max = HLSLScalarType_uint8_4packed;
 static const size_t HLSLScalarTypeCount = static_cast<size_t>(HLSLScalarType_max) + 1;
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -324,7 +326,10 @@ clang::CXXRecordDecl* DeclareUIntTemplatedTypeWithHandle(
   clang::ASTContext& context, llvm::StringRef typeName, llvm::StringRef templateParamName);
 clang::CXXRecordDecl *DeclareConstantBufferViewType(clang::ASTContext& context, bool bTBuf);
 clang::CXXRecordDecl* DeclareRayQueryType(clang::ASTContext& context);
-clang::CXXRecordDecl *DeclareResourceType(clang::ASTContext &context);
+clang::CXXRecordDecl *DeclareResourceType(clang::ASTContext &context,
+                                          bool bSampler);
+clang::VarDecl *DeclareBuiltinGlobal(llvm::StringRef name, clang::QualType Ty,
+                                     clang::ASTContext &context);
 
 /// <summary>Create a function template declaration for the specified method.</summary>
 /// <param name="context">AST context in which to work.</param>

+ 6 - 0
tools/clang/include/clang/Basic/Attr.td

@@ -901,6 +901,12 @@ def HLSLWaveSensitive : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
+def HLSLWaveSize : InheritableAttr {
+  let Spellings = [CXX11<"", "wavesize", 2017>];
+  let Args = [IntArgument<"Size">];
+  let Documentation = [Undocumented];
+}
+
 // HLSL Change Ends
 
 // SPIRV Change Starts

+ 2 - 0
tools/clang/include/clang/Basic/Specifiers.h

@@ -52,6 +52,8 @@ namespace clang {
     TST_min16uint,
     TST_min10float,
     TST_min12int,
+    TST_int8_4packed,
+    TST_uint8_4packed,
     // HLSL Changes end
     TST_half,         // OpenCL half, ARM NEON __fp16
     TST_halffloat, // HLSL Change

+ 2 - 0
tools/clang/include/clang/Sema/DeclSpec.h

@@ -296,6 +296,8 @@ public:
   static const TST TST_min16uint = clang::TST_min16uint;
   static const TST TST_min10float = clang::TST_min10float;
   static const TST TST_min12int = clang::TST_min12int;
+  static const TST TST_int8_4packed = clang::TST_int8_4packed;
+  static const TST TST_uint8_4packed = clang::TST_uint8_4packed;
   // HLSL Change Ends
   static const TST TST_half = clang::TST_half;
   static const TST TST_float = clang::TST_float;

+ 7 - 1
tools/clang/lib/AST/ASTContext.cpp

@@ -1103,7 +1103,9 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target) {
     InitBuiltinType(HalfFloatTy, BuiltinType::HalfFloat);
     InitBuiltinType(LitIntTy, BuiltinType::LitInt);
     InitBuiltinType(LitFloatTy, BuiltinType::LitFloat);
-    
+    InitBuiltinType(Int8_4PackedTy, BuiltinType::Int8_4Packed);
+    InitBuiltinType(UInt8_4PackedTy, BuiltinType::UInt8_4Packed);
+
     HLSLStringTy = this->getPointerType(CharTy);
 
     hlsl::InitializeASTContextForHLSL(*this); // Previously in constructor, guarded by !DelayInitialization
@@ -1633,6 +1635,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
       break;
     case BuiltinType::UInt:
     case BuiltinType::Int:
+    case BuiltinType::Int8_4Packed:  // HLSL Change
+    case BuiltinType::UInt8_4Packed: // HLSL Change
       Width = Target->getIntWidth();
       Align = Target->getIntAlign();
       break;
@@ -5505,6 +5509,8 @@ static char getObjCEncodingForPrimitiveKind(const ASTContext *C,
     case BuiltinType::HalfFloat:
     case BuiltinType::LitInt:
     case BuiltinType::LitFloat:
+    case BuiltinType::Int8_4Packed:
+    case BuiltinType::UInt8_4Packed:
       llvm_unreachable("@encoding HLSL primitive type");
     // HLSL Change Ends
     }

+ 52 - 4
tools/clang/lib/AST/ASTContextHLSL.cpp

@@ -25,6 +25,7 @@
 #include "dxc/Support/Global.h"
 #include "dxc/HLSL/HLOperations.h"
 #include "dxc/DXIL/DxilSemantic.h"
+#include "dxc/HlslIntrinsicOp.h"
 
 using namespace clang;
 using namespace hlsl;
@@ -69,7 +70,9 @@ const char* HLSLScalarTypeNames[] = {
   "uint64_t",
   "float16_t",
   "float32_t",
-  "float64_t"
+  "float64_t",
+  "int8_t4_packed",
+  "uint8_t4_packed"
 };
 
 static_assert(HLSLScalarTypeCount == _countof(HLSLScalarTypeNames), "otherwise scalar constants are not aligned");
@@ -171,6 +174,20 @@ static HLSLScalarType FindScalarTypeByName(const char *typeName, const size_t ty
         }
       }
       break;
+    case 14: // int8_t4_packed
+      if (typeName[0] == 'i' && typeName[1] == 'n') {
+        if (strncmp(typeName, "int8_t4_packed", 14))
+          break;
+        return HLSLScalarType_int8_4packed;
+      }
+      break;
+    case 15: // uint8_t4_packed
+      if (typeName[0] == 'u' && typeName[1] == 'i') {
+        if (strncmp(typeName, "uint8_t4_packed", 15))
+          break;
+        return HLSLScalarType_uint8_4packed;
+      }
+      break;
     default:
       break;
   }
@@ -872,16 +889,47 @@ CXXRecordDecl* hlsl::DeclareRayQueryType(ASTContext& context) {
   return typeDeclBuilder.completeDefinition();
 }
 
-CXXRecordDecl* hlsl::DeclareResourceType(ASTContext& context) {
+CXXRecordDecl* hlsl::DeclareResourceType(ASTContext& context, bool bSampler) {
   // struct ResourceDescriptor { uint8 desc; }
+  StringRef Name = bSampler?".Sampler":".Resource";
   BuiltinTypeDeclBuilder typeDeclBuilder(context.getTranslationUnitDecl(),
-                                         ".Resource",
+                                         Name,
                                          TagDecl::TagKind::TTK_Struct);
   typeDeclBuilder.startDefinition();
 
   typeDeclBuilder.addField("h", GetHLSLObjectHandleType(context));
 
-  return typeDeclBuilder.completeDefinition();
+  CXXRecordDecl *recordDecl = typeDeclBuilder.completeDefinition();
+
+  QualType indexType = context.UnsignedIntTy;
+  QualType resultType = context.getRecordType(recordDecl);
+  resultType = context.getConstType(resultType);
+
+  CXXMethodDecl *functionDecl = CreateObjectFunctionDeclarationWithParams(
+        context, recordDecl, resultType, ArrayRef<QualType>(indexType),
+        ArrayRef<StringRef>(StringRef("index")),
+      context.DeclarationNames.getCXXOperatorName(OO_Subscript), true);
+  // Mark function as createResourceFromHeap intrinsic.
+  functionDecl->addAttr(HLSLIntrinsicAttr::CreateImplicit(
+      context, "op", "",
+      static_cast<int>(hlsl::IntrinsicOp::IOP_CreateResourceFromHeap)));
+  return recordDecl;
+}
+
+VarDecl *hlsl::DeclareBuiltinGlobal(llvm::StringRef name, clang::QualType Ty,
+                              clang::ASTContext &context) {
+  IdentifierInfo &II = context.Idents.get(name);
+
+  auto *curDeclCtx =context.getTranslationUnitDecl();
+
+  VarDecl *varDecl = VarDecl::Create(context, curDeclCtx,
+                         SourceLocation(), SourceLocation(), &II, Ty,
+                         context.getTrivialTypeSourceInfo(Ty),
+                         StorageClass::SC_Extern);
+  // Mark implicit to avoid print it when rewrite.
+  varDecl->setImplicit();
+  curDeclCtx->addDecl(varDecl);
+  return varDecl;
 }
 
 bool hlsl::IsIntrinsicOp(const clang::FunctionDecl *FD) {

+ 2 - 0
tools/clang/lib/AST/ItaniumMangle.cpp

@@ -2048,6 +2048,8 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
   case BuiltinType::Min16Int: Out << "min16_int"; break;
   case BuiltinType::Min16UInt: Out << "min16_uint"; break;
   case BuiltinType::HalfFloat: Out << "half_float"; break;
+  case BuiltinType::Int8_4Packed: Out << "int8_t4_packed"; break;
+  case BuiltinType::UInt8_4Packed: Out << "uint8_t4_packed"; break;
     // HLSL Change ends
   }
 }

+ 7 - 1
tools/clang/lib/AST/MicrosoftMangle.cpp

@@ -1615,7 +1615,13 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
   case BuiltinType::Half:
     Out << "$f16@";
     break;
-  // HLSL Change Ends
+  case BuiltinType::Int8_4Packed:
+    Out << "$i8_4pk@";
+    break;
+  case BuiltinType::UInt8_4Packed:
+    Out << "$ui8_4pk@";
+    break;
+    // HLSL Change Ends
   }
 }
 

+ 2 - 0
tools/clang/lib/AST/StmtPrinter.cpp

@@ -1144,6 +1144,8 @@ void StmtPrinter::VisitIntegerLiteral(IntegerLiteral *Node) {
   case BuiltinType::ULongLong: OS << "ULL"; break;
   case BuiltinType::Int128:    OS << "i128"; break;
   case BuiltinType::UInt128:   OS << "Ui128"; break;
+  case BuiltinType::Int8_4Packed:  OS << "i8_4pk"; break; // HLSL Change
+  case BuiltinType::UInt8_4Packed: OS << "Ui8_4pk"; break; // HLSL Change
   }
 }
 

+ 2 - 0
tools/clang/lib/AST/Type.cpp

@@ -2546,6 +2546,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
   case Min12Int:          return "min12int";
   case LitFloat:          return "literal float";
   case LitInt:            return "literal int";
+  case Int8_4Packed:      return "int8_t4_packed";
+  case UInt8_4Packed:     return "uint8_t4_packed";
   // HLSL Change Ends
   }
   

+ 2 - 0
tools/clang/lib/AST/TypeLoc.cpp

@@ -329,6 +329,8 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
   case BuiltinType::Min12Int:
   case BuiltinType::LitFloat:
   case BuiltinType::LitInt:
+  case BuiltinType::Int8_4Packed:
+  case BuiltinType::UInt8_4Packed:
   // HLSL Change Ends
     llvm_unreachable("Builtin type needs extra local data!");
     // Fall through, if the impossible happens.

+ 3 - 1
tools/clang/lib/CodeGen/CGDebugInfo.cpp

@@ -483,7 +483,9 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
     Encoding = llvm::dwarf::DW_ATE_UTF;
     break;
   case BuiltinType::UShort:
-  case BuiltinType::Min16UInt: // HLSL Change
+  case BuiltinType::Min16UInt:      // HLSL Change
+  case BuiltinType::Int8_4Packed:   // HLSL Change
+  case BuiltinType::UInt8_4Packed:  // HLSL Change
   case BuiltinType::UInt:
   case BuiltinType::UInt128:
   case BuiltinType::ULong:

+ 4 - 0
tools/clang/lib/CodeGen/CGExpr.cpp

@@ -3492,7 +3492,11 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
 
     // bitcast to target type
     llvm::Type *ResultType = ConvertType(ToType);
+    // Make sure generate Inst not Operator to make lowering easy.
+    bool originAllowFolding = Builder.AllowFolding;
+    Builder.AllowFolding = false;
     llvm::Value *bitcast = Builder.CreateBitCast(This, ResultType);
+    Builder.AllowFolding = originAllowFolding;
     return MakeAddrLValue(bitcast, ToType);
   }
   case CK_HLSLDerivedToBase: {

+ 116 - 62
tools/clang/lib/CodeGen/CGHLSLMS.cpp

@@ -89,7 +89,9 @@ private:
   // Map from value to resource properties.
   // This only collect object variables(global/local/parameter), not object fields inside struct.
   // Object fields inside struct is saved by TypeAnnotation.
-  DenseMap<Value *, DxilResourceProperties> valToResPropertiesMap;
+  // Returns true if added to one.
+  bool AddValToPropertyMap(Value *V, QualType Ty);
+  CGHLSLMSHelper::DxilObjectProperties objectProperties;
 
   bool  m_bDebugInfo;
   bool  m_bIsLib;
@@ -573,6 +575,14 @@ static CompType::Kind BuiltinTyToCompTy(const BuiltinType *BTy, bool bSNorm,
   CompType::Kind kind = CompType::Kind::Invalid;
 
   switch (BTy->getKind()) {
+  // HLSL Changes begin
+  case BuiltinType::Int8_4Packed:
+    kind = CompType::Kind::PackedS8x32;
+    break;
+  case BuiltinType::UInt8_4Packed:
+    kind = CompType::Kind::PackedU8x32;
+    break;
+  // HLSL Changes end
   case BuiltinType::UInt:
     kind = CompType::Kind::U32;
     break;
@@ -709,7 +719,24 @@ QualType GetArrayEltType(ASTContext &Context, QualType Ty) {
     Ty = ArrayTy->getElementType();
   return Ty;
 }
+bool IsTextureBufferViewName(StringRef keyword) {
+  return keyword == "TextureBuffer";
+}
 
+bool IsTextureBufferView(clang::QualType Ty, clang::ASTContext &context) {
+  Ty = Ty.getCanonicalType();
+  if (const clang::ArrayType *arrayType = context.getAsArrayType(Ty)) {
+    return IsTextureBufferView(arrayType->getElementType(), context);
+  } else if (const RecordType *RT = Ty->getAsStructureType()) {
+    return IsTextureBufferViewName(RT->getDecl()->getName());
+  } else if (const RecordType *RT = Ty->getAs<RecordType>()) {
+    if (const ClassTemplateSpecializationDecl *templateDecl =
+            dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl())) {
+      return IsTextureBufferViewName(templateDecl->getName());
+    }
+  }
+  return false;
+}
 } // namespace
 
 DxilResourceProperties CGMSHLSLRuntime::BuildResourceProperty(QualType resTy) {
@@ -717,44 +744,59 @@ DxilResourceProperties CGMSHLSLRuntime::BuildResourceProperty(QualType resTy) {
   const RecordType *RT = resTy->getAs<RecordType>();
   DxilResourceProperties RP;
   if (!RT) {
-    RP.Class = DXIL::ResourceClass::Invalid;
     return RP;
   }
   RecordDecl *RD = RT->getDecl();
   SourceLocation loc = RD->getLocation();
 
   hlsl::DxilResourceBase::Class resClass = TypeToClass(resTy);
-  RP.Class = resClass;
   if (resClass == DXIL::ResourceClass::Invalid)
     return RP;
 
   llvm::Type *Ty = CGM.getTypes().ConvertType(resTy);
+
   switch (resClass) {
   case DXIL::ResourceClass::UAV: {
     DxilResource UAV;
     // TODO: save globalcoherent to variable in EmitHLSLBuiltinCallExpr.
     SetUAVSRV(loc, resClass, &UAV, resTy);
     UAV.SetGlobalSymbol(UndefValue::get(Ty->getPointerTo()));
-    RP = resource_helper::loadFromResourceBase(&UAV);
+    RP = resource_helper::loadPropsFromResourceBase(&UAV);
   } break;
   case DXIL::ResourceClass::SRV: {
     DxilResource SRV;
     SetUAVSRV(loc, resClass, &SRV, resTy);
     SRV.SetGlobalSymbol(UndefValue::get(Ty->getPointerTo()));
-    RP = resource_helper::loadFromResourceBase(&SRV);
+    RP = resource_helper::loadPropsFromResourceBase(&SRV);
   } break;
   case DXIL::ResourceClass::Sampler: {
     DxilSampler::SamplerKind kind = KeywordToSamplerKind(RD->getName());
     DxilSampler Sampler;
     Sampler.SetSamplerKind(kind);
-    RP = resource_helper::loadFromResourceBase(&Sampler);
-  }
+    RP = resource_helper::loadPropsFromResourceBase(&Sampler);
+  } break;
+  case DXIL::ResourceClass::CBuffer: {
+    DxilCBuffer CB;
+    CB.SetGlobalSymbol(UndefValue::get(Ty->getPointerTo()));
+    if (IsTextureBufferView(resTy, CGM.getContext()))
+      CB.SetKind(DXIL::ResourceKind::TBuffer);
+    DxilTypeSystem &typeSys = m_pHLModule->GetTypeSystem();
+    unsigned arrayEltSize = 0;
+    QualType ResultTy = hlsl::GetHLSLResourceResultType(resTy);
+    unsigned Size = AddTypeAnnotation(ResultTy, typeSys, arrayEltSize);
+    CB.SetSize(Size);
+    RP = resource_helper::loadPropsFromResourceBase(&CB);
+  } break;
   default:
     break;
   }
   return RP;
 }
 
+bool CGMSHLSLRuntime::AddValToPropertyMap(Value *V, QualType Ty) {
+  return objectProperties.AddResource(V, BuildResourceProperty(Ty));
+}
+
 void CGMSHLSLRuntime::ConstructFieldAttributedAnnotation(
     DxilFieldAnnotation &fieldAnnotation, QualType fieldTy,
     bool bDefaultRowMajor) {
@@ -1470,10 +1512,10 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
       funcProps->ShaderProps.MS.outputTopology = topology;
     }
     else if (isEntry && !SM->IsHS() && !SM->IsMS()) {
-      unsigned DiagID =
-          Diags.getCustomDiagID(DiagnosticsEngine::Warning,
-                                "attribute outputtopology only valid for HS and MS.");
-      Diags.Report(Attr->getLocation(), DiagID);
+    unsigned DiagID =
+      Diags.getCustomDiagID(DiagnosticsEngine::Warning,
+        "attribute outputtopology only valid for HS and MS.");
+    Diags.Report(Attr->getLocation(), DiagID);
     }
   }
 
@@ -1483,15 +1525,15 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   }
 
   if (const HLSLMaxTessFactorAttr *Attr =
-          FD->getAttr<HLSLMaxTessFactorAttr>()) {
+    FD->getAttr<HLSLMaxTessFactorAttr>()) {
     if (isHS) {
       // TODO: change getFactor to return float.
       llvm::APInt intV(32, Attr->getFactor());
       funcProps->ShaderProps.HS.maxTessFactor = intV.bitsToFloat();
     } else if (isEntry && !SM->IsHS()) {
       unsigned DiagID =
-          Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                "attribute maxtessfactor only valid for HS.");
+        Diags.getCustomDiagID(DiagnosticsEngine::Error,
+          "attribute maxtessfactor only valid for HS.");
       Diags.Report(Attr->getLocation(), DiagID);
       return;
     }
@@ -1501,8 +1543,8 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   if (const HLSLDomainAttr *Attr = FD->getAttr<HLSLDomainAttr>()) {
     if (isEntry && !SM->IsHS() && !SM->IsDS()) {
       unsigned DiagID =
-          Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                "attribute domain only valid for HS or DS.");
+        Diags.getCustomDiagID(DiagnosticsEngine::Error,
+          "attribute domain only valid for HS or DS.");
       Diags.Report(Attr->getLocation(), DiagID);
       return;
     }
@@ -1522,7 +1564,7 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
   if (const HLSLClipPlanesAttr *Attr = FD->getAttr<HLSLClipPlanesAttr>()) {
     if (isEntry && !SM->IsVS()) {
       unsigned DiagID = Diags.getCustomDiagID(
-          DiagnosticsEngine::Error, "attribute clipplane only valid for VS.");
+        DiagnosticsEngine::Error, "attribute clipplane only valid for VS.");
       Diags.Report(Attr->getLocation(), DiagID);
       return;
     }
@@ -1535,11 +1577,11 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
 
   // Pixel shader.
   if (const HLSLEarlyDepthStencilAttr *Attr =
-          FD->getAttr<HLSLEarlyDepthStencilAttr>()) {
+    FD->getAttr<HLSLEarlyDepthStencilAttr>()) {
     if (isEntry && !SM->IsPS()) {
       unsigned DiagID = Diags.getCustomDiagID(
-          DiagnosticsEngine::Error,
-          "attribute earlydepthstencil only valid for PS.");
+        DiagnosticsEngine::Error,
+        "attribute earlydepthstencil only valid for PS.");
       Diags.Report(Attr->getLocation(), DiagID);
       return;
     }
@@ -1549,6 +1591,39 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
     funcProps->shaderKind = DXIL::ShaderKind::Pixel;
   }
 
+  if (const HLSLWaveSizeAttr *Attr = FD->getAttr<HLSLWaveSizeAttr>()) {
+    if (!m_pHLModule->GetShaderModel()->IsSM66Plus()) {
+      unsigned DiagID = Diags.getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "attribute WaveSize only valid for shader model 6.6 and higher.");
+      Diags.Report(Attr->getLocation(), DiagID);
+      return;
+    }
+    if (!isCS) {
+      unsigned DiagID = Diags.getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "attribute WaveSize only valid for CS.");
+      Diags.Report(Attr->getLocation(), DiagID);
+      return;
+    }
+    if (!isEntry) {
+      unsigned DiagID = Diags.getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "attribute WaveSize only valid on entry point function.");
+      Diags.Report(Attr->getLocation(), DiagID);
+      return;
+    }
+    // validate that it is a power of 2 between 4 and 128
+    unsigned waveSize = Attr->getSize();
+    if (!DXIL::IsValidWaveSizeValue(waveSize)) {
+      unsigned DiagID = Diags.getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "WaveSize value must be between %0 and %1 and a power of 2.");
+      Diags.Report(Attr->getLocation(), DiagID) << DXIL::kMinWaveSize << DXIL::kMaxWaveSize;
+    }
+    funcProps->waveSize = Attr->getSize();
+  }
+
   const unsigned profileAttributes = isCS + isHS + isDS + isGS + isVS + isPS + isRay + isMS + isAS;
 
   // TODO: check this in front-end and report error.
@@ -1612,9 +1687,7 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
     // SRet.
     pRetTyAnnotation = &FuncAnnotation->GetParameterAnnotation(ArgNo++);
     // Save resource properties for parameters.
-    DxilResourceProperties RP = BuildResourceProperty(retTy);
-    if (RP.Class != DXIL::ResourceClass::Invalid)
-      valToResPropertiesMap[ArgIt] = RP;
+    AddValToPropertyMap(ArgIt, retTy);
     ++ArgIt;
   } else {
     pRetTyAnnotation = &FuncAnnotation->GetRetTypeAnnotation();
@@ -1658,10 +1731,8 @@ void CGMSHLSLRuntime::AddHLSLFunctionInfo(Function *F, const FunctionDecl *FD) {
     const ParmVarDecl *parmDecl = FD->getParamDecl(ParmIdx);
 
     QualType fieldTy = parmDecl->getType();
-    // Save resource properties for parameters.
-    DxilResourceProperties RP = BuildResourceProperty(fieldTy);
-    if (RP.Class != DXIL::ResourceClass::Invalid)
-      valToResPropertiesMap[ArgIt] = RP;
+    // Save object properties for parameters.
+    AddValToPropertyMap(ArgIt, fieldTy);
 
     // if parameter type is a typedef, try to desugar it first.
     if (isa<TypedefType>(fieldTy.getTypePtr()))
@@ -2303,10 +2374,8 @@ void CGMSHLSLRuntime::AddControlFlowHint(CodeGenFunction &CGF, const Stmt &S,
 
 void CGMSHLSLRuntime::MarkRetTemp(CodeGenFunction &CGF, Value *V,
                                  QualType QualTy) {
-  // Save resource properties for ret temp.
-  DxilResourceProperties RP = BuildResourceProperty(QualTy);
-  if (RP.Class != DXIL::ResourceClass::Invalid)
-    valToResPropertiesMap[V] = RP;
+  // Save object properties for ret temp.
+  AddValToPropertyMap(V, QualTy);
 }
 
 void CGMSHLSLRuntime::FinishAutoVar(CodeGenFunction &CGF, const VarDecl &D,
@@ -2319,10 +2388,8 @@ void CGMSHLSLRuntime::FinishAutoVar(CodeGenFunction &CGF, const VarDecl &D,
   DxilTypeSystem &typeSys = m_pHLModule->GetTypeSystem();
   unsigned arrayEltSize = 0;
   AddTypeAnnotation(D.getType(), typeSys, arrayEltSize);
-  // Save resource properties for local variables.
-  DxilResourceProperties RP = BuildResourceProperty(D.getType());
-  if (RP.Class != DXIL::ResourceClass::Invalid)
-    valToResPropertiesMap[V] = RP;
+  // Save object properties for local variables.
+  AddValToPropertyMap(V, D.getType());
 }
 
 hlsl::InterpolationMode CGMSHLSLRuntime::GetInterpMode(const Decl *decl,
@@ -2377,6 +2444,10 @@ hlsl::CompType CGMSHLSLRuntime::GetCompType(const BuiltinType *BT) {
   case BuiltinType::Short:
     ElementType = hlsl::CompType::getI16();
     break;
+    // HLSL Changes begin
+  case BuiltinType::Int8_4Packed:
+  case BuiltinType::UInt8_4Packed:
+    // HLSL Changes end
   case BuiltinType::UInt:
     ElementType = hlsl::CompType::getU32();
     break;
@@ -2404,9 +2475,7 @@ void CGMSHLSLRuntime::addResource(Decl *D) {
     // Save resource properties for global variables.
     if (resClass != DXIL::ResourceClass::Invalid) {
       GlobalVariable *GV = cast<GlobalVariable>(CGM.GetAddrOfGlobalVar(VD));
-      DxilResourceProperties RP = BuildResourceProperty(VD->getType());
-      if (RP.Class != DXIL::ResourceClass::Invalid)
-        valToResPropertiesMap[GV] = RP;
+      AddValToPropertyMap(GV, VD->getType());
     }
     // skip decl has init which is resource.
     if (VD->hasInit() && resClass != DXIL::ResourceClass::Invalid)
@@ -3127,9 +3196,7 @@ void CGMSHLSLRuntime::AddConstant(VarDecl *constDecl, HLCBuffer &CB) {
 
   auto &regBindings = constantRegBindingMap[constVal];
   // Save resource properties for cbuffer variables.
-  DxilResourceProperties RP = BuildResourceProperty(constDecl->getType());
-  if (RP.Class != DXIL::ResourceClass::Invalid)
-    valToResPropertiesMap[constVal] = RP;
+  AddValToPropertyMap(constVal, constDecl->getType());
 
   bool isGlobalCB = CB.GetID() == globalCBIndex;
   uint32_t offset = 0;
@@ -3231,24 +3298,6 @@ unique_ptr<HLCBuffer> CreateHLCBuf(NamedDecl *D, bool bIsView, bool bIsTBuf) {
   return CB;
 }
 
-bool IsTextureBufferViewName(StringRef keyword) {
-  return keyword == "TextureBuffer";
-}
-
-bool IsTextureBufferView(clang::QualType Ty, clang::ASTContext &context) {
-  Ty = Ty.getCanonicalType();
-  if (const clang::ArrayType *arrayType = context.getAsArrayType(Ty)) {
-    return IsTextureBufferView(arrayType->getElementType(), context);
-  } else if (const RecordType *RT = Ty->getAsStructureType()) {
-    return IsTextureBufferViewName(RT->getDecl()->getName());
-  } else if (const RecordType *RT = Ty->getAs<RecordType>()) {
-    if (const ClassTemplateSpecializationDecl *templateDecl =
-            dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl())) {
-      return IsTextureBufferViewName(templateDecl->getName());
-    }
-  }
-  return false;
-}
 } // namespace
 
 uint32_t CGMSHLSLRuntime::AddCBuffer(HLSLBufferDecl *D) {
@@ -3339,7 +3388,7 @@ void CGMSHLSLRuntime::FinishCodeGen() {
   llvm::Module &M = TheModule;
   // Do this before CloneShaderEntry and TranslateRayQueryConstructor to avoid
   // update valToResPropertiesMap for cloned inst.
-  FinishIntrinsics(HLM, m_IntrinsicMap, valToResPropertiesMap);
+  FinishIntrinsics(HLM, m_IntrinsicMap, objectProperties);
   bool bWaveEnabledStage = m_pHLModule->GetShaderModel()->IsPS() ||
                            m_pHLModule->GetShaderModel()->IsCS() ||
                            m_pHLModule->GetShaderModel()->IsLib();
@@ -5263,9 +5312,14 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
       return;
     }
   } else if (dxilutil::IsHLSLResourceDescType(SrcPtrTy) &&
-             dxilutil::IsHLSLResourceType(DestPtrTy)) {
-    // Cast resource desc to resource.
+             (dxilutil::IsHLSLResourceType(DestPtrTy) ||
+              GetResourceClassForType(CGM.getContext(), DestTy) ==
+                  DXIL::ResourceClass::CBuffer)) {
+    // Cast resource desc to resource.// Make sure to generate Inst to help lowering.
+    bool originAllowFolding = CGF.Builder.AllowFolding;
+    CGF.Builder.AllowFolding = false;
     Value *CastPtr = CGF.Builder.CreatePointerCast(SrcPtr, DestPtr->getType());
+    CGF.Builder.AllowFolding = originAllowFolding;
     // Load resource.
     Value *V = CGF.Builder.CreateLoad(CastPtr);
     // Store to resource ptr.

+ 297 - 160
tools/clang/lib/CodeGen/CGHLSLMSFinishCodeGen.cpp

@@ -9,41 +9,41 @@
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DxilValueCache.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DxilValueCache.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/IR/CFG.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #include "CodeGenModule.h"
-#include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Basic/LangOptions.h"
+#include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Parse/ParseHLSL.h" // root sig would be in Parser if part of lang
 
-#include "dxc/HLSL/HLModule.h"
-#include "dxc/HLSL/HLSLExtensionsCodegenHelper.h"
+#include "dxc/DXIL/DxilConstants.h"
 #include "dxc/DXIL/DxilOperations.h"
-#include "dxc/HlslIntrinsicOp.h"
-#include "dxc/DXIL/DxilUtil.h"
-#include "dxc/HLSL/DxilExportMap.h"
 #include "dxc/DXIL/DxilResourceProperties.h"
 #include "dxc/DXIL/DxilTypeSystem.h"
-#include "dxc/DXIL/DxilConstants.h"
+#include "dxc/DXIL/DxilUtil.h"
 #include "dxc/DxilRootSignature/DxilRootSignature.h"
+#include "dxc/HLSL/DxilExportMap.h"
 #include "dxc/HLSL/DxilGenerationPass.h"
 #include "dxc/HLSL/HLMatrixType.h"
+#include "dxc/HLSL/HLModule.h"
+#include "dxc/HLSL/HLSLExtensionsCodegenHelper.h"
+#include "dxc/HlslIntrinsicOp.h"
 
-#include <vector>
-#include <memory>
 #include <fenv.h>
+#include <memory>
+#include <vector>
 
 #include "CGHLSLMSHelper.h"
 
@@ -51,7 +51,6 @@ using namespace llvm;
 using namespace hlsl;
 using namespace CGHLSLMSHelper;
 
-
 namespace {
 
 Value *CreateHandleFromResPtr(Value *ResPtr, HLModule &HLM,
@@ -66,7 +65,7 @@ Value *CreateHandleFromResPtr(Value *ResPtr, HLModule &HLM,
   return Handle;
 }
 
-Value *CreateAnnotateHandle(HLModule &HLM, Value *Handle,
+CallInst *CreateAnnotateHandle(HLModule &HLM, Value *Handle,
                             DxilResourceProperties &RP, llvm::Type *ResTy,
                             IRBuilder<> &Builder) {
   Constant *RPConstant = resource_helper::getAsConstant(
@@ -74,10 +73,110 @@ Value *CreateAnnotateHandle(HLModule &HLM, Value *Handle,
   return HLM.EmitHLOperationCall(
       Builder, HLOpcodeGroup::HLAnnotateHandle,
       (unsigned)HLOpcodeGroup::HLAnnotateHandle, Handle->getType(),
-      {Handle, Builder.getInt8((uint8_t)RP.Class),
-       Builder.getInt8((uint8_t)RP.Kind), RPConstant, UndefValue::get(ResTy)},
-      *HLM.GetModule());
+      {Handle, RPConstant, UndefValue::get(ResTy)}, *HLM.GetModule());
+}
+
+// Lower CBV bitcast use to handle use.
+// Leave the load/store.
+void LowerDynamicCBVUseToHandle(
+    HLModule &HLM,
+    DxilObjectProperties &objectProperties) {
+  Type *HandleTy = HLM.GetOP()->GetHandleType();
+  Module &M = *HLM.GetModule();
+  // Collect BitCast use of CBV.
+  SmallVector<std::pair<BitCastInst *, DxilResourceProperties>, 4> BitCasts;
+  for (auto it : objectProperties.resMap) {
+    DxilResourceProperties RP = it.second;
+    if (RP.getResourceKind() != DXIL::ResourceKind::CBuffer &&
+        RP.getResourceKind() != DXIL::ResourceKind::TBuffer)
+      continue;
+    Value *V = it.first;
+    // Skip external globals.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+      if (GV->getLinkage() != GlobalValue::LinkageTypes::InternalLinkage)
+        continue;
+    }
+    for (auto UserIt = V->user_begin(); UserIt != V->user_end();) {
+      User *U = *(UserIt++);
+      if (U->user_empty())
+        continue;
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+        BitCasts.emplace_back(std::make_pair(BCI, RP));
+        continue;
+      }
+      DXASSERT((!isa<BitCastOperator>(U) || U->user_empty()),
+               "all BitCast should be BitCastInst");
+    }
+  }
+
+  for (auto it : BitCasts) {
+    BitCastInst *BCI = it.first;
+    DxilResourceProperties RP = it.second;
+    IRBuilder<> B(BCI);
+    B.AllowFolding = false;
+    Value *ObjV = BCI->getOperand(0);
+    Value *Handle = CreateHandleFromResPtr(ObjV, HLM, HandleTy, B);
+    Type *ResTy = ObjV->getType()->getPointerElementType();
+    Handle = CreateAnnotateHandle(HLM, Handle, RP, ResTy, B);
+    // Create cb subscript.
+    llvm::Type *opcodeTy = B.getInt32Ty();
+    llvm::Type *idxTy = opcodeTy;
+    Constant *zeroIdx = ConstantInt::get(opcodeTy, 0);
+
+    Type *cbTy = BCI->getType();
+    llvm::FunctionType *SubscriptFuncTy =
+        llvm::FunctionType::get(cbTy, {opcodeTy, HandleTy, idxTy}, false);
+
+    Function *subscriptFunc =
+        GetOrCreateHLFunction(M, SubscriptFuncTy, HLOpcodeGroup::HLSubscript,
+                              (unsigned)HLSubscriptOpcode::CBufferSubscript);
+    Constant *opArg = ConstantInt::get(
+        opcodeTy, (unsigned)HLSubscriptOpcode::CBufferSubscript);
+    Value *args[] = {opArg, Handle, zeroIdx};
+
+    Instruction *cbSubscript =
+        cast<Instruction>(B.CreateCall(subscriptFunc, {args}));
+    BCI->replaceAllUsesWith(cbSubscript);
+    BCI->eraseFromParent();
+  }
+}
+
+bool IsHLSLSamplerDescType(llvm::Type *Ty) {
+  if (llvm::StructType *ST = dyn_cast<llvm::StructType>(Ty)) {
+    if (!ST->hasName())
+      return false;
+    StringRef name = ST->getName();
+
+    if (name == "struct..Sampler")
+      return true;
+  }
+  return false;
+}
+
+#ifndef NDEBUG
+static bool ConsumePrefix(StringRef &Str, StringRef Prefix) {
+  if (!Str.startswith(Prefix)) return false;
+  Str = Str.substr(Prefix.size());
+  return true;
+}
+
+bool IsHLSLBufferViewType(llvm::Type *Ty) {
+  if (llvm::StructType *ST = dyn_cast<llvm::StructType>(Ty)) {
+    if (!ST->hasName())
+      return false;
+
+    StringRef name = ST->getName();
+    if (!(ConsumePrefix(name, "class.") ||
+          ConsumePrefix(name, "struct.")))
+      return false;
+
+    if (name.startswith("ConstantBuffer<") ||
+        name.startswith("TextureBuffer<"))
+      return true;
+  }
+  return false;
 }
+#endif
 
 void LowerGetResourceFromHeap(
     HLModule &HLM, std::vector<std::pair<Function *, unsigned>> &intrinsicMap) {
@@ -97,13 +196,17 @@ void LowerGetResourceFromHeap(
       continue;
     for (auto uit = F->user_begin(); uit != F->user_end();) {
       CallInst *CI = cast<CallInst>(*(uit++));
-      Instruction *ResPtr = cast<Instruction>(CI->getArgOperand(0));
-      Value *Index = CI->getArgOperand(1);
+      // Arg 0 is this pointer.
+      unsigned ArgIdx = 1;
+      Instruction *ResPtr = cast<Instruction>(CI->getArgOperand(ArgIdx));
+      Value *Index = CI->getArgOperand(ArgIdx+1);
       IRBuilder<> Builder(CI);
       // Make a handle from GetResFromHeap.
-      Value *Handle =
-          HLM.EmitHLOperationCall(Builder, HLOpcodeGroup::HLIntrinsic,
-                                  GetResFromHeapOp, HandleTy, {Index}, M);
+      Value *IsSampler = Builder.getInt1(
+          IsHLSLSamplerDescType(ResPtr->getType()->getPointerElementType()));
+      Value *Handle = HLM.EmitHLOperationCall(
+          Builder, HLOpcodeGroup::HLIntrinsic, GetResFromHeapOp, HandleTy,
+          {Index, IsSampler}, M);
 
       // Find the handle ptr for res ptr.
       auto it = ResourcePtrToHandlePtrMap.find(ResPtr);
@@ -131,7 +234,9 @@ void LowerGetResourceFromHeap(
       User *U = *(uit++);
       BitCastInst *BCI = cast<BitCastInst>(U);
       DXASSERT(
-          dxilutil::IsHLSLResourceType(BCI->getType()->getPointerElementType()),
+          dxilutil::IsHLSLResourceType(
+              BCI->getType()->getPointerElementType()) ||
+              IsHLSLBufferViewType(BCI->getType()->getPointerElementType()),
           "illegal cast of resource ptr");
       for (auto cuit = BCI->user_begin(); cuit != BCI->user_end();) {
         LoadInst *LI = cast<LoadInst>(*(cuit++));
@@ -150,7 +255,6 @@ void LowerGetResourceFromHeap(
   }
 }
 
-
 void ReplaceBoolVectorSubscript(CallInst *CI) {
   Value *Ptr = CI->getArgOperand(0);
   Value *Idx = CI->getArgOperand(1);
@@ -216,8 +320,8 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
       unsigned counterOpcode =
           bAppend ? (unsigned)IntrinsicOp::MOP_IncrementCounter
                   : (unsigned)IntrinsicOp::MOP_DecrementCounter;
-      Function *incCounterFunc =
-          GetOrCreateHLFunction(M, IncCounterFuncTy, group, counterOpcode, attribs);
+      Function *incCounterFunc = GetOrCreateHLFunction(
+          M, IncCounterFuncTy, group, counterOpcode, attribs);
 
       llvm::Type *idxTy = counterTy;
       llvm::Type *valTy =
@@ -245,9 +349,9 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
       llvm::FunctionType *SubscriptFuncTy = llvm::FunctionType::get(
           subscriptTy, {opcodeTy, handleTy, idxTy}, false);
 
-      Function *subscriptFunc =
-          GetOrCreateHLFunction(M, SubscriptFuncTy, HLOpcodeGroup::HLSubscript,
-                                (unsigned)HLSubscriptOpcode::DefaultSubscript, attribs);
+      Function *subscriptFunc = GetOrCreateHLFunction(
+          M, SubscriptFuncTy, HLOpcodeGroup::HLSubscript,
+          (unsigned)HLSubscriptOpcode::DefaultSubscript, attribs);
 
       BasicBlock *BB =
           BasicBlock::Create(opFunc->getContext(), "Entry", opFunc);
@@ -306,8 +410,10 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
           llvm::FunctionType::get(valTy, {opcodeTy, valTy}, false);
       unsigned sinOp = static_cast<unsigned>(IntrinsicOp::IOP_sin);
       unsigned cosOp = static_cast<unsigned>(IntrinsicOp::IOP_cos);
-      Function *sinFunc = GetOrCreateHLFunction(M, sinFuncTy, group, sinOp, attribs);
-      Function *cosFunc = GetOrCreateHLFunction(M, sinFuncTy, group, cosOp, attribs);
+      Function *sinFunc =
+          GetOrCreateHLFunction(M, sinFuncTy, group, sinOp, attribs);
+      Function *cosFunc =
+          GetOrCreateHLFunction(M, sinFuncTy, group, cosOp, attribs);
 
       BasicBlock *BB =
           BasicBlock::Create(opFunc->getContext(), "Entry", opFunc);
@@ -336,8 +442,8 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
   } else if (group == HLOpcodeGroup::HLExtIntrinsic) {
     llvm::StringRef fnName = F->getName();
     llvm::StringRef groupName = GetHLOpcodeGroupNameByAttr(F);
-    opFunc =
-      GetOrCreateHLFunction(M, funcTy, group, &groupName, &fnName, opcode, attribs);
+    opFunc = GetOrCreateHLFunction(M, funcTy, group, &groupName, &fnName,
+                                   opcode, attribs);
   } else {
     opFunc = GetOrCreateHLFunction(M, funcTy, group, opcode, attribs);
   }
@@ -347,82 +453,76 @@ Function *CreateOpFunction(llvm::Module &M, Function *F,
 
 DxilResourceProperties GetResourcePropsFromIntrinsicObjectArg(
     Value *arg, HLModule &HLM, DxilTypeSystem &typeSys,
-    DenseMap<Value *, DxilResourceProperties> &valToResPropertiesMap) {
-  DxilResourceProperties RP;
-  RP.Class = DXIL::ResourceClass::Invalid;
-
-  auto RPIt = valToResPropertiesMap.find(arg);
-  if (RPIt != valToResPropertiesMap.end()) {
-    RP = RPIt->second;
-  } else {
-    // Must be GEP.
-    GEPOperator *GEP = cast<GEPOperator>(arg);
-    // Find RP from GEP.
-    Value *Ptr = GEP->getPointerOperand();
-    // When Ptr is array of resource, check if it is another GEP.
-    while (
-        dxilutil::IsHLSLResourceType(dxilutil::GetArrayEltTy(Ptr->getType()))) {
-      if (GEPOperator *ParentGEP = dyn_cast<GEPOperator>(Ptr)) {
-        GEP = ParentGEP;
-        Ptr = GEP->getPointerOperand();
-      } else {
-        break;
-      }
+    DxilObjectProperties &objectProperties) {
+  DxilResourceProperties RP = objectProperties.GetResource(arg);
+  if (RP.isValid())
+    return RP;
+
+  // Must be GEP.
+  GEPOperator *GEP = cast<GEPOperator>(arg);
+  // Find RP from GEP.
+  Value *Ptr = GEP->getPointerOperand();
+  // When Ptr is array of resource, check if it is another GEP.
+  while (
+      dxilutil::IsHLSLResourceType(dxilutil::GetArrayEltTy(Ptr->getType()))) {
+    if (GEPOperator *ParentGEP = dyn_cast<GEPOperator>(Ptr)) {
+      GEP = ParentGEP;
+      Ptr = GEP->getPointerOperand();
+    } else {
+      break;
     }
+  }
 
-    RPIt = valToResPropertiesMap.find(Ptr);
-    // When ptr is array of resource, ptr could be in
-    // valToResPropertiesMap.
-    if (RPIt != valToResPropertiesMap.end()) {
-      RP = RPIt->second;
-    } else {
-      DxilStructAnnotation *Anno = nullptr;
-
-      for (auto gepIt = gep_type_begin(GEP), E = gep_type_end(GEP); gepIt != E;
-           ++gepIt) {
-
-        if (StructType *ST = dyn_cast<StructType>(*gepIt)) {
-          Anno = typeSys.GetStructAnnotation(ST);
-          DXASSERT(Anno, "missing type annotation");
-
-          unsigned Index =
-              cast<ConstantInt>(gepIt.getOperand())->getLimitedValue();
-
-          DxilFieldAnnotation &fieldAnno = Anno->GetFieldAnnotation(Index);
-          if (fieldAnno.HasResourceAttribute()) {
-            MDNode *resAttrib = fieldAnno.GetResourceAttribute();
-            DxilResourceBase R(DXIL::ResourceClass::Invalid);
-            HLM.LoadDxilResourceBaseFromMDNode(resAttrib, R);
-            switch (R.GetClass()) {
-            case DXIL::ResourceClass::SRV:
-            case DXIL::ResourceClass::UAV: {
-              DxilResource Res;
-              HLM.LoadDxilResourceFromMDNode(resAttrib, Res);
-              RP = resource_helper::loadFromResourceBase(&Res);
-            } break;
-            case DXIL::ResourceClass::Sampler: {
-              DxilSampler Sampler;
-              HLM.LoadDxilSamplerFromMDNode(resAttrib, Sampler);
-              RP = resource_helper::loadFromResourceBase(&Sampler);
-            } break;
-            default:
-              DXASSERT(0, "invalid resource attribute in filed annotation");
-              break;
-            }
-            break;
-          }
+  // When ptr is array of resource, ptr could be in
+  // objectProperties.
+  RP = objectProperties.GetResource(Ptr);
+  if (RP.isValid())
+    return RP;
+
+  DxilStructAnnotation *Anno = nullptr;
+
+  for (auto gepIt = gep_type_begin(GEP), E = gep_type_end(GEP); gepIt != E;
+        ++gepIt) {
+
+    if (StructType *ST = dyn_cast<StructType>(*gepIt)) {
+      Anno = typeSys.GetStructAnnotation(ST);
+      DXASSERT(Anno, "missing type annotation");
+
+      unsigned Index =
+          cast<ConstantInt>(gepIt.getOperand())->getLimitedValue();
+
+      DxilFieldAnnotation &fieldAnno = Anno->GetFieldAnnotation(Index);
+      if (fieldAnno.HasResourceAttribute()) {
+        MDNode *resAttrib = fieldAnno.GetResourceAttribute();
+        DxilResourceBase R(DXIL::ResourceClass::Invalid);
+        HLM.LoadDxilResourceBaseFromMDNode(resAttrib, R);
+        switch (R.GetClass()) {
+        case DXIL::ResourceClass::SRV:
+        case DXIL::ResourceClass::UAV: {
+          DxilResource Res;
+          HLM.LoadDxilResourceFromMDNode(resAttrib, Res);
+          RP = resource_helper::loadPropsFromResourceBase(&Res);
+        } break;
+        case DXIL::ResourceClass::Sampler: {
+          DxilSampler Sampler;
+          HLM.LoadDxilSamplerFromMDNode(resAttrib, Sampler);
+          RP = resource_helper::loadPropsFromResourceBase(&Sampler);
+        } break;
+        default:
+          DXASSERT(0, "invalid resource attribute in filed annotation");
+          break;
         }
+        break;
       }
     }
   }
-  DXASSERT(RP.Class != DXIL::ResourceClass::Invalid,
-           "invalid resource properties");
+  DXASSERT(RP.isValid(), "invalid resource properties");
   return RP;
 }
 
 void AddOpcodeParamForIntrinsic(
     HLModule &HLM, Function *F, unsigned opcode, llvm::Type *HandleTy,
-    DenseMap<Value *, DxilResourceProperties> &valToResPropertiesMap) {
+    DxilObjectProperties &objectProperties) {
   llvm::Module &M = *HLM.GetModule();
   llvm::FunctionType *oldFuncTy = F->getFunctionType();
 
@@ -550,7 +650,7 @@ void AddOpcodeParamForIntrinsic(
       objVal = objGEP->getPointerOperand();
 
       DxilResourceProperties RP = GetResourcePropsFromIntrinsicObjectArg(
-          objVal, HLM, typeSys, valToResPropertiesMap);
+          objVal, HLM, typeSys, objectProperties);
 
       if (IndexList.size() > 1)
         objVal = Builder.CreateInBoundsGEP(objVal, IndexList);
@@ -590,7 +690,7 @@ void AddOpcodeParamForIntrinsic(
         if (dxilutil::IsHLSLResourceType(Ty)) {
 
           DxilResourceProperties RP = GetResourcePropsFromIntrinsicObjectArg(
-              arg, HLM, typeSys, valToResPropertiesMap);
+              arg, HLM, typeSys, objectProperties);
           // Use object type directly, not by pointer.
           // This will make sure temp object variable only used by ld/st.
           if (GEPOperator *argGEP = dyn_cast<GEPOperator>(arg)) {
@@ -634,7 +734,7 @@ void AddOpcodeParamForIntrinsic(
 
 void AddOpcodeParamForIntrinsics(
     HLModule &HLM, std::vector<std::pair<Function *, unsigned>> &intrinsicMap,
-    DenseMap<Value *, DxilResourceProperties> &valToResPropertiesMap) {
+    DxilObjectProperties &objectProperties) {
   llvm::Type *HandleTy = HLM.GetOP()->GetHandleType();
   for (auto mapIter : intrinsicMap) {
     Function *F = mapIter.first;
@@ -645,11 +745,11 @@ void AddOpcodeParamForIntrinsics(
     }
 
     unsigned opcode = mapIter.second;
-    AddOpcodeParamForIntrinsic(HLM, F, opcode, HandleTy, valToResPropertiesMap);
+    AddOpcodeParamForIntrinsic(HLM, F, opcode, HandleTy, objectProperties);
   }
 }
 
-}
+} // namespace
 
 namespace {
 
@@ -1000,7 +1100,7 @@ void ReplaceConstStaticGlobals(
     }
   }
 }
-}
+} // namespace CGHLSLMSHelper
 
 namespace {
 
@@ -1401,7 +1501,8 @@ void SimpleTransformForHLDXIRInst(Instruction *I, SmallInstSet &deadInsts) {
 
 namespace CGHLSLMSHelper {
 
-Value *TryEvalIntrinsic(CallInst *CI, IntrinsicOp intriOp, unsigned hlslVersion) {
+Value *TryEvalIntrinsic(CallInst *CI, IntrinsicOp intriOp,
+                        unsigned hlslVersion) {
   switch (intriOp) {
   case IntrinsicOp::IOP_tan: {
     return EvalUnaryIntrinsic(CI, tanf, tan);
@@ -1528,13 +1629,14 @@ Value *TryEvalIntrinsic(CallInst *CI, IntrinsicOp intriOp, unsigned hlslVersion)
     return EvalUnaryIntrinsic(CI, floorf, floor);
   } break;
   case IntrinsicOp::IOP_round: {
-    // round intrinsic could exhibit different behaviour for constant and runtime evaluations.
-    // E.g., for round(0.5): constant evaluation results in 1 (away from zero rounding), 
-    // while runtime evaluation results in 0 (nearest even rounding).
-    // 
-    // For back compat, DXC still preserves the above behavior for language versions 2016 or below.
-    // However, for newer language versions, DXC now always use nearest even for round() intrinsic in all
-    // cases.
+    // round intrinsic could exhibit different behaviour for constant and
+    // runtime evaluations. E.g., for round(0.5): constant evaluation results in
+    // 1 (away from zero rounding), while runtime evaluation results in 0
+    // (nearest even rounding).
+    //
+    // For back compat, DXC still preserves the above behavior for language
+    // versions 2016 or below. However, for newer language versions, DXC now
+    // always use nearest even for round() intrinsic in all cases.
     if (hlslVersion <= 2016) {
       return EvalUnaryIntrinsic(CI, roundf, round);
     } else {
@@ -1639,8 +1741,8 @@ bool RetrieveLastElementType(Type *Ty, Type *&EltTy) {
 // Offset still needs to be aligned based on type since this
 // is the legacy cbuffer global path.
 unsigned AlignCBufferOffset(unsigned offset, unsigned size, llvm::Type *Ty,
-                            bool bRowMajor,
-                            bool bMinPrecMode, bool &bCurRowIsMinPrec) {
+                            bool bRowMajor, bool bMinPrecMode,
+                            bool &bCurRowIsMinPrec) {
   DXASSERT(!(offset & 1), "otherwise we have an invalid offset.");
   bool bNeedNewRow = Ty->isArrayTy();
   // In min-precision mode, a new row is needed when
@@ -1651,7 +1753,8 @@ unsigned AlignCBufferOffset(unsigned offset, unsigned size, llvm::Type *Ty,
       if (HLMatrixType mat = HLMatrixType::dyn_cast(Ty)) {
         bNeedNewRow |= !bRowMajor && mat.getNumColumns() > 1;
         bNeedNewRow |= bRowMajor && mat.getNumRows() > 1;
-        bMinPrec = bMinPrecMode && mat.getElementType(false)->getScalarSizeInBits() < 32;
+        bMinPrec = bMinPrecMode &&
+                   mat.getElementType(false)->getScalarSizeInBits() < 32;
       } else {
         bNeedNewRow = true;
         if (bMinPrecMode) {
@@ -1678,11 +1781,10 @@ unsigned AlignCBufferOffset(unsigned offset, unsigned size, llvm::Type *Ty,
                                    bNeedNewRow);
 }
 
-unsigned
-AllocateDxilConstantBuffer(HLCBuffer &CB,
-                           std::unordered_map<Constant *, DxilFieldAnnotation>
-                               &constVarAnnotationMap,
-                           bool bMinPrecMode) {
+unsigned AllocateDxilConstantBuffer(
+    HLCBuffer &CB,
+    std::unordered_map<Constant *, DxilFieldAnnotation> &constVarAnnotationMap,
+    bool bMinPrecMode) {
   unsigned offset = 0;
 
   // Scan user allocated constants first.
@@ -1710,7 +1812,8 @@ AllocateDxilConstantBuffer(HLCBuffer &CB,
                                MatrixOrientation::RowMajor
                          : false;
     // Align offset.
-    offset = AlignCBufferOffset(offset, size, Ty, bRowMajor, bMinPrecMode, bCurRowIsMinPrec);
+    offset = AlignCBufferOffset(offset, size, Ty, bRowMajor, bMinPrecMode,
+                                bCurRowIsMinPrec);
     if (C->GetLowerBound() == UINT_MAX) {
       C->SetLowerBound(offset);
     }
@@ -1719,14 +1822,13 @@ AllocateDxilConstantBuffer(HLCBuffer &CB,
   return offset;
 }
 
-
 void AllocateDxilConstantBuffers(
     HLModule &HLM, std::unordered_map<Constant *, DxilFieldAnnotation>
                        &constVarAnnotationMap) {
   for (unsigned i = 0; i < HLM.GetCBuffers().size(); i++) {
     HLCBuffer &CB = *static_cast<HLCBuffer *>(&(HLM.GetCBuffer(i)));
-    unsigned size = AllocateDxilConstantBuffer(CB, constVarAnnotationMap,
-      HLM.GetHLOptions().bUseMinPrecision);
+    unsigned size = AllocateDxilConstantBuffer(
+        CB, constVarAnnotationMap, HLM.GetHLOptions().bUseMinPrecision);
     CB.SetSize(size);
   }
 }
@@ -1772,7 +1874,6 @@ void ReplaceUseInFunction(Value *V, Value *NewV, Function *F,
   }
 }
 
-
 void MarkUsedFunctionForConst(Value *V,
                               std::unordered_set<Function *> &usedFunc) {
   for (auto U = V->user_begin(); U != V->user_end();) {
@@ -1908,9 +2009,12 @@ bool CreateCBufferVariable(HLCBuffer &CB, HLModule &HLM, llvm::Type *HandleTy) {
     IRBuilder<> Builder(F.getEntryBlock().getFirstInsertionPt());
 
     // create HL subscript to make all the use of cbuffer start from it.
-    HandleArgs[HLOperandIndex::kCreateHandleResourceOpIdx-1] = cbGV;
+    HandleArgs[HLOperandIndex::kCreateHandleResourceOpIdx - 1] = cbGV;
     CallInst *Handle = HLM.EmitHLOperationCall(
         Builder, HLOpcodeGroup::HLCreateHandle, 0, HandleTy, HandleArgs, M);
+    CallInst *OrigHandle = Handle;
+    DxilResourceProperties RP = resource_helper::loadPropsFromResourceBase(&CB);
+    Handle = CreateAnnotateHandle(HLM, Handle, RP, cbGV->getType()->getElementType(), Builder);
 
     args[HLOperandIndex::kSubscriptObjectOpIdx] = Handle;
     Instruction *cbSubscript =
@@ -1974,13 +2078,16 @@ bool CreateCBufferVariable(HLCBuffer &CB, HLModule &HLM, llvm::Type *HandleTy) {
             idxList.push_back(GI.getOperand());
           }
 
-          HandleArgs[HLOperandIndex::kCreateHandleIndexOpIdx-1] = arrayIdx;
+          HandleArgs[HLOperandIndex::kCreateHandleIndexOpIdx - 1] = arrayIdx;
           CallInst *Handle =
 
               HLM.EmitHLOperationCall(*instBuilder,
                                       HLOpcodeGroup::HLCreateHandle, 0,
                                       HandleTy, HandleArgs, M);
 
+          DxilResourceProperties RP = resource_helper::loadPropsFromResourceBase(&CB);
+          Handle = CreateAnnotateHandle(HLM, Handle, RP, cbGV->getType()->getElementType(), *instBuilder);
+
           args[HLOperandIndex::kSubscriptObjectOpIdx] = Handle;
           args[HLOperandIndex::kSubscriptIndexOpIdx] = arrayIdx;
 
@@ -1998,6 +2105,7 @@ bool CreateCBufferVariable(HLCBuffer &CB, HLModule &HLM, llvm::Type *HandleTy) {
     if (cbSubscript->user_empty()) {
       cbSubscript->eraseFromParent();
       Handle->eraseFromParent();
+      OrigHandle->eraseFromParent();
     } else {
       // merge GEP use for cbSubscript.
       HLModule::MergeGepUse(cbSubscript);
@@ -2037,7 +2145,6 @@ void ConstructCBufferAnnotation(
   }
 }
 
-
 void ConstructCBuffer(
     HLModule &HLM, llvm::Type *CBufferType,
     std::unordered_map<Constant *, DxilFieldAnnotation> &AnnotationMap) {
@@ -2067,7 +2174,7 @@ void ConstructCBuffer(
     CB.GetConstants().clear();
   }
 }
-}
+} // namespace
 
 namespace CGHLSLMSHelper {
 
@@ -2147,7 +2254,7 @@ void TranslateRayQueryConstructor(HLModule &HLM) {
     pConstructorFunc->eraseFromParent();
   }
 }
-}
+} // namespace CGHLSLMSHelper
 
 namespace {
 
@@ -2198,7 +2305,6 @@ bool BuildImmInit(Function *Ctor) {
   return true;
 }
 
-
 } // namespace
 
 namespace CGHLSLMSHelper {
@@ -2247,9 +2353,9 @@ void ProcessCtorFunctions(llvm::Module &M, StringRef globalName,
   }
 }
 
-void FinishCBuffer(
-    HLModule &HLM, llvm::Type *CBufferType,
-    std::unordered_map<Constant *, DxilFieldAnnotation> &constVarAnnotationMap) {
+void FinishCBuffer(HLModule &HLM, llvm::Type *CBufferType,
+                   std::unordered_map<Constant *, DxilFieldAnnotation>
+                       &constVarAnnotationMap) {
   // Allocate constant buffers.
   AllocateDxilConstantBuffers(HLM, constVarAnnotationMap);
   // TODO: create temp variable for constant which has store use.
@@ -2297,7 +2403,6 @@ void AddRegBindingsForResourceInConstantBuffer(
   }
 }
 
-
 // extension codegen.
 void ExtensionCodeGen(HLModule &HLM, clang::CodeGen::CodeGenModule &CGM) {
   // Add semantic defines for extensions if any are available.
@@ -2369,8 +2474,8 @@ void ReportDisallowedTypeInExportParam(clang::CodeGen ::CodeGenModule &CGM,
 
 namespace CGHLSLMSHelper {
 void FinishClipPlane(HLModule &HLM, std::vector<Function *> &clipPlaneFuncList,
-                    std::unordered_map<Value *, DebugLoc> &debugInfoMap,
-                    clang::CodeGen::CodeGenModule &CGM) {
+                     std::unordered_map<Value *, DebugLoc> &debugInfoMap,
+                     clang::CodeGen::CodeGenModule &CGM) {
   bool bDebugInfo = CGM.getCodeGenOpts().getDebugInfo() ==
                     clang::CodeGenOptions::FullDebugInfo;
   Module &M = *HLM.GetModule();
@@ -2398,7 +2503,7 @@ void FinishClipPlane(HLModule &HLM, std::vector<Function *> &clipPlaneFuncList,
     }
   }
 }
-} // namespace
+} // namespace CGHLSLMSHelper
 
 namespace {
 void LowerExportFunctions(HLModule &HLM, clang::CodeGen::CodeGenModule &CGM,
@@ -2635,24 +2740,27 @@ void FinishEntries(
     }
   }
 }
-} // namespace
+} // namespace CGHLSLMSHelper
 
 namespace CGHLSLMSHelper {
 void FinishIntrinsics(
     HLModule &HLM, std::vector<std::pair<Function *, unsigned>> &intrinsicMap,
-    DenseMap<Value *, DxilResourceProperties> &valToResPropertiesMap) {
+    DxilObjectProperties &objectProperties) {
   // Lower getResourceHeap before AddOpcodeParamForIntrinsics to skip automatic
   // lower for getResourceFromHeap.
   LowerGetResourceFromHeap(HLM, intrinsicMap);
+  // Lower bitcast use of CBV into cbSubscript.
+  LowerDynamicCBVUseToHandle(HLM, objectProperties);
   // translate opcode into parameter for intrinsic functions
   // Do this before CloneShaderEntry and TranslateRayQueryConstructor to avoid
   // update valToResPropertiesMap for cloned inst.
-  AddOpcodeParamForIntrinsics(HLM, intrinsicMap, valToResPropertiesMap);
+  AddOpcodeParamForIntrinsics(HLM, intrinsicMap, objectProperties);
 }
 
 // Add the dx.break temporary intrinsic and create Call Instructions
 // to it for each branch that requires the artificial conditional.
-void AddDxBreak(Module &M, const SmallVector<llvm::BranchInst*, 16> &DxBreaks) {
+void AddDxBreak(Module &M,
+                const SmallVector<llvm::BranchInst *, 16> &DxBreaks) {
   if (DxBreaks.empty())
     return;
 
@@ -2662,7 +2770,8 @@ void AddDxBreak(Module &M, const SmallVector<llvm::BranchInst*, 16> &DxBreaks) {
   for (Function &F : M.functions()) {
     HLOpcodeGroup opgroup = hlsl::GetHLOpcodeGroup(&F);
     if (F.isDeclaration() && IsHLWaveSensitive(&F) &&
-        (opgroup == HLOpcodeGroup::HLIntrinsic || opgroup == HLOpcodeGroup::HLExtIntrinsic)) {
+        (opgroup == HLOpcodeGroup::HLIntrinsic ||
+         opgroup == HLOpcodeGroup::HLExtIntrinsic)) {
       for (User *U : F.users()) {
         CallInst *CI = cast<CallInst>(U);
         WaveUsers.insert(CI->getParent()->getParent());
@@ -2675,25 +2784,29 @@ void AddDxBreak(Module &M, const SmallVector<llvm::BranchInst*, 16> &DxBreaks) {
     return;
 
   // Create the dx.break function
-  FunctionType *FT = llvm::FunctionType::get(llvm::Type::getInt1Ty(M.getContext()), false);
-  Function *func = cast<llvm::Function>(M.getOrInsertFunction(DXIL::kDxBreakFuncName, FT));
+  FunctionType *FT =
+      llvm::FunctionType::get(llvm::Type::getInt1Ty(M.getContext()), false);
+  Function *func =
+      cast<llvm::Function>(M.getOrInsertFunction(DXIL::kDxBreakFuncName, FT));
   func->addFnAttr(Attribute::AttrKind::NoUnwind);
 
-  // For all break branches recorded previously, if the function they are in makes
-  // any use of a wave op, it may need to be artificially conditional. Make it so now.
-  // The CleanupDxBreak pass will remove those that aren't needed when more is known.
-  for(llvm::BranchInst *BI : DxBreaks) {
+  // For all break branches recorded previously, if the function they are in
+  // makes any use of a wave op, it may need to be artificially conditional.
+  // Make it so now. The CleanupDxBreak pass will remove those that aren't
+  // needed when more is known.
+  for (llvm::BranchInst *BI : DxBreaks) {
     if (WaveUsers.count(BI->getParent()->getParent())) {
       CallInst *Call = CallInst::Create(FT, func, ArrayRef<Value *>(), "", BI);
       BI->setCondition(Call);
       if (!BI->getMetadata(DXIL::kDxBreakMDName)) {
-        BI->setMetadata(DXIL::kDxBreakMDName, llvm::MDNode::get(BI->getContext(), {}));
+        BI->setMetadata(DXIL::kDxBreakMDName,
+                        llvm::MDNode::get(BI->getContext(), {}));
       }
     }
   }
 }
 
-}
+} // namespace CGHLSLMSHelper
 
 namespace CGHLSLMSHelper {
 
@@ -2846,7 +2959,8 @@ void InitRetValue(BasicBlock *exitBB) {
     Value *Init = UndefValue::get(Ty);
     if (Ty->isAggregateType()) {
       // TODO: support aggreagate type and out parameters.
-      // Skip it here will cause undef on phi which the incoming path should never hit.
+      // Skip it here will cause undef on phi which the incoming path should
+      // never hit.
     } else {
       B.CreateStore(Init, RetVAlloc);
     }
@@ -3036,11 +3150,15 @@ void StructurizeMultiRet(Module &M, clang::CodeGen::CodeGenModule &CGM,
                          bool bWaveEnabledStage,
                          SmallVector<BranchInst *, 16> &DxBreaks) {
   if (CGM.getCodeGenOpts().HLSLExtensionsCodegen) {
-    if (!CGM.getCodeGenOpts().HLSLExtensionsCodegen->IsOptionEnabled("structurize-returns"))
+    if (!CGM.getCodeGenOpts().HLSLExtensionsCodegen->IsOptionEnabled(
+            "structurize-returns"))
       return;
   } else {
-    if (!CGM.getCodeGenOpts().HLSLOptimizationToggles.count("structurize-returns") ||
-        !CGM.getCodeGenOpts().HLSLOptimizationToggles.find("structurize-returns")->second)
+    if (!CGM.getCodeGenOpts().HLSLOptimizationToggles.count(
+            "structurize-returns") ||
+        !CGM.getCodeGenOpts()
+             .HLSLOptimizationToggles.find("structurize-returns")
+             ->second)
       return;
   }
 
@@ -3053,4 +3171,23 @@ void StructurizeMultiRet(Module &M, clang::CodeGen::CodeGenModule &CGM,
     StructurizeMultiRetFunction(&F, it->second, bWaveEnabledStage, DxBreaks);
   }
 }
+
+bool DxilObjectProperties::AddResource(llvm::Value *V, const hlsl::DxilResourceProperties &RP) {
+  if (RP.isValid()) {
+    DXASSERT(!GetResource(V).isValid() || GetResource(V) == RP, "otherwise, property conflict");
+    resMap[V] = RP;
+    return true;
+  }
+  return false;
+}
+bool DxilObjectProperties::IsResource(llvm::Value *V) {
+  return resMap.count(V) != 0;
+}
+hlsl::DxilResourceProperties DxilObjectProperties::GetResource(llvm::Value *V) {
+  auto it = resMap.find(V);
+  if (it != resMap.end())
+    return it->second;
+  return DxilResourceProperties();
+}
+
 } // namespace CGHLSLMSHelper

+ 16 - 3
tools/clang/lib/CodeGen/CGHLSLMSHelper.h

@@ -5,6 +5,7 @@
 #include "clang/Basic/SourceLocation.h"
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/MapVector.h"
 
 #include "dxc/DXIL/DxilCBuffer.h"
 
@@ -140,6 +141,18 @@ private:
   llvm::SmallVector<Scope, 16> scopes;
 };
 
+// Map from value to resource properties.
+// This only collect object variables(global/local/parameter), not object fields inside struct.
+// Object fields inside struct is saved by TypeAnnotation.
+struct DxilObjectProperties {
+  bool AddResource(llvm::Value *V, const hlsl::DxilResourceProperties &RP);
+  bool IsResource(llvm::Value *V);
+  hlsl::DxilResourceProperties GetResource(llvm::Value *V);
+
+  // MapVector for deterministic iteration order.
+  llvm::MapVector<llvm::Value *, hlsl::DxilResourceProperties> resMap;
+};
+
 // Align cbuffer offset in legacy mode (16 bytes per row).
 unsigned AlignBufferOffsetInLegacy(unsigned offset, unsigned size,
                                    unsigned scalarSizeInBytes,
@@ -157,9 +170,9 @@ void FinishEntries(hlsl::HLModule &HLM, const EntryFunctionInfo &Entry,
                        &patchConstantFunctionPropsMap);
 
 void FinishIntrinsics(
-    hlsl::HLModule &HLM, std::vector<std::pair<llvm::Function *, unsigned>> &intrinsicMap,
-    llvm::DenseMap<llvm::Value *, hlsl::DxilResourceProperties>
-        &valToResPropertiesMap);
+    hlsl::HLModule &HLM,
+    std::vector<std::pair<llvm::Function *, unsigned>> &intrinsicMap,
+    DxilObjectProperties &valToResPropertiesMap);
 
 void AddDxBreak(llvm::Module &M, const llvm::SmallVector<llvm::BranchInst*, 16> &DxBreaks);
 

+ 2 - 0
tools/clang/lib/CodeGen/CodeGenTypes.cpp

@@ -426,6 +426,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     case BuiltinType::Min16Int:
     case BuiltinType::Min16UInt:
     case BuiltinType::LitInt:
+    case BuiltinType::Int8_4Packed:
+    case BuiltinType::UInt8_4Packed:
     // HLSL Change Ends
     case BuiltinType::Char16:
     case BuiltinType::Char32:

+ 2 - 0
tools/clang/lib/CodeGen/ItaniumCXXABI.cpp

@@ -2376,6 +2376,8 @@ static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
     case BuiltinType::Min16Float:
     case BuiltinType::HalfFloat:
     case BuiltinType::LitFloat:
+    case BuiltinType::Int8_4Packed:
+    case BuiltinType::UInt8_4Packed:
       llvm_unreachable("FIXME: HLSL types are unsupported!");
       break;
   }

+ 5 - 1
tools/clang/lib/Index/USRGeneration.cpp

@@ -615,7 +615,11 @@ void USRGenerator::VisitType(QualType T) {
           c = '?'; break;
         case BuiltinType::LitInt:
           c = '?'; break;
-        // HLSL Change Ends
+        case BuiltinType::Int8_4Packed:
+          c = '?'; break;
+        case BuiltinType::UInt8_4Packed:
+          c = '?'; break;
+          // HLSL Change Ends
         case BuiltinType::NullPtr:
           c = 'n'; break;
 #define BUILTIN_TYPE(Id, SingletonId)

+ 10 - 2
tools/clang/lib/Parse/HLSLRootSignature.cpp

@@ -276,7 +276,8 @@ void RootSignatureTokenizer::ReadNextToken(uint32_t BufferIdx)
               KW(COMPARISON_GREATER) ||
               KW(COMPARISON_NOT_EQUAL) ||
               KW(COMPARISON_GREATER_EQUAL) ||
-              KW(COMPARISON_ALWAYS);
+              KW(COMPARISON_ALWAYS) ||
+              KW(CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED);
         break;
 
     case 'D':
@@ -359,13 +360,14 @@ void RootSignatureTokenizer::ReadNextToken(uint32_t BufferIdx)
 
     case 'S':
         bKW = KW(space) || KW(Sampler) || KW(StaticSampler) || KW(SRV) ||
+              KW(CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED) || KW(SAMPLER_HEAP_DIRECTLY_INDEXED) ||
               KW(SHADER_VISIBILITY_ALL)      ||  KW(SHADER_VISIBILITY_VERTEX) || 
               KW(SHADER_VISIBILITY_HULL)     || KW(SHADER_VISIBILITY_DOMAIN)  ||
               KW(SHADER_VISIBILITY_GEOMETRY) || KW(SHADER_VISIBILITY_PIXEL) ||
               KW(SHADER_VISIBILITY_AMPLIFICATION) || KW(SHADER_VISIBILITY_MESH) ||
               KW(STATIC_BORDER_COLOR_TRANSPARENT_BLACK) ||
               KW(STATIC_BORDER_COLOR_OPAQUE_BLACK) ||
-              KW(STATIC_BORDER_COLOR_OPAQUE_WHITE);
+              KW(STATIC_BORDER_COLOR_OPAQUE_WHITE) || KW(SAMPLER_HEAP_DIRECTLY_INDEXED);
         break;
 
     case 'T':
@@ -743,6 +745,12 @@ HRESULT RootSignatureParser::ParseRootSignatureFlags(DxilRootSignatureFlags & Fl
                   IFC(Error(ERR_RS_LOCAL_FLAG_ON_GLOBAL, "LOCAL_ROOT_SIGNATURE flag used in global root signature"));
                 Flags |= DxilRootSignatureFlags::LocalRootSignature;
                 break;
+            case TokenType::CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED:
+                Flags |= DxilRootSignatureFlags::CBVSRVUAVHeapDirectlyIndexed;
+                break;
+            case TokenType::SAMPLER_HEAP_DIRECTLY_INDEXED:
+                Flags |= DxilRootSignatureFlags::SamplerHeapDirectlyIndexed;
+                break;
             default:
                 IFC(Error(ERR_RS_UNEXPECTED_TOKEN, "Expected a root signature flag value, found: '%s'", Token.GetStr()));
             }

+ 2 - 0
tools/clang/lib/Parse/HLSLRootSignature.h

@@ -84,6 +84,8 @@ public:
             DENY_MESH_SHADER_ROOT_ACCESS,
             ALLOW_STREAM_OUTPUT,
             LOCAL_ROOT_SIGNATURE,
+            CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED,
+            SAMPLER_HEAP_DIRECTLY_INDEXED,
 
             // Filter
             filter,

+ 1 - 0
tools/clang/lib/Parse/ParseDecl.cpp

@@ -742,6 +742,7 @@ void Parser::ParseGNUAttributeArgs(IdentifierInfo *AttrName,
     case AttributeList::AT_HLSLPatchConstantFunc:
     case AttributeList::AT_HLSLMaxVertexCount:
     case AttributeList::AT_HLSLUnroll:
+    case AttributeList::AT_HLSLWaveSize:
     case AttributeList::AT_NoInline:
     // The following are not accepted in [attribute(param)] syntax:
     //case AttributeList::AT_HLSLCentroid:

+ 2 - 0
tools/clang/lib/SPIRV/SpirvEmitter.cpp

@@ -2769,6 +2769,8 @@ SpirvInstruction *SpirvEmitter::processFlatConversion(
         case BuiltinType::LongLong:
         case BuiltinType::ULong:
         case BuiltinType::ULongLong:
+        case BuiltinType::Int8_4Packed:
+        case BuiltinType::UInt8_4Packed:
           return castToInt(initInstr, initType, ty, srcLoc);
         // Target type is a float variant.
         case BuiltinType::Double:

+ 4 - 0
tools/clang/lib/Sema/DeclSpec.cpp

@@ -316,6 +316,8 @@ bool Declarator::isDeclarationOfFunction() const {
     case TST_min16uint:
     case TST_min10float:
     case TST_min12int:
+    case TST_int8_4packed:
+    case TST_uint8_4packed:
     // HLSL Change Ends
       return false;
 
@@ -469,6 +471,8 @@ const char *DeclSpec::getSpecifierName(DeclSpec::TST T,
   case DeclSpec::TST_min10float:  return "min10float";
   case DeclSpec::TST_min12int:    return "min12int";
   case DeclSpec::TST_halffloat:
+  case DeclSpec::TST_int8_4packed:  return "int8_t4_packed";
+  case DeclSpec::TST_uint8_4packed: return "uint8_t4_packed";
     // HLSL Change Ends
   case DeclSpec::TST_half:        return "half";
   case DeclSpec::TST_float:       return "float";

+ 137 - 12
tools/clang/lib/Sema/SemaHLSL.cpp

@@ -65,6 +65,8 @@ enum ArBasicKind {
   AR_BASIC_MIN12INT,
   AR_BASIC_MIN16INT,
   AR_BASIC_MIN16UINT,
+  AR_BASIC_INT8_4PACKED,
+  AR_BASIC_UINT8_4PACKED,
   AR_BASIC_ENUM,
 
   AR_BASIC_COUNT,
@@ -207,8 +209,9 @@ enum ArBasicKind {
   // RayQuery
   AR_OBJECT_RAY_QUERY,
 
-  // Resource
-  AR_OBJECT_RESOURCE,
+  // Heap Resource
+  AR_OBJECT_HEAP_RESOURCE,
+  AR_OBJECT_HEAP_SAMPLER,
   AR_BASIC_MAXIMUM_COUNT
 };
 
@@ -362,6 +365,8 @@ const UINT g_uBasicKindProps[] =
   BPROP_PRIMITIVE | BPROP_NUMERIC | BPROP_INTEGER | BPROP_BITS12 | BPROP_MIN_PRECISION,   // AR_BASIC_MIN12INT
   BPROP_PRIMITIVE | BPROP_NUMERIC | BPROP_INTEGER | BPROP_BITS16 | BPROP_MIN_PRECISION,   // AR_BASIC_MIN16INT
   BPROP_PRIMITIVE | BPROP_NUMERIC | BPROP_INTEGER | BPROP_UNSIGNED | BPROP_BITS16 | BPROP_MIN_PRECISION,  // AR_BASIC_MIN16UINT
+  BPROP_PRIMITIVE | BPROP_NUMERIC | BPROP_INTEGER | BPROP_UNSIGNED | BPROP_BITS32,// AR_BASIC_INT8_4PACKED
+  BPROP_PRIMITIVE | BPROP_NUMERIC | BPROP_INTEGER | BPROP_UNSIGNED | BPROP_BITS32,// AR_BASIC_UINT8_4PACKED
 
   BPROP_ENUM | BPROP_NUMERIC | BPROP_INTEGER, // AR_BASIC_ENUM
   BPROP_OTHER,  // AR_BASIC_COUNT
@@ -492,7 +497,8 @@ const UINT g_uBasicKindProps[] =
   0,      //AR_OBJECT_RAYTRACING_PIPELINE_CONFIG1,
 
   0,      //AR_OBJECT_RAY_QUERY,
-  0,      //AR_OBJECT_RESOURCE,
+  0,      //AR_OBJECT_HEAP_RESOURCE,
+  0,      //AR_OBJECT_HEAP_SAMPLER,
   // AR_BASIC_MAXIMUM_COUNT
 };
 
@@ -1122,7 +1128,7 @@ static const ArBasicKind g_Texture2DArrayCT[] =
   AR_BASIC_UNKNOWN
 };
 
-static const ArBasicKind g_ResourceCT[] = {AR_OBJECT_RESOURCE,
+static const ArBasicKind g_ResourceCT[] = {AR_OBJECT_HEAP_RESOURCE,
                                            AR_BASIC_UNKNOWN};
 
 static const ArBasicKind g_RayDescCT[] =
@@ -1209,6 +1215,65 @@ static const ArBasicKind g_Int32OnlyCT[] =
   AR_BASIC_UNKNOWN
 };
 
+static const ArBasicKind g_Float32OnlyCT[] =
+{
+  AR_BASIC_FLOAT32,
+  AR_BASIC_LITERAL_FLOAT,
+  AR_BASIC_NOCAST,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_Int64OnlyCT[] =
+{
+  AR_BASIC_UINT64,
+  AR_BASIC_INT64,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_NOCAST,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_AnyInt64CT[] =
+{
+  AR_BASIC_INT64,
+  AR_BASIC_UINT64,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_Int8_4PackedCT[] = 
+{
+  AR_BASIC_INT8_4PACKED,
+  AR_BASIC_UINT32,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_UInt8_4PackedCT[] =
+{
+  AR_BASIC_UINT8_4PACKED,
+  AR_BASIC_UINT32,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_AnyInt16Or32CT[] = {
+  AR_BASIC_INT32,
+  AR_BASIC_UINT32,
+  AR_BASIC_INT16,
+  AR_BASIC_UINT16,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_UNKNOWN
+};
+
+static const ArBasicKind g_SInt16Or32OnlyCT[] =
+{
+  AR_BASIC_INT32,
+  AR_BASIC_INT16,
+  AR_BASIC_LITERAL_INT,
+  AR_BASIC_NOCAST,
+  AR_BASIC_UNKNOWN
+};
+
 // Basic kinds, indexed by a LEGAL_INTRINSIC_COMPTYPES value.
 const ArBasicKind* g_LegalIntrinsicCompTypes[] =
 {
@@ -1249,6 +1314,13 @@ const ArBasicKind* g_LegalIntrinsicCompTypes[] =
   g_Texture2DArrayCT,   // LICOMPTYPE_TEXTURE2DARRAY
   g_ResourceCT,         // LICOMPTYPE_RESOURCE
   g_Int32OnlyCT,        // LICOMPTYPE_INT32_ONLY
+  g_Int64OnlyCT,        // LICOMPTYPE_INT64_ONLY
+  g_AnyInt64CT,         // LICOMPTYPE_ANY_INT64
+  g_Float32OnlyCT,      // LICOMPTYPE_FLOAT32_ONLY
+  g_Int8_4PackedCT,     // LICOMPTYPE_INT8_4PACKED
+  g_UInt8_4PackedCT,    // LICOMPTYPE_UINT8_4PACKED
+  g_AnyInt16Or32CT,     // LICOMPTYPE_ANY_INT16_OR_32
+  g_SInt16Or32OnlyCT,   // LICOMPTYPE_SINT16_OR_32_ONLY
 };
 static_assert(ARRAYSIZE(g_LegalIntrinsicCompTypes) == LICOMPTYPE_COUNT,
   "Intrinsic comp type table must be updated when new enumerants are added.");
@@ -1343,7 +1415,8 @@ const ArBasicKind g_ArBasicKindsAsTypes[] =
   AR_OBJECT_RAYTRACING_PIPELINE_CONFIG1,
 
   AR_OBJECT_RAY_QUERY,
-  AR_OBJECT_RESOURCE,
+  AR_OBJECT_HEAP_RESOURCE,
+  AR_OBJECT_HEAP_SAMPLER,
 };
 
 // Count of template arguments for basic kind of objects that look like templates (one or more type arguments).
@@ -1432,7 +1505,8 @@ const uint8_t g_ArBasicKindsTemplateCount[] =
   0, // AR_OBJECT_RAYTRACING_PIPELINE_CONFIG1,
 
   1, // AR_OBJECT_RAY_QUERY,
-  0, // AR_OBJECT_RESOURCE,
+  0, // AR_OBJECT_HEAP_RESOURCE,
+  0, // AR_OBJECT_HEAP_SAMPLER,
 };
 
 C_ASSERT(_countof(g_ArBasicKindsAsTypes) == _countof(g_ArBasicKindsTemplateCount));
@@ -1531,7 +1605,8 @@ const SubscriptOperatorRecord g_ArBasicKindsSubscripts[] =
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_RAYTRACING_PIPELINE_CONFIG1,
 
   { 0, MipsFalse, SampleFalse },  // AR_OBJECT_RAY_QUERY,
-  { 0, MipsFalse, SampleFalse },  // AR_OBJECT_RESOURCE,
+  { 0, MipsFalse, SampleFalse },  // AR_OBJECT_HEAP_RESOURCE,
+  { 0, MipsFalse, SampleFalse },  // AR_OBJECT_HEAP_SAMPLER,
 };
 
 C_ASSERT(_countof(g_ArBasicKindsAsTypes) == _countof(g_ArBasicKindsSubscripts));
@@ -1545,6 +1620,7 @@ const char* g_ArBasicTypeNames[] =
   "int", "uint", "long", "ulong",
   "min10float", "min16float",
   "min12int", "min16int", "min16uint",
+  "int8_t4_packed", "uint8_t4_packed",
   "enum",
 
   "<count>",
@@ -1651,7 +1727,8 @@ const char* g_ArBasicTypeNames[] =
   "RaytracingPipelineConfig1",
 
   "RayQuery",
-  "Resource",
+  "HEAP_Resource",
+  "HEAP_Sampler",
 };
 
 C_ASSERT(_countof(g_ArBasicTypeNames) == AR_BASIC_MAXIMUM_COUNT);
@@ -1747,6 +1824,8 @@ static bool IsAtomicOperation(IntrinsicOp op) {
   case IntrinsicOp::IOP_InterlockedAnd:
   case IntrinsicOp::IOP_InterlockedCompareExchange:
   case IntrinsicOp::IOP_InterlockedCompareStore:
+  case IntrinsicOp::IOP_InterlockedCompareExchangeFloatBitwise:
+  case IntrinsicOp::IOP_InterlockedCompareStoreFloatBitwise:
   case IntrinsicOp::IOP_InterlockedExchange:
   case IntrinsicOp::IOP_InterlockedMax:
   case IntrinsicOp::IOP_InterlockedMin:
@@ -1761,6 +1840,18 @@ static bool IsAtomicOperation(IntrinsicOp op) {
   case IntrinsicOp::MOP_InterlockedMin:
   case IntrinsicOp::MOP_InterlockedOr:
   case IntrinsicOp::MOP_InterlockedXor:
+  case IntrinsicOp::MOP_InterlockedAdd64:
+  case IntrinsicOp::MOP_InterlockedAnd64:
+  case IntrinsicOp::MOP_InterlockedCompareExchange64:
+  case IntrinsicOp::MOP_InterlockedCompareStore64:
+  case IntrinsicOp::MOP_InterlockedExchange64:
+  case IntrinsicOp::MOP_InterlockedMax64:
+  case IntrinsicOp::MOP_InterlockedMin64:
+  case IntrinsicOp::MOP_InterlockedOr64:
+  case IntrinsicOp::MOP_InterlockedXor64:
+  case IntrinsicOp::MOP_InterlockedExchangeFloat:
+  case IntrinsicOp::MOP_InterlockedCompareExchangeFloatBitwise:
+  case IntrinsicOp::MOP_InterlockedCompareStoreFloatBitwise:
     return true;
   default:
     return false;
@@ -3468,8 +3559,17 @@ private:
         recordDecl = DeclareConstantBufferViewType(*m_context, /*bTBuf*/true);
       } else if (kind == AR_OBJECT_RAY_QUERY) {
         recordDecl = DeclareRayQueryType(*m_context);
-      } else if (kind == AR_OBJECT_RESOURCE) {
-        recordDecl = DeclareResourceType(*m_context);
+      } else if (kind == AR_OBJECT_HEAP_RESOURCE) {
+        recordDecl = DeclareResourceType(*m_context, /*bSampler*/false);
+        // create Resource ResourceDescriptorHeap;
+        DeclareBuiltinGlobal("ResourceDescriptorHeap",
+                             m_context->getRecordType(recordDecl), *m_context);
+      } else if (kind == AR_OBJECT_HEAP_SAMPLER) {
+        recordDecl = DeclareResourceType(*m_context, /*bSampler*/true);
+        // create Resource SamplerDescriptorHeap;
+        DeclareBuiltinGlobal("SamplerDescriptorHeap",
+                             m_context->getRecordType(recordDecl), *m_context);
+
       }
       else if (kind == AR_OBJECT_FEEDBACKTEXTURE2D) {
         recordDecl = DeclareUIntTemplatedTypeWithHandle(*m_context, "FeedbackTexture2D", "kind");
@@ -4020,6 +4120,8 @@ public:
       case BuiltinType::Min10Float: return AR_BASIC_MIN10FLOAT;
       case BuiltinType::LitFloat: return AR_BASIC_LITERAL_FLOAT;
       case BuiltinType::LitInt: return AR_BASIC_LITERAL_INT;
+      case BuiltinType::Int8_4Packed: return AR_BASIC_INT8_4PACKED;
+      case BuiltinType::UInt8_4Packed: return AR_BASIC_UINT8_4PACKED;
       default:
         // Only builtin types that have basickind equivalents.
         break;
@@ -4100,6 +4202,8 @@ public:
     case AR_BASIC_MIN12INT:       return HLSLScalarType_int_min12;
     case AR_BASIC_MIN16INT:       return HLSLScalarType_int_min16;
     case AR_BASIC_MIN16UINT:      return HLSLScalarType_uint_min16;
+    case AR_BASIC_INT8_4PACKED:   return HLSLScalarType_int8_4packed;
+    case AR_BASIC_UINT8_4PACKED:  return HLSLScalarType_uint8_4packed;
 
     case AR_BASIC_INT64:          return HLSLScalarType_int64;
     case AR_BASIC_UINT64:         return HLSLScalarType_uint64;
@@ -4135,6 +4239,8 @@ public:
     case AR_BASIC_MIN12INT:       return m_scalarTypes[HLSLScalarType_int_min12];
     case AR_BASIC_MIN16INT:       return m_scalarTypes[HLSLScalarType_int_min16];
     case AR_BASIC_MIN16UINT:      return m_scalarTypes[HLSLScalarType_uint_min16];
+    case AR_BASIC_INT8_4PACKED:   return m_scalarTypes[HLSLScalarType_int8_4packed];
+    case AR_BASIC_UINT8_4PACKED:  return m_scalarTypes[HLSLScalarType_uint8_4packed];
     case AR_BASIC_ENUM:           return m_context->IntTy;
     case AR_BASIC_ENUM_CLASS:     return m_context->IntTy;
 
@@ -4159,7 +4265,8 @@ public:
     case AR_OBJECT_SAMPLER:
     case AR_OBJECT_SAMPLERCOMPARISON:
 
-    case AR_OBJECT_RESOURCE:
+    case AR_OBJECT_HEAP_RESOURCE:
+    case AR_OBJECT_HEAP_SAMPLER:
 
     case AR_OBJECT_BUFFER:
 
@@ -5240,6 +5347,8 @@ void HLSLExternalSource::AddBaseTypes()
   m_baseTypes[HLSLScalarType_int_min12] = m_context->Min12IntTy;
   m_baseTypes[HLSLScalarType_int_min16] = m_context->Min16IntTy;
   m_baseTypes[HLSLScalarType_uint_min16] = m_context->Min16UIntTy;
+  m_baseTypes[HLSLScalarType_int8_4packed] = m_context->Int8_4PackedTy;
+  m_baseTypes[HLSLScalarType_uint8_4packed] = m_context->UInt8_4PackedTy;
   m_baseTypes[HLSLScalarType_float_lit] = m_context->LitFloatTy;
   m_baseTypes[HLSLScalarType_int_lit] = m_context->LitIntTy;
   m_baseTypes[HLSLScalarType_int16] = m_context->ShortTy;
@@ -8654,7 +8763,9 @@ bool HLSLExternalSource::CanConvert(
   }
 
   // Cast from Resource to Object types.
-  if (SourceInfo.EltKind == AR_OBJECT_RESOURCE) {
+  if (SourceInfo.EltKind == AR_OBJECT_HEAP_RESOURCE ||
+      SourceInfo.EltKind == AR_OBJECT_HEAP_SAMPLER) {
+    // TODO: skip things like PointStream.
     if (TargetInfo.ShapeKind == AR_TOBJ_OBJECT) {
       Second = ICK_Flat_Conversion;
       goto lSuccess;
@@ -11511,6 +11622,10 @@ void hlsl::HandleDeclAttributeForHLSL(Sema &S, Decl *D, const AttributeList &A,
   case AttributeList::AT_HLSLWaveSensitive:
     declAttr = ::new (S.Context) HLSLWaveSensitiveAttr(A.getRange(), S.Context, A.getAttributeSpellingListIndex());
     break;
+  case AttributeList::AT_HLSLWaveSize:
+    declAttr = ::new (S.Context) HLSLWaveSizeAttr(A.getRange(), S.Context,
+      ValidateAttributeIntArg(S, A), A.getAttributeSpellingListIndex());
+    break;
   default:
     Handled = false;
     break;  // SPIRV Change: was return;
@@ -12823,6 +12938,15 @@ void hlsl::CustomPrintHLSLAttr(const clang::Attr *A, llvm::raw_ostream &Out, con
     break;
   }
   
+  case clang::attr::HLSLWaveSize:
+  {
+    Attr * noconst = const_cast<Attr*>(A);
+    HLSLWaveSizeAttr *ACast = static_cast<HLSLWaveSizeAttr*>(noconst);
+    Indent(Indentation, Out);
+    Out << "[wavesize(" << ACast->getSize() << ")]\n";
+    break;
+  }
+
   // Variable modifiers
   case clang::attr::HLSLGroupShared:
     Out << "groupshared ";
@@ -12949,6 +13073,7 @@ bool hlsl::IsHLSLAttr(clang::attr::Kind AttrKind) {
   case clang::attr::NoInline:
   case clang::attr::HLSLExport:
   case clang::attr::HLSLWaveSensitive:
+  case clang::attr::HLSLWaveSize:
   case clang::attr::VKBinding:
   case clang::attr::VKBuiltIn:
   case clang::attr::VKConstantId:

+ 2 - 0
tools/clang/lib/Sema/SemaTemplateVariadic.cpp

@@ -747,6 +747,8 @@ bool Sema::containsUnexpandedParameterPacks(Declarator &D) {
   case TST_min16uint:
   case TST_min10float:
   case TST_min12int:
+  case TST_int8_4packed:
+  case TST_uint8_4packed:
   // HLSL Change End
     break;
   }

+ 2 - 0
tools/clang/lib/Sema/SemaType.cpp

@@ -1340,6 +1340,8 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
   case DeclSpec::TST_min16uint: Result = Context.Min16UIntTy; break;
   case DeclSpec::TST_min10float: Result = Context.Min10FloatTy; break;
   case DeclSpec::TST_min12int: Result = Context.Min12IntTy; break;
+  case DeclSpec::TST_int8_4packed: Result = Context.Int8_4PackedTy; break;
+  case DeclSpec::TST_uint8_4packed: Result = Context.UInt8_4PackedTy; break;
     // HLSL Change Ends
   case DeclSpec::TST_double:
     if (DS.getTypeSpecWidth() == DeclSpec::TSW_long)

Fichier diff supprimé car celui-ci est trop grand
+ 256 - 116
tools/clang/lib/Sema/gen_intrin_main_tables_15.h


+ 19 - 0
tools/clang/test/CodeGenHLSL/attributes_wavesize.hlsl

@@ -0,0 +1,19 @@
+// RUN: %dxc -E main -T cs_6_6 %s | FileCheck %s
+// RUN: %dxc -E main -T cs_6_6 %s -D WAVESIZE=13 | FileCheck %s -check-prefixes=CHECK-ERR
+// RUN: %dxc -E main -T cs_6_6 %s -D WAVESIZE=2  | FileCheck %s -check-prefixes=CHECK-ERR
+
+// CHECK: @main, !"main", null, null, [[PROPS:![0-9]+]]}
+// CHECK: [[PROPS]] = !{i32 4, [[NT:![0-9]+]], i32 11, [[WS:![0-9]+]]}
+// CHECK: [[NT]] = !{i32 1, i32 1, i32 8}
+// CHECK: [[WS]] = !{i32 32}
+
+// CHECK-ERR: error: WaveSize value must be between 4 and 128 and a power of 2
+
+#ifndef WAVESIZE
+#define WAVESIZE 32
+#endif
+
+[wavesize(WAVESIZE)]
+[numthreads(1,1,8)]
+void main() {
+}

+ 1257 - 1
tools/clang/test/HLSL/ShaderOpArith.xml

@@ -87,6 +87,414 @@
     ]]>
     </Shader>
   </ShaderOp>
+
+  <ShaderOp Name="Derivatives" PS="PS" VS="VS" CS="CS" AS="AS" MS="MS" TopologyType="TRIANGLE">
+    <RootSignature>
+      RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+      DescriptorTable(SRV(t0,numDescriptors=1), UAV(u0), UAV(u1), UAV(u2)),
+      StaticSampler(s0, addressU = TEXTURE_ADDRESS_WRAP, addressV = TEXTURE_ADDRESS_WRAP, filter = FILTER_MIN_MAG_LINEAR_MIP_POINT)
+    </RootSignature>
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f, 0.0f }, { 0.0f, 0.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { 1.0f, -1.0f, 0.0f }, { 1.0f, 1.0f } }
+    </Resource>
+    <Resource Name="T0" Dimension="Texture2D" Width="4" Height="4" InitialResourceState="COPY_DEST" Init="FromBytes" Format="R32_FLOAT">
+      {.125f, .25f, .5f, 1.0f},
+      {2.0f, 4.0f, 16.0f, 32.0f},
+      {32.0f, 64.0f, 128.0f, 256.0f},
+      {256.0f, 512.0f, 1024.0f, 2048.0f}
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="32" Height="32" Format="R32G32B32A32_FLOAT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" ReadBack="true" />
+    <Resource Name="U0" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U1" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U2" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+
+    <RootValues>
+      <RootValue HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='T0' Kind='SRV' ResName='T0' />
+      <Descriptor Name='U0' Kind='UAV' ResName='U0'
+                  NumElements="1024" StructureByteStride="16" />
+      <Descriptor Name='U1' Kind='UAV' ResName='U1'
+                  NumElements="1024" StructureByteStride="16" />
+      <Descriptor Name='U2' Kind='UAV' ResName='U2'
+                  NumElements="1024" StructureByteStride="16" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+      <InputElement SemanticName="TEXCOORD" Format="R32G32_FLOAT" AlignedByteOffset="12" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="CS" Target="cs_6_6" EntryPoint="CSMain" Text="@PS"/>
+    <Shader Name="AS" Target="as_6_6" EntryPoint="ASMain" Text="@PS"/>
+    <Shader Name="MS" Target="ms_6_6" EntryPoint="MSMain" Text="@PS"/>
+    <Shader Name="VS" Target="vs_6_0" EntryPoint="VSMain" Text="@PS"/>
+    <Shader Name="PS" Target="ps_6_0" EntryPoint="PSMain">
+      <![CDATA[
+        struct PSInput {
+          float4 position : SV_POSITION;
+          float2 uv : TEXCOORD;
+        };
+        Texture2D<float> g_tex : register(t0);
+        RWStructuredBuffer<float4> g_bufMain : register(u0);
+        RWStructuredBuffer<float4> g_bufMesh : register(u1);
+        RWStructuredBuffer<float4> g_bufAmp : register(u2);
+
+        float4 DerivTest(int2 uv) {
+          int3 offset = int3(uv%4, 0);
+          float val = g_tex.Load(offset);
+          return float4(ddx_fine(val), ddy_fine(val), ddx_coarse(val), ddy_coarse(val));
+        }
+
+        // Map group index to 4x4 UV texcoord block
+        int2 ConvertGroupIdx(uint groupIdx) {
+          return int2(((groupIdx&0x4)>>1) + (groupIdx&01), ((groupIdx&0x8)>>2) + ((groupIdx&02)>>1));
+        }
+
+        // Convert group index into uv texcoords and return derivatives test result
+        float4 DerivTest(uint groupIdx) {
+          return DerivTest(ConvertGroupIdx(groupIdx));
+        }
+
+        PSInput VSMain(float3 position : POSITION, float2 uv : TEXCOORD) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          return result;
+        }
+
+        struct Payload {
+          uint nothing;
+        };
+
+        static float4 g_Verts[6] = {
+          { -1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f, -1.0f, 0.0f, 1.0f }};
+
+        static float2 g_UV[6] = {
+          { 0.0f, 0.0f },
+          { 1.0f, 0.0f },
+          { 0.0f, 1.0f },
+
+          { 0.0f, 1.0f },
+          { 1.0f, 0.0f },
+          { 1.0f, 1.0f }};
+
+        [NumThreads(MESHDISPATCHX, MESHDISPATCHY, MESHDISPATCHZ)]
+        void ASMain(uint ix : SV_GroupIndex) {
+          Payload payload;
+          g_bufAmp[ix] = DerivTest(ix);
+          payload.nothing = 0;
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(MESHDISPATCHX, MESHDISPATCHY, MESHDISPATCHZ)]
+        [OutputTopology("triangle")]
+        void MSMain(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            g_bufMesh[ix] = DerivTest(ix);
+        }
+        float4 PSMain(PSInput input) : SV_TARGET {
+          // Convert from texcoords into a groupIndex equivalent
+          int width = DISPATCHX;
+          int height = DISPATCHY;
+          int2 uv = int2(input.uv.x*width, input.uv.y*height);
+          uint ix = ((uv.y/4)*(width/4))*16 + (uv.x/4)*16 + (((uv.x & 0x2) << 1) | (uv.x & 0x1) | ((uv.y & 0x2) << 2) | ((uv.y & 0x1) << 1));
+
+          float4 res = DerivTest(ix);
+          g_bufMain[ix] = res;
+          return res;
+        }
+
+        [NumThreads(DISPATCHX, DISPATCHY, DISPATCHZ)]
+        void CSMain(uint ix : SV_GroupIndex) {
+          g_bufMain[ix] = DerivTest(ix);
+        }
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <ShaderOp Name="QuadRead" PS="PS" VS="VS" CS="CS" AS="AS" MS="MS" TopologyType="TRIANGLE">
+    <RootSignature>
+      RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+      DescriptorTable(UAV(u0), UAV(u1), UAV(u2))
+    </RootSignature>
+    <Resource Name="U0" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U1" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U2" Dimension="BUFFER" Width="16384"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+
+    <RootValues>
+      <RootValue HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='U0' Kind='UAV' ResName='U0'
+                  NumElements="1024" StructureByteStride="16" />
+      <Descriptor Name='U1' Kind='UAV' ResName='U1'
+                  NumElements="1024" StructureByteStride="16" />
+      <Descriptor Name='U2' Kind='UAV' ResName='U2'
+                  NumElements="1024" StructureByteStride="16" />
+    </DescriptorHeap>
+
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+      <InputElement SemanticName="TEXCOORD" Format="R32G32_FLOAT" AlignedByteOffset="12" />
+    </InputElements>
+    <Shader Name="CS" Target="cs_6_0" EntryPoint="CSMain" Text="@PS"/>
+    <Shader Name="AS" Target="as_6_6" EntryPoint="ASMain" Text="@PS"/>
+    <Shader Name="MS" Target="ms_6_6" EntryPoint="MSMain" Text="@PS"/>
+    <Shader Name="PS" Target="ps_6_0" EntryPoint="PSMain">
+      <![CDATA[
+        struct PSInput {
+          float4 position : SV_POSITION;
+        };
+        RWStructuredBuffer<int4> g_bufMain : register(u0);
+        RWStructuredBuffer<int4> g_bufMesh : register(u1);
+        RWStructuredBuffer<int4> g_bufAmp : register(u2);
+
+        uint4 QuadReadTest(uint ix) {
+          return int4(QuadReadLaneAt(ix, ix & 0x3), QuadReadAcrossX(ix),
+                      QuadReadAcrossY(ix), QuadReadAcrossDiagonal(ix));
+        }
+
+        struct Payload {
+          uint nothing;
+        };
+
+        [NumThreads(MESHDISPATCHX, MESHDISPATCHY, MESHDISPATCHZ)]
+        void ASMain(uint ix : SV_GroupIndex) {
+          Payload payload;
+          g_bufAmp[ix] = QuadReadTest(ix);
+          payload.nothing = 0;
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        static float4 g_Verts[6] = {
+          { -1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f, -1.0f, 0.0f, 1.0f }};
+
+        [NumThreads(MESHDISPATCHX, MESHDISPATCHY, MESHDISPATCHZ)]
+        [OutputTopology("triangle")]
+        void MSMain(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            g_bufMesh[ix] = QuadReadTest(ix);
+        }
+
+        void PSMain(PSInput input) {
+          return;
+        }
+
+        [NumThreads(DISPATCHX, DISPATCHY, DISPATCHZ)]
+        void CSMain(uint ix : SV_GroupIndex) {
+          g_bufMain[ix] = QuadReadTest(ix);
+        }
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <ShaderOp Name="ComputeSample" PS="PS" VS="VS" CS="CS" AS="AS" MS="MS" TopologyType="TRIANGLE">
+    <RootSignature>
+      RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+      DescriptorTable(SRV(t0,numDescriptors=1), UAV(u0), UAV(u1), UAV(u2)),
+      StaticSampler(s0, addressU = TEXTURE_ADDRESS_WRAP, addressV = TEXTURE_ADDRESS_WRAP, filter = FILTER_MIN_MAG_LINEAR_MIP_POINT)
+    </RootSignature>
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f, 0.0f }, { 0.0f, 0.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { 1.0f, -1.0f, 0.0f }, { 1.0f, 1.0f } }
+    </Resource>
+    <Resource Name="T0" Dimension="Texture2D" Width="64" Height="64" MipLevels="7" InitialResourceState="COPY_DEST" Init="ByName" Format="R32_FLOAT" />
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="8" Height="8" Format="R32G32B32A32_FLOAT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" />
+    <Resource Name="U0" Dimension="BUFFER" Width="1920"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U1" Dimension="BUFFER" Width="1920"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U2" Dimension="BUFFER" Width="1920"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+
+    <RootValues>
+      <RootValue HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name='T0' Kind='SRV' ResName='T0' />
+      <Descriptor Name='U0' Kind='UAV' ResName='U0'
+                  NumElements="64" StructureByteStride="16" />
+      <Descriptor Name='U1' Kind='UAV' ResName='U1'
+                  NumElements="64" StructureByteStride="16" />
+      <Descriptor Name='U2' Kind='UAV' ResName='U2'
+                  NumElements="64" StructureByteStride="16" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+      <InputElement SemanticName="TEXCOORD" Format="R32G32_FLOAT" AlignedByteOffset="12" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="CS" Target="cs_6_6" EntryPoint="CSMain" Text="@PS"/>
+    <Shader Name="AS" Target="as_6_6" EntryPoint="ASMain" Text="@PS"/>
+    <Shader Name="MS" Target="ms_6_6" EntryPoint="MSMain" Text="@PS"/>
+    <Shader Name="VS" Target="vs_6_0" EntryPoint="VSMain" Text="@PS"/>
+    <Shader Name="PS" Target="ps_6_0" EntryPoint="PSMain">
+      <![CDATA[
+        struct PSInput {
+          float4 position : SV_POSITION;
+          float2 uv : TEXCOORD;
+        };
+
+        Texture2D<float> g_tex : register(t0);
+        RWStructuredBuffer<float4> g_bufMain : register(u0);
+        RWStructuredBuffer<float4> g_bufMesh : register(u1);
+        RWStructuredBuffer<float4> g_bufAmp : register(u2);
+
+        PSInput VSMain(float3 position : POSITION, float2 uv : TEXCOORD) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          return result;
+        }
+
+        SamplerState g_samp : register(s0);
+
+        uint4 DerivTest(uint ix, float left, float right, float top, float bot) {
+          uint iy = ix>>1;
+          return uint4(g_tex.CalculateLevelOfDetail(g_samp, float2(left, 0.5)) * (~ix&1) +
+                       g_tex.CalculateLevelOfDetail(g_samp, float2(right, 0.5)) * (ix&1),
+                       g_tex.Sample(g_samp, float2(left, 0.5)) * (~ix&1) +
+                       g_tex.Sample(g_samp, float2(right, 0.5)) * (ix&1),
+                       g_tex.CalculateLevelOfDetail(g_samp, float2(0.5, top)) * (~iy&1) +
+                       g_tex.CalculateLevelOfDetail(g_samp, float2(0.5, bot)) * (iy&1),
+                       g_tex.Sample(g_samp, float2(0.5, top)) * (~iy&1) +
+                       g_tex.Sample(g_samp, float2(0.5, bot)) * (iy&1));
+        }
+
+        // To avoid conditionals, two samples are performed one for left one for right
+        // They are step functioned on or off depending
+        uint4 DerivTest(uint ix) {
+          uint iy = ix>>1;
+          return DerivTest(ix, ((ix^1)/64.0)*(ix&1), (ix/64.0)*(ix&1),
+                               ((ix^2)/64.0)*(iy&1), (ix/64.0)*(iy&1));
+        }
+
+        static float4 g_Verts[6] = {
+          { -1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f, -1.0f, 0.0f, 1.0f }};
+
+        static float2 g_UV[6] = {
+          { 0.0f, 0.0f },
+          { 1.0f, 0.0f },
+          { 0.0f, 1.0f },
+
+          { 0.0f, 1.0f },
+          { 1.0f, 0.0f },
+          { 1.0f, 1.0f }};
+
+        struct Payload {
+          uint nothing;
+        };
+
+        [NumThreads(8, 8, 1)]
+        void ASMain(uint ix : SV_GroupIndex) {
+          Payload payload;
+          g_bufAmp[ix] = DerivTest(ix);
+          payload.nothing = 0;
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 1)]
+        [OutputTopology("triangle")]
+        void MSMain(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            g_bufMesh[ix] = DerivTest(ix);
+        }
+
+        float4 PSMain(PSInput input) : SV_TARGET {
+          int ix = int(input.uv.y * 8) * 8 + int(input.uv.x * 8);
+          // Contort the linear index into quad order by rotating relevant middle bits
+          ix = (ix&~0xE)|((ix&0x8)>>2)|((ix&0x6)<<1);
+          g_bufMain[ix] = DerivTest(ix);
+          return 1;
+        }
+        [NumThreads(8, 8, 1)]
+        void CSMain(uint ix : SV_GroupIndex) {
+          g_bufMain[ix] = DerivTest(ix);
+        }
+
+      ]]>
+    </Shader>
+  </ShaderOp>
   <ShaderOp Name="OOB" PS="PS" VS="VS">
     <RootSignature>RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT), CBV(b0), DescriptorTable(SRV(t0,numDescriptors=2))</RootSignature>
     <Resource Name="CB0" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" TransitionTo="VERTEX_AND_CONSTANT_BUFFER">
@@ -880,7 +1288,855 @@
     </Shader>
   </ShaderOp>
 
-  <!--
+  <ShaderOp Name="WaveSizeTest" CS="CS">
+      <RootSignature>RootFlags(0), UAV(u0)</RootSignature>
+      <Resource Name="UAVBuffer0" Dimension="BUFFER" Width="512" InitialResourceState="COPY_DEST" Init="ByName" Flags="ALLOW_UNORDERED_ACCESS" TransitionTo="UNORDERED_ACCESS" ReadBack="true" Format="R32_TYPELESS" />
+      <RootValues>
+          <RootValue Index="0" ResName="UAVBuffer0" />
+      </RootValues>
+      <Shader Name="CS" Target="cs_6_6">
+          <![CDATA[// Shader source code will be set at runtime]]>
+      </Shader>
+  </ShaderOp>>
+
+  <ShaderOp Name="PackUnpackOp" CS="CS" DispatchX="1" DispatchY="1">
+    <RootSignature>RootFlags(0), UAV(u0), UAV(u1), UAV(u2)</RootSignature>
+    <Resource Name="g_bufIn" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="false" />
+    <Resource Name="g_bufOutPacked" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <Resource Name="g_bufOutPackedUnpacked" Dimension="BUFFER" Width="1024" Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST" TransitionTo="UNORDERED_ACCESS" Init="ByName" ReadBack="true" />
+    <RootValues>
+      <RootValue Index="0" ResName="g_bufIn" />
+      <RootValue Index="1" ResName="g_bufOutPacked" />
+      <RootValue Index="2" ResName="g_bufOutPackedUnpacked" />
+    </RootValues>
+    <Shader Name="CS" Target="cs_6_0">
+      <![CDATA[
+      void main(uint GI : SV_GroupIndex) {};
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <!-- For explanations of the atomics tests, see comments in and around VerifyAtomicResults in Executiontest.cpp -->
+  <ShaderOp Name="Atomics" PS="PS" VS="VS" CS="CS" AS="AS" MS="MS" TopologyType="TRIANGLE">
+    <RootSignature>
+      RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+      DescriptorTable(UAV(u0), UAV(u1), UAV(u2), UAV(u3), UAV(u4), UAV(u5), UAV(u6), UAV(u7), UAV(u8), UAV(u9), UAV(u10), UAV(u11), UAV(u12), UAV(u13), UAV(u14), UAV(u15), UAV(u16), UAV(u17)),
+      StaticSampler(s0, addressU = TEXTURE_ADDRESS_WRAP, addressV = TEXTURE_ADDRESS_WRAP, filter = FILTER_MIN_MAG_LINEAR_MIP_POINT)
+    </RootSignature>
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f, 0.0f }, { 0.0f, 0.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { 1.0f, -1.0f, 0.0f }, { 1.0f, 1.0f } }
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="64" Height="64" Format="R32G32B32A32_FLOAT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" />
+    <!-- Raw buffers -->
+    <Resource Name="U0" Dimension="BUFFER" Width="576"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true" >
+      {
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 99999999I, 99999999I, 0I, 0I, 99999999I, 99999999I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, -1I, -1I, 0I, 0I, -1I, -1I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0I, 0, 0, 0, 0,
+      }
+    </Resource>
+    <Resource Name="U1" Dimension="BUFFER" Width="9216"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U2" Dimension="BUFFER" Width="256"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true">
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U3" Dimension="BUFFER" Width="1024"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <!-- 32-bit typed resources -->
+    <Resource Name="U4" Dimension="BUFFER" Width="256" Format="R32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true" >
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U5" Dimension="BUFFER" Width="256" Format="R32_SINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true">
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U6" Dimension="BUFFER" Width="1024" Format="R32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U7" Dimension="TEXTURE1D" Width="16" Format="R32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true" >
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U8" Dimension="TEXTURE1D" Width="16" Format="R32_SINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true">
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U9" Dimension="TEXTURE1D" Width="128" Format="R32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <!-- groupshared output buffers -->
+    <Resource Name="U10" Dimension="BUFFER" Width="256"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U11" Dimension="BUFFER" Width="1024"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <!-- 64-bit typed resources -->
+    <Resource Name="U12" Dimension="BUFFER" Width="256" Format="R32G32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true" >
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U13" Dimension="BUFFER" Width="256" Format="R32G32_SINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true">
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U14" Dimension="BUFFER" Width="1024" Format="R32G32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U15" Dimension="TEXTURE1D" Width="16" Format="R32G32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true" >
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U16" Dimension="TEXTURE1D" Width="16" Format="R32G32_SINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="FromBytes" ReadBack="true">
+      { 0I, 0I, 99999999I, 99999999I, 0I, 0I, -1I, -1I, 0I, 0I, 0I, 0I, 42I, 42I, 42I, 42I }
+    </Resource>
+    <Resource Name="U17" Dimension="TEXTURE1D" Width="128" Format="R32G32_UINT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <RootValues>
+      <RootValue HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <!-- Raw buffers -->
+      <Descriptor Name="U0" Kind="UAV" ResName="U0"
+                  NumElements="8" StructureByteStride="72" />
+      <Descriptor Name="U1" Kind="UAV" ResName="U1"
+                  NumElements="128" StructureByteStride="72" />
+      <Descriptor Name="U2" Kind="UAV" ResName="U2"
+                  NumElements="16" StructureByteStride="8" />
+      <Descriptor Name="U3" Kind="UAV" ResName="U3"
+                  NumElements="128" StructureByteStride="8" />
+      <!-- 32-bit typed resources -->
+      <Descriptor Name="U4" Kind="UAV" ResName="U4" Dimension="BUFFER"
+                  NumElements="16"  Format="R32_UINT" />
+      <Descriptor Name="U5" Kind="UAV" ResName="U5" Dimension="BUFFER"
+                  NumElements="16"  Format="R32_UINT" />
+      <Descriptor Name="U6" Kind="UAV" ResName="U6" Dimension="BUFFER"
+                  NumElements="128"  Format="R32_UINT" />
+      <Descriptor Name="U7" Kind="UAV" ResName="U7" Dimension="TEXTURE1D"
+                  NumElements="16"  Format="R32_UINT" />
+      <Descriptor Name="U8" Kind="UAV" ResName="U8" Dimension="TEXTURE1D"
+                  NumElements="16"  Format="R32_UINT" />
+      <Descriptor Name="U9" Kind="UAV" ResName="U9" Dimension="TEXTURE1D"
+                  NumElements="128"  Format="R32_UINT" />
+      <!-- groupshared output buffers -->
+      <Descriptor Name="U10" Kind="UAV" ResName="U10" Dimension="BUFFER"
+                  NumElements="8" Format="R32G32_UINT" />
+      <Descriptor Name="U11" Kind="UAV" ResName="U11" Dimension="BUFFER"
+                  NumElements="64" Format="R32G32_UINT" />
+      <!-- 64-bit typed resources -->
+      <Descriptor Name="U12" Kind="UAV" ResName="U12" Dimension="BUFFER"
+                  NumElements="16"  Format="R32G32_UINT" />
+      <Descriptor Name="U13" Kind="UAV" ResName="U13" Dimension="BUFFER"
+                  NumElements="16"  Format="R32G32_UINT" />
+      <Descriptor Name="U14" Kind="UAV" ResName="U14" Dimension="BUFFER"
+                  NumElements="128"  Format="R32G32_UINT" />
+      <Descriptor Name="U15" Kind="UAV" ResName="U15" Dimension="TEXTURE1D"
+                  NumElements="16"  Format="R32G32_UINT" />
+      <Descriptor Name="U16" Kind="UAV" ResName="U16" Dimension="TEXTURE1D"
+                  NumElements="16"  Format="R32G32_UINT" />
+      <Descriptor Name="U17" Kind="UAV" ResName="U17" Dimension="TEXTURE1D"
+                  NumElements="128"  Format="R32G32_UINT" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+      <InputElement SemanticName="TEXCOORD" Format="R32G32_FLOAT" AlignedByteOffset="12" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="PS64" Target="ps_6_6" EntryPoint="PSMain64" Text="@CS"/>
+    <Shader Name="AS64" Target="as_6_6" EntryPoint="ASMain64" Text="@CS"/>
+    <Shader Name="MS64" Target="ms_6_6" EntryPoint="MSMain64" Text="@CS"/>
+    <Shader Name="VS64" Target="vs_6_6" EntryPoint="VSMain64" Text="@CS"/>
+    <Shader Name="CS64" Target="cs_6_6" EntryPoint="CSMain64" Text="@CS"/>
+    <Shader Name="PSTY64" Target="ps_6_6" EntryPoint="PSMainTyped64" Text="@CS"/>
+    <Shader Name="ASTY64" Target="as_6_6" EntryPoint="ASMainTyped64" Text="@CS"/>
+    <Shader Name="MSTY64" Target="ms_6_6" EntryPoint="MSMainTyped64" Text="@CS"/>
+    <Shader Name="VSTY64" Target="vs_6_6" EntryPoint="VSMainTyped64" Text="@CS"/>
+    <Shader Name="CSTY64" Target="cs_6_6" EntryPoint="CSMainTyped64" Text="@CS"/>
+    <Shader Name="ASSH64" Target="as_6_6" EntryPoint="ASMainShared64" Text="@CS"/>
+    <Shader Name="MSSH64" Target="ms_6_6" EntryPoint="MSMainShared64" Text="@CS"/>
+    <Shader Name="CSSH64" Target="cs_6_6" EntryPoint="CSMainShared64" Text="@CS"/>
+    <Shader Name="AS"   Target="as_6_5" EntryPoint="ASMain"   Text="@CS"/>
+    <Shader Name="MS"   Target="ms_6_5" EntryPoint="MSMain"   Text="@CS"/>
+    <Shader Name="VS"   Target="vs_6_0" EntryPoint="VSMain"   Text="@CS"/>
+    <Shader Name="PS"   Target="ps_6_0" EntryPoint="PSMain"   Text="@CS"/>
+    <Shader Name="CS"   Target="cs_6_0" EntryPoint="CSMain">
+      <![CDATA[
+        struct PSInput {
+          float4 position : SV_POSITION;
+          float2 uv : TEXCOORD;
+        };
+        struct AtomicStuff {
+          float2 prepad[3];
+          uint uintEl[4];
+          int4  sintEl;
+          struct useless {
+            uint3 unused;
+          } postpad;
+          float last;
+        };
+        struct Atomic64Stuff {
+          float2 prepad[3];
+          uint64_t uintEl[2];
+          int64_t2  sintEl;
+          struct useless {
+            uint3 unused;
+          } postpad;
+          float last;
+        };
+        RWStructuredBuffer<AtomicStuff> g_structBuf : register(u0);
+        RWStructuredBuffer<AtomicStuff> g_strXchgBuf : register(u1);
+
+        RWByteAddressBuffer g_rawBuf : register(u2);
+        RWByteAddressBuffer g_rawXchgBuf : register(u3);
+
+        RWBuffer<uint> g_uintBuf : register(u4);
+        RWBuffer<int> g_sintBuf : register(u5);
+        RWBuffer<int> g_xchgBuf : register(u6);
+
+        RWTexture1D<uint> g_utexBuf : register(u7);
+        RWTexture1D<int> g_stexBuf : register(u8);
+        RWTexture1D<int> g_xtexBuf : register(u9);
+
+        RWBuffer<uint2> g_shareBuf : register(u10);
+        RWBuffer<uint2> g_shareXchgBuf : register(u11);
+
+        groupshared uint g_uintShare[12];
+        groupshared int g_sintShare[6];
+        groupshared uint g_xchgShare[128];
+
+        RWStructuredBuffer<Atomic64Stuff> g_struct64Buf : register(u0);
+        RWStructuredBuffer<Atomic64Stuff> g_strXchg64Buf : register(u1);
+
+        RWByteAddressBuffer g_raw64Buf : register(u2);
+        RWByteAddressBuffer g_rawXchg64Buf : register(u3);
+
+        RWBuffer<uint64_t> g_uint64Buf : register(u12);
+        RWBuffer<int64_t> g_sint64Buf : register(u13);
+        RWBuffer<int64_t> g_xchg64Buf : register(u14);
+
+        RWTexture1D<uint64_t> g_utex64Buf : register(u15);
+        RWTexture1D<int64_t> g_stex64Buf : register(u16);
+        RWTexture1D<int64_t> g_xtex64Buf : register(u17);
+
+        RWBuffer<uint64_t> g_share64Buf : register(u10);
+        RWBuffer<uint64_t> g_shareXchg64Buf : register(u11);
+
+        groupshared uint64_t g_uint64Share[6];
+        groupshared int64_t g_sint64Share[3];
+        groupshared uint64_t g_xchg64Share[64];
+
+        #define VEC_CALL(op, uav, ix, val) op(uav[ix*stride], val);
+
+        #define USTRUCT_CALL(op, uav, ix, val) op(uav[ix].uintEl[stride], val);
+        #define SSTRUCT_CALL(op, uav, ix, val) op(uav[ix].sintEl.z, val);
+        #define SSTRUCT64_CALL(op, uav, ix, val) op(uav[ix].sintEl.y, val);
+
+        #define URAW_CALL(op, uav, ix, val) uav.op(8*ix, val);
+        #define SRAW_CALL(op, uav, ix, val) uav.op(8*(5+ix), val); // signed at end. raw buffers don't need separate buffers
+
+        #define OP_TEST(ucall, scall, uuav, suav) \
+          ucall(InterlockedAdd, uuav, 0, addVal); \
+          scall(InterlockedMin, suav, 1, sminMaxVal); \
+          scall(InterlockedMax, suav, 2, sminMaxVal); \
+          ucall(InterlockedMin, uuav, 1, uminMaxVal); \
+          ucall(InterlockedMax, uuav, 2, uminMaxVal); \
+          ucall(InterlockedAnd, uuav, 3, ~value); \
+          ucall(InterlockedOr,  uuav, 4, value); \
+          ucall(InterlockedXor, uuav, 5, xorVal);
+
+        #define VEC_CALL3(op, uav, ix, cmp, val) op(uav[(ix)*stride], cmp, val)
+        #define VEC_CALL4(op, uav, ix, cmp, val, o) op(uav[(ix)*stride], cmp, val, o)
+
+        #define STRUCT_CALL3(op, uav, ix, cmp, val) op(uav[ix].uintEl[stride], cmp, val)
+        #define STRUCT_CALL4(op, uav, ix, cmp, val, o) op(uav[ix].uintEl[stride], cmp, val, o)
+
+        #define RAW_CALL3(op, uav, ix, cmp, val) uav.op(8*(ix), cmp, val)
+        #define RAW_CALL4(op, uav, ix, cmp, val, o) uav.op(8*(ix), cmp, val, o)
+
+        // The first of four to match gets the first and then the winner performs the last two exchanges
+        #define XCHG_TEST(call3, call4, uav) \
+          call3(InterlockedCompareStore,    uav, (ix/3)%64, 0,           xchgVal - 2); \
+          call4(InterlockedCompareExchange, uav, (ix/3)%64, xchgVal - 2, xchgVal - 1, output); \
+          if (output == xchgVal - 2) { call3(InterlockedExchange, uav, (ix/3)%64, xchgVal, output);}
+
+        void AtomicTest(uint ix, uint bitSize) {
+          uint stride = 2;
+          uint value = (ix) | ((ix) << (bitSize/2));
+          uint addVal = ix; // 32 bits isn't enough room to dupliate upper and lower
+          uint uminMaxVal = ~value*(~value&1) + value*(value&1);
+          int sminMaxVal = ~value*(~value&1) + value*(value&1);
+          uint xorVal = 1 << (ix%(bitSize-1));
+          // make higher bits differ while lower bits match
+          uint xchgVal = (ix << (bitSize/2)) | ((ix/3)%64);
+          uint output = 0;
+
+          // structured
+          OP_TEST(USTRUCT_CALL, SSTRUCT_CALL, g_structBuf, g_structBuf)
+          XCHG_TEST(STRUCT_CALL3, STRUCT_CALL4, g_strXchgBuf)
+
+          // raw
+          OP_TEST(URAW_CALL, SRAW_CALL, g_rawBuf, g_rawBuf)
+          XCHG_TEST(RAW_CALL3, RAW_CALL4, g_rawXchgBuf)
+
+          // typed buffer
+          OP_TEST(VEC_CALL, VEC_CALL, g_uintBuf, g_sintBuf)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchgBuf)
+
+          // texture
+          OP_TEST(VEC_CALL, VEC_CALL, g_utexBuf, g_stexBuf)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xtexBuf)
+        }
+
+        void AtomicRaw64Test(uint ix, uint64_t bitSize) {
+          uint64_t lix = ix;
+          uint stride = 1;
+          uint64_t value = (lix) | ((lix) << (bitSize/2));
+          uint64_t addVal = value;
+          uint64_t uminMaxVal = ~value*(~value&1) + value*(value&1);
+          int64_t sminMaxVal = ~value*(~value&1) + value*(value&1);
+          uint64_t xorVal = 1ULL << (lix%(bitSize-1));
+          // make higher bits differ while lower bits match
+          uint64_t xchgVal = (lix << (bitSize/2)) | ((lix/3)%64);
+          uint64_t output = 0;
+
+          OP_TEST(USTRUCT_CALL, SSTRUCT64_CALL, g_struct64Buf, g_struct64Buf)
+          XCHG_TEST(STRUCT_CALL3, STRUCT_CALL4, g_strXchg64Buf)
+
+          // ByteAddressBuffer for 64-bit values are a special case. inlined here
+          URAW_CALL(InterlockedAdd64, g_raw64Buf, 0, addVal);
+          SRAW_CALL(InterlockedMin64, g_raw64Buf, 1, sminMaxVal);
+          SRAW_CALL(InterlockedMax64, g_raw64Buf, 2, sminMaxVal);
+          URAW_CALL(InterlockedMin64, g_raw64Buf, 1, uminMaxVal);
+          URAW_CALL(InterlockedMax64, g_raw64Buf, 2, uminMaxVal);
+          URAW_CALL(InterlockedAnd64, g_raw64Buf, 3, ~value);
+          URAW_CALL(InterlockedOr64,  g_raw64Buf, 4, value);
+          URAW_CALL(InterlockedXor64, g_raw64Buf, 5, xorVal);
+
+          RAW_CALL3(InterlockedCompareStore64,    g_rawXchg64Buf, (ix/3)%64, 0,           xchgVal - 2);
+          RAW_CALL4(InterlockedCompareExchange64, g_rawXchg64Buf, (ix/3)%64, xchgVal - 2, xchgVal - 1, output);
+          if (output == xchgVal - 2) { RAW_CALL3(InterlockedExchange64, g_rawXchg64Buf, (ix/3)%64, xchgVal, output);}
+        }
+
+        void AtomicTyped64Test(uint ix, uint64_t bitSize) {
+          uint64_t lix = ix;
+          uint stride = 1;
+          uint64_t value = (lix) | ((lix) << (bitSize/2));
+          uint64_t addVal = value;
+          uint64_t uminMaxVal = ~value*(~value&1) + value*(value&1);
+          int64_t sminMaxVal = ~value*(~value&1) + value*(value&1);
+          uint64_t xorVal = 1ULL << (lix%(bitSize-1));
+          // make higher bits differ while lower bits match
+          uint64_t xchgVal = (lix << (bitSize/2)) | ((lix/3)%64);
+          uint64_t output = 0;
+
+          OP_TEST(VEC_CALL, VEC_CALL, g_uint64Buf, g_sint64Buf)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchg64Buf)
+
+          OP_TEST(VEC_CALL, VEC_CALL, g_utex64Buf, g_stex64Buf)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xtex64Buf)
+        }
+
+        void AtomicGroupSharedTest(uint ix, uint bitSize) {
+          uint stride = 1;
+          uint value = (ix) | ((ix) << (bitSize/2));
+          uint addVal = ix; // 32 bits isn't enough room to dupliate upper and lower
+          uint uminMaxVal = ~value*(~value&1) + value*(value&1);
+          int sminMaxVal = ~value*(~value&1) + value*(value&1);
+          uint xorVal = 1 << (ix%(bitSize-1));
+          uint xchgVal = (ix << (bitSize/2)) | ((ix/3)%64);
+          uint output = 0;
+
+          uint uIx = ix%(6*stride);
+          uint sIx = ix%(3*stride);
+
+          // Zero-init shared memory
+          g_uintShare[uIx] = 0;
+          g_sintShare[sIx] = 0;
+          g_xchgShare[ix%64] = 0;
+
+          GroupMemoryBarrierWithGroupSync();
+
+          InterlockedCompareStore(g_uintShare[stride], 0, 99999999);
+          InterlockedCompareStore(g_uintShare[3*stride], 0, -1);
+          InterlockedCompareStore(g_sintShare[stride], 0, 99999999);
+
+          OP_TEST(VEC_CALL, VEC_CALL, g_uintShare, g_sintShare)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchgShare)
+
+          GroupMemoryBarrierWithGroupSync();
+
+          g_shareBuf[uIx].x = g_uintShare[uIx];
+          g_shareBuf[6 + sIx].x = g_sintShare[sIx + 1];
+
+          g_shareXchgBuf[(ix/3)%64].x = g_xchgShare[(ix/3)%64];
+        }
+
+        void AtomicGroupShared64Test(uint ix, uint64_t bitSize) {
+          uint64_t lix = ix;
+          uint stride = 1;
+          uint64_t value = (lix) | ((lix) << (bitSize/2));
+          uint64_t addVal = value;
+          uint64_t uminMaxVal = ~value*(~value&1) + value*(value&1);
+          int64_t sminMaxVal = ~value*(~value&1) + value*(value&1);
+          uint64_t xorVal = 1ULL << (lix%(bitSize-1));
+          uint64_t xchgVal = (lix << (bitSize/2)) | ((lix/3)%64);
+          uint64_t output = 0;
+
+          uint uIx = ix%(6*stride);
+          uint sIx = ix%(3*stride);
+
+          // Zero-init shared memory
+          g_uint64Share[uIx] = 0;
+          g_sint64Share[sIx] = 0;
+          g_xchg64Share[ix%64] = 0;
+
+          GroupMemoryBarrierWithGroupSync();
+
+          InterlockedCompareStore(g_uint64Share[stride], 0, 99999999ULL | (99999999ULL << (bitSize/2)));
+          InterlockedCompareStore(g_uint64Share[3*stride], 0, ~0ULL);
+          InterlockedCompareStore(g_sint64Share[stride], 0, 99999999ULL | (99999999ULL << (bitSize/2)));
+
+          OP_TEST(VEC_CALL, VEC_CALL, g_uint64Share, g_sint64Share)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchg64Share)
+
+          GroupMemoryBarrierWithGroupSync();
+
+          g_share64Buf[uIx] = g_uint64Share[uIx];
+          g_share64Buf[sIx + 6] = g_sint64Share[sIx + 1];
+
+          g_shareXchg64Buf[(ix/3)%64] = g_xchg64Share[(ix/3)%64];
+
+        }
+
+        struct Payload {
+          uint nothing;
+        };
+
+        static float4 g_Verts[6] = {
+          { -1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f, -1.0f, 0.0f, 1.0f }};
+
+        static float2 g_UV[6] = {
+          { 0.0f, 0.0f },
+          { 1.0f, 0.0f },
+          { 0.0f, 1.0f },
+
+          { 0.0f, 1.0f },
+          { 1.0f, 0.0f },
+          { 1.0f, 1.0f }};
+
+        [NumThreads(8, 8, 2)]
+        void ASMain(uint ix : SV_GroupIndex) {
+          Payload payload;
+          payload.nothing = 0;
+          AtomicTest(64*64 + 8*8 + ix, 32);
+          AtomicGroupSharedTest(64*64 + 8*8 + ix, 32);
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 2)]
+        [OutputTopology("triangle")]
+        void MSMain(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            AtomicTest(64*64 + ix, 32);
+            AtomicGroupSharedTest(64*64 + ix, 32);
+        }
+
+        PSInput VSMain(float3 position : POSITION, float2 uv : TEXCOORD, uint ix : SV_VertexID) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          AtomicTest(64*64 + ix, 32);
+          return result;
+        }
+
+        float4 PSMain(PSInput input) : SV_TARGET {
+          uint ix = uint(input.uv.y*64)*64 + input.uv.x*64;
+          AtomicTest(ix, 32);
+          return 1;
+        }
+
+        [NumThreads(32, 32, 1)]
+        void CSMain(uint ix : SV_GroupIndex) {
+          AtomicTest(ix, 32);
+          AtomicGroupSharedTest(ix, 32);
+        }
+
+        [NumThreads(8, 8, 2)]
+        void ASMain64(uint ix : SV_GroupIndex) {
+          Payload payload;
+          payload.nothing = 0;
+          AtomicRaw64Test(64*64 + 8*8 + ix, 64);
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 2)]
+        [OutputTopology("triangle")]
+        void MSMain64(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            AtomicRaw64Test(64*64 + ix, 64);
+        }
+
+        PSInput VSMain64(float3 position : POSITION, float2 uv : TEXCOORD, uint ix : SV_VertexID) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          AtomicRaw64Test(64*64 + ix, 64);
+          return result;
+        }
+
+        float4 PSMain64(PSInput input) : SV_TARGET {
+          uint ix = uint(input.uv.y*64)*64 + input.uv.x*64;
+          AtomicRaw64Test(ix, 64);
+          return 1;
+        }
+
+        [NumThreads(32, 32, 1)]
+        void CSMain64(uint ix : SV_GroupIndex) {
+          AtomicRaw64Test(ix, 64);
+        }
+
+        [NumThreads(8, 8, 2)]
+        void ASMainTyped64(uint ix : SV_GroupIndex) {
+          Payload payload;
+          payload.nothing = 0;
+          AtomicTyped64Test(64*64 + 8*8 + ix, 64);
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 2)]
+        [OutputTopology("triangle")]
+        void MSMainTyped64(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            AtomicTyped64Test(64*64 + ix, 64);
+        }
+
+        PSInput VSMainTyped64(float3 position : POSITION, float2 uv : TEXCOORD, uint ix : SV_VertexID) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          AtomicTyped64Test(64*64 + ix, 64);
+          return result;
+        }
+
+        float4 PSMainTyped64(PSInput input) : SV_TARGET {
+          uint ix = uint(input.uv.y*64)*64 + input.uv.x*64;
+          AtomicTyped64Test(ix, 64);
+          return 1;
+        }
+
+        [NumThreads(32, 32, 1)]
+        void CSMainTyped64(uint ix : SV_GroupIndex) {
+          AtomicTyped64Test(ix, 64);
+        }
+
+        [NumThreads(8, 8, 2)]
+        void ASMainShared64(uint ix : SV_GroupIndex) {
+          Payload payload;
+          payload.nothing = 0;
+          AtomicGroupShared64Test(64*64 + 8*8 + ix, 64);
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 2)]
+        [OutputTopology("triangle")]
+        void MSMainShared64(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            AtomicGroupShared64Test(64*64 + ix, 64);
+        }
+
+        [NumThreads(32, 32, 1)]
+        void CSMainShared64(uint ix : SV_GroupIndex) {
+          AtomicGroupShared64Test(ix, 64);
+        }
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+  <ShaderOp Name="FloatAtomics" PS="PS" VS="VS" CS="CS" AS="AS" MS="MS" TopologyType="TRIANGLE">
+    <RootSignature>
+      RootFlags(ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT),
+      DescriptorTable(UAV(u0), UAV(u1), UAV(u2), UAV(u3), UAV(u4)),
+      StaticSampler(s0, addressU = TEXTURE_ADDRESS_WRAP, addressV = TEXTURE_ADDRESS_WRAP, filter = FILTER_MIN_MAG_LINEAR_MIP_POINT)
+    </RootSignature>
+    <Resource Name="VBuffer" Dimension="BUFFER" InitialResourceState="COPY_DEST" Init="FromBytes" Topology="TRIANGLELIST">
+      { { -1.0f, 1.0f, 0.0f }, { 0.0f, 0.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+
+      { { -1.0f, -1.0f, 0.0f }, { 0.0f, 1.0f } },
+      { { 1.0f, 1.0f, 0.0f }, { 1.0f, 0.0f } },
+      { { 1.0f, -1.0f, 0.0f }, { 1.0f, 1.0f } }
+    </Resource>
+    <Resource Name="RTarget" Dimension="TEXTURE2D" Width="64" Height="64" Format="R32G32B32A32_FLOAT" Flags="ALLOW_RENDER_TARGET" InitialResourceState="COPY_DEST" />
+    <Resource Name="U0" Dimension="BUFFER" Width="2816"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U1" Dimension="BUFFER" Width="256"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U2" Dimension="BUFFER" Width="256"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U3" Dimension="TEXTURE1D" Width="64" Format="R32_FLOAT"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <Resource Name="U4" Dimension="BUFFER" Width="256"
+              Flags="ALLOW_UNORDERED_ACCESS" InitialResourceState="COPY_DEST"
+              Init="Zero" ReadBack="true" />
+    <RootValues>
+      <RootValue HeapName="ResHeap" />
+    </RootValues>
+    <DescriptorHeap Name="ResHeap" Type="CBV_SRV_UAV">
+      <Descriptor Name="U0" Kind="UAV" ResName="U0"
+                  NumElements="64" StructureByteStride="44" />
+      <Descriptor Name="U1" Kind="UAV" ResName="U1"
+                  NumElements="64" StructureByteStride="4" />
+      <Descriptor Name="U2" Kind="UAV" ResName="U2"
+                  NumElements="64" StructureByteStride="4" />
+      <Descriptor Name="U3" Kind="UAV" ResName="U3" Dimension="TEXTURE1D"
+                  NumElements="64" StructureByteStride="4" />
+      <Descriptor Name="U4" Kind="UAV" ResName="U4"
+                  NumElements="64" StructureByteStride="4" />
+    </DescriptorHeap>
+    <DescriptorHeap Name="RtvHeap" NumDescriptors="1" Type="RTV">
+      <Descriptor Name="RTarget" Kind="RTV"/>
+    </DescriptorHeap>
+
+    <InputElements>
+      <InputElement SemanticName="POSITION" Format="R32G32B32_FLOAT" AlignedByteOffset="0" />
+      <InputElement SemanticName="TEXCOORD" Format="R32G32_FLOAT" AlignedByteOffset="12" />
+    </InputElements>
+    <RenderTargets>
+      <RenderTarget Name="RTarget"/>
+    </RenderTargets>
+    <Shader Name="AS"   Target="as_6_5" EntryPoint="ASMain"   Text="@CS"/>
+    <Shader Name="MS"   Target="ms_6_5" EntryPoint="MSMain"   Text="@CS"/>
+    <Shader Name="VS"   Target="vs_6_0" EntryPoint="VSMain"   Text="@CS"/>
+    <Shader Name="PS"   Target="ps_6_0" EntryPoint="PSMain"   Text="@CS"/>
+    <Shader Name="CS"   Target="cs_6_0" EntryPoint="CSMain">
+      <![CDATA[
+        struct PSInput {
+          float4 position : SV_POSITION;
+          float2 uv : TEXCOORD;
+        };
+        struct AtomicStuff {
+          float2 prepad[3];
+          float fltEl[2];
+          struct useless {
+            uint3 unused;
+          } postpad;
+        };
+
+        RWStructuredBuffer<AtomicStuff> g_strXchgBuf : register(u0);
+        RWByteAddressBuffer g_rawXchgBuf : register(u1);
+        RWBuffer<float> g_xchgBuf : register(u2);
+        RWTexture1D<float> g_xtexBuf : register(u3);
+        RWBuffer<float> g_shareXchgBuf : register(u4);
+
+        groupshared float g_xchgShare[1024];
+
+        #define VEC_CALL3(op, uav, ix, cmp, val) op(uav[(ix)], cmp, val)
+        #define VEC_CALL4(op, uav, ix, cmp, val, o) op(uav[(ix)], cmp, val, o)
+
+        #define STRUCT_CALL3(op, uav, ix, cmp, val) op(uav[ix].fltEl[1], cmp, val)
+        #define STRUCT_CALL4(op, uav, ix, cmp, val, o) op(uav[ix].fltEl[1], cmp, val, o)
+
+        #define RAW_CALL3(op, uav, ix, cmp, val) uav.op(4*(ix), cmp, val)
+        #define RAW_CALL4(op, uav, ix, cmp, val, o) uav.op(4*(ix), cmp, val, o)
+
+        // The first of four to match gets the first and then the winner performs the last two exchanges
+        #define XCHG_TEST(call3, call4, uav) \
+          call3(InterlockedCompareStoreFloatBitwise,    uav, (ix/3)%63 + 1, 0,           xchgVal - 2); \
+          call4(InterlockedCompareExchangeFloatBitwise, uav, (ix/3)%63 + 1, xchgVal - 2, xchgVal - 1, output); \
+          if (output == xchgVal - 2) { call3(InterlockedExchange, uav, (ix/3)%63 + 1, xchgVal, output);}
+
+        void AtomicTest(uint ix) {
+          float xchgVal = ix;
+          float output = 0;
+
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchgBuf)
+          XCHG_TEST(STRUCT_CALL3, STRUCT_CALL4, g_strXchgBuf)
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xtexBuf)
+
+          // Special case for ByteAddressBuffers
+          RAW_CALL3(InterlockedCompareStoreFloatBitwise,    g_rawXchgBuf, (ix/3)%63 + 1, 0,           xchgVal - 2);
+          RAW_CALL4(InterlockedCompareExchangeFloatBitwise, g_rawXchgBuf, (ix/3)%63 + 1, xchgVal - 2, xchgVal - 1, output);
+          if (output == xchgVal - 2) { RAW_CALL3(InterlockedExchangeFloat, g_rawXchgBuf, (ix/3)%63 + 1, xchgVal, output);}
+
+          // Check NaN corner case
+          InterlockedCompareExchangeFloatBitwise(g_xchgBuf[0], 0, sqrt(-1), output);
+          if (output == 0.0) InterlockedCompareStoreFloatBitwise(g_xchgBuf[0], sqrt(-1), 0.123);
+
+          InterlockedCompareExchangeFloatBitwise(g_strXchgBuf[0].fltEl[1], 0, sqrt(-1), output);
+          if (output == 0.0) InterlockedCompareStoreFloatBitwise(g_strXchgBuf[0].fltEl[1], sqrt(-1), 0.123);
+
+          g_rawXchgBuf.InterlockedCompareExchangeFloatBitwise(0, 0, sqrt(-1), output);
+          if (output == 0.0) g_rawXchgBuf.InterlockedCompareStoreFloatBitwise(0, sqrt(-1), 0.123);
+
+          InterlockedCompareExchangeFloatBitwise(g_xtexBuf[0], 0, sqrt(-1), output);
+          if (output == 0.0) InterlockedCompareStoreFloatBitwise(g_xtexBuf[0], sqrt(-1), 0.123);
+        }
+
+        void AtomicGroupSharedTest(uint ix) {
+          float xchgVal = ix;
+          float output = 0;
+
+          g_xchgShare[ix%64] = 0;
+          GroupMemoryBarrierWithGroupSync();
+
+          XCHG_TEST(VEC_CALL3, VEC_CALL4, g_xchgShare)
+
+          InterlockedCompareExchangeFloatBitwise(g_xchgShare[0], 0, sqrt(-1), output);
+          if (output == 0.0) InterlockedCompareStoreFloatBitwise(g_xchgShare[0], sqrt(-1), 0.123);
+
+          GroupMemoryBarrierWithGroupSync();
+
+          g_shareXchgBuf[ix%64] = g_xchgShare[ix%64];
+        }
+
+        struct Payload {
+          uint nothing;
+        };
+
+        static float4 g_Verts[6] = {
+          { -1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+
+          { -1.0f, -1.0f, 0.0f, 1.0f },
+          {  1.0f,  1.0f, 0.0f, 1.0f },
+          {  1.0f, -1.0f, 0.0f, 1.0f }};
+
+        static float2 g_UV[6] = {
+          { 0.0f, 0.0f },
+          { 1.0f, 0.0f },
+          { 0.0f, 1.0f },
+
+          { 0.0f, 1.0f },
+          { 1.0f, 0.0f },
+          { 1.0f, 1.0f }};
+
+        [NumThreads(8, 8, 2)]
+        void ASMain(uint ix : SV_GroupIndex) {
+          Payload payload;
+          payload.nothing = 0;
+          AtomicTest(ix);
+          AtomicGroupSharedTest(ix);
+          DispatchMesh(1, 1, 1, payload);
+        }
+
+        [NumThreads(8, 8, 2)]
+        [OutputTopology("triangle")]
+        void MSMain(
+          uint ix : SV_GroupIndex,
+          in payload Payload payload,
+          out vertices PSInput verts[6],
+          out indices uint3 tris[2]) {
+            SetMeshOutputCounts(6, 2);
+            verts[ix].position = g_Verts[ix];
+            verts[ix].uv = g_UV[ix];
+            if (ix % 3)
+              tris[ix / 3] = uint3(ix, ix + 1, ix + 2);
+            AtomicTest(ix);
+            AtomicGroupSharedTest(ix);
+        }
+
+        PSInput VSMain(float3 position : POSITION, float2 uv : TEXCOORD, uint ix : SV_VertexID) {
+          PSInput result;
+          result.position = float4(position, 1.0);
+          result.uv = uv;
+          AtomicTest(64*64 + ix);
+          return result;
+        }
+
+        float4 PSMain(PSInput input) : SV_TARGET {
+          uint ix = uint(input.uv.y*64)*64 + input.uv.x*64;
+          AtomicTest(ix);
+          return 1;
+        }
+
+        [NumThreads(32, 32, 1)]
+        void CSMain(uint ix : SV_GroupIndex) {
+          AtomicTest(ix);
+          AtomicGroupSharedTest(ix);
+        }
+      ]]>
+    </Shader>
+  </ShaderOp>
+
+    <!--
   TODO: Dynamically index into tables
   -->
 </ShaderOpSet>

+ 2 - 0
tools/clang/test/HLSLFileCheck/hlsl/control_flow/attributes/unroll/2d_array.hlsl

@@ -1,4 +1,6 @@
 // RUN: %dxc -Od -E main -T ps_6_0 %s | FileCheck %s
+// RUN: %dxc -Od -E main -T ps_6_6 %s | FileCheck %s
+
 // CHECK: call i32 @dx.op.bufferUpdateCounter
 // CHECK: call i32 @dx.op.bufferUpdateCounter
 // CHECK: call i32 @dx.op.bufferUpdateCounter

+ 1 - 0
tools/clang/test/HLSLFileCheck/hlsl/control_flow/attributes/unroll/extern.hlsl

@@ -1,4 +1,5 @@
 // RUN: %dxc -T lib_6_3 %s | FileCheck %s
+// RUN: %dxc -T lib_6_6 %s | FileCheck %s
 
 // Global array with external linkage does not need constant indexing.
 // Check that the block is not included in the unroll and only happens

+ 13 - 8
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic.hlsl

@@ -1,4 +1,7 @@
-// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -DTYPE=uint -T cs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -DTYPE=int -T cs_6_0 %s | FileCheck %s
+// RUN: %dxc -E main -DTYPE=uint64_t -T cs_6_6 %s | FileCheck %s
+// RUN: %dxc -E main -DTYPE=int64_t -T cs_6_6 %s | FileCheck %s
 
 // CHECK: atomicrmw add
 // CHECK: atomicrmw add
@@ -20,19 +23,21 @@
 
 RWByteAddressBuffer rawBuf0 : register( u0 );
 
+#define _TOTUPLE(type) type##2
+#define TOTUPLE(type) _TOTUPLE(type)
+
 struct Foo
 {
   float2 a;
   float3 b;
-  uint   u;
-  int2 c[4];
-  int d[4];
+  TYPE   u;
+  TOTUPLE(TYPE) c[4];
+  TYPE d[4];
 };
 RWStructuredBuffer<Foo> structBuf1 : register( u1 );
-RWTexture2D<uint> rwTex2: register( u2 );
-
+RWTexture2D<TYPE> rwTex2: register( u2 );
 
-groupshared uint shareMem[256];
+groupshared TYPE shareMem[256];
 
 [numthreads( 8, 8, 1 )]
 void main( uint GI : SV_GroupIndex, uint3 DTid : SV_DispatchThreadID )
@@ -40,7 +45,7 @@ void main( uint GI : SV_GroupIndex, uint3 DTid : SV_DispatchThreadID )
     shareMem[GI] = 0;
 
     GroupMemoryBarrierWithGroupSync();
-    uint v;
+    TYPE v;
 
     InterlockedAdd( shareMem[DTid.x], 1 );
     InterlockedAdd( shareMem[DTid.x], 1, v );

+ 193 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpstr_i64_and_i32.hlsl

@@ -0,0 +1,193 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+#ifdef MEMTYPE
+MEMTYPE<uint>     resU;
+MEMTYPE<int>      resI;
+MEMTYPE<uint64_t> resU64;
+MEMTYPE<int64_t>  resI64;
+#else
+groupshared uint     resU[256];
+groupshared int      resI[256];
+groupshared uint64_t resU64[256];
+groupshared int64_t  resI64[256];
+#endif
+
+// TYCHECK: Note: shader requires additional functionality:
+// TYCHECK: 64-bit Atomics on Typed Resources
+// GSCHECK: Note: shader requires additional functionality:
+// GSCHECK: 64-bit Atomics on Group Shared
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  resU[a] = a;
+  resI[a] = a;
+  resU64[a] = a;
+  resI64[a] = a;
+
+  uint uv = b - c;
+  uint uv2 = b + c;
+  int iv = b / c;
+  int iv2 = b * c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+
+  // test some basic examples
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareStore( resU[a], uv, uv2);
+  InterlockedCompareStore( resI[a], iv, iv2 );
+  InterlockedCompareStore( resU64[a], luv, luv2);
+  InterlockedCompareStore( resI64[a], liv, liv2);
+
+  // test some signed and unsigned variables
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareStore( resU[a], iv, iv2 );
+  InterlockedCompareStore( resU[a], iv, uv2 );
+  InterlockedCompareStore( resU[a], uv, iv2 );
+  InterlockedCompareStore( resI[a], uv, uv2 );
+  InterlockedCompareStore( resI[a], uv, iv2 );
+  InterlockedCompareStore( resI[a], iv, uv2 );
+
+  // test some literals with 32-bit resources
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareStore( resU[a], 1.0, 2.0 );
+  InterlockedCompareStore( resU[a], iv, 2.0 );
+  InterlockedCompareStore( resU[a], 1.0, iv2 );
+  InterlockedCompareStore( resI[a], 1.0, 2.0 );
+  InterlockedCompareStore( resI[a], 1.0, iv2 );
+  InterlockedCompareStore( resI[a], iv, 2.0 );
+
+  // test some basic 64-bit variables
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareStore( resU64[a], liv, liv2 );
+  InterlockedCompareStore( resU64[a], liv, luv2 );
+  InterlockedCompareStore( resU64[a], luv, liv2 );
+  InterlockedCompareStore( resI64[a], luv, luv2 );
+  InterlockedCompareStore( resI64[a], luv, liv2 );
+  InterlockedCompareStore( resI64[a], liv, luv2 );
+
+  // test some literals with 64-bit resources
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  InterlockedCompareStore( resU64[a], 1.0, 2.0 );
+  InterlockedCompareStore( resU64[a], iv, 2.0 );
+  InterlockedCompareStore( resU64[a], 1.0, iv2 );
+  InterlockedCompareStore( resI64[a], 1.0, 2.0 );
+  InterlockedCompareStore( resI64[a], 1.0, iv2 );
+  InterlockedCompareStore( resI64[a], iv, 2.0 );
+
+  // test some mixed 32 and 64-bit variables with 32-bit resources
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareStore( resU[a], luv, luv2 );
+  InterlockedCompareStore( resU[a], luv, uv2 );
+  InterlockedCompareStore( resU[a], uv, luv2 );
+  InterlockedCompareStore( resI[a], liv, liv2 );
+  InterlockedCompareStore( resI[a], liv, iv2 );
+  InterlockedCompareStore( resI[a], iv, liv2 );
+
+  // test some mixed 32 and 64-bit variables with 64-bit resources
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareStore( resU64[a], uv, uv2 );
+  InterlockedCompareStore( resU64[a], uv, luv2 );
+  InterlockedCompareStore( resU64[a], luv, uv2 );
+  InterlockedCompareStore( resI64[a], iv, iv2 );
+  InterlockedCompareStore( resI64[a], iv, liv2 );
+  InterlockedCompareStore( resI64[a], liv, iv2 );
+}

+ 116 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpstr_method_i64_and_i32.hlsl

@@ -0,0 +1,116 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the second and third args determine the overload and the others can be what they will
+// When either of these is not int64, fallback to the old overload with its casts
+
+RWByteAddressBuffer res;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  uint uv = b - c;
+  uint uv2 = b + c;
+  int iv = b / c;
+  int iv2 = b * c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+  uint ix = 0;
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareStore( ix++, iv, iv2 );
+  res.InterlockedCompareStore( ix++, iv, uv2 );
+  res.InterlockedCompareStore( ix++, uv, iv2 );
+  res.InterlockedCompareStore( ix++, uv, uv2 );
+
+  // test some literals with 32-bit overloads
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareStore( ix++, 1.0, 2.0 );
+  res.InterlockedCompareStore( ix++, iv, 2.0 );
+  res.InterlockedCompareStore( ix++, 1.0, iv2 );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareStore64( ix++, liv, liv2 );
+  res.InterlockedCompareStore64( ix++, liv, luv2 );
+  res.InterlockedCompareStore64( ix++, luv, liv2 );
+  res.InterlockedCompareStore64( ix++, luv, luv2 );
+
+  // Test some literals with 64-bit overloads
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  res.InterlockedCompareStore64( ix++, 1.0, 2.0 );
+  res.InterlockedCompareStore64( ix++, liv, 2.0 );
+  res.InterlockedCompareStore64( ix++, 1.0, luv2 );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareStore( ix++, luv, luv2 );
+  res.InterlockedCompareStore( ix++, luv, uv2 );
+  res.InterlockedCompareStore( ix++, uv, luv2 );
+  res.InterlockedCompareStore( ix++, liv, liv2 );
+  res.InterlockedCompareStore( ix++, liv, iv2 );
+  res.InterlockedCompareStore( ix++, iv, liv2 );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareStore( ix++, uv, uv2 );
+  res.InterlockedCompareStore( ix++, uv, luv2 );
+  res.InterlockedCompareStore( ix++, luv, uv2 );
+  res.InterlockedCompareStore( ix++, iv, iv2 );
+  res.InterlockedCompareStore( ix++, iv, liv2 );
+  res.InterlockedCompareStore( ix++, liv, iv2 );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareStore64( ix++, luv, luv2 );
+  res.InterlockedCompareStore64( ix++, luv, uv2 );
+  res.InterlockedCompareStore64( ix++, uv, luv2 );
+  res.InterlockedCompareStore64( ix++, liv, liv2 );
+  res.InterlockedCompareStore64( ix++, liv, iv2 );
+  res.InterlockedCompareStore64( ix++, iv, liv2 );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareStore64( ix++, uv, uv2 );
+  res.InterlockedCompareStore64( ix++, iv, iv2 );
+  res.InterlockedCompareStore64( ix++, iv, liv2 );
+}

+ 215 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpxchg_i64_and_i32.hlsl

@@ -0,0 +1,215 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+#ifdef MEMTYPE
+MEMTYPE<uint>     resU;
+MEMTYPE<int>      resI;
+MEMTYPE<uint64_t> resU64;
+MEMTYPE<int64_t>  resI64;
+#else
+groupshared uint     resU[256];
+groupshared int      resI[256];
+groupshared uint64_t resU64[256];
+groupshared int64_t  resI64[256];
+#endif
+
+// TYCHECK: Note: shader requires additional functionality:
+// TYCHECK: 64-bit Atomics on Typed Resources
+// GSCHECK: Note: shader requires additional functionality:
+// GSCHECK: 64-bit Atomics on Group Shared
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  resU[a] = a;
+  resI[a] = a;
+  resU64[a] = a;
+  resI64[a] = a;
+
+  uint uv = b - c;
+  uint uv2 = b + c;
+  uint ouv = 0;
+  int iv = b / c;
+  int iv2 = b * c;
+  int oiv = 0;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  uint64_t oluv = 0;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+  int64_t oliv = 0;
+
+  // Test basic examples
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareExchange( resU[a], uv, uv2, ouv );
+  InterlockedCompareExchange( resI[a], iv, iv2, oiv );
+  InterlockedCompareExchange( resU64[a], luv, luv2, ouv );
+  InterlockedCompareExchange( resI64[a], liv, liv2, oiv );
+
+  // Test signed and unsigned
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareExchange( resU[a], iv, iv2, oiv );
+  InterlockedCompareExchange( resU[a], iv, uv2, ouv );
+  InterlockedCompareExchange( resU[a], uv, iv2, oiv );
+  InterlockedCompareExchange( resI[a], uv, uv2, ouv );
+  InterlockedCompareExchange( resI[a], uv, iv2, oiv );
+  InterlockedCompareExchange( resI[a], iv, uv2, ouv );
+
+  // Test literals with 32 bit resources
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareExchange( resU[a], 1.0, 2.0, oiv );
+  InterlockedCompareExchange( resU[a], iv, 2.0, ouv );
+  InterlockedCompareExchange( resU[a], 1.0, iv2, oiv );
+  InterlockedCompareExchange( resI[a], 1.0, 2.0, ouv );
+  InterlockedCompareExchange( resI[a], 1.0, iv2, oiv );
+  InterlockedCompareExchange( resI[a], iv, 2.0, ouv );
+
+  // Test basic 64-bit variables
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareExchange( resU64[a], liv, liv2, oliv );
+  InterlockedCompareExchange( resU64[a], liv, luv2, oluv );
+  InterlockedCompareExchange( resU64[a], luv, liv2, oliv );
+  InterlockedCompareExchange( resI64[a], luv, luv2, oluv );
+  InterlockedCompareExchange( resI64[a], luv, liv2, oliv );
+  InterlockedCompareExchange( resI64[a], liv, luv2, oluv );
+
+  // Test some literals with 64-bit resources
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareExchange( resU64[a], 1.0, 2.0, oliv );
+  InterlockedCompareExchange( resU64[a], liv, 2.0, oluv );
+  InterlockedCompareExchange( resU64[a], 1.0, liv2, oliv );
+  InterlockedCompareExchange( resI64[a], 1.0, 2.0, oluv );
+  InterlockedCompareExchange( resI64[a], 1.0, liv2, oliv );
+  InterlockedCompareExchange( resI64[a], liv, 2.0, oluv );
+
+  // test some mixed 32 and 64-bit variables with 32-bit resources
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareExchange( resU[a], luv, luv2, ouv );
+  InterlockedCompareExchange( resU[a], luv, uv2, ouv );
+  InterlockedCompareExchange( resU[a], uv, luv2, ouv );
+  InterlockedCompareExchange( resU[a], uv, uv2, oluv );
+  InterlockedCompareExchange( resI[a], liv, liv2, oiv );
+  InterlockedCompareExchange( resI[a], liv, iv2, oiv );
+  InterlockedCompareExchange( resI[a], iv, liv2, oiv );
+  InterlockedCompareExchange( resI[a], iv, iv2, oliv );
+
+  // Test some mixed 32 and 64-bit variables with 64-bit resources
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // GSCHECK: cmpxchg i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedCompareExchange( resU64[a], uv, uv2, oluv );
+  InterlockedCompareExchange( resU64[a], uv, luv2, oluv );
+  InterlockedCompareExchange( resU64[a], luv, uv2, oluv );
+  InterlockedCompareExchange( resU64[a], luv, luv2, ouv );
+  InterlockedCompareExchange( resI64[a], iv, iv2, oliv );
+  InterlockedCompareExchange( resI64[a], iv, liv2, oliv );
+  InterlockedCompareExchange( resI64[a], liv, iv2, oliv );
+  InterlockedCompareExchange( resI64[a], liv, liv2, oiv );
+}

+ 167 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_cmpxchg_method_i64_and_i32.hlsl

@@ -0,0 +1,167 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+
+// Verify that the second and third args determine the overload and the others can be what they will
+// When either of these is not int64, fallback to the old overload with its casts
+
+RWByteAddressBuffer res;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  uint uv = b - c;
+  uint uv2 = b + c;
+  uint ouv = 0;
+  int iv = b / c;
+  int iv2 = b * c;
+  int oiv = 0;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  uint64_t oluv = 0;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+  int64_t oliv = 0;
+  uint ix = 0;
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareExchange( ix++, uv, uv2, ouv );
+  res.InterlockedCompareExchange( ix++, iv, iv2, oiv );
+  res.InterlockedCompareExchange64( ix++, luv, luv2, ouv );
+  res.InterlockedCompareExchange64( ix++, liv, liv2, oiv );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareExchange( ix++, iv, iv2, oiv );
+  res.InterlockedCompareExchange( ix++, iv, uv2, ouv );
+  res.InterlockedCompareExchange( ix++, uv, iv2, oiv );
+  res.InterlockedCompareExchange( ix++, uv, uv2, ouv );
+  res.InterlockedCompareExchange( ix++, uv, iv2, oiv );
+  res.InterlockedCompareExchange( ix++, iv, uv2, ouv );
+
+  // test some literals with 32-bit overloads
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareExchange( ix++, 1.0, 2.0, oiv );
+  res.InterlockedCompareExchange( ix++, 1.0, uv2, ouv );
+  res.InterlockedCompareExchange( ix++, uv, 2.0, oiv );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareExchange64( ix++, liv, liv2, oliv );
+  res.InterlockedCompareExchange64( ix++, liv, luv2, oluv );
+  res.InterlockedCompareExchange64( ix++, luv, liv2, oliv );
+  res.InterlockedCompareExchange64( ix++, luv, luv2, oluv );
+  res.InterlockedCompareExchange64( ix++, luv, liv2, oliv );
+  res.InterlockedCompareExchange64( ix++, liv, luv2, oluv );
+
+  // Test some literals with 64-bit overloads
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareExchange64( ix++, 1.0, 2.0, oliv );
+  res.InterlockedCompareExchange64( ix++, liv, 2.0, oluv );
+  res.InterlockedCompareExchange64( ix++, 1.0, liv2, oliv );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareExchange( ix++, luv, luv2, ouv );
+  res.InterlockedCompareExchange( ix++, luv, uv2, ouv );
+  res.InterlockedCompareExchange( ix++, uv, luv2, ouv );
+  res.InterlockedCompareExchange( ix++, uv, uv2, oluv );
+  res.InterlockedCompareExchange( ix++, liv, liv2, oiv );
+  res.InterlockedCompareExchange( ix++, liv, iv2, oiv );
+  res.InterlockedCompareExchange( ix++, iv, liv2, oiv );
+  res.InterlockedCompareExchange( ix++, iv, iv2, oliv );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareExchange( ix++, uv, uv2, oluv );
+  res.InterlockedCompareExchange( ix++, uv, luv2, oluv );
+  res.InterlockedCompareExchange( ix++, luv, uv2, oluv );
+  res.InterlockedCompareExchange( ix++, luv, luv2, ouv );
+  res.InterlockedCompareExchange( ix++, iv, iv2, oliv );
+  res.InterlockedCompareExchange( ix++, iv, liv2, oliv );
+  res.InterlockedCompareExchange( ix++, liv, iv2, oliv );
+  res.InterlockedCompareExchange( ix++, liv, liv2, oiv );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareExchange64( ix++, luv, luv2, ouv );
+  res.InterlockedCompareExchange64( ix++, luv, uv2, ouv );
+  res.InterlockedCompareExchange64( ix++, uv, luv2, ouv );
+  res.InterlockedCompareExchange64( ix++, uv, uv2, oluv );
+  res.InterlockedCompareExchange64( ix++, liv, liv2, oiv );
+  res.InterlockedCompareExchange64( ix++, liv, iv2, oiv );
+  res.InterlockedCompareExchange64( ix++, iv, liv2, oiv );
+  res.InterlockedCompareExchange64( ix++, iv, iv2, oliv );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedCompareExchange64( ix++, uv, uv2, oluv );
+  res.InterlockedCompareExchange64( ix++, uv, luv2, oluv );
+  res.InterlockedCompareExchange64( ix++, luv, uv2, oluv );
+  res.InterlockedCompareExchange64( ix++, luv, luv2, ouv );
+  res.InterlockedCompareExchange64( ix++, iv, iv2, oliv );
+  res.InterlockedCompareExchange64( ix++, iv, liv2, oliv );
+  res.InterlockedCompareExchange64( ix++, liv, iv2, oliv );
+  res.InterlockedCompareExchange64( ix++, liv, liv2, oiv );
+}

+ 114 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_float.hlsl

@@ -0,0 +1,114 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=CHECK
+
+#ifdef MEMTYPE
+MEMTYPE<float>     resF;
+MEMTYPE<int>       resI;
+MEMTYPE<uint64_t>  resI64;
+#else
+groupshared float    resF[256];
+groupshared int      resI[256];
+groupshared int64_t  resI64[256];
+#endif
+
+
+float4 main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  float fv = b - c;
+  float fv2 = b + c;
+  float ofv = 0;
+  int iv = b / c;
+  int iv2 = b * c;
+  int oiv = 0;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t lv = bb * cc;
+  uint64_t lv2 = bb / cc;
+  uint64_t olv = 0;
+
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedExchange( resF[a], fv, ofv);
+  InterlockedExchange( resI[a], iv, iv2 );
+  InterlockedExchange( resI64[a], lv, lv2);
+
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedExchange( resF[a], iv, iv2 );
+  InterlockedExchange( resF[a], fv, iv2 );
+  InterlockedExchange( resF[a], iv, fv2 );
+  InterlockedExchange( resI[a], fv, fv2 );
+  InterlockedExchange( resI64[a], fv, fv2 );
+
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareStoreFloatBitwise( resF[a], fv, fv2);
+
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareStoreFloatBitwise( resF[a], iv, iv2 );
+  InterlockedCompareStoreFloatBitwise( resF[a], fv, iv2 );
+  InterlockedCompareStoreFloatBitwise( resF[a], iv, fv2 );
+
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareExchangeFloatBitwise( resF[a], fv, fv2, ofv);
+
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // GSCHECK: cmpxchg i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  InterlockedCompareExchangeFloatBitwise( resF[a], iv, iv2, ofv );
+  InterlockedCompareExchangeFloatBitwise( resF[a], fv, iv2, ofv );
+  InterlockedCompareExchangeFloatBitwise( resF[a], iv, fv2, ofv );
+
+  // Test literals
+  // GSCHECK: atomicrmw xchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1065353216
+  // GSCHECK: atomicrmw xchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1073741824
+  // CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216)
+  // CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1073741824)
+  InterlockedExchange( resF[a], 1.0, ofv );
+  InterlockedExchange( resF[a], 2, oiv );
+
+
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1065353216, i32 1073741824
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}}
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}})
+  InterlockedCompareStoreFloatBitwise( resF[a], 1.0, 2.0 );
+  InterlockedCompareStoreFloatBitwise( resF[a], iv, 2 );
+  InterlockedCompareStoreFloatBitwise( resF[a], 1, fv2 );
+
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1065353216, i32 1073741824
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}}
+  // GSCHECK: cmpxchg i32 addrspace(3)* {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}})
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824)
+  InterlockedCompareExchangeFloatBitwise( resF[a], 1.0, 2.0, ofv );
+  InterlockedCompareExchangeFloatBitwise( resF[a], 1.0, iv2, oiv );
+  InterlockedCompareExchangeFloatBitwise( resF[a], iv2, 2.0, ofv );
+
+  return ofv;
+}

+ 83 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_float_errors.hlsl

@@ -0,0 +1,83 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+groupshared int      resGI[256];
+groupshared uint64_t resGI64[256];
+RWBuffer<int>      resBI;
+RWBuffer<uint64_t> resBI64;
+
+RWByteAddressBuffer Rres;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  resGI[a] = a;
+  resGI64[a] = a;
+  resBI[a] = a;
+  resBI64[a] = a;
+
+  float fv = b - c;
+  float fv2 = b + c;
+  float ofv = 0;
+  int iv = b / c;
+  int iv2 = b * c;
+  int oiv = 0;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t lv = bb * cc;
+  uint64_t lv2 = bb / cc;
+  uint64_t olv = 0;
+
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float &' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'unsigned long long' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'uint64_t' to 'float' for 1st argument
+  InterlockedCompareStoreFloatBitwise( resBI[a], iv, iv2 );
+  InterlockedCompareStoreFloatBitwise( resBI64[a], lv, lv2);
+  InterlockedCompareStoreFloatBitwise( resGI[a], iv, iv2 );
+  InterlockedCompareStoreFloatBitwise( resGI64[a], lv, lv2);
+
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'unsigned long long' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareStoreFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'uint64_t' to 'float' for 1st argument
+  InterlockedCompareStoreFloatBitwise( resBI[a], fv, fv2 );
+  InterlockedCompareStoreFloatBitwise( resBI64[a], fv, fv2 );
+  InterlockedCompareStoreFloatBitwise( resGI[a], fv, fv2 );
+  InterlockedCompareStoreFloatBitwise( resGI64[a], fv, fv2 );
+
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float &' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'unsigned long long' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'uint64_t' to 'float' for 1st argument
+  InterlockedCompareExchangeFloatBitwise( resBI[a], iv, iv2, oiv );
+  InterlockedCompareExchangeFloatBitwise( resBI64[a], lv, lv2, olv);
+  InterlockedCompareExchangeFloatBitwise( resGI[a], iv, iv2, oiv );
+  InterlockedCompareExchangeFloatBitwise( resGI64[a], lv, lv2, olv);
+
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'unsigned long long' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'int' to 'float' for 1st argument
+  // CHECK: error: no matching function for call to 'InterlockedCompareExchangeFloatBitwise'
+  // CHECK: note: candidate function not viable: no known conversion from 'uint64_t' to 'float' for 1st argument
+  InterlockedCompareExchangeFloatBitwise( resBI[a], fv, fv2, ofv );
+  InterlockedCompareExchangeFloatBitwise( resBI64[a], fv, fv2, ofv );
+  InterlockedCompareExchangeFloatBitwise( resGI[a], fv, fv2, ofv );
+  InterlockedCompareExchangeFloatBitwise( resGI64[a], fv, fv2, ofv );
+
+}

+ 105 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_method_float.hlsl

@@ -0,0 +1,105 @@
+// RUN: %dxc -T ps_6_0 %s | FileCheck %s -check-prefix=CHECK
+
+// Test float atomic cmp and xchg atomic methods
+
+RWByteAddressBuffer res;
+
+float4 main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  int iv = b - c;
+  int iv2 = b + c;
+  float fv = b/c;
+  float fv2 = b*c;
+  float ofv = 0;
+  int oiv = 0;
+  uint ix = 0;
+
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedExchangeFloat( ix++, fv, ofv );
+  res.InterlockedExchangeFloat( ix++, iv, oiv );
+  res.InterlockedExchangeFloat( ix++, iv2, ofv );
+  res.InterlockedExchangeFloat( ix++, fv2, oiv );
+
+  fv = ofv*0.1;
+  fv2 = ofv*2;
+  iv = oiv*0.1;
+  iv2 = oiv*2;
+  int iv3 = oiv*3;
+  int iv4 = oiv*4;
+  float fv3 = ofv*3;
+  float fv4 = ofv*4;
+
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareStoreFloatBitwise( ix++, fv, fv2 );
+  res.InterlockedCompareStoreFloatBitwise( ix++, iv, iv2 );
+  res.InterlockedCompareStoreFloatBitwise( ix++, iv3, fv4 );
+  res.InterlockedCompareStoreFloatBitwise( ix++, fv3, iv4 );
+
+  fv = ofv/6;
+  fv2 = ofv/2;
+  iv = oiv/6;
+  iv2 = oiv/2;
+  iv3 = oiv/3;
+  iv4 = oiv/4;
+  fv3 = ofv/3;
+  fv4 = ofv/4;
+
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  // CHECK: bitcast float
+  // CHECK: bitcast float
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32
+  res.InterlockedCompareExchangeFloatBitwise( ix++, fv, fv2, ofv );
+  res.InterlockedCompareExchangeFloatBitwise( ix++, iv, iv2, oiv );
+  res.InterlockedCompareExchangeFloatBitwise( ix++, iv3, fv4, ofv );
+  res.InterlockedCompareExchangeFloatBitwise( ix++, fv3, iv4, oiv );
+
+  // Test literals
+  // CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216)
+  // CHECK: call i32 @dx.op.atomicBinOp.i32(i32 78, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1073741824)
+  res.InterlockedExchangeFloat( ix++, 1.0, ofv );
+  res.InterlockedExchangeFloat( ix++, 2, oiv );
+
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}})
+  res.InterlockedCompareStoreFloatBitwise( ix++, 1.0, 2.0 );
+  res.InterlockedCompareStoreFloatBitwise( ix++, iv, 2 );
+  res.InterlockedCompareStoreFloatBitwise( ix++, 1, fv4 );
+
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 1073741824)
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 1065353216, i32 {{%?[0-9]*}})
+  // CHECK: call i32 @dx.op.atomicCompareExchange.i32(i32 79, %dx.types.Handle {{%?[A-Za-z0-9_]*}}, i32 {{%?[0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[A-Za-z0-9]*}}, i32 {{%?[0-9]*}}, i32 1073741824)
+  res.InterlockedCompareExchangeFloatBitwise( ix++, 1.0, 2.0, ofv );
+  res.InterlockedCompareExchangeFloatBitwise( ix++, 1.0, iv2, oiv );
+  res.InterlockedCompareExchangeFloatBitwise( ix++, iv3, 2.0, ofv );
+  
+
+  return ofv;
+}

+ 74 - 32
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_overload.hlsl

@@ -3,12 +3,15 @@
 // RUN: %dxc -no-warnings -T vs_6_2 -DTYPE=int16_t -enable-16bit-types  %s | %FileCheck %s -check-prefixes=INTFAIL,FLTFAIL
 // RUN: %dxc -no-warnings -T vs_6_2 -DTYPE=uint16_t -enable-16bit-types  %s | %FileCheck %s -check-prefixes=INTFAIL,FLTFAIL
 // RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=bool  %s | %FileCheck %s -check-prefixes=INTFAIL,FLTFAIL
-// RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=int64_t  %s | %FileCheck %s -check-prefixes=INTFAIL,FLTFAIL
-// RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=uint64_t  %s | %FileCheck %s -check-prefixes=INTFAIL,FLTFAIL
+// RUN: %dxilver 1.6 | %dxc -no-warnings -T vs_6_5 -DTYPE=int64_t  %s | %FileCheck %s -check-prefix=VALFAIL
+// RUN: %dxilver 1.6 | %dxc -no-warnings -T vs_6_5 -DTYPE=uint64_t  %s | %FileCheck %s -check-prefix=VALFAIL
+
 
 // RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=float  %s | %FileCheck %s -check-prefixes=INTFAIL,
 // RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=half  %s | %FileCheck %s -check-prefixes=INTFAIL
 
+// RUN: %dxc -no-warnings -T vs_6_6 -DTYPE=int64_t  %s | %FileCheck %s -check-prefixes=INTCHK
+// RUN: %dxc -no-warnings -T vs_6_6 -DTYPE=uint64_t  %s | %FileCheck %s -check-prefixes=INTCHK
 // RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=int  %s | %FileCheck %s -check-prefixes=INTCHK
 // RUN: %dxc -no-warnings -T vs_6_0 -DTYPE=uint  %s | %FileCheck %s -check-prefixes=INTCHK
 
@@ -34,10 +37,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedAdd'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw add i32
-  // INTCHK: atomicrmw add i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw add i{{[63][24]}}
+  // INTCHK: atomicrmw add i{{[63][24]}}
   InterlockedAdd(rw_res[0], val);
   InterlockedAdd(rw_res[0], val, orig);
   InterlockedAdd(gs_res, val);
@@ -53,10 +61,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedMin'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw {{u?}}min i32
-  // INTCHK: atomicrmw {{u?}}min i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw {{u?}}min i{{[63][24]}}
+  // INTCHK: atomicrmw {{u?}}min i{{[63][24]}}
   InterlockedMin(rw_res[0], val);
   InterlockedMin(rw_res[0], val, orig);
   InterlockedMin(gs_res, val);
@@ -72,10 +85,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedMax'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw {{u?}}max i32
-  // INTCHK: atomicrmw {{u?}}max i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw {{u?}}max i{{[63][24]}}
+  // INTCHK: atomicrmw {{u?}}max i{{[63][24]}}
   InterlockedMax(rw_res[0], val);
   InterlockedMax(rw_res[0], val, orig);
   InterlockedMax(gs_res, val);
@@ -91,10 +109,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedAnd'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw and i32
-  // INTCHK: atomicrmw and i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw and i{{[63][24]}}
+  // INTCHK: atomicrmw and i{{[63][24]}}
   InterlockedAnd(rw_res[0], val);
   InterlockedAnd(rw_res[0], val, orig);
   InterlockedAnd(gs_res, val);
@@ -110,10 +133,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedOr'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw or i32
-  // INTCHK: atomicrmw or i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw or i{{[63][24]}}
+  // INTCHK: atomicrmw or i{{[63][24]}}
   InterlockedOr(rw_res[0], val);
   InterlockedOr(rw_res[0], val, orig);
   InterlockedOr(gs_res, val);
@@ -129,10 +157,15 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedXor'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw xor i32
-  // INTCHK: atomicrmw xor i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw xor i{{[63][24]}}
+  // INTCHK: atomicrmw xor i{{[63][24]}}
   InterlockedXor(rw_res[0], val);
   InterlockedXor(rw_res[0], val, orig);
   InterlockedXor(gs_res, val);
@@ -144,8 +177,11 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedCompareStore'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicCompareExchange.i32
-  // INTCHK: cmpxchg i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicCompareExchange.i{{[63][24]}}
+  // INTCHK: cmpxchg i{{[63][24]}}
   InterlockedCompareStore(rw_res[0], comp, val);
   InterlockedCompareStore(gs_res, comp, val);
 
@@ -155,8 +191,11 @@ float main() :OUT{
   // FLTFAIL: error: no matching function for call to 'InterlockedExchange'
   // FLTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicBinOp.i32
-  // INTCHK: atomicrmw xchg i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicBinOp.i{{[63][24]}}
+  // INTCHK: atomicrmw xchg i{{[63][24]}}
   InterlockedExchange(rw_res[0], val, orig);
   InterlockedExchange(gs_res, val, orig);
 
@@ -166,8 +205,11 @@ float main() :OUT{
   // INTFAIL: error: no matching function for call to 'InterlockedCompareExchange'
   // INTFAIL: note: candidate function not viable: no known conversion from
 
-  // INTCHK: call i32 @dx.op.atomicCompareExchange.i32
-  // INTCHK: cmpxchg i32
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // VALFAIL: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+
+  // INTCHK: call i{{[63][24]}} @dx.op.atomicCompareExchange.i{{[63][24]}}
+  // INTCHK: cmpxchg i{{[63][24]}}
   InterlockedCompareExchange(rw_res[0], comp, val, orig);
   InterlockedCompareExchange(gs_res, comp, val, orig);
 

+ 74 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_structuredbuf_i64.hlsl

@@ -0,0 +1,74 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+// Test structured buffers with more complicated struct members and 64-bit atomics
+
+// A simple structure with 64-bit integer in the middle of two other members
+struct simple {
+  bool thisVariableIsFalse;
+  uint64_t i;
+  float3x1 longEnding[4];
+};
+
+struct complex {
+  double4 d;
+  simple s;
+  int64_t i;
+  simple ss[3];
+  float2 theEnd;
+};
+
+RWStructuredBuffer<simple> simpBuf;
+RWStructuredBuffer<simple[3]> simpArrBuf;
+RWStructuredBuffer<complex> cplxBuf;
+RWStructuredBuffer<complex[3]> cplxArrBuf;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  int64_t liv = a + b;
+  int64_t liv2 = 0, liv3 = 0;
+
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedAdd( simpBuf[a].i, liv );
+  InterlockedAdd( simpArrBuf[a][b].i, liv );
+  InterlockedAdd( cplxBuf[a].i, liv );
+  InterlockedAdd( cplxBuf[a].s.i, liv );
+  InterlockedAdd( cplxBuf[a].ss[b].i, liv );
+
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedExchange( simpBuf[a].i, liv, liv2 );
+  InterlockedExchange( simpArrBuf[a][b].i, liv2, liv );
+  InterlockedExchange( cplxBuf[a].i, liv, liv2 );
+  InterlockedExchange( cplxBuf[a].s.i, liv2, liv );
+  InterlockedExchange( cplxBuf[a].ss[b].i, liv, liv2 );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  InterlockedCompareStore( simpBuf[a].i, liv, liv2 );
+  InterlockedCompareStore( simpArrBuf[a][b].i, liv2, liv );
+  InterlockedCompareStore( cplxBuf[a].i, liv, liv2 );
+  InterlockedCompareStore( cplxBuf[a].s.i, liv2, liv );
+  InterlockedCompareStore( cplxBuf[a].ss[b].i, liv, liv2 );
+
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  // CHECK: call i64 @dx.op.atomicCompareExchange.i64
+  InterlockedCompareExchange( simpBuf[a].i, liv, liv2, liv3 );
+  InterlockedCompareExchange( simpArrBuf[a][b].i, liv2, liv3, liv );
+  InterlockedCompareExchange( cplxBuf[a].i, liv3, liv2, liv );
+  InterlockedCompareExchange( cplxBuf[a].s.i, liv2, liv, liv3 );
+  InterlockedCompareExchange( cplxBuf[a].ss[b].i, liv2, liv3, liv );
+
+}

+ 185 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_xchg_i64_and_i32.hlsl

@@ -0,0 +1,185 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -T ps_6_6 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -DMEMTYPE=RWStructuredBuffer %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+#ifdef MEMTYPE
+MEMTYPE<uint>     resU;
+MEMTYPE<int>      resI;
+MEMTYPE<uint64_t> resU64;
+MEMTYPE<int64_t>  resI64;
+#else
+groupshared uint     resU[256];
+groupshared int      resI[256];
+groupshared uint64_t resU64[256];
+groupshared int64_t  resI64[256];
+#endif
+
+// TYCHECK: Note: shader requires additional functionality:
+// TYCHECK: 64-bit Atomics on Typed Resources
+// GSCHECK: Note: shader requires additional functionality:
+// GSCHECK: 64-bit Atomics on Group Shared
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  resU[a] = a;
+  resI[a] = a;
+  resU64[a] = a;
+  resI64[a] = a;
+
+  uint uv = b - c;
+  uint uv2 = b + c;
+  int iv = b / c;
+  int iv2 = b * c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+
+  // Test basic examples
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedExchange( resU[a], uv, uv2);
+  InterlockedExchange( resI[a], iv, iv2 );
+  InterlockedExchange( resU64[a], luv, luv2);
+  InterlockedExchange( resI64[a], liv, liv2);
+
+  // Test signed and unsigned
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  InterlockedExchange( resU[a], iv, iv2 );
+  InterlockedExchange( resU[a], iv, uv2 );
+  InterlockedExchange( resU[a], uv, iv2 );
+  InterlockedExchange( resI[a], uv, uv2 );
+  InterlockedExchange( resI[a], uv, iv2 );
+  InterlockedExchange( resI[a], iv, uv2 );
+
+  // Test literals with 32 bit resources
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  InterlockedExchange( resU[a], 1.0, iv2 );
+  InterlockedExchange( resU[a], 2.0, uv2 );
+  InterlockedExchange( resI[a], 2.0, iv2 );
+  InterlockedExchange( resI[a], 1.0, uv2 );
+
+  // Test basic 64-bit variables
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedExchange( resU64[a], liv, liv2 );
+  InterlockedExchange( resU64[a], liv, luv2 );
+  InterlockedExchange( resU64[a], luv, liv2 );
+  InterlockedExchange( resI64[a], luv, luv2 );
+  InterlockedExchange( resI64[a], luv, liv2 );
+  InterlockedExchange( resI64[a], liv, luv2 );
+
+  // Test some literals with 64-bit resources
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  InterlockedExchange( resU64[a], 1.0, liv2 );
+  InterlockedExchange( resU64[a], 2.0, luv2 );
+  InterlockedExchange( resI64[a], 2.5, luv2 );
+  InterlockedExchange( resI64[a], 1.5, liv2 );
+
+  // test some mixed 32 and 64-bit variables with 32-bit resources
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // GSCHECK: atomicrmw xchg i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  InterlockedExchange( resU[a], luv, luv2 );
+  InterlockedExchange( resU[a], luv, uv2 );
+  InterlockedExchange( resU[a], uv, luv2 );
+  InterlockedExchange( resI[a], liv, liv2 );
+  InterlockedExchange( resI[a], liv, iv2 );
+  InterlockedExchange( resI[a], iv, liv2 );
+
+  // Test some mixed 32 and 64-bit variables with 64-bit resources
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // GSCHECK: atomicrmw xchg i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  InterlockedExchange( resU64[a], uv, uv2 );
+  InterlockedExchange( resU64[a], uv, luv2 );
+  InterlockedExchange( resU64[a], luv, uv2 );
+  InterlockedExchange( resI64[a], iv, iv2 );
+  InterlockedExchange( resI64[a], iv, liv2 );
+  InterlockedExchange( resI64[a], liv, iv2 );
+}

+ 132 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomic_xchg_method_i64_and_i32.hlsl

@@ -0,0 +1,132 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the second arg determines the overload and the others can be what they will
+
+RWByteAddressBuffer res;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  uint uv = b - c;
+  uint uv2 = b + c;
+  int iv = b / c;
+  int iv2 = b * c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+  uint ix = 0;
+
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedExchange( ix++, iv, iv2 );
+  res.InterlockedExchange( ix++, iv, uv2 );
+  res.InterlockedExchange( ix++, uv, iv2 );
+  res.InterlockedExchange( ix++, uv, uv2 );
+  res.InterlockedExchange( ix++, uv, iv2 );
+  res.InterlockedExchange( ix++, iv, uv2 );
+
+  // Test some literals with 32-bit overloads
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedExchange( ix++, 1.0, iv2 );
+  res.InterlockedExchange( ix++, 2.0, uv2 );
+
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedExchange64( ix++, liv, liv2 );
+  res.InterlockedExchange64( ix++, liv, luv2 );
+  res.InterlockedExchange64( ix++, luv, liv2 );
+  res.InterlockedExchange64( ix++, luv, luv2 );
+  res.InterlockedExchange64( ix++, luv, liv2 );
+  res.InterlockedExchange64( ix++, liv, luv2 );
+
+  // Test some literals with 64-bit overloads
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedExchange64( ix++, 1.0, liv2 );
+  res.InterlockedExchange64( ix++, 2.0, luv2 );
+
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedExchange( ix++, luv, luv2 );
+  res.InterlockedExchange( ix++, luv, uv2 );
+  res.InterlockedExchange( ix++, uv, luv2 );
+  res.InterlockedExchange( ix++, liv, liv2 );
+  res.InterlockedExchange( ix++, liv, iv2 );
+  res.InterlockedExchange( ix++, iv, liv2 );
+
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedExchange( ix++, uv, uv2 );
+  res.InterlockedExchange( ix++, uv, luv2 );
+  res.InterlockedExchange( ix++, luv, uv2 );
+  res.InterlockedExchange( ix++, iv, iv2 );
+  res.InterlockedExchange( ix++, iv, liv2 );
+  res.InterlockedExchange( ix++, liv, iv2 );
+
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedExchange64( ix++, luv, luv2 );
+  res.InterlockedExchange64( ix++, luv, uv2 );
+  res.InterlockedExchange64( ix++, uv, luv2 );
+  res.InterlockedExchange64( ix++, liv, liv2 );
+  res.InterlockedExchange64( ix++, liv, iv2 );
+  res.InterlockedExchange64( ix++, iv, liv2 );
+
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.InterlockedExchange64( ix++, uv, uv2 );
+  res.InterlockedExchange64( ix++, uv, luv2 );
+  res.InterlockedExchange64( ix++, luv, uv2 );
+  res.InterlockedExchange64( ix++, iv, iv2 );
+  res.InterlockedExchange64( ix++, iv, liv2 );
+  res.InterlockedExchange64( ix++, liv, iv2 );
+}

+ 91 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_float.hlsl

@@ -0,0 +1,91 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+groupshared float   resG[256];
+RWBuffer<float>     resB;
+RWStructuredBuffer<float> resS;
+
+void main( float a : A, int b: B, float c :C) : SV_Target
+{
+  // Test some disallowed atomic binop intrinsics with floats as both args
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resG[0], a);
+  InterlockedMin(resG[0], a);
+  InterlockedMax(resG[0], a);
+  InterlockedAnd(resG[0], a);
+  InterlockedOr(resG[0], a);
+  InterlockedXor(resG[0], a);
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resB[0], a);
+  InterlockedMin(resB[0], a);
+  InterlockedMax(resB[0], a);
+  InterlockedAnd(resB[0], a);
+  InterlockedOr(resB[0], a);
+  InterlockedXor(resB[0], a);
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resS[0], a);
+  InterlockedMin(resS[0], a);
+  InterlockedMax(resS[0], a);
+  InterlockedAnd(resS[0], a);
+  InterlockedOr(resS[0], a);
+  InterlockedXor(resS[0], a);
+
+  // Try the same with an integer second arg to make sure they still fail
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resG[0], b);
+  InterlockedMin(resG[0], b);
+  InterlockedMax(resG[0], b);
+  InterlockedAnd(resG[0], b);
+  InterlockedOr(resG[0], b);
+  InterlockedXor(resG[0], b);
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resB[0], b);
+  InterlockedMin(resB[0], b);
+  InterlockedMax(resB[0], b);
+  InterlockedAnd(resB[0], b);
+  InterlockedOr(resB[0], b);
+  InterlockedXor(resB[0], b);
+
+  // CHECK: error: no matching function for call to 'InterlockedAdd'
+  // CHECK: error: no matching function for call to 'InterlockedMin'
+  // CHECK: error: no matching function for call to 'InterlockedMax'
+  // CHECK: error: no matching function for call to 'InterlockedAnd'
+  // CHECK: error: no matching function for call to 'InterlockedOr'
+  // CHECK: error: no matching function for call to 'InterlockedXor'
+  InterlockedAdd(resS[0], b);
+  InterlockedMin(resS[0], b);
+  InterlockedMax(resS[0], b);
+  InterlockedAnd(resS[0], b);
+  InterlockedOr(resS[0], b);
+  InterlockedXor(resS[0], b);
+}
+

+ 84 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_i64.hlsl

@@ -0,0 +1,84 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+// A test to verify that 64-bit atomic binary operation intrinsics select the right variant
+
+groupshared int64_t gs[256];
+RWBuffer<int64_t> tb;
+RWStructuredBuffer<int64_t> sb;
+RWByteAddressBuffer rb;
+
+groupshared uint64_t ugs[256];
+RWBuffer<uint64_t> utb;
+RWStructuredBuffer<uint64_t> usb;
+
+void main( uint a : A, uint b: B) : SV_Target
+{
+  uint64_t luv = a * b;
+  int64_t liv = a + b;
+  uint ix = 0;
+
+  // GSCHECK: atomicrmw add i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 0
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 0
+  InterlockedAdd( gs[a], liv );
+  InterlockedAdd( tb[a], liv );
+  InterlockedAdd( sb[a], liv );
+  rb.InterlockedAdd( ix++, liv );
+
+  // GSCHECK: atomicrmw and i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 1
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 1
+  InterlockedAnd( gs[a], liv );
+  InterlockedAnd( tb[a], liv );
+  InterlockedAnd( sb[a], liv );
+  rb.InterlockedAnd( ix++, liv );
+
+  // GSCHECK: atomicrmw or i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 2
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 2
+  InterlockedOr( gs[a], liv );
+  InterlockedOr( tb[a], liv );
+  InterlockedOr( sb[a], liv );
+  rb.InterlockedOr( ix++, liv );
+
+  // GSCHECK: atomicrmw xor i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 3
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 3
+  InterlockedXor( gs[a], liv );
+  InterlockedXor( tb[a], liv );
+  InterlockedXor( sb[a], liv );
+  rb.InterlockedXor( ix++, liv );
+
+  // GSCHECK: atomicrmw min i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 4
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 4
+  InterlockedMin( gs[a], liv );
+  InterlockedMin( tb[a], liv );
+  InterlockedMin( sb[a], liv );
+  rb.InterlockedMin( ix++, liv );
+
+  // GSCHECK: atomicrmw max i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 5
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 5
+  InterlockedMax( gs[a], liv );
+  InterlockedMax( tb[a], liv );
+  InterlockedMax( sb[a], liv );
+  rb.InterlockedMax( ix++, liv );
+
+  // GSCHECK: atomicrmw umin i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 6
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 6
+  InterlockedMin( ugs[a], luv );
+  InterlockedMin( utb[a], luv );
+  InterlockedMin( usb[a], luv );
+  rb.InterlockedMin( ix++, luv );
+
+  // GSCHECK: atomicrmw umax i64 
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 7
+  // CHECK: call i64 @dx.op.atomicBinOp.i64(i32 78, %dx.types.Handle %{{[0-9]*}}, i32 7
+  InterlockedMax( ugs[a], luv );
+  InterlockedMax( utb[a], luv );
+  InterlockedMax( usb[a], luv );
+  rb.InterlockedMax( ix++, luv );
+
+}

+ 128 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_i64_and_i32.hlsl

@@ -0,0 +1,128 @@
+// RUN: %dxc -DINTRIN=InterlockedAdd -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -DINTRIN=InterlockedMin -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -DINTRIN=InterlockedMax -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -DINTRIN=InterlockedAnd -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -DINTRIN=InterlockedOr -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+// RUN: %dxc -DINTRIN=InterlockedXor -T ps_6_6 %s | FileCheck %s -check-prefix=GSCHECK
+
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedAdd -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedMin -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedMax -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedAnd -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedOr -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+// RUN: %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedXor -T ps_6_6 %s | FileCheck %s -check-prefixes=CHECK,TYCHECK
+
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedAdd -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedMin -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedMax -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedAnd -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedOr -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -DMEMTYPE=RWStructuredBuffer -DINTRIN=InterlockedXor -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK
+
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedAdd -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedMin -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedMax -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedAnd -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedOr -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DMEMTYPE=RWBuffer -DINTRIN=InterlockedXor -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedAdd -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedMin -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedMax -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedAnd -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedOr -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedXor -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+#ifdef MEMTYPE
+MEMTYPE<uint>     resU;
+MEMTYPE<int>      resI;
+MEMTYPE<uint64_t> resU64;
+MEMTYPE<int64_t>  resI64;
+#else
+groupshared uint     resU[256];
+groupshared int      resI[256];
+groupshared uint64_t resU64[256];
+groupshared int64_t  resI64[256];
+#endif
+
+// TYCHECK: Note: shader requires additional functionality:
+// TYCHECK: 64-bit Atomics on Typed Resources
+// GSCHECK: Note: shader requires additional functionality:
+// GSCHECK: 64-bit Atomics on Group Shared
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  resU[a] = a;
+  resI[a] = a;
+  resU64[a] = a;
+  resI64[a] = a;
+
+  uint uv = b - c;
+  int iv = b / c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  int64_t liv = bb + cc;
+
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  INTRIN( resU[a], uv );
+  INTRIN( resI[a], iv );
+  INTRIN( resU64[a], luv );
+  INTRIN( resI64[a], liv );
+
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  INTRIN( resU[a], iv );
+  INTRIN( resI[a], uv );
+  INTRIN( resU64[a], liv );
+  INTRIN( resI64[a], luv );
+
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  INTRIN( resU[a], luv );
+  INTRIN( resI[a], liv );
+  INTRIN( resU64[a], uv );
+  INTRIN( resI64[a], iv );
+
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i32
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // GSCHECK: atomicrmw {{[a-z]*}} i64
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // CHECK: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  INTRIN( resU[a], 1.0 );
+  INTRIN( resI[a], 2.0 );
+  INTRIN( resU64[a], 3.0 );
+  INTRIN( resI64[a], 4.0 );
+}

+ 46 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_method_float.hlsl

@@ -0,0 +1,46 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+RWByteAddressBuffer res;
+
+int main( float a : A) : SV_Target
+{
+  // Test some disallowed atomic binop intrinsics with floats as both args
+  // Since the destination value is a raw buffer, only the provided value can determine the overload
+  // Since casts are allowed for the existing methods, these will result in i32 variants.
+  // Make sure they are not f32 variants
+
+  uint ix = 0;
+  int b;
+
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedAdd(ix, a);
+  res.InterlockedMin(ix, a);
+  res.InterlockedMax(ix, a);
+  res.InterlockedAnd(ix, a);
+  res.InterlockedOr(ix, a);
+  res.InterlockedXor(ix, a);
+
+  // Try the same with an integer second arg to make sure they still fail
+
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  // CHECK: call i32 @dx.op.atomicBinOp.i32
+  res.InterlockedAdd(ix, a, b);
+  res.InterlockedMin(ix, a, b);
+  res.InterlockedMax(ix, a, b);
+  res.InterlockedAnd(ix, a, b);
+  res.InterlockedOr(ix, a, b);
+  res.InterlockedXor(ix, a, b);
+
+  // CHECK-NOT: dx.op.atomicBinOp.f32
+  return b;
+}
+

+ 107 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/atomic/atomicop_method_i64_and_i32.hlsl

@@ -0,0 +1,107 @@
+// RUN: %dxc -DINTRIN=InterlockedAdd -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+// RUN: %dxc -DINTRIN=InterlockedMin -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+// RUN: %dxc -DINTRIN=InterlockedMax -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+// RUN: %dxc -DINTRIN=InterlockedAnd -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+// RUN: %dxc -DINTRIN=InterlockedOr -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+// RUN: %dxc -DINTRIN=InterlockedXor -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK32
+
+// RUN: %dxc -DINTRIN=InterlockedAdd64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+// RUN: %dxc -DINTRIN=InterlockedMin64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+// RUN: %dxc -DINTRIN=InterlockedMax64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+// RUN: %dxc -DINTRIN=InterlockedAnd64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+// RUN: %dxc -DINTRIN=InterlockedOr64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+// RUN: %dxc -DINTRIN=InterlockedXor64 -T ps_6_6 %s | FileCheck %s -check-prefix=CHECK64
+
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedAdd64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedMin64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedMax64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedAnd64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedOr64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+// RUN: %dxilver 1.6 | %dxc -DINTRIN=InterlockedXor64 -T ps_6_5 %s | FileCheck %s -check-prefix=ERRCHECK
+
+// Verify that the first arg determines the overload and the others can be what they will
+
+RWByteAddressBuffer res;
+
+void main( uint a : A, uint b: B, uint c :C) : SV_Target
+{
+  uint uv = b - c;
+  uint uv2 = b + c;
+  int iv = b / c;
+  int iv2 = b * c;
+  uint64_t bb = b;
+  uint64_t cc = c;
+  uint64_t luv = bb * cc;
+  uint64_t luv2 = bb / cc;
+  int64_t liv = bb + cc;
+  int64_t liv2 = bb - cc;
+  uint ix = 0;
+
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.INTRIN( ix++, uv, uv2 );
+  res.INTRIN( ix++, iv, iv2 );
+  res.INTRIN( ix++, luv, luv2 );
+  res.INTRIN( ix++, liv, liv2 );
+
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.INTRIN( ix++, iv, uv2 );
+  res.INTRIN( ix++, uv, iv2 );
+  res.INTRIN( ix++, liv, luv2 );
+  res.INTRIN( ix++, luv, liv2 );
+
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.INTRIN( ix++, uv, luv2 );
+  res.INTRIN( ix++, iv, liv2 );
+  res.INTRIN( ix++, luv, uv2 );
+  res.INTRIN( ix++, liv, iv2 );
+
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK32: call i32 @dx.op.atomicBinOp.i32
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // CHECK64: call i64 @dx.op.atomicBinOp.i64
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  // ERRCHECK: error: opcode '64-bit atomic operations' should only be used in 'Shader Model 6.6+'
+  res.INTRIN( ix++, 1.0, luv2 );
+  res.INTRIN( ix++, 2.0, liv2 );
+  res.INTRIN( ix++, 3.0, uv2 );
+  res.INTRIN( ix++, 4.0, iv2 );
+}

+ 5 - 2
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/annotateHandle.hlsl

@@ -1,9 +1,12 @@
 // RUN: %dxc -T ps_6_6 %s | %FileCheck %s
 
+// Make sure generate createHandleFromBinding for sm6.6.
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind zeroinitializer, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 3 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
 // Make sure sampler and texture get correct annotateHandle.
 
-// CHECK-DAG:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle {{.*}}, i8 0, i8 2, %dx.types.ResourceProperties { i32 457, i32 0 }) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: Texture2D<F32>
-// CHECK-DAG:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle {{.*}}, i8 3, i8 14, %dx.types.ResourceProperties zeroinitializer) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: SamplerState
+// CHECK-DAG:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 2, i32 1033 })  ; AnnotateHandle(res,props)  resource: Texture2D<F32>
+// CHECK-DAG:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 14, i32 0 })  ; AnnotateHandle(res,props)  resource: SamplerState
 
 SamplerState samplers : register(s0);
 SamplerState foo() { return samplers; }

+ 3 - 3
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap.hlsl

@@ -1,9 +1,9 @@
 // RUN: %dxc -T ps_6_6 %s | %FileCheck %s
-// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 216
-// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle %{{.*}}, i8 0, i8 10, %dx.types.ResourceProperties { i32 489, i32 0 }) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: TypedBuffer<F32>
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 %{{.*}}, i1 false, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
+// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 10, i32 265 })  ; AnnotateHandle(res,props)  resource: TypedBuffer<F32>
 
 uint ID;
 float main(uint i:I): SV_Target {
-  Buffer<float> buf = CreateResourceFromHeap(ID);
+  Buffer<float> buf = ResourceDescriptorHeap[ID];
   return buf[i];
 }

+ 9 - 9
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap2.hlsl

@@ -1,12 +1,12 @@
 // RUN: %dxc -T ps_6_6 %s | %FileCheck %s
 
 // Make sure snorm/unorm and globallycoherent works.
-// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 216
-// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 216
-// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 216
-// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle %{{.*}}, i8 1, i8 10, %dx.types.ResourceProperties { i32 494, i32 0 }) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: RWTypedBuffer<UNormF32>
-// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle %{{.*}}, i8 1, i8 10, %dx.types.ResourceProperties { i32 493, i32 2 }) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: globallycoherent RWTypedBuffer<SNormF32>
-// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle %{{.*}}, i8 1, i8 10, %dx.types.ResourceProperties { i32 493, i32 2 }) ; AnnotateHandle(res,resourceClass,resourceKind,props)  resource: globallycoherent RWTypedBuffer<SNormF32>
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218
+// CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218
+// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 4106, i32 270 })  ; AnnotateHandle(res,props)  resource: RWTypedBuffer<UNormF32>
+// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 20490, i32 269 })  ; AnnotateHandle(res,props)  resource: globallycoherent RWTypedBuffer<SNormF32>
+// CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 20490, i32 269 })  ; AnnotateHandle(res,props)  resource: globallycoherent RWTypedBuffer<SNormF32>
 
 
 struct S {
@@ -17,8 +17,8 @@ globallycoherent RWBuffer<snorm float> buf1[2];
 uint ID;
 float main(uint i:I): SV_Target {
   S s;
-  s.buf = CreateResourceFromHeap(ID);
-  s.buf1[0] = CreateResourceFromHeap(ID+1);
-  s.buf1[1] = CreateResourceFromHeap(ID+2);
+  s.buf = ResourceDescriptorHeap[ID];
+  s.buf1[0] = ResourceDescriptorHeap[ID+1];
+  s.buf1[1] = ResourceDescriptorHeap[ID+2];
   return s.buf[i] + s.buf1[0][i] + s.buf1[1][i];
 }

+ 16 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap3.hlsl

@@ -0,0 +1,16 @@
+// RUN: %dxc -T ps_6_6 %s | %FileCheck %s
+
+//CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 0, i1 false, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
+//CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 0, i1 true, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
+//CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 2, i32 1033 })  ; AnnotateHandle(res,props)  resource: Texture2D<F32>
+//CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 14, i32 0 })  ; AnnotateHandle(res,props)  resource: SamplerState
+
+
+[RootSignature("RootFlags(CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED|SAMPLER_HEAP_DIRECTLY_INDEXED)")]
+float4 main(float2 c:C) : SV_Target {
+
+  Texture2D t = ResourceDescriptorHeap[0];
+  SamplerState s = SamplerDescriptorHeap[0];
+  return t.Sample(s, c);
+
+}

+ 21 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/createHandleFromHeap/createFromHeap4.hlsl

@@ -0,0 +1,21 @@
+// RUN: %dxc -T ps_6_6 %s | %FileCheck %s
+
+//CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 0, i1 false, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
+//CHECK:call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 1, i1 false, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
+//CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 13, i32 4 })  ; AnnotateHandle(res,props)  resource: CBuffer
+//CHECK:call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
+//CHECK:call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %{{.*}}, %dx.types.ResourceProperties { i32 15, i32 4 })  ; AnnotateHandle(res,props)  resource: TBuffer
+//CHECK:call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle %{{.*}}, i32 0, i32 undef)  ; BufferLoad(srv,index,wot)
+struct A {
+  float a;
+};
+
+static ConstantBuffer<A> C= ResourceDescriptorHeap[0];
+
+float4 main(float2 c:C) : SV_Target {
+
+TextureBuffer<A> T= ResourceDescriptorHeap[1];
+
+  return C.a * T.a;
+
+}

+ 28 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_16.hlsl

@@ -0,0 +1,28 @@
+// RUN: %dxc -T ps_6_6 -enable-16bit-types  %s | FileCheck %s
+
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 220, i8 0, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 220, i8 2, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 219, i8 1, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 219, i8 1, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 220, i8 0, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 220, i8 1, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}}, i16 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 219, i8 0, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 219, i8 0, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+
+int16_t4 main(int16_t4 input1 : Inputs1, int16_t4 input2 : Inputs2) : SV_Target {
+  int8_t4_packed ps1 = pack_s8(input1);
+  int8_t4_packed ps2 = pack_clamp_s8(input1);
+  int16_t4 up1_out = unpack_s8s16(ps1) + unpack_s8s16(ps2);
+
+  uint8_t4_packed pu1 = pack_u8(input2);
+  uint8_t4_packed pu2 = pack_clamp_u8(input2);
+  uint16_t4 up2_out = unpack_u8u16(pu1) + unpack_u8u16(pu2);
+
+  return up1_out + up2_out;
+}

+ 28 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_32.hlsl

@@ -0,0 +1,28 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 220, i8 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 220, i8 2, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 1, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 1, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 220, i8 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 220, i8 1, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 %{{[0-9]+}})  ; Pack4x8(packMode,x,y,z,w)
+// CHECK-NOT: trunc
+// CHECK-NOT: sext
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 0, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 0, i32 %{{[0-9]+}})  ; Unpack4x8(unpackMode,pk)
+
+int4 main(int4 input1 : Inputs1, int4 input2 : Inputs2) : SV_Target {
+  int8_t4_packed ps1 = pack_s8(input1);
+  int8_t4_packed ps2 = pack_clamp_s8(input1);
+  int4 up1_out = unpack_s8s32(ps1) + unpack_s8s32(ps2);
+
+  uint8_t4_packed pu1 = pack_u8(input2);
+  uint8_t4_packed pu2 = pack_clamp_u8(input2);
+  uint4 up2_out = unpack_u8u32(pu1) + unpack_u8u32(pu2);
+
+  return up1_out + up2_out;
+}

+ 22 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_error.hlsl

@@ -0,0 +1,22 @@
+// RUN: %dxilver 1.6 | %dxc -T ps_6_5 -enable-16bit-types  %s | FileCheck %s
+
+// CHECK: Opcode Pack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Pack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Unpack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Unpack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Unpack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Unpack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Pack4x8 not valid in shader model ps_6_5
+// CHECK: Opcode Pack4x8 not valid in shader model ps_6_5
+
+int16_t4 main(int4 input1 : Inputs1, int16_t4 input2 : Inputs2) : SV_Target {
+  int8_t4_packed ps1 = pack_s8(input1);
+  int8_t4_packed ps2 = pack_clamp_s8(input1);
+  int16_t4 up1_out = unpack_s8s16(ps1) + unpack_s8s16(ps2);
+
+  uint8_t4_packed pu1 = pack_u8(input2);
+  uint8_t4_packed pu2 = pack_clamp_u8(input2);
+  uint16_t4 up2_out = unpack_u8u16(pu1) + unpack_u8u16(pu2);
+
+  return up1_out + up2_out;
+}

+ 49 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_mix.hlsl

@@ -0,0 +1,49 @@
+// RUN: %dxc -T ps_6_6 -enable-16bit-types %s  | FileCheck %s
+
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 218, i8 1,
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 218, i8 0,
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 218, i8 1,
+// CHECK: call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 218, i8 0,
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 219, i8 2, 
+// CHECK: call i32 @dx.op.pack4x8.i32(i32 219, i8 1, 
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 219, i8 2, 
+// CHECK: call i32 @dx.op.pack4x8.i16(i32 219, i8 1, 
+
+  %3 = call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 218, i8 1, i32 %2)  ; Unpack4x8(unpackMode,pk)
+  %4 = extractvalue %dx.types.fouri32 %3, 0
+  %5 = extractvalue %dx.types.fouri32 %3, 1
+  %6 = extractvalue %dx.types.fouri32 %3, 2
+  %7 = extractvalue %dx.types.fouri32 %3, 3
+  %8 = call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 218, i8 0, i32 %1)  ; Unpack4x8(unpackMode,pk)
+  %9 = extractvalue %dx.types.fouri32 %8, 0
+  %10 = extractvalue %dx.types.fouri32 %8, 1
+  %11 = extractvalue %dx.types.fouri32 %8, 2
+  %12 = extractvalue %dx.types.fouri32 %8, 3
+  %13 = call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 218, i8 1, i32 %2)  ; Unpack4x8(unpackMode,pk)
+  %14 = extractvalue %dx.types.fouri16 %13, 0
+  %15 = extractvalue %dx.types.fouri16 %13, 1
+  %16 = extractvalue %dx.types.fouri16 %13, 2
+  %17 = extractvalue %dx.types.fouri16 %13, 3
+  %18 = call %dx.types.fouri16 @dx.op.unpack4x8.i16(i32 218, i8 0, i32 %1)  ; Unpack4x8(unpackMode,pk)
+  %19 = extractvalue %dx.types.fouri16 %18, 0
+  %20 = extractvalue %dx.types.fouri16 %18, 1
+  %21 = extractvalue %dx.types.fouri16 %18, 2
+  %22 = extractvalue %dx.types.fouri16 %18, 3
+  %23 = call i32 @dx.op.pack4x8.i32(i32 219, i8 2, i32 %4, i32 %5, i32 %6, i32 %7)  ; Pack4x8(packMode,x,y,z,w)
+  %24 = call i32 @dx.op.pack4x8.i32(i32 219, i8 1, i32 %9, i32 %10, i32 %11, i32 %12)  ; Pack4x8(packMode,x,y,z,w)
+  %25 = call i32 @dx.op.pack4x8.i16(i32 219, i8 2, i16 %14, i16 %15, i16 %16, i16 %17)  ; Pack4x8(packMode,x,y,z,w)
+  %26 = call i32 @dx.op.pack4x8.i16(i32 219, i8 1, i16 %19, i16 %20, i16 %21, i16 %22)  ; Pack4x8(packMode,x,y,z,w)
+
+int4 main(int8_t4_packed input1 : Inputs1, uint8_t4_packed input2 : Inputs2) : SV_Target {
+  int4 i41 = unpack_s8s32(input1);
+  int4 i42 = unpack_u8u32(input2);
+  int16_t4 i43 = unpack_s8s16(input1);
+  uint16_t4 i44 = unpack_u8u16(input2);
+
+  int8_t4_packed p1 = pack_clamp_s8(i41);
+  uint8_t4_packed p2 = pack_clamp_u8(i42);
+  int8_t4_packed p3 = pack_clamp_s8(i43);
+  uint8_t4_packed p4 = pack_clamp_u8(i44);
+
+  return p1 & p2 & p3 & p4;
+}

+ 15 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_pack_unpack_uint.hlsl

@@ -0,0 +1,15 @@
+// RUN: %dxc -T ps_6_6 -enable-16bit-types  %s | FileCheck %s
+
+//CHECK: error: no matching function for call to 'pack_clamp_s8'
+//CHECK: note: candidate function not viable: no known conversion from 'vector<uint, 4>' to 'vector<int, 4>' for 1st argument
+//CHECK: error: no matching function for call to 'pack_clamp_u8'
+//CHECK: note: candidate function not viable: no known conversion from 'vector<uint16_t, 4>' to 'vector<int, 4>' for 1st argument
+
+int main(uint4 input1 : Inputs1, uint16_t4 input2 : Inputs2) : SV_Target {
+  int8_t4_packed ps1 = pack_s8(input1);
+  int8_t4_packed ps2 = pack_clamp_s8(input1);
+  uint8_t4_packed pu1 = pack_u8(input2);
+  uint8_t4_packed pu2 = pack_clamp_u8(input2);
+
+  return ps1 + ps2 + pu1 + pu2;
+}

+ 24 - 0
tools/clang/test/HLSLFileCheck/hlsl/intrinsics/pack/sm_6_6_packed_type_arg.hlsl

@@ -0,0 +1,24 @@
+// RUN: %dxc -T ps_6_6 %s | FileCheck %s
+
+// CHECK: call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)
+// CHECK: call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 1,
+// CHECK: call %dx.types.fouri32 @dx.op.unpack4x8.i32(i32 219, i8 0,
+// CHECK: add i32 %{{[0-9]+}}, 5
+
+int foo(uint a) {
+  return 1;
+}
+
+int foo(int8_t4_packed a) {
+  return 2;
+}
+
+int foo(uint8_t4_packed a) {
+  return 3;
+}
+
+int main(int8_t4_packed input1 : Inputs1, uint8_t4_packed input2 : Inputs2) : SV_Target {
+  int4 o = unpack_s8s32(input1) + unpack_u8u32(input2);
+  return o.x + foo(input1) + foo(input2);
+}

Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff