Kaynağa Gözat

Merge branch 'master' into user/texr/rt-merge-rebase

Tex Riddell 7 yıl önce
ebeveyn
işleme
f8e1af0417
61 değiştirilmiş dosya ile 3139 ekleme ve 361 silme
  1. 61 0
      docs/DXIL.rst
  2. 98 24
      docs/SPIR-V.rst
  3. 1 1
      external/SPIRV-Headers
  4. 1 1
      external/SPIRV-Tools
  5. 1 1
      external/googletest
  6. 1 1
      external/re2
  7. 3 2
      include/dxc/HLSL/DxilConstants.h
  8. 9 0
      lib/HLSL/HLOperationLower.cpp
  9. 42 10
      lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
  10. 148 0
      tools/clang/include/clang/SPIRV/InstBuilder.h
  11. 5 1
      tools/clang/include/clang/SPIRV/ModuleBuilder.h
  12. 16 3
      tools/clang/lib/CodeGen/CGExprAgg.cpp
  13. 50 0
      tools/clang/lib/CodeGen/CGHLSLMS.cpp
  14. 88 20
      tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
  15. 24 8
      tools/clang/lib/SPIRV/DeclResultIdMapper.h
  16. 1031 1
      tools/clang/lib/SPIRV/InstBuilderAuto.cpp
  17. 21 3
      tools/clang/lib/SPIRV/ModuleBuilder.cpp
  18. 203 55
      tools/clang/lib/SPIRV/SPIRVEmitter.cpp
  19. 21 5
      tools/clang/lib/SPIRV/SPIRVEmitter.h
  20. 10 1
      tools/clang/lib/SPIRV/SpirvEvalInfo.h
  21. 274 77
      tools/clang/lib/SPIRV/TypeTranslator.cpp
  22. 24 2
      tools/clang/lib/SPIRV/TypeTranslator.h
  23. 27 0
      tools/clang/test/CodeGenHLSL/quick-test/constant_cast.hlsl
  24. 28 0
      tools/clang/test/CodeGenHLSL/quick-test/flat_addrspacecast.hlsl
  25. 4 4
      tools/clang/test/CodeGenSPIRV/bezier.domain.hlsl2spv
  26. 4 4
      tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
  27. 0 2
      tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.form.hlsl
  28. 26 0
      tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.type.hlsl
  29. 165 0
      tools/clang/test/CodeGenSPIRV/cast.bitwidth.hlsl
  30. 50 0
      tools/clang/test/CodeGenSPIRV/cast.flat-conversion.literal-initializer.hlsl
  31. 1 1
      tools/clang/test/CodeGenSPIRV/constant.scalar.16bit.enabled.hlsl
  32. 7 1
      tools/clang/test/CodeGenSPIRV/cs.groupshared.hlsl
  33. 34 0
      tools/clang/test/CodeGenSPIRV/op.cbuffer.access.majorness.hlsl
  34. 19 0
      tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-count.hlsl
  35. 19 0
      tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-index.hlsl
  36. 30 0
      tools/clang/test/CodeGenSPIRV/sm6.wave-read-lane-first.hlsl
  37. 27 0
      tools/clang/test/CodeGenSPIRV/sm6.wave.builtin.no-dup.hlsl
  38. 1 1
      tools/clang/test/CodeGenSPIRV/spirv.interface.hs.hlsl
  39. 36 0
      tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std140.hlsl
  40. 39 0
      tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std430.hlsl
  41. 8 5
      tools/clang/test/CodeGenSPIRV/vk.layout.asbuffer.std430.hlsl
  42. 30 26
      tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.nested.std140.hlsl
  43. 14 11
      tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.std140.hlsl
  44. 20 0
      tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpc.hlsl
  45. 31 0
      tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpr.hlsl
  46. 8 4
      tools/clang/test/CodeGenSPIRV/vk.layout.csbuffer.std430.hlsl
  47. 20 0
      tools/clang/test/CodeGenSPIRV/vk.layout.non-fp-matrix.error.hlsl
  48. 14 3
      tools/clang/test/CodeGenSPIRV/vk.layout.push-constant.std430.hlsl
  49. 34 26
      tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.nested.std430.hlsl
  50. 18 11
      tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.std430.hlsl
  51. 7 3
      tools/clang/test/CodeGenSPIRV/vk.layout.tbuffer.std430.hlsl
  52. 7 3
      tools/clang/test/CodeGenSPIRV/vk.layout.texture-buffer.std430.hlsl
  53. 46 0
      tools/clang/test/CodeGenSPIRV/vk.location.composite.hlsl
  54. 0 0
      tools/clang/test/vk.cloption.invert-y.vs.hlsl
  55. 8 3
      tools/clang/tools/dxcompiler/dxcontainerbuilder.cpp
  56. 83 7
      tools/clang/unittests/HLSL/CompilerTest.cpp
  57. 20 20
      tools/clang/unittests/HLSL/ShaderOpArithTable.xml
  58. 44 2
      tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
  59. 59 0
      utils/hct/hctdb_inst_docs.txt
  60. 10 6
      utils/hct/hctdb_test.py
  61. 9 2
      utils/hct/hctgettaef.py

+ 61 - 0
docs/DXIL.rst

@@ -1980,6 +1980,67 @@ ExtractValue  extracts from aggregate
 ============= ======================================================================= =================
 
 
+FAdd
+~~~~
+
+%des = fadd float %src0, %src1
+
+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that "fp32-denorm-mode"="preserve".
+For "fp32-denorm-mode"="ftz" mode, denorms inputs should be treated as corresponding signed zero, and any resulting denorm is also flushed to zero.
+
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| src0\src1| -inf     | -F     | -denorm  | -0 | +0 | +denorm   |    +F  | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -inf     | -inf     |   -inf | -inf     |-inf|-inf| -inf      |   -inf | NaN  | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -F       | -inf     |   -F   | -F       |src0|src0| -F        |   +/-F | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -denorm  | -inf     |   -F   |-F/denorm |src0|src0| +/-denorm |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +denorm  | -inf     |   -F   |+/-denorm |src0|src0| +F/denorm |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +F       | -inf     |  +/-F  | +F       |src0|src0| +F        |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +inf     | NaN      |   +inf | +inf     |+inf|+inf| +inf      |   +inf | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| NaN      | NaN      |   NaN  | NaN      |NaN |NaN | NaN       |   NaN  | NaN  | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+
+FDiv
+~~~~
+
+%dest = fdiv float %src0, %src1
+
+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that fast math flag is not used and "fp32-denorm-mode"="preserve".
+When "fp32-denorm-mode"="ftz", denorm inputs should be interpreted as corresponding signed zero, and any resulting denorm is also flushed to zero.
+When fast math is enabled, implementation may use reciprocal form: src0*(1/src1).  This may result in evaluating src0*(+/-)INF from src0*(1/(+/-)denorm).  This may produce NaN in some cases or (+/-)INF in others.
+
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| src0\\src1| -inf     | -F     |  -1   | -denorm | -0 | +0 | +denorm |  +1   |    +F  | +inf | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -inf      | NaN      |   +inf | +inf  | +inf    |+inf|-inf| -inf    |  -inf |   -inf | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -F        | +0       |   +F   | -src0 | +F      |+inf|-inf| -F      |  src0 |   -F   | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -denorm   | +0       | +denorm| -src0 | +F      |+inf|-inf| -F      |  src0 |-denorm | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -0        | +0       |   +0   | +0    | 0       |NaN |NaN | 0       |  -0   |   -0   | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +0        | -0       |   -0   | -0    | 0       |NaN |NaN | 0       |  +0   |   +0   | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +denorm   | -0       | -denorm| -src0 | -F      |-inf|+inf| +F      |  src0 |+denorm | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +F        | -0       |   -F   | -src0 | -F      |-inf|+inf| +F      |  src0 |   +F   | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +inf      | NaN      |   -inf | -inf  | -inf    |-inf|+inf| +inf    |  +inf |   +inf | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| NaN       | NaN      |   NaN  | NaN   | NaN     |NaN |NaN | NaN     |  NaN  |   NaN  | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+
 .. INSTR-RST:END
 
 Operations via external functions

+ 98 - 24
docs/SPIR-V.rst

@@ -286,6 +286,58 @@ interface variables:
   main([[vk::location(N)]] float4 input: A) : B
   { ... }
 
+Legalization, optimization, validation
+--------------------------------------
+
+After initial translation of the HLSL source code, SPIR-V CodeGen will further
+conduct legalization (if needed), optimization (if requested), and validation
+(if not turned off). All these three stages are outsourced to `SPIRV-Tools <https://github.com/KhronosGroup/SPIRV-Tools>`_.
+Here are the options controlling these stages:
+
+* ``-fcgl``: turn off legalization and optimization
+* ``-Od``: turn off optimization
+* ``-Vd``: turn off validation
+
+Legalization
+~~~~~~~~~~~~
+
+HLSL is a fairly permissive language considering the flexibility it provides for
+manipulating resource objects. The developer can create local copies, pass
+them around as function parameters and return values, as long as after certain
+transformations (function inlining, constant evaluation and propagating, dead
+code elimination, etc.), the compiler can remove all temporary copies and
+pinpoint all uses to unique global resource objects.
+
+Resulting from the above property of HLSL, if we translate into SPIR-V for
+Vulkan literally from the input HLSL source code, we will sometimes generate
+illegal SPIR-V. Certain transformations are needed to legalize the literally
+translated SPIR-V. Performing such transformations at the frontend AST level
+is cumbersome or impossible (e.g., function inlining). They are better to be
+conducted at SPIR-V level. Therefore, legalization is delegated to SPIRV-Tools.
+
+Specifically, we need to legalize the following HLSL source code patterns:
+
+* Using resource types in struct types
+* Creating aliases of global resource objects
+* Control flows invovling the above cases
+
+Legalization transformations will not run unless the above patterns are
+encountered in the source code.
+
+Optimization
+~~~~~~~~~~~~
+
+Optimization is also delegated to SPIRV-Tools. Right now there are no difference
+between optimization levels greater than zero; they will all invoke the same
+optimization recipe. This may change in the future.
+
+Validation
+~~~~~~~~~~
+
+Validation is turned on by default as the last stage of SPIR-V CodeGen. Failing
+validation, which indicates there is a CodeGen bug, will trigger a fatal error.
+Please file an issue if you see that.
+
 HLSL Types
 ==========
 
@@ -307,7 +359,7 @@ type instructions:
 ``uint``/``dword``/``uin32_t``                         ``OpTypeInt 32 0``
 ``uint16_t``                   ``-enable-16bit-types`` ``OpTypeInt 16 0`` ``Int16``
 ``half``                                               ``OpTypeFloat 32``
-``half``/``float16_t``         ``-enable-16bit-types`` ``OpTypeFloat 16`` ``Float16`` ``SPV_AMD_gpu_shader_half_float``
+``half``/``float16_t``         ``-enable-16bit-types`` ``OpTypeFloat 16``             ``SPV_AMD_gpu_shader_half_float``
 ``float``/``float32_t``                                ``OpTypeFloat 32``
 ``snorm float``                                        ``OpTypeFloat 32``
 ``unorm float``                                        ``OpTypeFloat 32``
@@ -340,8 +392,8 @@ https://github.com/Microsoft/DirectXShaderCompiler/wiki/16-Bit-Scalar-Types
 ``min16int``                           ``OpTypeInt 32 1`` ``RelaxedPrecision``
 ``min12int``                           ``OpTypeInt 32 1`` ``RelaxedPrecision``
 ``min16uint``                          ``OpTypeInt 32 0`` ``RelaxedPrecision``
-``min16float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                      ``Float16``  ``SPV_AMD_gpu_shader_half_float``
-``min10float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                      ``Float16``  ``SPV_AMD_gpu_shader_half_float``
+``min16float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                                   ``SPV_AMD_gpu_shader_half_float``
+``min10float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                                   ``SPV_AMD_gpu_shader_half_float``
 ``min16int``   ``-enable-16bit-types`` ``OpTypeInt 16 1``                      ``Int16``
 ``min12int``   ``-enable-16bit-types`` ``OpTypeInt 16 1``                      ``Int16``
 ``min16uint``  ``-enable-16bit-types`` ``OpTypeInt 16 0``                      ``Int16``
@@ -458,26 +510,28 @@ Textures
 `Texture types <https://msdn.microsoft.com/en-us/library/windows/desktop/bb509700(v=vs.85).aspx>`_
 are translated into SPIR-V ``OpTypeImage``, with parameters:
 
-======================= ========== ===== ======= == ======= ================ =================
-HLSL Texture Type           Dim    Depth Arrayed MS Sampled  Image Format       Capability
-======================= ========== ===== ======= == ======= ================ =================
-``Texture1D``           ``1D``      0       0    0    1     ``Unknown``
-``Texture2D``           ``2D``      0       0    0    1     ``Unknown``
-``Texture3D``           ``3D``      0       0    0    1     ``Unknown``
-``TextureCube``         ``Cube``    0       0    0    1     ``Unknown``
-``Texture1DArray``      ``1D``      0       1    0    1     ``Unknown``
-``Texture2DArray``      ``2D``      0       1    0    1     ``Unknown``
-``Texture2DMS``         ``2D``      0       0    1    1     ``Unknown``
-``Texture2DMSArray``    ``2D``      0       1    1    1     ``Unknown``      ``ImageMSArray``
-``TextureCubeArray``    ``3D``      0       1    0    1     ``Unknown``
-``Buffer<T>``           ``Buffer``  0       0    0    1     Depends on ``T`` ``SampledBuffer``
-``RWBuffer<T>``         ``Buffer``  0       0    0    2     Depends on ``T`` ``SampledBuffer``
-``RWTexture1D<T>``      ``1D``      0       0    0    2     Depends on ``T``
-``RWTexture2D<T>``      ``2D``      0       0    0    2     Depends on ``T``
-``RWTexture3D<T>``      ``3D``      0       0    0    2     Depends on ``T``
-``RWTexture1DArray<T>`` ``1D``      0       1    0    2     Depends on ``T``
-``RWTexture2DArray<T>`` ``2D``      0       1    0    2     Depends on ``T``
-======================= ========== ===== ======= == ======= ================ =================
+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
+       HLSL                   Vulkan                                        SPIR-V
+----------------------- -------------------------- ------------------------------------------------------------------------------------------
+     Texture Type         Descriptor Type    RO/RW    Storage Class        Dim    Depth Arrayed MS Sampled   Image Format      Capability
+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
+``Texture1D``           Sampled Image         RO   ``UniformConstant`` ``1D``      0       0    0    1     ``Unknown``
+``Texture2D``           Sampled Image         RO   ``UniformConstant`` ``2D``      0       0    0    1     ``Unknown``
+``Texture3D``           Sampled Image         RO   ``UniformConstant`` ``3D``      0       0    0    1     ``Unknown``
+``TextureCube``         Sampled Image         RO   ``UniformConstant`` ``Cube``    0       0    0    1     ``Unknown``
+``Texture1DArray``      Sampled Image         RO   ``UniformConstant`` ``1D``      0       1    0    1     ``Unknown``
+``Texture2DArray``      Sampled Image         RO   ``UniformConstant`` ``2D``      0       1    0    1     ``Unknown``
+``Texture2DMS``         Sampled Image         RO   ``UniformConstant`` ``2D``      0       0    1    1     ``Unknown``
+``Texture2DMSArray``    Sampled Image         RO   ``UniformConstant`` ``2D``      0       1    1    1     ``Unknown``      ``ImageMSArray``
+``TextureCubeArray``    Sampled Image         RO   ``UniformConstant`` ``3D``      0       1    0    1     ``Unknown``
+``Buffer<T>``           Uniform Texel Buffer  RO   ``UniformConstant`` ``Buffer``  0       0    0    1     Depends on ``T`` ``SampledBuffer``
+``RWBuffer<T>``         Storage Texel Buffer  RW   ``UniformConstant`` ``Buffer``  0       0    0    2     Depends on ``T`` ``SampledBuffer``
+``RWTexture1D<T>``      Storage Image         RW   ``UniformConstant`` ``1D``      0       0    0    2     Depends on ``T``
+``RWTexture2D<T>``      Storage Image         RW   ``UniformConstant`` ``2D``      0       0    0    2     Depends on ``T``
+``RWTexture3D<T>``      Storage Image         RW   ``UniformConstant`` ``3D``      0       0    0    2     Depends on ``T``
+``RWTexture1DArray<T>`` Storage Image         RW   ``UniformConstant`` ``1D``      0       1    0    2     Depends on ``T``
+``RWTexture2DArray<T>`` Storage Image         RW   ``UniformConstant`` ``2D``      0       1    0    2     Depends on ``T``
+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
 
 The meanings of the headers in the above table is explained in ``OpTypeImage``
 of the SPIR-V spec.
@@ -771,6 +825,8 @@ placed in the ``Uniform`` or ``UniformConstant`` storage class.
 
   - Global variables with ``groupshared`` modifier will be placed in the
     ``Workgroup`` storage class.
+  - Note that this modifier overrules ``static``; if both ``groupshared`` and
+    ``static`` are applied to a variable, ``static`` will be ignored.
 
 - ``uinform``
 
@@ -2257,7 +2313,6 @@ element is the height, and the third is the elements.
 The ``OpImageQuerySize`` instruction is used to get a uint3. The first element is the width, the second
 element is the height, and the third element is the depth.
 
-
 HLSL Shader Stages
 ==================
 
@@ -2424,6 +2479,25 @@ behind ``T`` will be flushed before SPIR-V ``OpEmitVertex`` instruction is
 generated. ``.RestartStrip()`` method calls will be translated into the SPIR-V
 ``OpEndPrimitive`` instruction.
 
+Shader Model 6.0 Wave Intrinsics
+================================
+
+Shader Model 6.0 introduces a set of wave operations, which are translated
+according to the following table:
+
+====================== ============================= =========================
+      Intrinsic               SPIR-V BuiltIn                Extension
+====================== ============================= =========================
+``WaveGetLaneCount()`` ``SubgroupSize``              ``SPV_KHR_shader_ballot``
+``WaveGetLaneIndex()`` ``SubgroupLocalInvocationId`` ``SPV_KHR_shader_ballot``
+====================== ============================= =========================
+
+======================= ================================ =========================
+      Intrinsic               SPIR-V Instruction                Extension
+======================= ================================ =========================
+``WaveReadLaneFirst()`` ``OpSubgroupFirstInvocationKHR`` ``SPV_KHR_shader_ballot``
+======================= ================================ =========================
+
 Vulkan Command-line Options
 ===========================
 

+ 1 - 1
external/SPIRV-Headers

@@ -1 +1 @@
-Subproject commit e0282aa7d54631502b4af567a85d3b6565fd5464
+Subproject commit 02ffc719aa9f9c1dce5ce05743fb1afe6cbf17ea

+ 1 - 1
external/SPIRV-Tools

@@ -1 +1 @@
-Subproject commit 6c75050136a2657dac4501ca16d447852fc69e5f
+Subproject commit 03b8a3fe540e72794646195fe261a679203c13ac

+ 1 - 1
external/googletest

@@ -1 +1 @@
-Subproject commit a5014476f0c49c966e4ac602469cddefc7ed486d
+Subproject commit 703b4a85a21e394252560a89cc856b384b48c286

+ 1 - 1
external/re2

@@ -1 +1 @@
-Subproject commit 715f0dcaafbeda5d9fef58194d9ce256f0317ecf
+Subproject commit c1ed8543f1b703ce200212bb5629ba69a2f9b63a

+ 3 - 2
include/dxc/HLSL/DxilConstants.h

@@ -56,8 +56,9 @@ namespace DXIL {
   const unsigned kMaxStructBufferStride = 2048;
   const unsigned kMaxHSOutputControlPointsTotalScalars = 3968;
   const unsigned kMaxHSOutputPatchConstantTotalScalars = 32*4;
-  const unsigned kMaxOutputTotalScalars = 32*4;
-  const unsigned kMaxInputTotalScalars = 32*4;
+  const unsigned kMaxSignatureTotalVectors = 32;
+  const unsigned kMaxOutputTotalScalars = kMaxSignatureTotalVectors * 4;
+  const unsigned kMaxInputTotalScalars = kMaxSignatureTotalVectors * 4;
   const unsigned kMaxClipOrCullDistanceElementCount = 2;
   const unsigned kMaxClipOrCullDistanceCount = 2 * 4;
   const unsigned kMaxGSOutputVertexCount = 1024;

+ 9 - 0
lib/HLSL/HLOperationLower.cpp

@@ -5409,6 +5409,15 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
 
     ldInst->replaceAllUsesWith(newLd);
     ldInst->eraseFromParent();
+  } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(user)) {
+    for (auto it = BCI->user_begin(); it != BCI->user_end(); ) {
+      Instruction *I = cast<Instruction>(*it++);
+      TranslateCBAddressUserLegacy(I,
+                                   handle, legacyIdx, channelOffset, hlslOP,
+                                   prevFieldAnnotation, dxilTypeSys,
+                                   DL, pObjHelper);
+    }
+    BCI->eraseFromParent();
   } else {
     // Must be GEP here
     GetElementPtrInst *GEP = cast<GetElementPtrInst>(user);

+ 42 - 10
lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp

@@ -105,6 +105,7 @@ private:
 
   void RewriteForConstExpr(ConstantExpr *user, IRBuilder<> &Builder);
   void RewriteForGEP(GEPOperator *GEP, IRBuilder<> &Builder);
+  void RewriteForAddrSpaceCast(ConstantExpr *user, IRBuilder<> &Builder);
   void RewriteForLoad(LoadInst *loadInst);
   void RewriteForStore(StoreInst *storeInst);
   void RewriteMemIntrin(MemIntrinsic *MI, Value *OldV);
@@ -3158,6 +3159,22 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
   }
 }
 
+/// RewriteForConstExpr - Rewrite the GEP which is ConstantExpr.
+void SROA_Helper::RewriteForAddrSpaceCast(ConstantExpr *CE,
+                                          IRBuilder<> &Builder) {
+  SmallVector<Value *, 8> NewCasts;
+  // create new AddrSpaceCast.
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    Value *NewGEP = Builder.CreateAddrSpaceCast(
+        NewElts[i],
+        PointerType::get(NewElts[i]->getType()->getPointerElementType(),
+                         CE->getType()->getPointerAddressSpace()));
+    NewCasts.emplace_back(NewGEP);
+  }
+  SROA_Helper helper(CE, NewCasts, DeadInsts);
+  helper.RewriteForScalarRepl(CE, Builder);
+}
+
 /// RewriteForConstExpr - Rewrite the GEP which is ConstantExpr.
 void SROA_Helper::RewriteForConstExpr(ConstantExpr *CE, IRBuilder<> &Builder) {
   if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
@@ -3167,17 +3184,26 @@ void SROA_Helper::RewriteForConstExpr(ConstantExpr *CE, IRBuilder<> &Builder) {
       return;
     }
   }
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    if (OldVal == CE->getOperand(0)) {
+      // Flatten AddrSpaceCast.
+      RewriteForAddrSpaceCast(CE, Builder);
+      return;
+    }
+  }
   // Skip unused CE. 
   if (CE->use_empty())
     return;
 
-  Instruction *constInst = CE->getAsInstruction();
-  Builder.Insert(constInst);
-  // Replace CE with constInst.
   for (Value::use_iterator UI = CE->use_begin(), E = CE->use_end(); UI != E;) {
     Use &TheUse = *UI++;
-    if (isa<Instruction>(TheUse.getUser()))
-      TheUse.set(constInst);
+    if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+      IRBuilder<> tmpBuilder(I);
+      // Replace CE with constInst.
+      Instruction *tmpInst = CE->getAsInstruction();
+      tmpBuilder.Insert(tmpInst);
+      TheUse.set(tmpInst);
+    }
     else {
       RewriteForConstExpr(cast<ConstantExpr>(TheUse.getUser()), Builder);
     }
@@ -3788,17 +3814,23 @@ static void ReplaceUnboundedArrayUses(Value *V, Value *Src, IRBuilder<> &Builder
 }
 
 static void ReplaceMemcpy(Value *V, Value *Src, MemCpyInst *MC) {
+  Type *TyV = V->getType()->getPointerElementType();
+  Type *TySrc = Src->getType()->getPointerElementType();
   if (Constant *C = dyn_cast<Constant>(V)) {
-    if (isa<Constant>(Src)) {
-      V->replaceAllUsesWith(Src);
+    if (TyV == TySrc) {
+      if (isa<Constant>(Src)) {
+        V->replaceAllUsesWith(Src);
+      } else {
+        // Replace Constant with a non-Constant.
+        IRBuilder<> Builder(MC);
+        ReplaceConstantWithInst(C, Src, Builder);
+      }
     } else {
-      // Replace Constant with a non-Constant.
       IRBuilder<> Builder(MC);
+      Src = Builder.CreateBitCast(Src, V->getType());
       ReplaceConstantWithInst(C, Src, Builder);
     }
   } else {
-    Type* TyV = V->getType()->getPointerElementType();
-    Type* TySrc = Src->getType()->getPointerElementType();
     if (TyV == TySrc) {
       if (V != Src)
         V->replaceAllUsesWith(Src);

+ 148 - 0
tools/clang/include/clang/SPIRV/InstBuilder.h

@@ -806,6 +806,149 @@ public:
   InstBuilder &opModuleProcessed(std::string process);
   InstBuilder &opExecutionModeId(uint32_t entry_point, spv::ExecutionMode mode);
   InstBuilder &opDecorateId(uint32_t target, spv::Decoration decoration);
+  InstBuilder &opGroupNonUniformElect(uint32_t result_type, uint32_t result_id,
+                                      uint32_t execution);
+  InstBuilder &opGroupNonUniformAll(uint32_t result_type, uint32_t result_id,
+                                    uint32_t execution, uint32_t predicate);
+  InstBuilder &opGroupNonUniformAny(uint32_t result_type, uint32_t result_id,
+                                    uint32_t execution, uint32_t predicate);
+  InstBuilder &opGroupNonUniformAllEqual(uint32_t result_type,
+                                         uint32_t result_id, uint32_t execution,
+                                         uint32_t value);
+  InstBuilder &opGroupNonUniformBroadcast(uint32_t result_type,
+                                          uint32_t result_id,
+                                          uint32_t execution, uint32_t value,
+                                          uint32_t id);
+  InstBuilder &opGroupNonUniformBroadcastFirst(uint32_t result_type,
+                                               uint32_t result_id,
+                                               uint32_t execution,
+                                               uint32_t value);
+  InstBuilder &opGroupNonUniformBallot(uint32_t result_type, uint32_t result_id,
+                                       uint32_t execution, uint32_t predicate);
+  InstBuilder &opGroupNonUniformInverseBallot(uint32_t result_type,
+                                              uint32_t result_id,
+                                              uint32_t execution,
+                                              uint32_t value);
+  InstBuilder &opGroupNonUniformBallotBitExtract(uint32_t result_type,
+                                                 uint32_t result_id,
+                                                 uint32_t execution,
+                                                 uint32_t value,
+                                                 uint32_t index);
+  InstBuilder &opGroupNonUniformBallotBitCount(uint32_t result_type,
+                                               uint32_t result_id,
+                                               uint32_t execution,
+                                               spv::GroupOperation operation,
+                                               uint32_t value);
+  InstBuilder &opGroupNonUniformBallotFindLSB(uint32_t result_type,
+                                              uint32_t result_id,
+                                              uint32_t execution,
+                                              uint32_t value);
+  InstBuilder &opGroupNonUniformBallotFindMSB(uint32_t result_type,
+                                              uint32_t result_id,
+                                              uint32_t execution,
+                                              uint32_t value);
+  InstBuilder &opGroupNonUniformShuffle(uint32_t result_type,
+                                        uint32_t result_id, uint32_t execution,
+                                        uint32_t value, uint32_t id);
+  InstBuilder &opGroupNonUniformShuffleXor(uint32_t result_type,
+                                           uint32_t result_id,
+                                           uint32_t execution, uint32_t value,
+                                           uint32_t mask);
+  InstBuilder &opGroupNonUniformShuffleUp(uint32_t result_type,
+                                          uint32_t result_id,
+                                          uint32_t execution, uint32_t value,
+                                          uint32_t delta);
+  InstBuilder &opGroupNonUniformShuffleDown(uint32_t result_type,
+                                            uint32_t result_id,
+                                            uint32_t execution, uint32_t value,
+                                            uint32_t delta);
+  InstBuilder &opGroupNonUniformIAdd(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformFAdd(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformIMul(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformFMul(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformSMin(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformUMin(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformFMin(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformSMax(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformUMax(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformFMax(uint32_t result_type, uint32_t result_id,
+                                     uint32_t execution,
+                                     spv::GroupOperation operation,
+                                     uint32_t value,
+                                     llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformBitwiseAnd(uint32_t result_type, uint32_t result_id,
+                              uint32_t execution, spv::GroupOperation operation,
+                              uint32_t value,
+                              llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformBitwiseOr(uint32_t result_type, uint32_t result_id,
+                             uint32_t execution, spv::GroupOperation operation,
+                             uint32_t value,
+                             llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformBitwiseXor(uint32_t result_type, uint32_t result_id,
+                              uint32_t execution, spv::GroupOperation operation,
+                              uint32_t value,
+                              llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformLogicalAnd(uint32_t result_type, uint32_t result_id,
+                              uint32_t execution, spv::GroupOperation operation,
+                              uint32_t value,
+                              llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformLogicalOr(uint32_t result_type, uint32_t result_id,
+                             uint32_t execution, spv::GroupOperation operation,
+                             uint32_t value,
+                             llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &
+  opGroupNonUniformLogicalXor(uint32_t result_type, uint32_t result_id,
+                              uint32_t execution, spv::GroupOperation operation,
+                              uint32_t value,
+                              llvm::Optional<uint32_t> cluster_size);
+  InstBuilder &opGroupNonUniformQuadBroadcast(uint32_t result_type,
+                                              uint32_t result_id,
+                                              uint32_t execution,
+                                              uint32_t value, uint32_t index);
+  InstBuilder &opGroupNonUniformQuadSwap(uint32_t result_type,
+                                         uint32_t result_id, uint32_t execution,
+                                         uint32_t value, uint32_t direction);
   InstBuilder &opSubgroupBallotKHR(uint32_t result_type, uint32_t result_id,
                                    uint32_t predicate);
   InstBuilder &opSubgroupFirstInvocationKHR(uint32_t result_type,
@@ -876,6 +1019,11 @@ public:
   InstBuilder &opSubgroupImageBlockWriteINTEL(uint32_t image,
                                               uint32_t coordinate,
                                               uint32_t data);
+  InstBuilder &opDecorateStringGOOGLE(uint32_t target,
+                                      spv::Decoration decoration);
+  InstBuilder &opMemberDecorateStringGOOGLE(uint32_t struct_type,
+                                            uint32_t member,
+                                            spv::Decoration decoration);
 
   // All-in-one methods for creating unary and binary operations.
   InstBuilder &unaryOp(spv::Op op, uint32_t result_type, uint32_t result_id,

+ 5 - 1
tools/clang/include/clang/SPIRV/ModuleBuilder.h

@@ -303,6 +303,9 @@ public:
   /// \brief Creates an OpEndPrimitive instruction.
   void createEndPrimitive();
 
+  /// \brief Creates an OpSubgroupFirstInvocationKHR instruciton.
+  uint32_t createSubgroupFirstInvocation(uint32_t resultType, uint32_t value);
+
   // === SPIR-V Module Structure ===
 
   inline void requireCapability(spv::Capability);
@@ -384,7 +387,8 @@ public:
   uint32_t getFloat32Type();
   uint32_t getFloat64Type();
   uint32_t getVecType(uint32_t elemType, uint32_t elemCount);
-  uint32_t getMatType(QualType elemType, uint32_t colType, uint32_t colCount);
+  uint32_t getMatType(QualType elemType, uint32_t colType, uint32_t colCount,
+                      Type::DecorationSet decorations = {});
   uint32_t getPointerType(uint32_t pointeeType, spv::StorageClass);
   uint32_t getStructType(llvm::ArrayRef<uint32_t> fieldTypes,
                          llvm::StringRef structName = "",

+ 16 - 3
tools/clang/lib/CodeGen/CGExprAgg.cpp

@@ -726,9 +726,22 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
       Expr *Src = E->getSubExpr();
       switch (CGF.getEvaluationKind(Ty)) {
       case TEK_Aggregate: {
-        LValue LV = CGF.EmitAggExprToLValue(Src);
-        CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
-            CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
+        if (CastExpr *SrcCast = dyn_cast<CastExpr>(Src)) {
+          if (SrcCast->getCastKind() == CK_LValueToRValue) {
+            // Skip the lval to rval cast to reach decl.
+            Src = SrcCast->getSubExpr();
+          }
+        }
+        // Just use decl if possible to skip useless copy.
+        if (DeclRefExpr *SrcDecl = dyn_cast<DeclRefExpr>(Src)) {
+          LValue LV = CGF.EmitLValue(SrcDecl);
+          CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
+              CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
+        } else {
+          LValue LV = CGF.EmitAggExprToLValue(Src);
+          CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
+              CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
+        }
       } break;
       case TEK_Scalar: {
         llvm::Value *SrcVal = CGF.EmitScalarExpr(Src);

+ 50 - 0
tools/clang/lib/CodeGen/CGHLSLMS.cpp

@@ -2559,6 +2559,11 @@ void CGMSHLSLRuntime::AddConstant(VarDecl *constDecl, HLCBuffer &CB) {
     // For static inside cbuffer, take as global static.
     // Don't add to cbuffer.
     CGM.EmitGlobal(constDecl);
+    // Add type annotation for static global types.
+    // May need it when cast from cbuf.
+    DxilTypeSystem &dxilTypeSys = m_pHLModule->GetTypeSystem();
+    unsigned arraySize = 0;
+    AddTypeAnnotation(constDecl->getType(), dxilTypeSys, arraySize);
     return;
   }
   // Search defined structure for resource objects and fail
@@ -6144,6 +6149,43 @@ void CGMSHLSLRuntime::EmitHLSLAggregateCopy(CodeGenFunction &CGF, llvm::Value *S
     SmallVector<Value *, 4> idxList;
     EmitHLSLAggregateCopy(CGF, SrcPtr, DestPtr, idxList, Ty, Ty, SrcPtr->getType());
 }
+// To memcpy, need element type match.
+// For struct type, the layout should match in cbuffer layout.
+// struct { float2 x; float3 y; } will not match struct { float3 x; float2 y; }.
+// struct { float2 x; float3 y; } will not match array of float.
+static bool IsTypeMatchForMemcpy(llvm::Type *SrcTy, llvm::Type *DestTy) {
+  llvm::Type *SrcEltTy = dxilutil::GetArrayEltTy(SrcTy);
+  llvm::Type *DestEltTy = dxilutil::GetArrayEltTy(DestTy);
+  if (SrcEltTy == DestEltTy)
+    return true;
+
+  llvm::StructType *SrcST = dyn_cast<llvm::StructType>(SrcEltTy);
+  llvm::StructType *DestST = dyn_cast<llvm::StructType>(DestEltTy);
+  if (SrcST && DestST) {
+    // Only allow identical struct.
+    return SrcST->isLayoutIdentical(DestST);
+  } else if (!SrcST && !DestST) {
+    // For basic type, if one is array, one is not array, layout is different.
+    // If both array, type mismatch. If both basic, copy should be fine.
+    // So all return false.
+    return false;
+  } else {
+    // One struct, one basic type.
+    // Make sure all struct element match the basic type and basic type is
+    // vector4.
+    llvm::StructType *ST = SrcST ? SrcST : DestST;
+    llvm::Type *Ty = SrcST ? DestEltTy : SrcEltTy;
+    if (!Ty->isVectorTy())
+      return false;
+    if (Ty->getVectorNumElements() != 4)
+      return false;
+    for (llvm::Type *EltTy : ST->elements()) {
+      if (EltTy != Ty)
+        return false;
+    }
+    return true;
+  }
+}
 
 void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF, llvm::Value *SrcPtr,
     clang::QualType SrcTy,
@@ -6162,6 +6204,14 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
     unsigned sizeDest = TheModule.getDataLayout().getTypeAllocSize(DestPtrTy);
     CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, std::max(sizeSrc, sizeDest), 1);
     return;
+  } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(DestPtr)) {
+    if (GV->isInternalLinkage(GV->getLinkage()) &&
+        IsTypeMatchForMemcpy(SrcPtrTy, DestPtrTy)) {
+      unsigned sizeSrc = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
+      unsigned sizeDest = TheModule.getDataLayout().getTypeAllocSize(DestPtrTy);
+      CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, std::min(sizeSrc, sizeDest), 1);
+      return;
+    }
   }
 
   // It is possiable to implement EmitHLSLAggregateCopy, EmitHLSLAggregateStore

+ 88 - 20
tools/clang/lib/SPIRV/DeclResultIdMapper.cpp

@@ -304,15 +304,13 @@ SpirvEvalInfo DeclResultIdMapper::getDeclEvalInfo(const ValueDecl *decl,
           cast<VarDecl>(decl)->getType(),
           // We need to set decorateLayout here to avoid creating SPIR-V
           // instructions for the current type without decorations.
-          info->info.getLayoutRule(), info->isRowMajor);
+          info->info.getLayoutRule(), info->info.isRowMajor());
 
       const uint32_t elemId = theBuilder.createAccessChain(
           theBuilder.getPointerType(varType, info->info.getStorageClass()),
           info->info, {theBuilder.getConstantInt32(info->indexInCTBuffer)});
 
-      return SpirvEvalInfo(elemId)
-          .setStorageClass(info->info.getStorageClass())
-          .setLayoutRule(info->info.getLayoutRule());
+      return info->info.substResultId(elemId);
     } else {
       return *info;
     }
@@ -383,8 +381,8 @@ uint32_t DeclResultIdMapper::createFileVar(const VarDecl *var,
 uint32_t DeclResultIdMapper::createExternVar(const VarDecl *var) {
   auto storageClass = spv::StorageClass::UniformConstant;
   auto rule = LayoutRule::Void;
-  bool isMatType = false;     // Whether this var is of matrix type
-  bool isACRWSBuffer = false; // Whether its {Append|Consume|RW}StructuredBuffer
+  bool isMatType = false;     // Whether is matrix that needs struct wrap
+  bool isACRWSBuffer = false; // Whether is {Append|Consume|RW}StructuredBuffer
 
   if (var->getAttr<HLSLGroupSharedAttr>()) {
     // For CS groupshared variables
@@ -432,11 +430,18 @@ uint32_t DeclResultIdMapper::createExternVar(const VarDecl *var) {
   astDecls[var] =
       SpirvEvalInfo(id).setStorageClass(storageClass).setLayoutRule(rule);
   if (isMatType) {
+    astDecls[var].info.setRowMajor(
+        typeTranslator.isRowMajorMatrix(var->getType(), var));
+
     // We have wrapped the stand-alone matrix inside a struct. Mark it as
     // needing an extra index to access.
     astDecls[var].indexInCTBuffer = 0;
   }
 
+  // Variables in Workgroup do not need descriptor decorations.
+  if (storageClass == spv::StorageClass::Workgroup)
+    return id;
+
   const auto *regAttr = getResourceBinding(var);
   const auto *bindingAttr = var->getAttr<VKBindingAttr>();
   const auto *counterBindingAttr = var->getAttr<VKCounterBindingAttr>();
@@ -573,12 +578,13 @@ uint32_t DeclResultIdMapper::createCTBuffer(const HLSLBufferDecl *decl) {
     const auto *varDecl = cast<VarDecl>(subDecl);
     const bool isRowMajor =
         typeTranslator.isRowMajorMatrix(varDecl->getType(), varDecl);
-    astDecls[varDecl] = {SpirvEvalInfo(bufferVar)
-                             .setStorageClass(spv::StorageClass::Uniform)
-                             .setLayoutRule(decl->isCBuffer()
-                                                ? LayoutRule::GLSLStd140
-                                                : LayoutRule::GLSLStd430),
-                         index++, isRowMajor};
+    astDecls[varDecl] =
+        SpirvEvalInfo(bufferVar)
+            .setStorageClass(spv::StorageClass::Uniform)
+            .setLayoutRule(decl->isCBuffer() ? LayoutRule::GLSLStd140
+                                             : LayoutRule::GLSLStd430)
+            .setRowMajor(isRowMajor);
+    astDecls[varDecl].indexInCTBuffer = index++;
   }
   resourceVars.emplace_back(
       bufferVar, ResourceVar::Category::Other, getResourceBinding(decl),
@@ -793,12 +799,19 @@ public:
   /// Uses the given location.
   void useLoc(uint32_t loc) { usedLocs.set(loc); }
 
-  /// Uses the next available location.
-  uint32_t useNextLoc() {
+  /// Uses the next |count| available location.
+  int useNextLocs(uint32_t count) {
     while (usedLocs[nextLoc])
       nextLoc++;
-    usedLocs.set(nextLoc);
-    return nextLoc++;
+
+    int toUse = nextLoc;
+
+    for (uint32_t i = 0; i < count; ++i) {
+      assert(!usedLocs[nextLoc]);
+      usedLocs.set(nextLoc++);
+    }
+
+    return toUse;
   }
 
   /// Returns true if the given location number is already used.
@@ -976,7 +989,8 @@ bool DeclResultIdMapper::finalizeStageIOLocations(bool forInput) {
   }
 
   for (const auto *var : vars)
-    theBuilder.decorateLocation(var->getSpirvId(), locSet.useNextLoc());
+    theBuilder.decorateLocation(var->getSpirvId(),
+                                locSet.useNextLocs(var->getLocationCount()));
 
   return true;
 }
@@ -1257,9 +1271,11 @@ bool DeclResultIdMapper::createStageVars(const hlsl::SigPoint *sigPoint,
       typeId = theBuilder.getArrayType(typeId,
                                        theBuilder.getConstantUint32(arraySize));
 
-    StageVar stageVar(sigPoint, semanticToUse->str, semanticToUse->semantic,
-                      semanticToUse->name, semanticToUse->index, builtinAttr,
-                      typeId);
+    StageVar stageVar(
+        sigPoint, semanticToUse->str, semanticToUse->semantic,
+        semanticToUse->name, semanticToUse->index, builtinAttr, typeId,
+        // For HS/DS/GS, we have already stripped the outmost arrayness on type.
+        typeTranslator.getLocationCount(type));
     const auto name = namePrefix.str() + "." + stageVar.getSemanticStr();
     const uint32_t varId =
         createSpirvStageVar(&stageVar, decl, name, semanticToUse->loc);
@@ -1673,6 +1689,58 @@ void DeclResultIdMapper::decoratePSInterpolationMode(const NamedDecl *decl,
   }
 }
 
+uint32_t DeclResultIdMapper::getBuiltinVar(spv::BuiltIn builtIn) {
+  // Guarantee uniqueness
+  switch (builtIn) {
+  case spv::BuiltIn::SubgroupSize:
+    if (laneCountBuiltinId)
+      return laneCountBuiltinId;
+    break;
+  case spv::BuiltIn::SubgroupLocalInvocationId:
+    if (laneIndexBuiltinId)
+      return laneIndexBuiltinId;
+    break;
+  default:
+    // Only allow the two cases we know about
+    assert(false && "unsupported builtin case");
+    return 0;
+  }
+
+  // Both of them require the SPV_KHR_shader_ballot extension.
+  theBuilder.addExtension("SPV_KHR_shader_ballot");
+  theBuilder.requireCapability(spv::Capability::SubgroupBallotKHR);
+
+  uint32_t type = theBuilder.getUint32Type();
+
+  // Create a dummy StageVar for this builtin variable
+  const uint32_t varId =
+      theBuilder.addStageBuiltinVar(type, spv::StorageClass::Input, builtIn);
+
+  const hlsl::SigPoint *sigPoint =
+      hlsl::SigPoint::GetSigPoint(hlsl::SigPointFromInputQual(
+          hlsl::DxilParamInputQual::In, shaderModel.GetKind(),
+          /*isPatchConstant=*/false));
+
+  StageVar stageVar(sigPoint, /*semaStr=*/"", hlsl::Semantic::GetInvalid(),
+                    /*semaName=*/"", /*semaIndex=*/0, /*builtinAttr=*/nullptr,
+                    type, /*locCount=*/0);
+
+  stageVar.setIsSpirvBuiltin();
+  stageVar.setSpirvId(varId);
+  stageVars.push_back(stageVar);
+
+  switch (builtIn) {
+  case spv::BuiltIn::SubgroupSize:
+    laneCountBuiltinId = varId;
+    break;
+  case spv::BuiltIn::SubgroupLocalInvocationId:
+    laneIndexBuiltinId = varId;
+    break;
+  }
+
+  return varId;
+}
+
 uint32_t DeclResultIdMapper::createSpirvStageVar(StageVar *stageVar,
                                                  const NamedDecl *decl,
                                                  const llvm::StringRef name,

+ 24 - 8
tools/clang/lib/SPIRV/DeclResultIdMapper.h

@@ -38,11 +38,13 @@ public:
   inline StageVar(const hlsl::SigPoint *sig, llvm::StringRef semaStr,
                   const hlsl::Semantic *sema, llvm::StringRef semaName,
                   uint32_t semaIndex, const VKBuiltInAttr *builtin,
-                  uint32_t type)
+
+                  uint32_t type, uint32_t locCount)
       : sigPoint(sig), semanticStr(semaStr), semantic(sema),
         semanticName(semaName), semanticIndex(semaIndex), builtinAttr(builtin),
         typeId(type), valueId(0), isBuiltin(false),
-        storageClass(spv::StorageClass::Max), location(nullptr) {
+        storageClass(spv::StorageClass::Max), location(nullptr),
+        locationCount(locCount) {
     isBuiltin = builtinAttr != nullptr;
   }
 
@@ -68,6 +70,8 @@ public:
   const VKLocationAttr *getLocationAttr() const { return location; }
   void setLocationAttr(const VKLocationAttr *loc) { location = loc; }
 
+  uint32_t getLocationCount() const { return locationCount; }
+
 private:
   /// HLSL SigPoint. It uniquely identifies each set of parameters that may be
   /// input or output for each entry point.
@@ -92,6 +96,8 @@ private:
   spv::StorageClass storageClass;
   /// Location assignment if input/output variable.
   const VKLocationAttr *location;
+  /// How many locations this stage variable takes.
+  uint32_t locationCount;
 };
 
 class ResourceVar {
@@ -255,6 +261,9 @@ public:
                             ModuleBuilder &builder,
                             const EmitSPIRVOptions &spirvOptions);
 
+  /// \brief Returns the <result-id> for a SPIR-V builtin variable.
+  uint32_t getBuiltinVar(spv::BuiltIn builtIn);
+
   /// \brief Creates the stage output variables by parsing the semantics
   /// attached to the given function's parameter or return value and returns
   /// true on success. SPIR-V instructions will also be generated to update the
@@ -347,8 +356,8 @@ private:
     /// Default constructor to satisfy DenseMap
     DeclSpirvInfo() : info(0), indexInCTBuffer(-1) {}
 
-    DeclSpirvInfo(const SpirvEvalInfo &info_, int index = -1, bool row = false)
-        : info(info_), indexInCTBuffer(index), isRowMajor(row) {}
+    DeclSpirvInfo(const SpirvEvalInfo &info_, int index = -1)
+        : info(info_), indexInCTBuffer(index) {}
 
     /// Implicit conversion to SpirvEvalInfo.
     operator SpirvEvalInfo() const { return info; }
@@ -357,8 +366,6 @@ private:
     /// Value >= 0 means that this decl is a VarDecl inside a cbuffer/tbuffer
     /// and this is the index; value < 0 means this is just a standalone decl.
     int indexInCTBuffer;
-    /// Whether this decl should be row major.
-    bool isRowMajor;
   };
 
   /// \brief Returns the SPIR-V information for the given decl.
@@ -559,7 +566,7 @@ private:
   /// the children of this decl, and the children of this decl will be using
   /// the semantic in inheritSemantic, with index increasing sequentially.
   bool createStageVars(const hlsl::SigPoint *sigPoint, const NamedDecl *decl,
-                       bool asInput, QualType type, uint32_t arraySize,
+                       bool asInput, QualType asType, uint32_t arraySize,
                        const llvm::StringRef namePrefix,
                        llvm::Optional<uint32_t> invocationId, uint32_t *value,
                        bool noWriteBack, SemanticInfo *inheritSemantic);
@@ -648,6 +655,15 @@ private:
   /// to the <type-id>
   llvm::DenseMap<const DeclContext *, uint32_t> ctBufferPCTypeIds;
 
+  /// <result-id> for the SPIR-V builtin variables accessed by
+  /// WaveGetLaneCount() and WaveGetLaneIndex().
+  ///
+  /// These are the only two cases that SPIR-V builtin variables are accessed
+  /// using HLSL intrinsic function calls. All other builtin variables are
+  /// accessed using stage IO variables.
+  uint32_t laneCountBuiltinId;
+  uint32_t laneIndexBuiltinId;
+
   /// Whether the translated SPIR-V binary needs legalization.
   ///
   /// The following cases will require legalization:
@@ -718,7 +734,7 @@ DeclResultIdMapper::DeclResultIdMapper(const hlsl::ShaderModel &model,
     : shaderModel(model), theBuilder(builder), spirvOptions(options),
       astContext(context), diags(context.getDiagnostics()),
       typeTranslator(context, builder, diags, options), entryFunctionId(0),
-      needsLegalization(false),
+      laneCountBuiltinId(0), laneIndexBuiltinId(0), needsLegalization(false),
       glPerVertex(model, context, builder, typeTranslator, options.invertY) {}
 
 bool DeclResultIdMapper::decorateStageIOLocations() {

Dosya farkı çok büyük olduğundan ihmal edildi
+ 1031 - 1
tools/clang/lib/SPIRV/InstBuilderAuto.cpp


+ 21 - 3
tools/clang/lib/SPIRV/ModuleBuilder.cpp

@@ -705,6 +705,18 @@ void ModuleBuilder::createEndPrimitive() {
   insertPoint->appendInstruction(std::move(constructSite));
 }
 
+uint32_t ModuleBuilder::createSubgroupFirstInvocation(uint32_t resultType,
+                                                      uint32_t value) {
+  assert(insertPoint && "null insert point");
+  addExtension("SPV_KHR_shader_ballot");
+  requireCapability(spv::Capability::SubgroupBallotKHR);
+
+  uint32_t resultId = theContext.takeNextId();
+  instBuilder.opSubgroupFirstInvocationKHR(resultType, resultId, value).x();
+  insertPoint->appendInstruction(std::move(constructSite));
+  return resultId;
+}
+
 void ModuleBuilder::addExecutionMode(uint32_t entryPointId,
                                      spv::ExecutionMode em,
                                      llvm::ArrayRef<uint32_t> params) {
@@ -835,12 +847,17 @@ IMPL_GET_PRIMITIVE_TYPE(Float32)
 
 #undef IMPL_GET_PRIMITIVE_TYPE
 
+// Note: At the moment, Float16 capability should not be added for Vulkan 1.0.
+// It is not a required capability, and adding the SPV_AMD_gpu_half_float does
+// not enable this capability. Any driver that supports float16 in Vulkan 1.0
+// should accept this extension.
 #define IMPL_GET_PRIMITIVE_TYPE_WITH_CAPABILITY(ty, cap)                       \
                                                                                \
   uint32_t ModuleBuilder::get##ty##Type() {                                    \
-    requireCapability(spv::Capability::cap);                                   \
     if (spv::Capability::cap == spv::Capability::Float16)                      \
       theModule.addExtension("SPV_AMD_gpu_shader_half_float");                 \
+    else                                                                       \
+      requireCapability(spv::Capability::cap);                                 \
     const Type *type = Type::get##ty(theContext);                              \
     const uint32_t typeId = theContext.getResultIdForType(type);               \
     theModule.addType(type, typeId);                                           \
@@ -881,7 +898,8 @@ uint32_t ModuleBuilder::getVecType(uint32_t elemType, uint32_t elemCount) {
 }
 
 uint32_t ModuleBuilder::getMatType(QualType elemType, uint32_t colType,
-                                   uint32_t colCount) {
+                                   uint32_t colCount,
+                                   Type::DecorationSet decorations) {
   // NOTE: According to Item "Data rules" of SPIR-V Spec 2.16.1 "Universal
   // Validation Rules":
   //   Matrix types can only be parameterized with floating-point types.
@@ -889,7 +907,7 @@ uint32_t ModuleBuilder::getMatType(QualType elemType, uint32_t colType,
   // So we need special handling of non-fp matrices. We emulate non-fp
   // matrices as an array of vectors.
   if (!elemType->isFloatingType())
-    return getArrayType(colType, getConstantUint32(colCount));
+    return getArrayType(colType, getConstantUint32(colCount), decorations);
 
   const Type *type = Type::getMatrix(theContext, colType, colCount);
   const uint32_t typeId = theContext.getResultIdForType(type);

+ 203 - 55
tools/clang/lib/SPIRV/SPIRVEmitter.cpp

@@ -159,7 +159,12 @@ const Expr *isStructuredBufferLoad(const Expr *expr, const Expr **index) {
 /// Returns true if the given VarDecl will be translated into a SPIR-V variable
 /// not in the Private or Function storage class.
 inline bool isExternalVar(const VarDecl *var) {
-  return var->isExternallyVisible() && !var->isStaticDataMember();
+  // Class static variables should be put in the Private storage class.
+  // groupshared variables are allowed to be declared as "static". But we still
+  // need to put them in the Workgroup storage class. That is, when seeing
+  // "static groupshared", ignore "static".
+  return var->isExternallyVisible() ? !var->isStaticDataMember()
+                                    : var->getAttr<HLSLGroupSharedAttr>();
 }
 
 /// Returns the referenced variable's DeclContext if the given expr is
@@ -778,8 +783,8 @@ SpirvEvalInfo SPIRVEmitter::loadIfGLValue(const Expr *expr,
   if (const auto *declContext = isConstantTextureBufferDeclRef(expr)) {
     valType = declIdMapper.getCTBufferPushConstantTypeId(declContext);
   } else {
-    valType =
-        typeTranslator.translateType(expr->getType(), info.getLayoutRule());
+    valType = typeTranslator.translateType(
+        expr->getType(), info.getLayoutRule(), info.isRowMajor());
   }
   return info.setResultId(theBuilder.createLoad(valType, info)).setRValue();
 }
@@ -1055,6 +1060,14 @@ void SPIRVEmitter::doHLSLBufferDecl(const HLSLBufferDecl *bufferDecl) {
       for (const auto *annotation : varMember->getUnusualAnnotations())
         if (const auto *packing = dyn_cast<hlsl::ConstantPacking>(annotation))
           emitWarning("packoffset ignored since not supported", packing->Loc);
+
+      // We cannot handle external initialization of column-major matrices now.
+      if (typeTranslator.isOrContainsNonFpColMajorMatrix(varMember->getType(),
+                                                         varMember)) {
+        emitError("externally initialized non-floating-point column-major "
+                  "matrices not supported yet",
+                  varMember->getLocation());
+      }
     }
   }
   if (!validateVKAttributes(bufferDecl))
@@ -1084,6 +1097,14 @@ void SPIRVEmitter::doVarDecl(const VarDecl *decl) {
   if (!validateVKAttributes(decl))
     return;
 
+  // We cannot handle external initialization of column-major matrices now.
+  if (isExternalVar(decl) &&
+      typeTranslator.isOrContainsNonFpColMajorMatrix(decl->getType(), decl)) {
+    emitError("externally initialized non-floating-point column-major "
+              "matrices not supported yet",
+              decl->getLocation());
+  }
+
   if (decl->hasAttr<VKConstantIdAttr>()) {
     // This is a VarDecl for specialization constant.
     createSpecConstant(decl);
@@ -1724,7 +1745,6 @@ void SPIRVEmitter::doSwitchStmt(const SwitchStmt *switchStmt,
 
 SpirvEvalInfo
 SPIRVEmitter::doArraySubscriptExpr(const ArraySubscriptExpr *expr) {
-
   llvm::SmallVector<uint32_t, 4> indices;
   auto info = loadIfAliasVarRef(collectArrayStructIndices(expr, &indices));
 
@@ -1757,7 +1777,8 @@ SpirvEvalInfo SPIRVEmitter::doBinaryOperator(const BinaryOperator *expr) {
   }
 
   return processBinaryOp(expr->getLHS(), expr->getRHS(), opcode,
-                         expr->getType(), expr->getSourceRange());
+                         expr->getLHS()->getType(), expr->getType(),
+                         expr->getSourceRange());
 }
 
 SpirvEvalInfo SPIRVEmitter::doCallExpr(const CallExpr *callExpr) {
@@ -2154,13 +2175,22 @@ SpirvEvalInfo SPIRVEmitter::doCastExpr(const CastExpr *expr) {
       return SpirvEvalInfo(subExprId).setRValue().setConstant();
     }
 
-    // Try to evaluate 'literal float' as float rather than double.
+    TypeTranslator::LiteralTypeHint hint(typeTranslator);
+    // Try to evaluate float literals as float rather than double.
     if (const auto *floatLiteral = dyn_cast<FloatingLiteral>(subExpr)) {
       subExprId = tryToEvaluateAsFloat32(floatLiteral->getValue());
       if (subExprId)
         evalType = astContext.FloatTy;
     }
-    // Try to evaluate 'literal int' as 32-bit int rather than 64-bit int.
+    // Evaluate 'literal float' initializer type as float rather than double.
+    // TODO: This could result in rounding error if the initializer is a
+    // non-literal expression that requires larger than 32 bits and has the
+    // 'literal float' type.
+    else if (subExprType->isSpecificBuiltinType(BuiltinType::LitFloat)) {
+      evalType = astContext.FloatTy;
+      hint.setHint(astContext.FloatTy);
+    }
+    // Try to evaluate integer literals as 32-bit int rather than 64-bit int.
     else if (const auto *intLiteral = dyn_cast<IntegerLiteral>(subExpr)) {
       const bool isSigned = subExprType->isSignedIntegerType();
       subExprId = tryToEvaluateAsInt32(intLiteral->getValue(), isSigned);
@@ -2229,15 +2259,18 @@ uint32_t SPIRVEmitter::processFlatConversion(const QualType type,
         case BuiltinType::Bool:
           return castToBool(initId, initType, ty);
         // Target type is an integer variant.
-        // TODO: Add long and ulong.
         case BuiltinType::Int:
         case BuiltinType::Short:
         case BuiltinType::Min12Int:
         case BuiltinType::UShort:
         case BuiltinType::UInt:
+        case BuiltinType::Long:
+        case BuiltinType::LongLong:
+        case BuiltinType::ULong:
+        case BuiltinType::ULongLong:
           return castToInt(initId, initType, ty, srcLoc);
         // Target type is a float variant.
-        // TODO: Add double.
+        case BuiltinType::Double:
         case BuiltinType::Float:
         case BuiltinType::Half:
         case BuiltinType::Min10Float:
@@ -2340,8 +2373,9 @@ SPIRVEmitter::doCompoundAssignOperator(const CompoundAssignOperator *expr) {
   const auto *lhs = expr->getLHS();
 
   SpirvEvalInfo lhsPtr = 0;
-  const auto result = processBinaryOp(lhs, rhs, opcode, expr->getType(),
-                                      expr->getSourceRange(), &lhsPtr);
+  const auto result =
+      processBinaryOp(lhs, rhs, opcode, expr->getComputationLHSType(),
+                      expr->getType(), expr->getSourceRange(), &lhsPtr);
   return processAssignment(lhs, result, true, lhsPtr);
 }
 
@@ -4506,9 +4540,36 @@ SpirvEvalInfo SPIRVEmitter::processAssignment(const Expr *lhs,
 void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
                               const SpirvEvalInfo &rhsVal,
                               const QualType lhsValType) {
+
+  // Lambda for cases where we want to store per each array element.
+  const auto storeValueForEachArrayElement = [this, &lhsPtr,
+                                              &rhsVal](uint32_t arraySize,
+                                                       QualType arrayElemType) {
+    for (uint32_t i = 0; i < arraySize; ++i) {
+      const auto subRhsValType =
+          typeTranslator.translateType(arrayElemType, rhsVal.getLayoutRule());
+      const auto subRhsVal =
+          theBuilder.createCompositeExtract(subRhsValType, rhsVal, {i});
+      const auto subLhsPtrType = theBuilder.getPointerType(
+          typeTranslator.translateType(arrayElemType, lhsPtr.getLayoutRule()),
+          lhsPtr.getStorageClass());
+      const auto subLhsPtr = theBuilder.createAccessChain(
+          subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(i)});
+
+      storeValue(lhsPtr.substResultId(subLhsPtr),
+                 rhsVal.substResultId(subRhsVal), arrayElemType);
+    }
+  };
+
+  QualType matElemType = {};
+  uint32_t numRows = 0, numCols = 0;
+  const bool lhsIsMat =
+      typeTranslator.isMxNMatrix(lhsValType, &matElemType, &numRows, &numCols);
+  const bool lhsIsFloatMat = lhsIsMat && matElemType->isFloatingType();
+  const bool lhsIsNonFpMat = lhsIsMat && !matElemType->isFloatingType();
+
   if (typeTranslator.isScalarType(lhsValType) ||
-      typeTranslator.isVectorType(lhsValType) ||
-      typeTranslator.isMxNMatrix(lhsValType)) {
+      typeTranslator.isVectorType(lhsValType) || lhsIsFloatMat) {
     theBuilder.createStore(lhsPtr, rhsVal);
   } else if (TypeTranslator::isOpaqueType(lhsValType)) {
     // Resource types are represented using RecordType in the AST.
@@ -4545,16 +4606,24 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
     // Note: this check should happen after those setting needsLegalization.
     // TODO: is this optimization always correct?
     theBuilder.createStore(lhsPtr, rhsVal);
+  } else if (lhsIsNonFpMat) {
+    // Note: This check should happen before the RecordType check.
+    // Non-fp matrices are represented as arrays of vectors in SPIR-V.
+    // Each array element is a vector. Get the QualType for the vector.
+    const auto elemType = astContext.getExtVectorType(matElemType, numCols);
+    storeValueForEachArrayElement(numRows, elemType);
   } else if (const auto *recordType = lhsValType->getAs<RecordType>()) {
     uint32_t index = 0;
     for (const auto *field : recordType->getDecl()->fields()) {
+      bool isRowMajor =
+          typeTranslator.isRowMajorMatrix(field->getType(), field);
       const auto subRhsValType = typeTranslator.translateType(
-          field->getType(), rhsVal.getLayoutRule());
+          field->getType(), rhsVal.getLayoutRule(), isRowMajor);
       const auto subRhsVal =
           theBuilder.createCompositeExtract(subRhsValType, rhsVal, {index});
       const auto subLhsPtrType = theBuilder.getPointerType(
-          typeTranslator.translateType(field->getType(),
-                                       lhsPtr.getLayoutRule()),
+          typeTranslator.translateType(field->getType(), lhsPtr.getLayoutRule(),
+                                       isRowMajor),
           lhsPtr.getStorageClass());
       const auto subLhsPtr = theBuilder.createAccessChain(
           subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(index)});
@@ -4569,21 +4638,7 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
     // TODO: handle extra large array size?
     const auto size =
         static_cast<uint32_t>(arrayType->getSize().getZExtValue());
-
-    for (uint32_t i = 0; i < size; ++i) {
-      const auto subRhsValType =
-          typeTranslator.translateType(elemType, rhsVal.getLayoutRule());
-      const auto subRhsVal =
-          theBuilder.createCompositeExtract(subRhsValType, rhsVal, {i});
-      const auto subLhsPtrType = theBuilder.getPointerType(
-          typeTranslator.translateType(elemType, lhsPtr.getLayoutRule()),
-          lhsPtr.getStorageClass());
-      const auto subLhsPtr = theBuilder.createAccessChain(
-          subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(i)});
-
-      storeValue(lhsPtr.substResultId(subLhsPtr),
-                 rhsVal.substResultId(subRhsVal), elemType);
-    }
+    storeValueForEachArrayElement(size, elemType);
   } else {
     emitError("storing value of type %0 unimplemented", {}) << lhsValType;
   }
@@ -4591,22 +4646,24 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
 
 SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
                                             const BinaryOperatorKind opcode,
+                                            const QualType computationType,
                                             const QualType resultType,
                                             SourceRange sourceRange,
                                             SpirvEvalInfo *lhsInfo,
                                             const spv::Op mandateGenOpcode) {
-  const uint32_t resultTypeId = typeTranslator.translateType(resultType);
+  const QualType lhsType = lhs->getType();
+  const QualType rhsType = rhs->getType();
 
   // Binary logical operations (such as ==, !=, etc) that return a boolean type
   // may get a literal (e.g. 0, 1, etc.) as lhs or rhs args. Since only
   // non-zero-ness of these literals matter, they can be translated as 32-bits.
   TypeTranslator::LiteralTypeHint hint(typeTranslator);
   if (resultType->isBooleanType()) {
-    if (lhs->getType()->isSpecificBuiltinType(BuiltinType::LitInt) ||
-        rhs->getType()->isSpecificBuiltinType(BuiltinType::LitInt))
+    if (lhsType->isSpecificBuiltinType(BuiltinType::LitInt) ||
+        rhsType->isSpecificBuiltinType(BuiltinType::LitInt))
       hint.setHint(astContext.IntTy);
-    if (lhs->getType()->isSpecificBuiltinType(BuiltinType::LitFloat) ||
-        rhs->getType()->isSpecificBuiltinType(BuiltinType::LitFloat))
+    if (lhsType->isSpecificBuiltinType(BuiltinType::LitFloat) ||
+        rhsType->isSpecificBuiltinType(BuiltinType::LitFloat))
       hint.setHint(astContext.FloatTy);
   }
 
@@ -4614,7 +4671,7 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
   // onto each element vector iff the operands are not degenerated matrices
   // and we don't have a matrix specific SPIR-V instruction for the operation.
   if (!isSpirvMatrixOp(mandateGenOpcode) &&
-      TypeTranslator::isMxNMatrix(lhs->getType())) {
+      TypeTranslator::isMxNMatrix(lhsType)) {
     return processMatrixBinaryOp(lhs, rhs, opcode, sourceRange);
   }
 
@@ -4626,11 +4683,8 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
     return doExpr(rhs);
   }
 
-  const spv::Op spvOp = (mandateGenOpcode == spv::Op::Max)
-                            ? translateOp(opcode, lhs->getType())
-                            : mandateGenOpcode;
-
   SpirvEvalInfo rhsVal = 0, lhsPtr = 0, lhsVal = 0;
+
   if (BinaryOperator::isCompoundAssignmentOp(opcode)) {
     // Evalute rhs before lhs
     rhsVal = loadIfGLValue(rhs);
@@ -4640,6 +4694,12 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
     if (!lhsPtr.isRValue() && !isVectorShuffle(lhs)) {
       lhsVal = loadIfGLValue(lhs, lhsPtr);
     }
+    // For a compound assignments, the AST does not have the proper implicit
+    // cast if lhs and rhs have different types. So we need to manually cast lhs
+    // to the computation type.
+    if (computationType != lhsType)
+      lhsVal.setResultId(
+          castToType(lhsVal, lhsType, computationType, lhs->getExprLoc()));
   } else {
     // Evalute lhs before rhs
     lhsPtr = doExpr(lhs);
@@ -4650,6 +4710,10 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
   if (lhsInfo)
     *lhsInfo = lhsPtr;
 
+  const spv::Op spvOp = (mandateGenOpcode == spv::Op::Max)
+                            ? translateOp(opcode, computationType)
+                            : mandateGenOpcode;
+
   switch (opcode) {
   case BO_Add:
   case BO_Sub:
@@ -4679,19 +4743,32 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
   case BO_XorAssign:
   case BO_ShlAssign:
   case BO_ShrAssign: {
+
     // To evaluate this expression as an OpSpecConstantOp, we need to make sure
     // both operands are constant and at least one of them is a spec constant.
     if (lhsVal.isConstant() && rhsVal.isConstant() &&
         (lhsVal.isSpecConstant() || rhsVal.isSpecConstant()) &&
         isAcceptedSpecConstantBinaryOp(spvOp)) {
       const auto valId = theBuilder.createSpecConstantBinaryOp(
-          spvOp, resultTypeId, lhsVal, rhsVal);
+          spvOp, typeTranslator.translateType(resultType), lhsVal, rhsVal);
       return SpirvEvalInfo(valId).setRValue().setSpecConstant();
     }
 
     // Normal binary operation
-    const auto valId =
-        theBuilder.createBinaryOp(spvOp, resultTypeId, lhsVal, rhsVal);
+    uint32_t valId = 0;
+    if (BinaryOperator::isCompoundAssignmentOp(opcode)) {
+      valId = theBuilder.createBinaryOp(
+          spvOp, typeTranslator.translateType(computationType), lhsVal, rhsVal);
+      // For a compound assignments, the AST does not have the proper implicit
+      // cast if lhs and rhs have different types. So we need to manually cast
+      // the result back to lhs' type.
+      if (computationType != lhsType)
+        valId = castToType(valId, computationType, lhsType, lhs->getExprLoc());
+    } else {
+      valId = theBuilder.createBinaryOp(
+          spvOp, typeTranslator.translateType(resultType), lhsVal, rhsVal);
+    }
+
     auto result = SpirvEvalInfo(valId).setRValue();
     if (lhsVal.isRelaxedPrecision() || rhsVal.isRelaxedPrecision())
       result.setRelaxedPrecision();
@@ -4977,12 +5054,12 @@ SPIRVEmitter::tryToGenFloatVectorScale(const BinaryOperator *expr) {
         if (isa<CompoundAssignOperator>(expr)) {
           SpirvEvalInfo lhsPtr = 0;
           const auto result = processBinaryOp(
-              lhs, cast->getSubExpr(), expr->getOpcode(), vecType, range,
-              &lhsPtr, spv::Op::OpVectorTimesScalar);
+              lhs, cast->getSubExpr(), expr->getOpcode(), vecType, vecType,
+              range, &lhsPtr, spv::Op::OpVectorTimesScalar);
           return processAssignment(lhs, result, true, lhsPtr);
         } else {
           return processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
-                                 vecType, range, nullptr,
+                                 vecType, vecType, range, nullptr,
                                  spv::Op::OpVectorTimesScalar);
         }
       }
@@ -4998,7 +5075,7 @@ SPIRVEmitter::tryToGenFloatVectorScale(const BinaryOperator *expr) {
         // OpVectorTimesScalar requires the first operand to be a vector and
         // the second to be a scalar.
         return processBinaryOp(rhs, cast->getSubExpr(), expr->getOpcode(),
-                               vecType, range, nullptr,
+                               vecType, vecType, range, nullptr,
                                spv::Op::OpVectorTimesScalar);
       }
     }
@@ -5043,11 +5120,11 @@ SPIRVEmitter::tryToGenFloatMatrixScale(const BinaryOperator *expr) {
           SpirvEvalInfo lhsPtr = 0;
           const auto result =
               processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
-                              matType, range, &lhsPtr, opcode);
+                              matType, matType, range, &lhsPtr, opcode);
           return processAssignment(lhs, result, true, lhsPtr);
         } else {
           return processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
-                                 matType, range, nullptr, opcode);
+                                 matType, matType, range, nullptr, opcode);
         }
       }
     }
@@ -5063,7 +5140,7 @@ SPIRVEmitter::tryToGenFloatMatrixScale(const BinaryOperator *expr) {
         // OpMatrixTimesScalar requires the first operand to be a matrix and
         // the second to be a scalar.
         return processBinaryOp(rhs, cast->getSubExpr(), expr->getOpcode(),
-                               matType, range, nullptr, opcode);
+                               matType, matType, range, nullptr, opcode);
       }
     }
   }
@@ -5557,7 +5634,7 @@ uint32_t SPIRVEmitter::castToBool(const uint32_t fromVal, QualType fromType,
   return theBuilder.createBinaryOp(spvOp, boolType, fromVal, zeroVal);
 }
 
-uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
+uint32_t SPIRVEmitter::castToInt(uint32_t fromVal, QualType fromType,
                                  QualType toIntType, SourceLocation srcLoc) {
   if (TypeTranslator::isSameScalarOrVecType(fromType, toIntType))
     return fromVal;
@@ -5571,11 +5648,18 @@ uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
   }
 
   if (isSintOrVecOfSintType(fromType) || isUintOrVecOfUintType(fromType)) {
-    // TODO: handle different bitwidths
+    // First convert the source to the bitwidth of the destination if necessary.
+    uint32_t convertedType = 0;
+    fromVal = convertBitwidth(fromVal, fromType, toIntType, &convertedType);
+    // If bitwidth conversion was the only thing we needed to do, we're done.
+    if (convertedType == typeTranslator.translateType(toIntType))
+      return fromVal;
     return theBuilder.createUnaryOp(spv::Op::OpBitcast, intType, fromVal);
   }
 
   if (isFloatOrVecOfFloatType(fromType)) {
+    // First convert the source to the bitwidth of the destination if necessary.
+    fromVal = convertBitwidth(fromVal, fromType, toIntType);
     if (isSintOrVecOfSintType(toIntType)) {
       return theBuilder.createUnaryOp(spv::Op::OpConvertFToS, intType, fromVal);
     } else if (isUintOrVecOfUintType(toIntType)) {
@@ -5619,7 +5703,41 @@ uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
   return 0;
 }
 
-uint32_t SPIRVEmitter::castToFloat(const uint32_t fromVal, QualType fromType,
+uint32_t SPIRVEmitter::convertBitwidth(uint32_t fromVal, QualType fromType,
+                                       QualType toType, uint32_t *resultType) {
+  // At the moment, we will not make bitwidth conversions for literal int and
+  // literal float types because they always indicate 64-bit and do not
+  // represent what SPIR-V was actually resolved to.
+  // TODO: If the evaluated type is added to SpirvEvalInfo, change 'fromVal' to
+  // SpirvEvalInfo and use it to handle literal types more accurately.
+  if (fromType->isSpecificBuiltinType(BuiltinType::LitFloat) ||
+      fromType->isSpecificBuiltinType(BuiltinType::LitInt))
+    return fromVal;
+
+  const auto fromBitwidth = typeTranslator.getElementSpirvBitwidth(fromType);
+  const auto toBitwidth = typeTranslator.getElementSpirvBitwidth(toType);
+  if (fromBitwidth == toBitwidth) {
+    if (resultType)
+      *resultType = typeTranslator.translateType(fromType);
+    return fromVal;
+  }
+
+  // We want the 'fromType' with the 'toBitwidth'.
+  const uint32_t targetTypeId =
+      typeTranslator.getTypeWithCustomBitwidth(fromType, toBitwidth);
+  if (resultType)
+    *resultType = targetTypeId;
+
+  if (isFloatOrVecOfFloatType(fromType))
+    return theBuilder.createUnaryOp(spv::Op::OpFConvert, targetTypeId, fromVal);
+  if (isSintOrVecOfSintType(fromType))
+    return theBuilder.createUnaryOp(spv::Op::OpSConvert, targetTypeId, fromVal);
+  if (isUintOrVecOfUintType(fromType))
+    return theBuilder.createUnaryOp(spv::Op::OpUConvert, targetTypeId, fromVal);
+  llvm_unreachable("invalid type passed to convertBitwidth");
+}
+
+uint32_t SPIRVEmitter::castToFloat(uint32_t fromVal, QualType fromType,
                                    QualType toFloatType,
                                    SourceLocation srcLoc) {
   if (TypeTranslator::isSameScalarOrVecType(fromType, toFloatType))
@@ -5634,15 +5752,20 @@ uint32_t SPIRVEmitter::castToFloat(const uint32_t fromVal, QualType fromType,
   }
 
   if (isSintOrVecOfSintType(fromType)) {
+    // First convert the source to the bitwidth of the destination if necessary.
+    fromVal = convertBitwidth(fromVal, fromType, toFloatType);
     return theBuilder.createUnaryOp(spv::Op::OpConvertSToF, floatType, fromVal);
   }
 
   if (isUintOrVecOfUintType(fromType)) {
+    // First convert the source to the bitwidth of the destination if necessary.
+    fromVal = convertBitwidth(fromVal, fromType, toFloatType);
     return theBuilder.createUnaryOp(spv::Op::OpConvertUToF, floatType, fromVal);
   }
 
   if (isFloatOrVecOfFloatType(fromType)) {
-    return theBuilder.createUnaryOp(spv::Op::OpFConvert, floatType, fromVal);
+    // This is the case of float to float conversion with different bitwidths.
+    return convertBitwidth(fromVal, fromType, toFloatType);
   }
 
   // Casting matrix types
@@ -5895,6 +6018,31 @@ SpirvEvalInfo SPIRVEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
   case hlsl::IntrinsicOp::IOP_f32tof16:
     retVal = processIntrinsicF32ToF16(callExpr);
     break;
+  case hlsl::IntrinsicOp::IOP_WaveGetLaneCount: {
+    const uint32_t retType =
+        typeTranslator.translateType(callExpr->getCallReturnType(astContext));
+    const uint32_t varId =
+        declIdMapper.getBuiltinVar(spv::BuiltIn::SubgroupSize);
+    retVal = theBuilder.createLoad(retType, varId);
+  } break;
+  case hlsl::IntrinsicOp::IOP_WaveGetLaneIndex: {
+    const uint32_t retType =
+        typeTranslator.translateType(callExpr->getCallReturnType(astContext));
+    const uint32_t varId =
+        declIdMapper.getBuiltinVar(spv::BuiltIn::SubgroupLocalInvocationId);
+    retVal = theBuilder.createLoad(retType, varId);
+  } break;
+  case hlsl::IntrinsicOp::IOP_WaveReadLaneFirst: {
+    const auto retType = callExpr->getCallReturnType(astContext);
+    if (!retType->isScalarType()) {
+      emitError("vector overloads of WaveReadLaneFirst unimplemented",
+                callExpr->getExprLoc());
+      return 0;
+    }
+    const uint32_t retTypeId = typeTranslator.translateType(retType);
+    retVal = theBuilder.createSubgroupFirstInvocation(
+        retTypeId, doExpr(callExpr->getArg(0)));
+  } break;
   case hlsl::IntrinsicOp::IOP_abort:
   case hlsl::IntrinsicOp::IOP_GetRenderTargetSampleCount:
   case hlsl::IntrinsicOp::IOP_GetRenderTargetSamplePosition: {

+ 21 - 5
tools/clang/lib/SPIRV/SPIRVEmitter.h

@@ -148,12 +148,20 @@ private:
                   QualType lhsValType);
 
   /// Generates the necessary instructions for conducting the given binary
-  /// operation on lhs and rhs. If lhsResultId is not nullptr, the evaluated
-  /// pointer from lhs during the process will be written into it. If
-  /// mandateGenOpcode is not spv::Op::Max, it will used as the SPIR-V opcode
-  /// instead of deducing from Clang frontend opcode.
+  /// operation on lhs and rhs.
+  ///
+  /// computationType is the type for LHS and RHS when doing computation, while
+  /// resultType is the type of the whole binary operation. They can be
+  /// different for compound assignments like <some-int-value> *=
+  /// <some-float-value>, where computationType is float and resultType is int.
+  ///
+  /// If lhsResultId is not nullptr, the evaluated pointer from lhs during the
+  /// process will be written into it. If mandateGenOpcode is not spv::Op::Max,
+  /// it will used as the SPIR-V opcode instead of deducing from Clang frontend
+  /// opcode.
   SpirvEvalInfo processBinaryOp(const Expr *lhs, const Expr *rhs,
-                                BinaryOperatorKind opcode, QualType resultType,
+                                BinaryOperatorKind opcode,
+                                QualType computationType, QualType resultType,
                                 SourceRange, SpirvEvalInfo *lhsInfo = nullptr,
                                 spv::Op mandateGenOpcode = spv::Op::Max);
 
@@ -283,6 +291,14 @@ private:
   bool validateVKAttributes(const NamedDecl *decl);
 
 private:
+  /// Converts the given value from the bitwidth of 'fromType' to the bitwidth
+  /// of 'toType'. If the two have the same bitwidth, returns the value itself.
+  /// If resultType is not nullptr, the resulting value's type will be written
+  /// to resultType. Panics if the given types are not scalar or vector of
+  /// float/integer type.
+  uint32_t convertBitwidth(uint32_t value, QualType fromType, QualType toType,
+                           uint32_t *resultType = nullptr);
+
   /// Processes the given expr, casts the result into the given bool (vector)
   /// type and returns the <result-id> of the casted value.
   uint32_t castToBool(uint32_t value, QualType fromType, QualType toType);

+ 10 - 1
tools/clang/lib/SPIRV/SpirvEvalInfo.h

@@ -100,6 +100,9 @@ public:
   inline SpirvEvalInfo &setRelaxedPrecision();
   bool isRelaxedPrecision() const { return isRelaxedPrecision_; }
 
+  inline SpirvEvalInfo &setRowMajor(bool);
+  bool isRowMajor() const { return isRowMajor_; }
+
 private:
   uint32_t resultId;
   /// Indicates whether this evaluation result contains alias variables
@@ -119,13 +122,14 @@ private:
   bool isConstant_;
   bool isSpecConstant_;
   bool isRelaxedPrecision_;
+  bool isRowMajor_;
 };
 
 SpirvEvalInfo::SpirvEvalInfo(uint32_t id)
     : resultId(id), containsAlias(false),
       storageClass(spv::StorageClass::Function), layoutRule(LayoutRule::Void),
       isRValue_(false), isConstant_(false), isSpecConstant_(false),
-      isRelaxedPrecision_(false) {}
+      isRelaxedPrecision_(false), isRowMajor_(false) {}
 
 SpirvEvalInfo &SpirvEvalInfo::setResultId(uint32_t id) {
   resultId = id;
@@ -174,6 +178,11 @@ SpirvEvalInfo &SpirvEvalInfo::setRelaxedPrecision() {
   return *this;
 }
 
+SpirvEvalInfo &SpirvEvalInfo::setRowMajor(bool rm) {
+  isRowMajor_ = rm;
+  return *this;
+}
+
 } // end namespace spirv
 } // end namespace clang
 

+ 274 - 77
tools/clang/lib/SPIRV/TypeTranslator.cpp

@@ -203,11 +203,219 @@ void TypeTranslator::popIntendedLiteralType() {
   intendedLiteralTypes.pop();
 }
 
+uint32_t TypeTranslator::getLocationCount(QualType type) {
+  // See Vulkan spec 14.1.4. Location Assignment for the complete set of rules.
+
+  const auto canonicalType = type.getCanonicalType();
+  if (canonicalType != type)
+    return getLocationCount(canonicalType);
+
+  // Inputs and outputs of the following types consume a single interface
+  // location:
+  // * 16-bit scalar and vector types, and
+  // * 32-bit scalar and vector types, and
+  // * 64-bit scalar and 2-component vector types.
+
+  // 64-bit three- and four- component vectors consume two consecutive
+  // locations.
+
+  // Primitive types
+  if (isScalarType(type))
+    return 1;
+
+  // Vector types
+  {
+    QualType elemType = {};
+    uint32_t elemCount = {};
+    if (isVectorType(type, &elemType, &elemCount)) {
+      const auto *builtinType = elemType->getAs<BuiltinType>();
+      switch (builtinType->getKind()) {
+      case BuiltinType::Double:
+      case BuiltinType::LongLong:
+      case BuiltinType::ULongLong:
+        if (elemCount >= 3)
+          return 2;
+      }
+      return 1;
+    }
+  }
+
+  // If the declared input or output is an n * m 16- , 32- or 64- bit matrix,
+  // it will be assigned multiple locations starting with the location
+  // specified. The number of locations assigned for each matrix will be the
+  // same as for an n-element array of m-component vectors.
+
+  // Matrix types
+  {
+    QualType elemType = {};
+    uint32_t rowCount = 0, colCount = 0;
+    if (isMxNMatrix(type, &elemType, &rowCount, &colCount))
+      return getLocationCount(astContext.getExtVectorType(elemType, colCount)) *
+             rowCount;
+  }
+
+  // Typedefs
+  if (const auto *typedefType = type->getAs<TypedefType>())
+    return getLocationCount(typedefType->desugar());
+
+  // Reference types
+  if (const auto *refType = type->getAs<ReferenceType>())
+    return getLocationCount(refType->getPointeeType());
+
+  // Pointer types
+  if (const auto *ptrType = type->getAs<PointerType>())
+    return getLocationCount(ptrType->getPointeeType());
+
+  // If a declared input or output is an array of size n and each element takes
+  // m locations, it will be assigned m * n consecutive locations starting with
+  // the location specified.
+
+  // Array types
+  if (const auto *arrayType = astContext.getAsConstantArrayType(type))
+    return getLocationCount(arrayType->getElementType()) *
+           static_cast<uint32_t>(arrayType->getSize().getZExtValue());
+
+  // Struct type
+  if (const auto *structType = type->getAs<RecordType>()) {
+    assert(false && "all structs should already be flattened");
+    return 0;
+  }
+
+  emitError(
+      "calculating number of occupied locations for type %0 unimplemented")
+      << type;
+  return 0;
+}
+
+uint32_t TypeTranslator::getTypeWithCustomBitwidth(QualType type,
+                                                   uint32_t bitwidth) {
+  // Cases where the given type is a vector of float/int.
+  {
+    QualType elemType = {};
+    uint32_t elemCount = 0;
+    const bool isVec = isVectorType(type, &elemType, &elemCount);
+    if (isVec) {
+      return theBuilder.getVecType(
+          getTypeWithCustomBitwidth(elemType, bitwidth), elemCount);
+    }
+  }
+
+  // Scalar cases.
+  assert(!type->isBooleanType());
+  assert(type->isIntegerType() || type->isFloatingType());
+  if (type->isFloatingType()) {
+    switch (bitwidth) {
+    case 16:
+      return theBuilder.getFloat16Type();
+    case 32:
+      return theBuilder.getFloat32Type();
+    case 64:
+      return theBuilder.getFloat64Type();
+    }
+  }
+  if (type->isSignedIntegerType()) {
+    switch (bitwidth) {
+    case 16:
+      return theBuilder.getInt16Type();
+    case 32:
+      return theBuilder.getInt32Type();
+    case 64:
+      return theBuilder.getInt64Type();
+    }
+  }
+  if (type->isUnsignedIntegerType()) {
+    switch (bitwidth) {
+    case 16:
+      return theBuilder.getUint16Type();
+    case 32:
+      return theBuilder.getUint32Type();
+    case 64:
+      return theBuilder.getUint64Type();
+    }
+  }
+  llvm_unreachable(
+      "invalid type or bitwidth passed to getTypeWithCustomBitwidth");
+}
+
+uint32_t TypeTranslator::getElementSpirvBitwidth(QualType type) {
+  const auto canonicalType = type.getCanonicalType();
+  if (canonicalType != type)
+    return getElementSpirvBitwidth(canonicalType);
+
+  // Vector types
+  {
+    QualType elemType = {};
+    if (isVectorType(type, &elemType))
+      return getElementSpirvBitwidth(elemType);
+  }
+
+  // Scalar types
+  QualType ty = {};
+  const bool isScalar = isScalarType(type, &ty);
+  assert(isScalar);
+  if (const auto *builtinType = ty->getAs<BuiltinType>()) {
+    switch (builtinType->getKind()) {
+    case BuiltinType::Int:
+    case BuiltinType::UInt:
+    case BuiltinType::Float:
+      return 32;
+    case BuiltinType::Double:
+    case BuiltinType::LongLong:
+    case BuiltinType::ULongLong:
+      return 64;
+    // min16int (short), ushort, min12int, half, and min10float are treated as
+    // 16-bit if '-enable-16bit-types' option is enabled. They are treated as
+    // 32-bit otherwise.
+    case BuiltinType::Short:
+    case BuiltinType::UShort:
+    case BuiltinType::Min12Int:
+    case BuiltinType::Half:
+    case BuiltinType::Min10Float: {
+      if (spirvOptions.enable16BitTypes)
+        return 16;
+      else
+        return 32;
+    }
+    case BuiltinType::LitFloat: {
+      // First try to see if there are any hints about how this literal type
+      // is going to be used. If so, use the hint.
+      if (getIntendedLiteralType(ty) != ty) {
+        return getElementSpirvBitwidth(getIntendedLiteralType(ty));
+      }
+
+      const auto &semantics = astContext.getFloatTypeSemantics(type);
+      const auto bitwidth = llvm::APFloat::getSizeInBits(semantics);
+      if (bitwidth <= 32)
+        return 32;
+      else
+        return 64;
+    }
+    case BuiltinType::LitInt: {
+      // First try to see if there are any hints about how this literal type
+      // is going to be used. If so, use the hint.
+      if (getIntendedLiteralType(ty) != ty) {
+        return getElementSpirvBitwidth(getIntendedLiteralType(ty));
+      }
+
+      const auto bitwidth = astContext.getIntWidth(type);
+      // All integer variants with bitwidth larger than 32 are represented
+      // as 64-bit int in SPIR-V.
+      // All integer variants with bitwidth of 32 or less are represented as
+      // 32-bit int in SPIR-V.
+      return bitwidth > 32 ? 64 : 32;
+    }
+    }
+  }
+  llvm_unreachable("invalid type passed to getElementSpirvBitwidth");
+}
+
 uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
                                        bool isRowMajor) {
   // We can only apply row_major to matrices or arrays of matrices.
+  // isRowMajor will be ignored for scalar and vector types.
   if (isRowMajor)
-    assert(isMxNMatrix(type) || type->isArrayType());
+    assert(type->isScalarType() || type->isArrayType() ||
+           hlsl::IsHLSLVecMatType(type));
 
   // Try to translate the canonical type first
   const auto canonicalType = type.getCanonicalType();
@@ -224,80 +432,30 @@ uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
           return theBuilder.getVoidType();
         case BuiltinType::Bool:
           return theBuilder.getBoolType();
+        // All the ints
         case BuiltinType::Int:
-          return theBuilder.getInt32Type();
         case BuiltinType::UInt:
-          return theBuilder.getUint32Type();
-        case BuiltinType::Float:
-          return theBuilder.getFloat32Type();
-        case BuiltinType::Double:
-          return theBuilder.getFloat64Type();
+        case BuiltinType::Short:
+        case BuiltinType::Min12Int:
+        case BuiltinType::UShort:
         case BuiltinType::LongLong:
-          return theBuilder.getInt64Type();
         case BuiltinType::ULongLong:
-          return theBuilder.getUint64Type();
-        // min16int (short), and min12int are treated as 16-bit Int if
-        // '-enable-16bit-types' option is enabled. They are treated as 32-bit
-        // Int otherwise.
-        case BuiltinType::Short:
-        case BuiltinType::Min12Int: {
-          if (spirvOptions.enable16BitTypes)
-            return theBuilder.getInt16Type();
-          else
-            return theBuilder.getInt32Type();
-        }
-        // min16uint (ushort) is treated as 16-bit Uint if '-enable-16bit-types'
-        // option is enabled. It is treated as 32-bit Uint otherwise.
-        case BuiltinType::UShort: {
-          if (spirvOptions.enable16BitTypes)
-            return theBuilder.getUint16Type();
-          else
-            return theBuilder.getUint32Type();
-        }
-        // min16float (half), and min10float are all translated to
-        // 32-bit float in SPIR-V.
-        // min16float (half), and min10float are treated as 16-bit float if
-        // '-enable-16bit-types' option is enabled. They are treated as 32-bit
-        // float otherwise.
+        // All the floats
+        case BuiltinType::Float:
+        case BuiltinType::Double:
         case BuiltinType::Half:
         case BuiltinType::Min10Float: {
-          if (spirvOptions.enable16BitTypes)
-            return theBuilder.getFloat16Type();
-          else
-            return theBuilder.getFloat32Type();
+          const auto bitwidth = getElementSpirvBitwidth(ty);
+          return getTypeWithCustomBitwidth(ty, bitwidth);
         }
+        // Literal types. First try to resolve them using hints.
+        case BuiltinType::LitInt:
         case BuiltinType::LitFloat: {
-          // First try to see if there are any hints about how this literal type
-          // is going to be used. If so, use the hint.
-          if (getIntendedLiteralType(ty) != ty) {
-            return translateType(getIntendedLiteralType(ty));
-          }
-
-          const auto &semantics = astContext.getFloatTypeSemantics(type);
-          const auto bitwidth = llvm::APFloat::getSizeInBits(semantics);
-          if (bitwidth <= 32)
-            return theBuilder.getFloat32Type();
-          else
-            return theBuilder.getFloat64Type();
-        }
-        case BuiltinType::LitInt: {
-          // First try to see if there are any hints about how this literal type
-          // is going to be used. If so, use the hint.
           if (getIntendedLiteralType(ty) != ty) {
             return translateType(getIntendedLiteralType(ty));
           }
-
-          const auto bitwidth = astContext.getIntWidth(type);
-          // All integer variants with bitwidth larger than 32 are represented
-          // as 64-bit int in SPIR-V.
-          // All integer variants with bitwidth of 32 or less are represented as
-          // 32-bit int in SPIR-V.
-          if (type->isSignedIntegerType())
-            return bitwidth > 32 ? theBuilder.getInt64Type()
-                                 : theBuilder.getInt32Type();
-          else
-            return bitwidth > 32 ? theBuilder.getUint64Type()
-                                 : theBuilder.getUint32Type();
+          const auto bitwidth = getElementSpirvBitwidth(ty);
+          return getTypeWithCustomBitwidth(ty, bitwidth);
         }
         default:
           emitError("primitive type %0 unimplemented")
@@ -345,20 +503,22 @@ uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
     QualType elemType = {};
     uint32_t rowCount = 0, colCount = 0;
     if (isMxNMatrix(type, &elemType, &rowCount, &colCount)) {
-
-      // We cannot handle external initialization of column-major matrices now.
-      if (!elemType->isFloatingType() && rule != LayoutRule::Void &&
-          !isRowMajor) {
-        emitError(
-            "externally initialized column-major matrices not supported yet");
-        return 0;
-      }
-
       // HLSL matrices are row major, while SPIR-V matrices are column major.
       // We are mapping what HLSL semantically mean a row into a column here.
       const uint32_t vecType =
           theBuilder.getVecType(translateType(elemType), colCount);
-      return theBuilder.getMatType(elemType, vecType, rowCount);
+
+      // If the matrix element type is not float, it is represented as an array
+      // of vectors, and should therefore have the ArrayStride decoration.
+      llvm::SmallVector<const Decoration *, 4> decorations;
+      if (!elemType->isFloatingType() && rule != LayoutRule::Void) {
+        uint32_t stride = 0;
+        (void)getAlignmentAndSize(type, rule, isRowMajor, &stride);
+        decorations.push_back(
+            Decoration::getArrayStride(*theBuilder.getSPIRVContext(), stride));
+      }
+
+      return theBuilder.getMatType(elemType, vecType, rowCount, decorations);
     }
   }
 
@@ -746,6 +906,35 @@ bool TypeTranslator::isMxNMatrix(QualType type, QualType *elemType,
   return true;
 }
 
+bool TypeTranslator::isOrContainsNonFpColMajorMatrix(QualType type,
+                                                     const Decl *decl) const {
+  const auto isColMajorDecl = [this](const Decl *decl) {
+    return decl->hasAttr<HLSLColumnMajorAttr>() ||
+           !decl->hasAttr<HLSLRowMajorAttr>() && !spirvOptions.defaultRowMajor;
+  };
+
+  QualType elemType = {};
+  if (isMxNMatrix(type, &elemType) && !elemType->isFloatingType()) {
+    return isColMajorDecl(decl);
+  }
+
+  if (const auto *arrayType = astContext.getAsConstantArrayType(type)) {
+    if (isMxNMatrix(arrayType->getElementType(), &elemType) &&
+        !elemType->isFloatingType())
+      return isColMajorDecl(decl);
+  }
+
+  if (const auto *structType = type->getAs<RecordType>()) {
+    const auto *decl = structType->getDecl();
+    for (const auto *field : decl->fields()) {
+      if (isOrContainsNonFpColMajorMatrix(field->getType(), field))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 bool TypeTranslator::isRowMajorMatrix(QualType type, const Decl *decl) const {
   if (!isMxNMatrix(type) && !type->isArrayType())
     return false;
@@ -907,7 +1096,12 @@ TypeTranslator::getLayoutDecorations(const DeclContext *decl, LayoutRule rule) {
       // MatrixStride on the field. So skip possible arrays here.
       fieldType = arrayType->getElementType();
     }
-    if (isMxNMatrix(fieldType)) {
+
+    // Non-floating point matrices are represented as arrays of vectors, and
+    // therefore ColMajor and RowMajor decorations should not be applied to
+    // them.
+    QualType elemType = {};
+    if (isMxNMatrix(fieldType, &elemType) && elemType->isFloatingType()) {
       memberAlignment = memberSize = stride = 0;
       std::tie(memberAlignment, memberSize) =
           getAlignmentAndSize(fieldType, rule, isRowMajor, &stride);
@@ -1172,8 +1366,7 @@ TypeTranslator::getAlignmentAndSize(QualType type, LayoutRule rule,
   //
   // 8. If the member is an array of S row-major matrices with C columns and R
   //    rows, the matrix is stored identically to a row of S X R row vectors
-  //    with C
-  //    components each, according to rule (4).
+  //    with C components each, according to rule (4).
   //
   // 9. If the member is a structure, the base alignment of the structure is N,
   //    where N is the largest base alignment value of any of its members, and
@@ -1207,6 +1400,10 @@ TypeTranslator::getAlignmentAndSize(QualType type, LayoutRule rule,
         case BuiltinType::UInt:
         case BuiltinType::Float:
           return {4, 4};
+        case BuiltinType::Double:
+        case BuiltinType::LongLong:
+        case BuiltinType::ULongLong:
+          return {8, 8};
         default:
           emitError("primitive type %0 unimplemented")
               << builtinType->getTypeClassName();

+ 24 - 2
tools/clang/lib/SPIRV/TypeTranslator.h

@@ -125,6 +125,20 @@ public:
   /// \brief Returns true if the given type is SubpassInputMS.
   static bool isSubpassInputMS(QualType);
 
+  /// \brief Evluates the given type at the given bitwidth and returns the
+  /// result-id for it. Panics if the given type is not a scalar or vector of
+  /// float or integer type. For example: if QualType of an int4 and bitwidth of
+  /// 64 is passed in, the result-id of a SPIR-V vector of size 4 of signed
+  /// 64-bit integers is returned.
+  /// Acceptable bitwidths are 16, 32, and 64.
+  uint32_t getTypeWithCustomBitwidth(QualType type, uint32_t bitwidth);
+
+  /// \brief Returns the realized bitwidth of the given type when represented in
+  /// SPIR-V. Panics if the given type is not a scalar or vector of float or
+  /// integer. In case of vectors, it returns the realized SPIR-V bitwidth of
+  /// the vector elements.
+  uint32_t getElementSpirvBitwidth(QualType type);
+
   /// \brief Returns true if the given type will be translated into a SPIR-V
   /// scalar type. This includes normal scalar types, vectors of size 1, and
   /// 1x1 matrices. If scalarType is not nullptr, writes the scalar type to
@@ -164,10 +178,15 @@ public:
                           uint32_t *rowCount = nullptr,
                           uint32_t *colCount = nullptr);
 
-  /// \broef returns true if type is a matrix and matrix is row major
-  /// If decl is not nullptr, is is checked for attributes specifying majorness
+  /// \brief Returns true if type is a matrix and matrix is row major
+  /// If decl is not nullptr, it is checked for attributes specifying majorness.
   bool isRowMajorMatrix(QualType type, const Decl *decl = nullptr) const;
 
+  /// \brief Returns true if the decl type is a non-floating-point matrix and
+  /// the matrix is column major, or if it is an array/struct containing such
+  /// matrices.
+  bool isOrContainsNonFpColMajorMatrix(QualType type, const Decl *decl) const;
+
   /// \brief Returns true if the two types are the same scalar or vector type,
   /// regardless of constness and literalness.
   static bool isSameScalarOrVecType(QualType type1, QualType type2);
@@ -221,6 +240,9 @@ public:
   llvm::SmallVector<const Decoration *, 4>
   getLayoutDecorations(const DeclContext *decl, LayoutRule rule);
 
+  /// \brief Returns how many sequential locations are consumed by a given type.
+  uint32_t getLocationCount(QualType type);
+
 private:
   /// \brief Wrapper method to create an error message and report it
   /// in the diagnostic engine associated with this consumer.

+ 27 - 0
tools/clang/test/CodeGenHLSL/quick-test/constant_cast.hlsl

@@ -0,0 +1,27 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+
+
+// Make sure no store is generated.
+// CHECK-NOT:store {{.*}},
+
+struct ST
+{
+    float4 a;
+    float4 b;
+    float4 c;
+};
+
+
+cbuffer cbModelSkinningConstants : register ( b4 )
+{
+    float4 v[ 2 * 256 * 3 ];
+
+    static const float4 v2d[ 512 ] [ 3 ] = v ;
+    static const ST vst[ 512 ] = v;
+} ;
+
+
+float4 main(int i:I) : SV_Target {
+  return v2d[i][1] + vst[i].b;
+}

+ 28 - 0
tools/clang/test/CodeGenHLSL/quick-test/flat_addrspacecast.hlsl

@@ -0,0 +1,28 @@
+// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
+
+// Make sure generate addrspacecast.
+// CHECK: addrspacecast (float addrspace(3)*
+
+struct ST
+{
+	float3 a; // center
+	float3 b; // half extents
+
+        void func(float3 x, float3 y)
+	{
+		a = x + y;
+		b = x * y;
+	}
+};
+
+groupshared ST myST;
+StructuredBuffer<ST> buf0;
+float3 a;
+float3 b;
+RWBuffer<float3> buf1;
+[numthreads(8,8,1)]
+void main() {
+  myST = buf0[0];
+  myST.func(a, b);
+  buf1[0] = myST.b;
+}

+ 4 - 4
tools/clang/test/CodeGenSPIRV/bezier.domain.hlsl2spv

@@ -107,10 +107,10 @@ DS_OUTPUT BezierEvalDS( HS_CONSTANT_DATA_OUTPUT input,
 // OpDecorate %gl_TessCoord Patch
 // OpDecorate %in_var_BEZIERPOS Location 0
 // OpDecorate %in_var_TANGENT Location 1
-// OpDecorate %in_var_TANUCORNER Location 2
-// OpDecorate %in_var_TANVCORNER Location 3
-// OpDecorate %in_var_TANWEIGHTS Location 4
-// OpDecorate %in_var_TEXCOORD Location 5
+// OpDecorate %in_var_TANUCORNER Location 5
+// OpDecorate %in_var_TANVCORNER Location 9
+// OpDecorate %in_var_TANWEIGHTS Location 13
+// OpDecorate %in_var_TEXCOORD Location 14
 // OpDecorate %out_var_NORMAL Location 0
 // OpDecorate %out_var_TEXCOORD Location 1
 // OpDecorate %out_var_TANGENT Location 2

+ 4 - 4
tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv

@@ -129,10 +129,10 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 // OpDecorate %in_var_TANGENT Location 2
 // OpDecorate %out_var_BEZIERPOS Location 0
 // OpDecorate %out_var_TANGENT Location 1
-// OpDecorate %out_var_TANUCORNER Location 2
-// OpDecorate %out_var_TANVCORNER Location 3
-// OpDecorate %out_var_TANWEIGHTS Location 4
-// OpDecorate %out_var_TEXCOORD Location 5
+// OpDecorate %out_var_TANUCORNER Location 5
+// OpDecorate %out_var_TANVCORNER Location 9
+// OpDecorate %out_var_TANWEIGHTS Location 13
+// OpDecorate %out_var_TEXCOORD Location 14
 // %void = OpTypeVoid
 // %3 = OpTypeFunction %void
 // %float = OpTypeFloat 32

+ 0 - 2
tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.hlsl → tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.form.hlsl

@@ -1,8 +1,6 @@
 // Run: %dxc -T vs_6_0 -E main
 
 void main() {
-// CHECK-LABEL: %bb_entry = OpLabel
-
     float4 a;
     float s;
 

+ 26 - 0
tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.type.hlsl

@@ -0,0 +1,26 @@
+// Run: %dxc -T vs_6_0 -E main
+
+void main() {
+    uint uVal;
+    bool bVal;
+
+    float fVal;
+    int iVal;
+
+    // No conversion of lhs
+// CHECK:      [[b_bool:%\d+]] = OpLoad %bool %bVal
+// CHECK-NEXT: [[b_uint:%\d+]] = OpSelect %uint [[b_bool]] %uint_1 %uint_0
+// CHECK-NEXT: [[u_uint:%\d+]] = OpLoad %uint %uVal
+// CHECK-NEXT:    [[add:%\d+]] = OpIAdd %uint [[u_uint]] [[b_uint]]
+// CHECK-NEXT:                   OpStore %uVal [[add]]
+    uVal += bVal;
+
+    // Convert lhs to the type of rhs, do computation, and then convert back
+// CHECK:        [[f_float:%\d+]] = OpLoad %float %fVal
+// CHECK-NEXT:     [[i_int:%\d+]] = OpLoad %int %iVal
+// CHECK-NEXT:   [[i_float:%\d+]] = OpConvertSToF %float [[i_int]]
+// CHECK-NEXT: [[mul_float:%\d+]] = OpFMul %float [[i_float]] [[f_float]]
+// CHECK-NEXT:   [[mul_int:%\d+]] = OpConvertFToS %int [[mul_float]]
+// CHECK-NEXT:                      OpStore %iVal [[mul_int]]
+    iVal *= fVal;
+}

+ 165 - 0
tools/clang/test/CodeGenSPIRV/cast.bitwidth.hlsl

@@ -0,0 +1,165 @@
+// Run: %dxc -T ps_6_2 -E main -enable-16bit-types
+
+void main() {
+
+  // 32-bit uint to various 64-bit types.
+  uint a;
+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
+// CHECK-NEXT:                    OpStore %b [[a_ulong]]
+  uint64_t b = a;
+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
+// CHECK-NEXT:[[a_double:%\d+]] = OpConvertUToF %double [[a_ulong]]
+// CHECK-NEXT:                    OpStore %c [[a_double]]
+  double   c = a;
+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
+// CHECK-NEXT:  [[a_long:%\d+]] = OpBitcast %long [[a_ulong]]
+// CHECK-NEXT:                    OpStore %d [[a_long]]
+  int64_t  d = a;
+
+
+  // 32-bit int to various 64-bit types.
+  int aa;
+// CHECK:            [[aa:%\d+]] = OpLoad %int %aa
+// CHECK-NEXT:  [[aa_long:%\d+]] = OpSConvert %long [[aa]]
+// CHECK-NEXT: [[aa_ulong:%\d+]] = OpBitcast %ulong [[aa_long]]
+// CHECK-NEXT:                     OpStore %bb [[aa_ulong]]
+  uint64_t bb = aa;
+// CHECK:             [[aa:%\d+]] = OpLoad %int %aa
+// CHECK-NEXT:   [[aa_long:%\d+]] = OpSConvert %long [[aa]]
+// CHECK-NEXT: [[aa_double:%\d+]] = OpConvertSToF %double [[aa_long]]
+// CHECK-NEXT:                      OpStore %cc [[aa_double]]
+  double   cc = aa;
+// CHECK:           [[aa:%\d+]] = OpLoad %int %aa
+// CHECK-NEXT: [[aa_long:%\d+]] = OpSConvert %long [[aa]]
+// CHECK-NEXT:                    OpStore %dd [[aa_long]]
+  int64_t  dd = aa;
+
+
+  // 32-bit float to various 64-bit types.
+  float aaa;
+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
+// CHECK-NEXT:  [[aaa_ulong:%\d+]] = OpConvertFToU %ulong [[aaa_double]]
+// CHECK-NEXT:                       OpStore %bbb [[aaa_ulong]]
+  uint64_t bbb = aaa;
+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
+// CHECK-NEXT:                       OpStore %ccc [[aaa_double]]
+  double   ccc = aaa;
+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
+// CHECK-NEXT:   [[aaa_long:%\d+]] = OpConvertFToS %long [[aaa_double]]
+// CHECK-NEXT:                       OpStore %ddd [[aaa_long]]
+  int64_t  ddd = aaa;
+
+
+  // 64-bit uint to various 32-bit types.
+  uint64_t e;
+// CHECK:      [[e64:%\d+]] = OpLoad %ulong %e
+// CHECK-NEXT: [[e32:%\d+]] = OpUConvert %uint [[e64]]
+// CHECK-NEXT:                OpStore %f [[e32]]
+  uint  f = e;
+// CHECK:          [[e64:%\d+]] = OpLoad %ulong %e
+// CHECK-NEXT:     [[e32:%\d+]] = OpUConvert %uint [[e64]]
+// CHECK-NEXT: [[e_float:%\d+]] = OpConvertUToF %float [[e32]]
+// CHECK-NEXT:                    OpStore %g [[e_float]]
+  float g = e;
+// CHECK:        [[e64:%\d+]] = OpLoad %ulong %e
+// CHECK-NEXT:   [[e32:%\d+]] = OpUConvert %uint [[e64]]
+// CHECK-NEXT: [[e_int:%\d+]] = OpBitcast %int [[e32]]
+// CHECK-NEXT:                  OpStore %h [[e_int]]
+  int   h = e;
+
+
+  // 64-bit int to various 32-bit types.
+  int64_t ee;
+// CHECK:           [[e:%\d+]] = OpLoad %long %ee
+// CHECK-NEXT:  [[e_int:%\d+]] = OpSConvert %int [[e]]
+// CHECK-NEXT: [[e_uint:%\d+]] = OpBitcast %uint [[e_int]]
+// CHECK-NEXT:                   OpStore %ff [[e_uint]]
+  uint  ff = ee;
+// CHECK:            [[e:%\d+]] = OpLoad %long %ee
+// CHECK-NEXT:   [[e_int:%\d+]] = OpSConvert %int [[e]]
+// CHECK-NEXT: [[e_float:%\d+]] = OpConvertSToF %float [[e_int]]
+// CHECK-NEXT:                    OpStore %gg [[e_float]]
+  float gg = ee;
+// CHECK:          [[e:%\d+]] = OpLoad %long %ee
+// CHECK-NEXT: [[e_int:%\d+]] = OpSConvert %int [[e]]
+// CHECK-NEXT:                  OpStore %hh [[e_int]]
+  int   hh = ee;
+
+
+  // 64-bit float to various 32-bit types.
+  double eee;
+// CHECK:         [[e64:%\d+]] = OpLoad %double %eee
+// CHECK-NEXT:    [[e32:%\d+]] = OpFConvert %float [[e64]]
+// CHECK-NEXT: [[e_uint:%\d+]] = OpConvertFToU %uint [[e32]]
+// CHECK-NEXT:                   OpStore %fff [[e_uint]]
+  uint  fff = eee;
+// CHECK:              [[e:%\d+]] = OpLoad %double %eee
+// CHECK-NEXT:   [[e_float:%\d+]] = OpFConvert %float [[e]]
+// CHECK-NEXT:                      OpStore %ggg [[e_float]]
+  float ggg = eee;
+// CHECK:            [[e:%\d+]] = OpLoad %double %eee
+// CHECK-NEXT: [[e_float:%\d+]] = OpFConvert %float [[e]]
+// CHECK-NEXT:   [[e_int:%\d+]] = OpConvertFToS %int [[e_float]]
+// CHECK-NEXT:                    OpStore %hhh [[e_int]]
+  int   hhh = eee;
+
+
+  // Vector case: 64-bit float to various 32-bit types.
+  double2 i;
+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
+// CHECK-NEXT:   [[i_uint:%\d+]] = OpConvertFToU %v2uint [[i_float]]
+// CHECK-NEXT:                     OpStore %j [[i_uint]]
+  uint2   j = i;
+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
+// CHECK-NEXT:    [[i_int:%\d+]] = OpConvertFToS %v2int [[i_float]]
+// CHECK-NEXT:                     OpStore %k [[i_int]]
+  int2    k = i;
+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
+// CHECK-NEXT:                     OpStore %l [[i_float]]
+  float2  l = i;
+
+
+  // 16-bit uint to various 32-bit types.
+  uint16_t m;
+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
+// CHECK-NEXT:                     OpStore %n [[m_uint]]
+  uint  n = m;
+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
+// CHECK-NEXT:  [[m_float:%\d+]] = OpConvertUToF %float [[m_uint]]
+// CHECK-NEXT:                     OpStore %o [[m_float]]
+  float o = m;
+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
+// CHECK-NEXT:    [[m_int:%\d+]] = OpBitcast %int [[m_uint]]
+// CHECK-NEXT:                     OpStore %p [[m_int]]
+  int   p = m;
+
+
+  // 16-bit int to various 32-bit types.
+  int16_t mm;
+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
+// CHECK-NEXT:  [[mm_uint:%\d+]] = OpBitcast %uint [[mm_int]]
+// CHECK-NEXT:                     OpStore %nn [[mm_uint]]
+  uint  nn = mm;
+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
+// CHECK-NEXT: [[mm_float:%\d+]] = OpConvertSToF %float [[mm_int]]
+// CHECK-NEXT:                     OpStore %oo [[mm_float]]
+  float oo = mm;
+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
+// CHECK-NEXT:                     OpStore %pp [[mm_int]]
+  int   pp = mm;
+}

+ 50 - 0
tools/clang/test/CodeGenSPIRV/cast.flat-conversion.literal-initializer.hlsl

@@ -0,0 +1,50 @@
+// Run: %dxc -T ps_6_0 -E main
+
+struct S {
+  float2   a;
+  float    b;
+  double2  c;
+  double   d;
+  int64_t  e;
+  uint64_t f;
+};
+
+void main() {
+
+// CHECK:              [[inf:%\d+]] = OpFDiv %float %float_1 %float_0
+// CHECK-NEXT:        [[inf2:%\d+]] = OpCompositeConstruct %v2float [[inf]] [[inf]]
+// CHECK-NEXT:  [[inf_double:%\d+]] = OpFConvert %double [[inf]]
+// CHECK-NEXT: [[inf2_double:%\d+]] = OpCompositeConstruct %v2double [[inf_double]] [[inf_double]] 
+// CHECK-NEXT:  [[inf_double:%\d+]] = OpFConvert %double [[inf]]
+// CHECK-NEXT: [[inf_double_:%\d+]] = OpFConvert %double [[inf]]
+// CHECK-NEXT:   [[inf_int64:%\d+]] = OpConvertFToS %long [[inf_double_]]
+// CHECK-NEXT: [[inf_double_:%\d+]] = OpFConvert %double [[inf]]
+// CHECK-NEXT:  [[inf_uint64:%\d+]] = OpConvertFToU %ulong [[inf_double_]]
+// CHECK-NEXT:             {{%\d+}} = OpCompositeConstruct %S [[inf2]] [[inf]] [[inf2_double]] [[inf_double]] [[inf_int64]] [[inf_uint64]]
+  S s3 = (S)(1.0 / 0.0);
+
+// CHECK:              [[b:%\d+]] = OpLoad %float %b
+// CHECK-NEXT:  [[b2_float:%\d+]] = OpCompositeConstruct %v2float [[b]] [[b]]
+// CHECK-NEXT:  [[b_double:%\d+]] = OpFConvert %double [[b]]
+// CHECK-NEXT: [[b2_double:%\d+]] = OpCompositeConstruct %v2double [[b_double]] [[b_double]]
+// CHECK-NEXT:  [[b_double:%\d+]] = OpFConvert %double [[b]]
+// CHECK-NEXT: [[b_double_:%\d+]] = OpFConvert %double [[b]]
+// CHECK-NEXT:   [[b_int64:%\d+]] = OpConvertFToS %long [[b_double_]]
+// CHECK-NEXT: [[b_double_:%\d+]] = OpFConvert %double [[b]]
+// CHECK-NEXT:  [[b_uint64:%\d+]] = OpConvertFToU %ulong [[b_double_]]
+// CHECK-NEXT:           {{%\d+}} = OpCompositeConstruct %S [[b2_float]] [[b]] [[b2_double]] [[b_double]] [[b_int64]] [[b_uint64]]
+  float b;
+  S s2 = (S)(b);
+
+
+// CHECK:              [[a:%\d+]] = OpLoad %double %a
+// CHECK-NEXT:   [[a_float:%\d+]] = OpFConvert %float [[a]]
+// CHECK-NEXT:  [[a2_float:%\d+]] = OpCompositeConstruct %v2float [[a_float]] [[a_float]]
+// CHECK-NEXT:   [[a_float:%\d+]] = OpFConvert %float [[a]]
+// CHECK-NEXT: [[a2_double:%\d+]] = OpCompositeConstruct %v2double [[a]] [[a]]
+// CHECK-NEXT:   [[a_int64:%\d+]] = OpConvertFToS %long [[a]]
+// CHECK-NEXT:  [[a_uint64:%\d+]] = OpConvertFToU %ulong [[a]]
+// CHECK-NEXT:           {{%\d+}} = OpCompositeConstruct %S [[a2_float]] [[a_float]] [[a2_double]] [[a]] [[a_int64]] [[a_uint64]]
+  double a;
+  S s1 = (S)(a);
+}

+ 1 - 1
tools/clang/test/CodeGenSPIRV/constant.scalar.16bit.enabled.hlsl

@@ -15,8 +15,8 @@
 // min16uint:  uint16_t(warning)
 // uint16_t:   uint16_t
 
-// CHECK: OpCapability Float16
 // CHECK: OpCapability Int16
+// CHECK: OpExtension "SPV_AMD_gpu_shader_half_float"
 
 // CHECK-NOT: OpDecorate %c_half RelaxedPrecision
 // CHECK-NOT: OpDecorate %c_min10float RelaxedPrecision

+ 7 - 1
tools/clang/test/CodeGenSPIRV/cs.groupshared.hlsl

@@ -5,10 +5,16 @@ struct S {
     float3 f2;
 };
 
+// CHECK-NOT: OpDecorate %a DescriptorSet
+// CHECK-NOT: OpDecorate %b DescriptorSet
+// CHECK-NOT: OpDecorate %c DescriptorSet
+// CHECK-NOT: OpDecorate %d DescriptorSet
+// CHECK-NOT: OpDecorate %s DescriptorSet
+
 // CHECK: %a = OpVariable %_ptr_Workgroup_float Workgroup
 groupshared              float    a;
 // CHECK: %b = OpVariable %_ptr_Workgroup_v3float Workgroup
-groupshared              float3   b;
+static groupshared       float3   b;  // Ignore static modifier
 // CHECK: %c = OpVariable %_ptr_Workgroup_mat2v3float Workgroup
 groupshared column_major float2x3 c;
 // CHECK: %d = OpVariable %_ptr_Workgroup__arr_v2float_uint_5 Workgroup

+ 34 - 0
tools/clang/test/CodeGenSPIRV/op.cbuffer.access.majorness.hlsl

@@ -0,0 +1,34 @@
+// Run: %dxc -T cs_6_0 -E main -Zpr
+
+// CHECK: %SData = OpTypeStruct %_arr_mat3v4float_uint_2 %_arr_mat3v4float_uint_2_0
+struct SData {
+                float3x4 mat1[2];
+   column_major float3x4 mat2[2];
+};
+
+// CHECK: %type_SBufferData = OpTypeStruct %SData %_arr_mat3v4float_uint_2 %_arr_mat3v4float_uint_2_0
+cbuffer SBufferData {
+                SData    BufferData;
+                float3x4 Mat1[2];
+   column_major float3x4 Mat2[2];
+};
+
+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_SData %SBufferData %int_0
+// CHECK: [[val:%\d+]] = OpLoad %SData [[ptr]]
+// CHECK:     {{%\d+}} = OpCompositeExtract %_arr_mat3v4float_uint_2 %32 0
+// CHECK:     {{%\d+}} = OpCompositeExtract %_arr_mat3v4float_uint_2_0 %32 1
+static const SData Data = BufferData;
+
+RWStructuredBuffer<float4> Out;
+
+[numthreads(4, 4, 4)]
+void main() {
+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform__arr_mat3v4float_uint_2 %SBufferData %int_1
+// CHECK:     {{%\d+}} = OpLoad %_arr_mat3v4float_uint_2 [[ptr]]
+  float3x4 a[2] = Mat1;
+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform__arr_mat3v4float_uint_2_0 %SBufferData %int_2
+// CHECK:     {{%\d+}} = OpLoad %_arr_mat3v4float_uint_2_0 [[ptr]]
+  float3x4 b[2] = Mat2;
+
+  Out[0] = Data.mat1[0][0];
+}

+ 19 - 0
tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-count.hlsl

@@ -0,0 +1,19 @@
+// Run: %dxc -T cs_6_0 -E main
+
+RWStructuredBuffer<uint> values;
+
+// CHECK: OpCapability SubgroupBallotKHR
+// CHECK: OpExtension "SPV_KHR_shader_ballot"
+
+// CHECK: OpEntryPoint GLCompute
+// CHECK-SAME: %SubgroupSize
+
+// CHECK: OpDecorate %SubgroupSize BuiltIn SubgroupSize
+
+// CHECK: %SubgroupSize = OpVariable %_ptr_Input_uint Input
+
+[numthreads(32, 1, 1)]
+void main(uint3 id: SV_DispatchThreadID) {
+// CHECK: OpLoad %uint %SubgroupSize
+    values[id.x] = WaveGetLaneCount();
+}

+ 19 - 0
tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-index.hlsl

@@ -0,0 +1,19 @@
+// Run: %dxc -T cs_6_0 -E main
+
+RWStructuredBuffer<uint> values;
+
+// CHECK: OpCapability SubgroupBallotKHR
+// CHECK: OpExtension "SPV_KHR_shader_ballot"
+
+// CHECK: OpEntryPoint GLCompute
+// CHECK-SAME: %SubgroupLocalInvocationId
+
+// CHECK: OpDecorate %SubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
+
+// CHECK: %SubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
+
+[numthreads(32, 1, 1)]
+void main(uint3 id: SV_DispatchThreadID) {
+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
+    values[id.x] = WaveGetLaneIndex();
+}

+ 30 - 0
tools/clang/test/CodeGenSPIRV/sm6.wave-read-lane-first.hlsl

@@ -0,0 +1,30 @@
+// Run: %dxc -T cs_6_0 -E main
+
+// CHECK: OpCapability SubgroupBallotKHR
+// CHECK: OpExtension "SPV_KHR_shader_ballot"
+
+struct S {
+    uint4 val1;
+     int2 val2;
+    float val3;
+};
+
+RWStructuredBuffer<S> values;
+
+[numthreads(32, 1, 1)]
+void main(uint3 id: SV_DispatchThreadID) {
+    uint x = id.x;
+
+    uint4 val1 = values[x].val1;
+     int2 val2 = values[x].val2;
+    float val3 = values[x].val3;
+
+// OpSubgroupFirstInvocationKHR requires that:
+//   Result Type must be a 32-bit integer type or a 32-bit float type scalar.
+
+    // values[x].val1 = WaveReadLaneFirst(val1);
+    // values[x].val2 = WaveReadLaneFirst(val2);
+// CHECK:      [[val3:%\d+]] = OpLoad %float %val3
+// CHECK-NEXT:      {{%\d+}} = OpSubgroupFirstInvocationKHR %float [[val3]]
+    values[x].val3 = WaveReadLaneFirst(val3);
+}

+ 27 - 0
tools/clang/test/CodeGenSPIRV/sm6.wave.builtin.no-dup.hlsl

@@ -0,0 +1,27 @@
+// Run: %dxc -T cs_6_0 -E main
+
+// Some wave ops translates into SPIR-V builtin variables.
+// Test that we are not generating duplicated builtins for multiple calls of
+// of the same wave ops.
+RWStructuredBuffer<uint> values;
+
+// CHECK: OpEntryPoint GLCompute
+// CHECK-SAME: %SubgroupSize %SubgroupLocalInvocationId
+
+// CHECK: OpDecorate %SubgroupSize BuiltIn SubgroupSize
+// CHECK-NOT: OpDecorate {{%\w+}} BuiltIn SubgroupSize
+
+// CHECK: OpDecorate %SubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
+// CHECK-NOT: OpDecorate {{%\w+}} BuiltIn SubgroupLocalInvocationId
+
+// CHECK: %SubgroupSize = OpVariable %_ptr_Input_uint Input
+// CHECK-NEXT: %SubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
+
+[numthreads(32, 1, 1)]
+void main(uint3 id: SV_DispatchThreadID) {
+// CHECK: OpLoad %uint %SubgroupSize
+// CHECK: OpLoad %uint %SubgroupSize
+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
+    values[id.x] = WaveGetLaneCount() + WaveGetLaneCount() + WaveGetLaneIndex() + WaveGetLaneIndex();
+}

+ 1 - 1
tools/clang/test/CodeGenSPIRV/spirv.interface.hs.hlsl

@@ -90,7 +90,7 @@ struct HsPcfOut
 // CHECK: OpDecorate %out_var_BAR Location 0
 // CHECK: OpDecorate %out_var_FOO Location 1
 // CHECK: OpDecorate %out_var_TEXCOORD Location 2
-// CHECK: OpDecorate %out_var_WEIGHT Location 3
+// CHECK: OpDecorate %out_var_WEIGHT Location 6
 
 // Input : clip0 + clip2         : 3 floats
 // Input : cull3 + cull5         : 4 floats

+ 36 - 0
tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std140.hlsl

@@ -0,0 +1,36 @@
+// Run: %dxc -T vs_6_0 -E main
+
+// CHECK: OpDecorate %_arr_double_uint_3 ArrayStride 16
+// CHECK: OpDecorate %_arr_mat2v3double_uint_2 ArrayStride 64
+// CHECK: OpDecorate %_arr_v2long_uint_1 ArrayStride 16
+
+// CHECK: OpMemberDecorate %type_MyCBuffer 0 Offset 0
+// CHECK: OpMemberDecorate %type_MyCBuffer 1 Offset 8
+// CHECK: OpMemberDecorate %type_MyCBuffer 2 Offset 16
+// CHECK: OpMemberDecorate %type_MyCBuffer 3 Offset 64
+// CHECK: OpMemberDecorate %type_MyCBuffer 4 Offset 96
+// CHECK: OpMemberDecorate %type_MyCBuffer 5 Offset 128
+// CHECK: OpMemberDecorate %type_MyCBuffer 5 MatrixStride 32
+// CHECK: OpMemberDecorate %type_MyCBuffer 5 RowMajor
+// CHECK: OpMemberDecorate %type_MyCBuffer 6 Offset 192
+// CHECK: OpMemberDecorate %type_MyCBuffer 7 Offset 208
+// CHECK: OpMemberDecorate %type_MyCBuffer 8 Offset 224
+// CHECK: OpMemberDecorate %type_MyCBuffer 8 MatrixStride 32
+// CHECK: OpMemberDecorate %type_MyCBuffer 8 ColMajor
+// CHECK: OpMemberDecorate %type_MyCBuffer 9 Offset 352
+
+
+cbuffer MyCBuffer{               // Alignment | Offset + Size                 = Next
+              float     f1;      // 0         | 0        4                      4
+              uint64_t  f2;      // 8         | 8        8                      16
+              double    f3[3];   // 16        | 16       16 (stride) * 3        64
+              float     f4;      // 4         | 64       4                      68
+              int64_t3  f5;      // 32        | 96       8 * 3                  120
+              double3x2 f6;      // 32        | 128      32 * 2                 192    // SPIR-V RowMajor
+              double2x1 f7;      // 16        | 192      16                     208
+              float     f8;      // 4         | 208      4                      212
+    row_major double2x3 f9[2];   // 32        | 224      32 * 4                 352    // SPIR-V ColMajor
+              int64_t2  f10[1];  // 16        | 352      16 (stride)            368
+};                               // 32 (max)                                    384
+
+void main() { }

+ 39 - 0
tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std430.hlsl

@@ -0,0 +1,39 @@
+// Run: %dxc -T vs_6_0 -E main
+
+// CHECK: OpDecorate %_arr_double_uint_3 ArrayStride 8
+// CHECK: OpDecorate %_arr_mat2v3double_uint_2 ArrayStride 64
+// CHECK: OpDecorate %_arr_v2long_uint_1 ArrayStride 16
+
+// CHECK: OpMemberDecorate %S 0 Offset 0
+// CHECK: OpMemberDecorate %S 1 Offset 8
+// CHECK: OpMemberDecorate %S 2 Offset 16
+// CHECK: OpMemberDecorate %S 3 Offset 40
+// CHECK: OpMemberDecorate %S 4 Offset 64
+// CHECK: OpMemberDecorate %S 5 Offset 96
+// CHECK: OpMemberDecorate %S 5 MatrixStride 32
+// CHECK: OpMemberDecorate %S 5 RowMajor
+// CHECK: OpMemberDecorate %S 6 Offset 160
+// CHECK: OpMemberDecorate %S 7 Offset 176
+// CHECK: OpMemberDecorate %S 8 Offset 192
+// CHECK: OpMemberDecorate %S 8 MatrixStride 32
+// CHECK: OpMemberDecorate %S 8 ColMajor
+// CHECK: OpMemberDecorate %S 9 Offset 320
+
+// CHECK: OpDecorate %_runtimearr_S ArrayStride 352
+
+struct S {                       // Alignment | Offset + Size       = Next
+              float     f1;      // 0         | 0        4            4
+              uint64_t  f2;      // 8         | 8        8            16
+              double    f3[3];   // 8         | 16       8 * 3        40
+              float     f4;      // 4         | 40       4            44
+              int64_t3  f5;      // 32        | 64       8 * 3        88
+              double3x2 f6;      // 32        | 96       32 * 2       160    // SPIR-V RowMajor
+              double2x1 f7;      // 16        | 160      16           176
+              float     f8;      // 4         | 176      4            180
+    row_major double2x3 f9[2];   // 32        | 192      32 * 4       320    // SPIR-V ColMajor
+              int64_t2  f10[1];  // 16        | 320      16           336
+};                               // 32 (max)                          352
+
+StructuredBuffer<S> MySBuffer;
+
+void main() { }

+ 8 - 5
tools/clang/test/CodeGenSPIRV/vk.layout.asbuffer.std430.hlsl

@@ -4,6 +4,8 @@
 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
 
 // CHECK: OpMemberDecorate %S 0 Offset 0
 // CHECK: OpMemberDecorate %S 1 Offset 16
@@ -17,13 +19,14 @@
 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
 // CHECK: OpMemberDecorate %S 4 RowMajor
 // CHECK: OpMemberDecorate %S 5 Offset 208
+// CHECK: OpMemberDecorate %S 6 Offset 272
 
-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
 
 // CHECK: OpMemberDecorate %T 0 Offset 0
-// CHECK: OpMemberDecorate %T 1 Offset 448
+// CHECK: OpMemberDecorate %T 1 Offset 576
 
-// CHECK: OpDecorate %_runtimearr_T ArrayStride 464
+// CHECK: OpDecorate %_runtimearr_T ArrayStride 592
 
 // CHECK: OpMemberDecorate %type_AppendStructuredBuffer_T 0 Offset 0
 // CHECK: OpDecorate %type_AppendStructuredBuffer_T BufferBlock
@@ -36,7 +39,8 @@ struct S {
     row_major    float2x3 c[2];
     column_major float2x3 d[2];
                  float2x3 e[2];
-                 int      f;
+    row_major    int2x3   f[2];
+                 int      g;
 };
 
 struct T {
@@ -49,4 +53,3 @@ AppendStructuredBuffer<T> buffer2;
 float main() : A {
     return 1.0;
 }
-

+ 30 - 26
tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.nested.std140.hlsl

@@ -1,39 +1,42 @@
 // Run: %dxc -T vs_6_0 -E main
 
 // Deep nested array of matrices
-// Depp nested majorness
+// Deep nested majorness
 struct R {                         // Alignment    Offset  Size                              Next
     row_major    float2x3 rf1[3];  // 16(vec4)  -> 0     + 3(array) * stride(2 * 16(vec4)) = 96
     column_major float2x3 rf2[4];  // 16(vec4)  -> 96    + 4(array) * stride(3 * 16(vec4)) = 288
                  float2x3 rf3[2];  // 16(vec4)  -> 288   + 2(array) * stride(3 * 16(vec4)) = 384
-                 int      rf4;     // 4         -> 384   + 4                               = 388
-};                                 // 16(max)                                                400 (388 round up to R alignment)
+    row_major    int2x3   rf4[2];  // 16(vec4)  -> 384   + 2(array) * stride(2 * 16(vec4)) = 448
+                 int      rf5;     // 4         -> 448   + 4                               = 452
+};                                 // 16(max)                                                464 (452 round up to R alignment)
 
 // Array of scalars, vectors, matrices, and structs
 struct S {                         // Alignment   Offset  Size                              Next
     float3       sf1[3];           // 16(vec4) -> 0     + 3(array) * 16(vec4)             = 48
     float        sf2[3];           // 4        -> 48    + 3(array) * 16(vec4)             = 96
-    R            sf3[4];           // 16       -> 96    + 4(array) * stride(400)          = 1696
-    row_major    float3x2 sf4[2];  // 16(vec4) -> 1696  + 2(array) * stride(3 * 16(vec4)) = 1792
-    column_major float3x2 sf5[3];  // 16(vec4) -> 1792  + 3(array) * stride(2 * 16(vec4)) = 1888
-                 float3x2 sf6[4];  // 16(vec4) -> 1888  + 4(array) * stride(2 * 16(vec4)) = 2016
-                 float    sf7;     // 4        -> 2016  + 4                               = 2020
-};                                 // 16(max)                                               2032 (2020 round up to S alignment)
+    R            sf3[4];           // 16       -> 96    + 4(array) * stride(464)          = 1952
+    row_major    float3x2 sf4[2];  // 16(vec4) -> 1952  + 2(array) * stride(3 * 16(vec4)) = 2048
+    column_major float3x2 sf5[3];  // 16(vec4) -> 2048  + 3(array) * stride(2 * 16(vec4)) = 2144
+                 float3x2 sf6[4];  // 16(vec4) -> 2144  + 4(array) * stride(2 * 16(vec4)) = 2272
+                 float    sf7;     // 4        -> 2272  + 4                               = 2276
+};                                 // 16(max)                                               2288 (2276 round up to S alignment)
 
 struct T {        // Alignment    Offset  Size              Next
-    R    tf1[2];  // 16        -> 0     + 2(array) * 400  = 800
-    S    tf2[3];  // 16        -> 800   + 3(array) * 2032 = 6896
-    uint tf3;     // 4         -> 6896  + 4               = 6900
-};                // 16(max)                                6912 (6900 round up to T alignment)
+    R    tf1[2];  // 16        -> 0     + 2(array) * 464  = 928
+    S    tf2[3];  // 16        -> 928   + 3(array) * 2288 = 7792
+    uint tf3;     // 4         -> 7792  + 4               = 7796
+};                // 16(max)                                7808 (7796 round up to T alignment)
 
 cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
-    T    t[2];       // 16       -> 0      + 2(array) * 6912 = 13824
-    bool z;          // 4        -> 13824
+    T    t[2];       // 16       -> 0      + 2(array) * 7808 = 15616
+    bool z;          // 4        -> 15616
 };
 
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_4 ArrayStride 48
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 48
+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
 
 // CHECK:      OpMemberDecorate %R 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %R 0 MatrixStride 16
@@ -45,11 +48,12 @@ cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
 // CHECK-NEXT: OpMemberDecorate %R 2 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %R 2 RowMajor
 // CHECK-NEXT: OpMemberDecorate %R 3 Offset 384
+// CHECK-NEXT: OpMemberDecorate %R 4 Offset 448
 
-// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 400
+// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 464
 // CHECK:      OpDecorate %_arr_v3float_uint_3 ArrayStride 16
 // CHECK:      OpDecorate %_arr_float_uint_3 ArrayStride 16
-// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 400
+// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 464
 
 // CHECK:      OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 48
 // CHECK:      OpDecorate %_arr_mat3v2float_uint_3 ArrayStride 32
@@ -58,27 +62,27 @@ cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
 // CHECK:      OpMemberDecorate %S 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 48
 // CHECK-NEXT: OpMemberDecorate %S 2 Offset 96
-// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1696
+// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1952
 // CHECK-NEXT: OpMemberDecorate %S 3 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %S 3 ColMajor
-// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1792
+// CHECK-NEXT: OpMemberDecorate %S 4 Offset 2048
 // CHECK-NEXT: OpMemberDecorate %S 4 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %S 4 RowMajor
-// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1888
+// CHECK-NEXT: OpMemberDecorate %S 5 Offset 2144
 // CHECK-NEXT: OpMemberDecorate %S 5 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %S 5 RowMajor
-// CHECK-NEXT: OpMemberDecorate %S 6 Offset 2016
+// CHECK-NEXT: OpMemberDecorate %S 6 Offset 2272
 
-// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 2032
+// CHECK-NEXT: OpDecorate %_arr_S_uint_3 ArrayStride 2288
 
 // CHECK:      OpMemberDecorate %T 0 Offset 0
-// CHECK-NEXT: OpMemberDecorate %T 1 Offset 800
-// CHECK-NEXT: OpMemberDecorate %T 2 Offset 6896
+// CHECK-NEXT: OpMemberDecorate %T 1 Offset 928
+// CHECK-NEXT: OpMemberDecorate %T 2 Offset 7792
 
-// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 6912
+// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 7808
 
 // CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 0 Offset 0
-// CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 1 Offset 13824
+// CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 1 Offset 15616
 
 // CHECK:      OpDecorate %type_MyCbuffer Block
 float main() : A {

+ 14 - 11
tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.std140.hlsl

@@ -11,13 +11,14 @@ struct S {      // Alignment    Offset                                Size
     float  sf4; // 4         -> 44                                  + 4         = 48
 };              // 16(max)                                                        48(48 round up to S max alignment)
 
-struct T {           // Alignment     Offset                               Size              = Next
-    int      tf1;    // 4          -> 0                                  + 4                 = 4
-    R        tf2[3]; // 16         -> 16 (4 rounded up to R alignment)   + 3 * stride(16)    = 64
-    float3x2 tf3;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 2 * stride(vec4)  = 96
-    S        tf4;    // 16         -> 96 (96 round up to S alignment)    + 48                = 144
-    float    tf5;    // 4          -> 144                                + 4                 = 148
-};                   // 16(max)                                                                160(148 round up to T max alignment)
+struct T {                     // Alignment     Offset                               Size              = Next
+              int      tf1;    // 4          -> 0                                  + 4                 = 4
+              R        tf2[3]; // 16         -> 16 (4 rounded up to R alignment)   + 3 * stride(16)    = 64
+              float3x2 tf3;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 2 * stride(vec4)  = 96
+              S        tf4;    // 16         -> 96 (96 round up to S alignment)    + 48                = 144
+              float    tf5;    // 4          -> 144                                + 4                 = 148
+    row_major int3x2   tf6;    // 16(vec4)   -> 160 (148 rounded up to vec4)       + 3 * stride(vec4)  = 208
+};                             // 16(max)                                                                208(208 round up to T max alignment)
 
 cbuffer MyCBuffer {              // Alignment   Offset                                 Size                     Next
                  bool     a;     // 4        -> 0                                    +     4                  = 4
@@ -28,9 +29,9 @@ cbuffer MyCBuffer {              // Alignment   Offset
                  float2x1 f;     // 8(vec2)  -> 112 (112 round up to vec2 aligment)  + 2 * 4                  = 120
     row_major    float2x3 g[3];  // 16(vec4) -> 128 (120 round up to vec4 alignment) + 3 * 2 * stride(vec4)   = 224
     column_major float2x2 h[4];  // 16(vec4) -> 224 (224 round up to vec4 alignment) + 4 * 2 * stride(vec4)   = 352
-                 T        t;     // 16       -> 352 (352 round up to vec4 alignment) + 160                    = 512
-                 float    z;     // 4        -> 512
-
+                 T        t;     // 16       -> 352 (352 round up to vec4 alignment) + 208                    = 560
+    row_major    int2x3   y;     // 16(vec4) -> 560 (560 round up to vec4 alignment) + 2 * stride(vec4)       = 592
+                 float    z;     // 4        -> 592
 };
 
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
@@ -52,6 +53,7 @@ cbuffer MyCBuffer {              // Alignment   Offset
 // CHECK-NEXT: OpMemberDecorate %T 2 RowMajor
 // CHECK-NEXT: OpMemberDecorate %T 3 Offset 96
 // CHECK-NEXT: OpMemberDecorate %T 4 Offset 144
+// CHECK-NEXT: OpMemberDecorate %T 5 Offset 160
 
 // CHECK:      OpMemberDecorate %type_MyCBuffer 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 1 Offset 4
@@ -70,7 +72,8 @@ cbuffer MyCBuffer {              // Alignment   Offset
 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 7 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 7 RowMajor
 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 8 Offset 352
-// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 9 Offset 512
+// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 9 Offset 560
+// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 10 Offset 592
 // CHECK-NEXT: OpDecorate %type_MyCBuffer Block
 
 float main() : A {

+ 20 - 0
tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpc.hlsl

@@ -3,6 +3,9 @@
 // CHECK: OpDecorate %_arr_mat2v3float_uint_5 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_5_0 ArrayStride 48
 
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
+
 // CHECK: OpMemberDecorate %type_MyCBuffer 0 ColMajor
 // CHECK: OpMemberDecorate %type_MyCBuffer 1 RowMajor
 // CHECK: OpMemberDecorate %type_MyCBuffer 2 RowMajor
@@ -12,6 +15,8 @@ cbuffer MyCBuffer {
     row_major    float2x3 matrices1[5];
     column_major float2x3 matrices2[5];
                  float2x3 matrices3[5];
+
+    row_major    int2x3   matrices4[5];
 }
 
 void main() {
@@ -22,4 +27,19 @@ void main() {
     float2x3 m1 = matrices1[1];
     float2x3 m2 = matrices2[2];
     float2x3 m3 = matrices3[3];
+
+    // Note: Since non-fp matrices are represented as arrays of vectors, and
+    // due to layout decoration on the rhs of the assignments below,
+    // a load and store is performed for each vector.
+
+// CHECK:          [[ptr_matrices4:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_3
+// CHECK-NEXT:   [[ptr_matrices4_1:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices4]] %int_1
+// CHECK-NEXT:       [[matrices4_1:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices4_1]]
+// CHECK-NEXT:  [[matrices4_1_row0:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 0
+// CHECK-NEXT:       [[ptr_m4_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_0
+// CHECK-NEXT:                              OpStore [[ptr_m4_row0]] [[matrices4_1_row0]]
+// CHECK-NEXT:  [[matrices4_1_row1:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 1
+// CHECK-NEXT:       [[ptr_m4_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_1
+// CHECK-NEXT:                              OpStore [[ptr_m4_row1]] [[matrices4_1_row1]]
+    int2x3 m4 = matrices4[1];
 }

+ 31 - 0
tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpr.hlsl

@@ -3,6 +3,9 @@
 // CHECK: OpDecorate %_arr_mat2v3float_uint_5 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_5_0 ArrayStride 48
 
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
+
 // CHECK: OpMemberDecorate %type_MyCBuffer 0 ColMajor
 // CHECK: OpMemberDecorate %type_MyCBuffer 1 RowMajor
 // CHECK: OpMemberDecorate %type_MyCBuffer 2 ColMajor
@@ -12,6 +15,9 @@ cbuffer MyCBuffer {
     row_major    float2x3 matrices1[5];
     column_major float2x3 matrices2[5];
                  float2x3 matrices3[5];
+
+    row_major    int2x3   matrices4[5];
+                 int2x3   matrices5[5];
 }
 
 void main() {
@@ -22,4 +28,29 @@ void main() {
     float2x3 m1 = matrices1[1];
     float2x3 m2 = matrices2[2];
     float2x3 m3 = matrices3[3];
+
+    // Note: Since non-fp matrices are represented as arrays of vectors, and
+    // due to layout decoration on the rhs of the assignments below,
+    // a load and store is performed for each vector.
+
+// CHECK:          [[ptr_matrices4:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_3
+// CHECK-NEXT:   [[ptr_matrices4_1:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices4]] %int_1
+// CHECK-NEXT:       [[matrices4_1:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices4_1]]
+// CHECK-NEXT:  [[matrices4_1_row0:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 0
+// CHECK-NEXT:       [[ptr_m4_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_0
+// CHECK-NEXT:                              OpStore [[ptr_m4_row0]] [[matrices4_1_row0]]
+// CHECK-NEXT:  [[matrices4_1_row1:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 1
+// CHECK-NEXT:       [[ptr_m4_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_1
+// CHECK-NEXT:                              OpStore [[ptr_m4_row1]] [[matrices4_1_row1]]
+    int2x3 m4 = matrices4[1];
+// CHECK:          [[ptr_matrices5:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_4
+// CHECK-NEXT:   [[ptr_matrices5_2:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices5]] %int_2
+// CHECK-NEXT:       [[matrices5_2:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices5_2]]
+// CHECK-NEXT: [[matrices_5_2_row0:%\d+]] = OpCompositeExtract %v3int [[matrices5_2]] 0
+// CHECK-NEXT:       [[ptr_m5_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m5 %uint_0
+// CHECK-NEXT:                              OpStore [[ptr_m5_row0]] [[matrices_5_2_row0]]
+// CHECK-NEXT: [[matrices_5_2_row1:%\d+]] = OpCompositeExtract %v3int [[matrices5_2]] 1
+// CHECK-NEXT:       [[ptr_m5_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m5 %uint_1
+// CHECK-NEXT:                              OpStore [[ptr_m5_row1]] [[matrices_5_2_row1]]
+    int2x3 m5 = matrices5[2];
 }

+ 8 - 4
tools/clang/test/CodeGenSPIRV/vk.layout.csbuffer.std430.hlsl

@@ -4,6 +4,8 @@
 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
 
 // CHECK: OpMemberDecorate %S 0 Offset 0
 // CHECK: OpMemberDecorate %S 1 Offset 16
@@ -17,13 +19,14 @@
 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
 // CHECK: OpMemberDecorate %S 4 RowMajor
 // CHECK: OpMemberDecorate %S 5 Offset 208
+// CHECK: OpMemberDecorate %S 6 Offset 272
 
-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
 
 // CHECK: OpMemberDecorate %T 0 Offset 0
-// CHECK: OpMemberDecorate %T 1 Offset 448
+// CHECK: OpMemberDecorate %T 1 Offset 576
 
-// CHECK: OpDecorate %_runtimearr_T ArrayStride 464
+// CHECK: OpDecorate %_runtimearr_T ArrayStride 592
 
 // CHECK: OpMemberDecorate %type_ConsumeStructuredBuffer_T 0 Offset 0
 // CHECK: OpDecorate %type_ConsumeStructuredBuffer_T BufferBlock
@@ -36,7 +39,8 @@ struct S {
     row_major    float2x3 c[2];
     column_major float2x3 d[2];
                  float2x3 e[2];
-                 int      f;
+    row_major    int2x3   f[2];
+                 int      g;
 };
 
 struct T {

+ 20 - 0
tools/clang/test/CodeGenSPIRV/vk.layout.non-fp-matrix.error.hlsl

@@ -0,0 +1,20 @@
+// Run: %dxc -T vs_6_0 -E main
+
+cbuffer MyCBuffer {
+  struct S {
+    int2x3   matrices4[5];
+  } s;
+}
+
+struct T {
+    int2x3   t[5];
+};
+
+RWStructuredBuffer<T> rwsb;
+
+void main() {
+   int2x3 m4 = s.matrices4[1];
+}
+
+// CHECK: :6:5: error: externally initialized non-floating-point column-major matrices not supported yet
+// CHECK: :13:23: error: externally initialized non-floating-point column-major matrices not supported yet

+ 14 - 3
tools/clang/test/CodeGenSPIRV/vk.layout.push-constant.std430.hlsl

@@ -2,28 +2,39 @@
 
 // CHECK: OpDecorate %_arr_v2float_uint_3 ArrayStride 8
 // CHECK: OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 32
+// CHECK: OpDecorate %_arr_v2int_uint_3 ArrayStride 8
+// CHECK: OpDecorate %_arr__arr_v2int_uint_3_uint_2 ArrayStride 24
 
 // CHECK: OpMemberDecorate %T 0 Offset 0
 // CHECK: OpMemberDecorate %T 1 Offset 32
 // CHECK: OpMemberDecorate %T 1 MatrixStride 16
 // CHECK: OpMemberDecorate %T 1 RowMajor
+// CHECK: OpMemberDecorate %T 2 Offset 96
+// CHECK: OpMemberDecorate %T 3 Offset 144
+// CHECK: OpMemberDecorate %T 3 MatrixStride 8
+// CHECK: OpMemberDecorate %T 3 ColMajor
 struct T {
                  float2   f1[3];
     column_major float3x2 f2[2];
+    row_major    int3x2   f4[2];
+    row_major    float3x2 f3[2];
 };
 
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
 // CHECK: OpMemberDecorate %type_PushConstant_S 0 Offset 0
 // CHECK: OpMemberDecorate %type_PushConstant_S 1 Offset 16
 // CHECK: OpMemberDecorate %type_PushConstant_S 2 Offset 32
-// CHECK: OpMemberDecorate %type_PushConstant_S 3 Offset 128
-// CHECK: OpMemberDecorate %type_PushConstant_S 3 MatrixStride 16
-// CHECK: OpMemberDecorate %type_PushConstant_S 3 ColMajor
+// CHECK: OpMemberDecorate %type_PushConstant_S 3 Offset 224
+// CHECK: OpMemberDecorate %type_PushConstant_S 4 Offset 256
+// CHECK: OpMemberDecorate %type_PushConstant_S 4 MatrixStride 16
+// CHECK: OpMemberDecorate %type_PushConstant_S 4 ColMajor
 
 // CHECK: OpDecorate %type_PushConstant_S Block
 struct S {
               float    f1;
               float3   f2;
               T        f4;
+    row_major int2x3   f5;
     row_major float2x3 f3;
 };
 

+ 34 - 26
tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.nested.std430.hlsl

@@ -6,29 +6,31 @@ struct R {                         // Alignment    Offset  Size
     row_major    float2x3 rf1[3];  // 16(vec4)  -> 0     + 3(array) * stride(2 * 16(vec4)) = 96
     column_major float2x3 rf2[4];  // 8(vec2)   -> 96    + 4(array) * stride(3 * 8(vec2))  = 192
                  float2x3 rf3[2];  // 8(vec2)   -> 192   + 2(array) * stride(3 * 8(vec2))  = 240
-                 int      rf4;     // 4         -> 240   + 4                               = 244
-};                                 // 16(max)                                                256 (244 round up to R alignment)
+    row_major    int2x3   rf4[3];  // 16(vec4)  -> 240   + 3(array) * stride(2 * 16(vec4)) = 336
+                 int      rf5;     // 4         -> 336   + 4                               = 340
+};                                 // 16(max)                                                352 (340 round up to R alignment)
 
 // Array of scalars, vectors, matrices, and structs
 struct S {                         // Alignment   Offset  Size                              Next
     float3       sf1[3];           // 16(vec4) -> 0     + 3(array) * 16(vec4)             = 48
     float        sf2[3];           // 4        -> 48    + 3(array) * 4                    = 60
-    R            sf3[4];           // 16       -> 64    + 4(array) * stride(256)          = 1088
-    row_major    float3x2 sf4[2];  // 8(vec2)  -> 1088  + 2(array) * stride(3 * 8(vec2))  = 1136
-    column_major float3x2 sf5[3];  // 16(vec4) -> 1136  + 3(array) * stride(2 * 16(vec4)) = 1232
-                 float3x2 sf6[4];  // 16(vec4) -> 1232  + 4(array) * stride(2 * 16(vec4)) = 1360
-                 float    sf7;     // 4        -> 1360  + 4                               = 1364
-};                                 // 16(max)                                               1376 (1364 round up to S alignment)
+    R            sf3[4];           // 16       -> 64    + 4(array) * stride(256)          = 1472
+    row_major    float3x2 sf4[2];  // 8(vec2)  -> 1472  + 2(array) * stride(3 * 8(vec2))  = 1520
+    column_major float3x2 sf5[3];  // 16(vec4) -> 1520  + 3(array) * stride(2 * 16(vec4)) = 1616
+                 float3x2 sf6[4];  // 16(vec4) -> 1616  + 4(array) * stride(2 * 16(vec4)) = 1744
+    row_major    int3x2   sf7[2];  // 8(vec2)  -> 1744  + 2(array) * stride(3 * 8(vec2))  = 1792
+                 float    sf8;     // 4        -> 1792  + 4                               = 1796
+};                                 // 16(max)                                               1808 (1796 round up to S alignment)
 
 struct T {        // Alignment    Offset  Size              Next
-    R    tf1[2];  // 16        -> 0     + 2(array) * 256  = 512
-    S    tf2[3];  // 16        -> 512   + 3(array) * 1376 = 4640
-    uint tf3;     // 4         -> 4640  + 4               = 4644
-};                // 16(max)                                4656 (4640 round up to T alignment)
+    R    tf1[2];  // 16        -> 0     + 2(array) * 352  = 704
+    S    tf2[3];  // 16        -> 704   + 3(array) * 1808 = 6128
+    uint tf3;     // 4         -> 6128  + 4               = 6132
+};                // 16(max)                                6144 (6132 round up to T alignment)
 
 struct SBuffer {  // Alignment   Offset   Size                 Next
-    T    t[2];       // 16       -> 0      + 2(array) * 4656 = 9312
-    bool z;          // 4        -> 9312
+    T    t[2];       // 16       -> 0      + 2(array) * 6144 = 12288
+    bool z;          // 4        -> 12288
 };
 
 RWStructuredBuffer<SBuffer> MySBuffer;
@@ -36,6 +38,8 @@ RWStructuredBuffer<SBuffer> MySBuffer;
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_4 ArrayStride 24
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 24
+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_3 ArrayStride 32
 
 // CHECK:      OpMemberDecorate %R 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %R 0 MatrixStride 16
@@ -47,42 +51,46 @@ RWStructuredBuffer<SBuffer> MySBuffer;
 // CHECK-NEXT: OpMemberDecorate %R 2 MatrixStride 8
 // CHECK-NEXT: OpMemberDecorate %R 2 RowMajor
 // CHECK-NEXT: OpMemberDecorate %R 3 Offset 240
+// CHECK-NEXT: OpMemberDecorate %R 4 Offset 336
 
-// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 256
+// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 352
 // CHECK:      OpDecorate %_arr_v3float_uint_3 ArrayStride 16
 // CHECK:      OpDecorate %_arr_float_uint_3 ArrayStride 4
-// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 256
+// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 352
 
 // CHECK:      OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 24
 // CHECK:      OpDecorate %_arr_mat3v2float_uint_3 ArrayStride 32
 // CHECK:      OpDecorate %_arr_mat3v2float_uint_4 ArrayStride 32
+// CHECK:      OpDecorate %_arr_v2int_uint_3 ArrayStride 8
+// CHECK:      OpDecorate %_arr__arr_v2int_uint_3_uint_2 ArrayStride 24
 
 // CHECK:      OpMemberDecorate %S 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 48
 // CHECK-NEXT: OpMemberDecorate %S 2 Offset 64
-// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1088
+// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1472
 // CHECK-NEXT: OpMemberDecorate %S 3 MatrixStride 8
 // CHECK-NEXT: OpMemberDecorate %S 3 ColMajor
-// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1136
+// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1520
 // CHECK-NEXT: OpMemberDecorate %S 4 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %S 4 RowMajor
-// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1232
+// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1616
 // CHECK-NEXT: OpMemberDecorate %S 5 MatrixStride 16
 // CHECK-NEXT: OpMemberDecorate %S 5 RowMajor
-// CHECK-NEXT: OpMemberDecorate %S 6 Offset 1360
+// CHECK-NEXT: OpMemberDecorate %S 6 Offset 1744
+// CHECK-NEXT: OpMemberDecorate %S 7 Offset 1792
 
-// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 1376
+// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 1808
 
 // CHECK:      OpMemberDecorate %T 0 Offset 0
-// CHECK-NEXT: OpMemberDecorate %T 1 Offset 512
-// CHECK-NEXT: OpMemberDecorate %T 2 Offset 4640
+// CHECK-NEXT: OpMemberDecorate %T 1 Offset 704
+// CHECK-NEXT: OpMemberDecorate %T 2 Offset 6128
 
-// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 4656
+// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 6144
 
 // CHECK-NEXT: OpMemberDecorate %SBuffer 0 Offset 0
-// CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 9312
+// CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 12288
 
-// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 9328
+// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 12304
 
 // CHECK:      OpMemberDecorate %type_RWStructuredBuffer_SBuffer 0 Offset 0
 // CHECK-NEXT: OpDecorate %type_RWStructuredBuffer_SBuffer BufferBlock

+ 18 - 11
tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.std430.hlsl

@@ -11,13 +11,14 @@ struct S {      // Alignment    Offset                                Size
     float  sf4; // 4         -> 28                                  + 4         = 32
 };              // 16(max)                                                        32
 
-struct T {           // Alignment     Offset                               Size              = Next
-    int      tf1;    // 4          -> 0                                  + 4                 = 4
-    R        tf2[3]; // 8          -> 8                                  + 3 * stride(8)     = 32
-    float3x2 tf3;    // 16(vec4)   -> 32 (32 round up to vec4 alignment) + 2 * stride(vec4)  = 64
-    S        tf4;    // 16         -> 64 (64 round up to S alignment)    + 32                = 96
-    float    tf5;    // 4          -> 96                                 + 4                 = 100
-};                   // 16(max)                                                                112(100 round up to T max alignment)
+struct T {                      // Alignment     Offset                               Size              = Next
+               int      tf1;    // 4          -> 0                                  + 4                 = 4
+               R        tf2[3]; // 8          -> 8                                  + 3 * stride(8)     = 32
+               float3x2 tf3;    // 16(vec4)   -> 32 (32 round up to vec4 alignment) + 2 * stride(vec4)  = 64
+  row_major    int3x2   tf4;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 3 * stride(vec2)  = 88
+               S        tf5;    // 16         -> 96 (88 round up to S alignment)    + 32                = 128
+               float    tf6;    // 4          -> 128                                + 4                 = 132
+};                              // 16(max)                                                                144(132 round up to T max alignment)
 
 struct SBuffer {              // Alignment   Offset                                 Size                     Next
                  bool     a;     // 4        -> 0                                    +     4                  = 4
@@ -28,8 +29,9 @@ struct SBuffer {              // Alignment   Offset
                  float2x1 f;     // 8(vec2)  -> 88 (88 round up to vec2 aligment)    + 2 * 4                  = 96
     row_major    float2x3 g[3];  // 16(vec4) -> 96 (96 round up to vec4 alignment)   + 3 * 2 * stride(vec4)   = 192
     column_major float2x2 h[4];  // 16(vec4) -> 192 (192 round up to vec2 alignment) + 4 * 2 * stride(vec2)   = 256
-                 T        t;     // 16       -> 256 (352 round up to T alignment)    + 112                    = 368
-                 float    z;     // 4        -> 368
+    row_major    int2x3   i[5];  // 16(vec4) -> 256 (256 round up to vec4 alignment) + 5 * 2 * stride(vec4)   = 416
+                 T        t;     // 16       -> 416 (416 round up to T alignment)    + 144                    = 560
+                 float    z;     // 4        -> 560
 
 };
 
@@ -37,10 +39,13 @@ StructuredBuffer<SBuffer> MySBuffer;
 
 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
 // CHECK:      OpDecorate %_arr_mat2v2float_uint_4 ArrayStride 16
+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
 
 // CHECK:      OpMemberDecorate %R 0 Offset 0
 
 // CHECK:      OpDecorate %_arr_R_uint_3 ArrayStride 8
+// CEHCK:      OpDecorate %_arr_v2int_uint_3 ArrayStride 8
 
 // CHECK:      OpMemberDecorate %S 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 8
@@ -54,6 +59,7 @@ StructuredBuffer<SBuffer> MySBuffer;
 // CHECK-NEXT: OpMemberDecorate %T 2 RowMajor
 // CHECK-NEXT: OpMemberDecorate %T 3 Offset 64
 // CHECK-NEXT: OpMemberDecorate %T 4 Offset 96
+// CHECK-NEXT: OpMemberDecorate %T 5 Offset 128
 
 // CHECK:      OpMemberDecorate %SBuffer 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 4
@@ -72,9 +78,10 @@ StructuredBuffer<SBuffer> MySBuffer;
 // CHECK-NEXT: OpMemberDecorate %SBuffer 7 MatrixStride 8
 // CHECK-NEXT: OpMemberDecorate %SBuffer 7 RowMajor
 // CHECK-NEXT: OpMemberDecorate %SBuffer 8 Offset 256
-// CHECK-NEXT: OpMemberDecorate %SBuffer 9 Offset 368
+// CHECK-NEXT: OpMemberDecorate %SBuffer 9 Offset 416
+// CHECK-NEXT: OpMemberDecorate %SBuffer 10 Offset 560
 
-// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 384
+// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 576
 
 // CHECK:      OpMemberDecorate %type_StructuredBuffer_SBuffer 0 Offset 0
 // CHECK-NEXT: OpMemberDecorate %type_StructuredBuffer_SBuffer 0 NonWritable

+ 7 - 3
tools/clang/test/CodeGenSPIRV/vk.layout.tbuffer.std430.hlsl

@@ -4,6 +4,8 @@
 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
 
 // CHECK: OpMemberDecorate %S 0 Offset 0
 // CHECK: OpMemberDecorate %S 1 Offset 16
@@ -17,11 +19,12 @@
 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
 // CHECK: OpMemberDecorate %S 4 RowMajor
 // CHECK: OpMemberDecorate %S 5 Offset 208
+// CHECK: OpMemberDecorate %S 6 Offset 272
 
-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
 
 // CHECK: OpMemberDecorate %type_myTbuffer 0 Offset 0
-// CHECK: OpMemberDecorate %type_myTbuffer 1 Offset 448
+// CHECK: OpMemberDecorate %type_myTbuffer 1 Offset 576
 
 // CHECK: OpDecorate %type_myTbuffer BufferBlock
 
@@ -34,7 +37,8 @@ struct S {
     row_major    float2x3 c[2];
     column_major float2x3 d[2];
                  float2x3 e[2];
-                 int      f;
+    row_major    int2x3   f[2];
+                 int      g;
 };
 
 tbuffer myTbuffer : register(t0)

+ 7 - 3
tools/clang/test/CodeGenSPIRV/vk.layout.texture-buffer.std430.hlsl

@@ -4,6 +4,8 @@
 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
 
 // CHECK: OpMemberDecorate %S 0 Offset 0
 // CHECK: OpMemberDecorate %S 1 Offset 16
@@ -17,11 +19,12 @@
 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
 // CHECK: OpMemberDecorate %S 4 RowMajor
 // CHECK: OpMemberDecorate %S 5 Offset 208
+// CHECK: OpMemberDecorate %S 6 Offset 272
 
-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
 
 // CHECK: OpMemberDecorate %type_TextureBuffer_T 0 Offset 0
-// CHECK: OpMemberDecorate %type_TextureBuffer_T 1 Offset 448
+// CHECK: OpMemberDecorate %type_TextureBuffer_T 1 Offset 576
 
 // CHECK: OpDecorate %type_TextureBuffer_T BufferBlock
 
@@ -34,7 +37,8 @@ struct S {
     row_major    float2x3 c[2];
     column_major float2x3 d[2];
                  float2x3 e[2];
-                 int      f;
+    row_major    int2x3   f[2];
+                 int      g;
 };
 
 struct T {

+ 46 - 0
tools/clang/test/CodeGenSPIRV/vk.location.composite.hlsl

@@ -0,0 +1,46 @@
+// Run: %dxc -T vs_6_0 -E main
+
+// CHECK: OpDecorate %in_var_A Location 0
+// CHECK: OpDecorate %in_var_B Location 1
+// CHECK: OpDecorate %in_var_C Location 2
+// CHECK: OpDecorate %in_var_D Location 4
+// CHECK: OpDecorate %in_var_E Location 6
+// CHECK: OpDecorate %in_var_F Location 8
+// CHECK: OpDecorate %in_var_G Location 16
+
+// CHECK: OpDecorate %out_var_A Location 0
+// CHECK: OpDecorate %out_var_B Location 2
+// CHECK: OpDecorate %out_var_C Location 3
+// CHECK: OpDecorate %out_var_D Location 4
+// CHECK: OpDecorate %out_var_E Location 5
+// CHECK: OpDecorate %out_var_F Location 11
+// CHECK: OpDecorate %out_var_G Location 13
+// CHECK: OpDecorate %out_var_H Location 14
+
+struct S {
+    half2x3  matrix2x3 : A; // 0 (+2)
+    float1x2 vector1x2 : B; // 2 (+1)
+    float3x1 vector3x1 : C; // 3 (+1)
+    float1x1 scalar1x1 : D; // 4 (+1)
+};
+
+struct T {
+    S        s;
+    float2x3 array1[3] : E; // 5  (+2*3)
+    half1x2  array2[2] : F; // 11 (+1*2)
+    half3x1  array3[1] : G; // 13 (+1*1)
+    float    array4[4] : H; // 14 (+1*4)
+};
+
+T main(
+    double    a   : A, // 0  (+1)
+    double2   b   : B, // 1  (+1)
+    double3   c   : C, // 2  (+2)
+    double4   d   : D, // 4  (+2)
+    double2x2 e   : E, // 6  (+1*2)
+    double2x3 f[2]: F, // 8  (+2*2*2)
+    double2x3 g   : G  // 16 (+2x2)
+) {
+    T t = (T)0;
+    return t;
+}

+ 0 - 0
tools/clang/test/vk.cloption.invert-y.vs.hlsl


+ 8 - 3
tools/clang/tools/dxcompiler/dxcontainerbuilder.cpp

@@ -98,8 +98,12 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::AddPart(_In_ UINT32 fourCC, _In_
     IFTBOOL(pSource != nullptr && !IsDxilContainerLike(pSource->GetBufferPointer(),
       pSource->GetBufferSize()),
       E_INVALIDARG);
-    // Only allow adding private data and root signature for now
-    IFTBOOL(fourCC == DxilFourCC::DFCC_RootSignature || fourCC == DxilFourCC::DFCC_PrivateData, E_INVALIDARG);
+    // Only allow adding private data, debug info name and root signature for now
+    IFTBOOL(
+        fourCC == DxilFourCC::DFCC_RootSignature || 
+        fourCC == DxilFourCC::DFCC_ShaderDebugName ||
+        fourCC == DxilFourCC::DFCC_PrivateData, 
+      E_INVALIDARG);
     PartList::iterator it = std::find_if(m_parts.begin(), m_parts.end(), [&](DxilPart part) {
       return part.m_fourCC == fourCC;
     });
@@ -117,9 +121,10 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::RemovePart(_In_ UINT32 fourCC) {
   DxcThreadMalloc TM(m_pMalloc);
   try {
     IFTBOOL(fourCC == DxilFourCC::DFCC_ShaderDebugInfoDXIL ||
+                fourCC == DxilFourCC::DFCC_ShaderDebugName ||
                 fourCC == DxilFourCC::DFCC_RootSignature ||
                 fourCC == DxilFourCC::DFCC_PrivateData,
-            E_INVALIDARG); // You can only remove debug info, rootsignature, or private data blob
+            E_INVALIDARG); // You can only remove debug info, debug info name, rootsignature, or private data blob
     PartList::iterator it =
       std::find_if(m_parts.begin(), m_parts.end(),
         [&](DxilPart part) { return part.m_fourCC == fourCC; });

+ 83 - 7
tools/clang/unittests/HLSL/CompilerTest.cpp

@@ -400,6 +400,7 @@ public:
   TEST_METHOD(CompileWhenWorksThenDisassembleWorks)
   TEST_METHOD(CompileWhenDebugWorksThenStripDebug)
   TEST_METHOD(CompileWhenWorksThenAddRemovePrivate)
+  TEST_METHOD(CompileThenAddCustomDebugName)
   TEST_METHOD(CompileWithRootSignatureThenStripRootSignature)
 
   TEST_METHOD(CompileWhenIncludeThenLoadInvoked)
@@ -2081,12 +2082,12 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
 
   VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
   CreateBlobFromText("float4 main() : SV_Target {\r\n"
-                     "  return 0;\r\n"
-                     "}",
-                     &pSource);
+    "  return 0;\r\n"
+    "}",
+    &pSource);
   VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"source.hlsl", L"main",
-                                      L"ps_6_0", nullptr, 0, nullptr, 0,
-                                      nullptr, &pResult));
+    L"ps_6_0", nullptr, 0, nullptr, 0,
+    nullptr, &pResult));
   VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
   // Append private data blob
   CComPtr<IDxcContainerBuilder> pBuilder;
@@ -2103,9 +2104,9 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
   CComPtr<IDxcBlob> pNewProgram;
   VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
   hlsl::DxilContainerHeader *pContainerHeader =
-      (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
   hlsl::DxilPartHeader *pPartHeader = hlsl::GetDxilPartByType(
-      pContainerHeader, hlsl::DxilFourCC::DFCC_PrivateData);
+    pContainerHeader, hlsl::DxilFourCC::DFCC_PrivateData);
   VERIFY_IS_NOT_NULL(pPartHeader);
   // compare data
   std::string privatePart((const char *)(pPartHeader + 1), privateTxt.size());
@@ -2128,6 +2129,81 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
   VERIFY_IS_NULL(pPartHeader);
 }
 
+TEST_F(CompilerTest, CompileThenAddCustomDebugName) {
+  CComPtr<IDxcCompiler> pCompiler;
+  CComPtr<IDxcOperationResult> pResult;
+  CComPtr<IDxcBlobEncoding> pSource;
+  CComPtr<IDxcBlob> pProgram;
+
+  VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
+  CreateBlobFromText("float4 main() : SV_Target {\r\n"
+    "  return 0;\r\n"
+    "}",
+    &pSource);
+
+  LPCWSTR args[] = { L"/Zi", L"/Zss" };
+
+  VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"source.hlsl", L"main",
+    L"ps_6_0", args, _countof(args), nullptr, 0,
+    nullptr, &pResult));
+  VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
+  // Append private data blob
+  CComPtr<IDxcContainerBuilder> pBuilder;
+  VERIFY_SUCCEEDED(CreateContainerBuilder(&pBuilder));
+
+  const char pNewName[] = "MyOwnUniqueName.lld";
+  //include null terminator:
+  size_t nameBlobPartSize = sizeof(hlsl::DxilShaderDebugName) + _countof(pNewName);
+  // round up to four-byte size:
+  size_t allocatedSize = (nameBlobPartSize + 3) & ~3;
+  auto pNameBlobContent = reinterpret_cast<hlsl::DxilShaderDebugName*>(malloc(allocatedSize));
+  ZeroMemory(pNameBlobContent, allocatedSize); //just to make sure trailing nulls are nulls.
+  pNameBlobContent->Flags = 0;
+  pNameBlobContent->NameLength = _countof(pNewName) - 1; //this is not supposed to include null terminator
+  memcpy(pNameBlobContent + 1, pNewName, _countof(pNewName));
+
+  CComPtr<IDxcBlobEncoding> pDebugName;
+
+  CreateBlobPinned(pNameBlobContent, allocatedSize, CP_UTF8, &pDebugName);
+
+
+  VERIFY_SUCCEEDED(pBuilder->Load(pProgram));
+  // should fail since it already exists:
+  VERIFY_FAILED(pBuilder->AddPart(hlsl::DxilFourCC::DFCC_ShaderDebugName, pDebugName));
+  VERIFY_SUCCEEDED(pBuilder->RemovePart(hlsl::DxilFourCC::DFCC_ShaderDebugName));
+  VERIFY_SUCCEEDED(pBuilder->AddPart(hlsl::DxilFourCC::DFCC_ShaderDebugName, pDebugName));
+  pResult.Release();
+  VERIFY_SUCCEEDED(pBuilder->SerializeContainer(&pResult));
+
+  CComPtr<IDxcBlob> pNewProgram;
+  VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
+  hlsl::DxilContainerHeader *pContainerHeader =
+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
+  hlsl::DxilPartHeader *pPartHeader = hlsl::GetDxilPartByType(
+    pContainerHeader, hlsl::DxilFourCC::DFCC_ShaderDebugName);
+  VERIFY_IS_NOT_NULL(pPartHeader);
+  // compare data
+  VERIFY_IS_TRUE(memcmp(pPartHeader + 1, pNameBlobContent, allocatedSize) == 0);
+
+  free(pNameBlobContent);
+
+  // Remove private data blob
+  pBuilder.Release();
+  VERIFY_SUCCEEDED(CreateContainerBuilder(&pBuilder));
+  VERIFY_SUCCEEDED(pBuilder->Load(pNewProgram));
+  VERIFY_SUCCEEDED(pBuilder->RemovePart(hlsl::DxilFourCC::DFCC_ShaderDebugName));
+  pResult.Release();
+  VERIFY_SUCCEEDED(pBuilder->SerializeContainer(&pResult));
+
+  pNewProgram.Release();
+  VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
+  pContainerHeader =
+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
+  pPartHeader = hlsl::GetDxilPartByType(
+    pContainerHeader, hlsl::DxilFourCC::DFCC_ShaderDebugName);
+  VERIFY_IS_NULL(pPartHeader);
+}
+
 TEST_F(CompilerTest, CompileWithRootSignatureThenStripRootSignature) {
   CComPtr<IDxcCompiler> pCompiler;
   CComPtr<IDxcOperationResult> pResult;

+ 20 - 20
tools/clang/unittests/HLSL/ShaderOpArithTable.xml

@@ -1760,8 +1760,8 @@
             <Parameter Name="ShaderOp.Arguments">-enable-16bit-types</Parameter>
         </Row>
         <Row Name="HcosHalf">
-            <Parameter Name="Validation.Type">Epsilon</Parameter>
-            <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+            <Parameter Name="Validation.Type">ulp</Parameter>
+            <Parameter Name="Validation.Tolerance">2</Parameter>
             <Parameter Name="ShaderOp.Text"> struct SUnaryFPOp {
                 float16_t input;
                 float16_t output;
@@ -5798,19 +5798,19 @@
             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
             <Parameter Name="Validation.Input1">
                 <Value>0x007F0000</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x40000000</Value>
+                <Value>0x807F0000</Value>
+                <Value>0x20000000</Value>
                 <Value>0x00800000</Value>
             </Parameter>
             <Parameter Name="Validation.Input2">
                 <Value>1</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x7F7F0000</Value>
+                <Value>4</Value>
+                <Value>0x607F0000</Value>
                 <Value>0x40000000</Value>
             </Parameter>
             <Parameter Name="Validation.Expected1">
                 <Value>0</Value>
-                <Value>NaN</Value>
+                <Value>0</Value>
                 <Value>0</Value>
                 <Value>0</Value>
             </Parameter>
@@ -5925,25 +5925,25 @@
             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
             <Parameter Name="Validation.Input1">
                 <Value>0x007F0000</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x40000000</Value>
+                <Value>0x807F0000</Value>
+                <Value>0x20000000</Value>
                 <Value>0x00800000</Value>
             </Parameter>
             <Parameter Name="Validation.Input2">
                 <Value>1</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x7F7F0000</Value>
+                <Value>4</Value>
+                <Value>0x607F0000</Value>
                 <Value>0x40000000</Value>
             </Parameter>
             <Parameter Name="Validation.Expected1">
                 <Value>0x007F0000</Value>
-                <Value>1</Value>
-                <Value>0x00404040</Value>
+                <Value>0x801FC000</Value>
+                <Value>0x00101010</Value>
                 <Value>0x00400000</Value>
             </Parameter>
             <Parameter Name="Validation.Expected2">
                 <Value>0</Value>
-                <Value>NaN</Value>
+                <Value>0</Value>
                 <Value>0</Value>
                 <Value>0</Value>
             </Parameter>
@@ -6045,20 +6045,20 @@
             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
             <Parameter Name="Validation.Input1">
                 <Value>0x007F0000</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x40000000</Value>
+                <Value>0x807F0000</Value>
+                <Value>0x20000000</Value>
                 <Value>0x00800000</Value>
             </Parameter>
             <Parameter Name="Validation.Input2">
                 <Value>1</Value>
-                <Value>0x007F0000</Value>
-                <Value>0x7F7F0000</Value>
+                <Value>4</Value>
+                <Value>0x607F0000</Value>
                 <Value>0x40000000</Value>
             </Parameter>
             <Parameter Name="Validation.Expected1">
                 <Value>0x007F0000</Value>
-                <Value>1</Value>
-                <Value>0x00404040</Value>
+                <Value>0x801FC000</Value>
+                <Value>0x00101010</Value>
                 <Value>0x00400000</Value>
             </Parameter>
             <Parameter Name="ShaderOp.Arguments">-denorm preserve</Parameter>

+ 44 - 2
tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp

@@ -194,8 +194,13 @@ TEST_F(FileTest, BinaryOpVectorArithAssign) {
 TEST_F(FileTest, BinaryOpMatrixArithAssign) {
   runFileTest("binary-op.arith-assign.matrix.hlsl");
 }
-TEST_F(FileTest, BinaryOpMixedArithAssign) {
-  runFileTest("binary-op.arith-assign.mixed.hlsl");
+TEST_F(FileTest, BinaryOpMixedFormArithAssign) {
+  // Test mixing scalar/vector/matrix/etc.
+  runFileTest("binary-op.arith-assign.mixed.form.hlsl");
+}
+TEST_F(FileTest, BinaryOpMixedTypeArithAssign) {
+  // Test mixing float/int/uint/bool/etc.
+  runFileTest("binary-op.arith-assign.mixed.type.hlsl");
 }
 
 // For bitwise binary operators
@@ -273,6 +278,10 @@ TEST_F(FileTest, OpArrayAccess) { runFileTest("op.array.access.hlsl"); }
 TEST_F(FileTest, OpBufferAccess) { runFileTest("op.buffer.access.hlsl"); }
 TEST_F(FileTest, OpRWBufferAccess) { runFileTest("op.rwbuffer.access.hlsl"); }
 TEST_F(FileTest, OpCBufferAccess) { runFileTest("op.cbuffer.access.hlsl"); }
+TEST_F(FileTest, OpCBufferAccessMajorness) {
+  /// Tests that we correctly consider majorness when accessing matrices
+  runFileTest("op.cbuffer.access.majorness.hlsl");
+}
 TEST_F(FileTest, OpConstantBufferAccess) {
   runFileTest("op.constant-buffer.access.hlsl");
 }
@@ -327,9 +336,13 @@ TEST_F(FileTest, CastFlatConversionStruct) {
 TEST_F(FileTest, CastFlatConversionNoOp) {
   runFileTest("cast.flat-conversion.no-op.hlsl");
 }
+TEST_F(FileTest, CastFlatConversionLiteralInitializer) {
+  runFileTest("cast.flat-conversion.literal-initializer.hlsl");
+}
 TEST_F(FileTest, CastExplicitVecToMat) {
   runFileTest("cast.vec-to-mat.explicit.hlsl");
 }
+TEST_F(FileTest, CastBitwidth) { runFileTest("cast.bitwidth.hlsl"); }
 
 // For vector/matrix splatting and trunction
 TEST_F(FileTest, CastTruncateVector) { runFileTest("cast.vector.trunc.hlsl"); }
@@ -969,6 +982,22 @@ TEST_F(FileTest, PrimitiveErrorGS) {
   runFileTest("primitive.error.gs.hlsl", Expect::Failure);
 }
 
+// Shader model 6.0 wave query
+TEST_F(FileTest, SM6WaveGetLaneCount) {
+  runFileTest("sm6.wave-get-lane-count.hlsl");
+}
+TEST_F(FileTest, SM6WaveGetLaneIndex) {
+  runFileTest("sm6.wave-get-lane-index.hlsl");
+}
+TEST_F(FileTest, SM6WaveBuiltInNoDuplicate) {
+  runFileTest("sm6.wave.builtin.no-dup.hlsl");
+}
+
+// Shader model 6.0 wave broadcast
+TEST_F(FileTest, SM6WaveReadLaneFirst) {
+  runFileTest("sm6.wave-read-lane-first.hlsl");
+}
+
 // SPIR-V specific
 TEST_F(FileTest, SpirvStorageClass) { runFileTest("spirv.storage-class.hlsl"); }
 
@@ -1086,6 +1115,9 @@ TEST_F(FileTest, VulkanLocationInputExplicitOutputImplicit) {
 TEST_F(FileTest, VulkanLocationInputImplicitOutputExplicit) {
   runFileTest("vk.location.exp-out.hlsl");
 }
+TEST_F(FileTest, VulkanLocationCompositeTypes) {
+  runFileTest("vk.location.composite.hlsl");
+}
 TEST_F(FileTest, VulkanLocationTooLarge) {
   runFileTest("vk.location.large.hlsl", Expect::Failure);
 }
@@ -1179,6 +1211,12 @@ TEST_F(FileTest, VulkanLayoutTBufferStd430) {
 TEST_F(FileTest, VulkanLayoutTextureBufferStd430) {
   runFileTest("vk.layout.texture-buffer.std430.hlsl");
 }
+TEST_F(FileTest, VulkanLayout64BitTypesStd430) {
+  runFileTest("vk.layout.64bit-types.std430.hlsl");
+}
+TEST_F(FileTest, VulkanLayout64BitTypesStd140) {
+  runFileTest("vk.layout.64bit-types.std140.hlsl");
+}
 
 TEST_F(FileTest, VulkanLayoutPushConstantStd430) {
   runFileTest("vk.layout.push-constant.std430.hlsl");
@@ -1196,6 +1234,10 @@ TEST_F(FileTest, VulkanSubpassInputError) {
   runFileTest("vk.subpass-input.error.hlsl", Expect::Failure);
 }
 
+TEST_F(FileTest, NonFpColMajorError) {
+  runFileTest("vk.layout.non-fp-matrix.error.hlsl", Expect::Failure);
+}
+
 // HS: for different Patch Constant Functions
 TEST_F(FileTest, HullShaderPCFVoid) { runFileTest("hs.pcf.void.hlsl"); }
 TEST_F(FileTest, HullShaderPCFTakesInputPatch) {

+ 59 - 0
utils/hct/hctdb_inst_docs.txt

@@ -588,3 +588,62 @@ dest0, dest1 = USubb(src0, src1)
 * Inst: AttributeAtVertex - returns the values of the attributes at the vertex.
 
 returns the values of the attributes at the vertex. VertexID ranges from 0 to 2.
+
+* Inst: FDiv - returns the quotient of its two operands
+
+%dest = fdiv float %src0, %src1
+
+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that fast math flag is not used and "fp32-denorm-mode"="preserve".
+When "fp32-denorm-mode"="ftz", denorm inputs should be interpreted as corresponding signed zero, and any resulting denorm is also flushed to zero.
+When fast math is enabled, implementation may use reciprocal form: src0*(1/src1).  This may result in evaluating src0*(+/-)INF from src0*(1/(+/-)denorm).  This may produce NaN in some cases or (+/-)INF in others.
+
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| src0\\src1| -inf     | -F     |  -1   | -denorm | -0 | +0 | +denorm |  +1   |    +F  | +inf | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -inf      | NaN      |   +inf | +inf  | +inf    |+inf|-inf| -inf    |  -inf |   -inf | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -F        | +0       |   +F   | -src0 | +F      |+inf|-inf| -F      |  src0 |   -F   | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -denorm   | +0       | +denorm| -src0 | +F      |+inf|-inf| -F      |  src0 |-denorm | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| -0        | +0       |   +0   | +0    | 0       |NaN |NaN | 0       |  -0   |   -0   | -0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +0        | -0       |   -0   | -0    | 0       |NaN |NaN | 0       |  +0   |   +0   | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +denorm   | -0       | -denorm| -src0 | -F      |-inf|+inf| +F      |  src0 |+denorm | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +F        | -0       |   -F   | -src0 | -F      |-inf|+inf| +F      |  src0 |   +F   | +0   | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| +inf      | NaN      |   -inf | -inf  | -inf    |-inf|+inf| +inf    |  +inf |   +inf | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+| NaN       | NaN      |   NaN  | NaN   | NaN     |NaN |NaN | NaN     |  NaN  |   NaN  | NaN  | NaN |
++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
+
+* Inst: FAdd - component-wise add
+
+%des = fadd float %src0, %src1
+
+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that "fp32-denorm-mode"="preserve". 
+For "fp32-denorm-mode"="ftz" mode, denorms inputs should be treated as corresponding signed zero, and any resulting denorm is also flushed to zero.
+
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| src0\src1| -inf     | -F     | -denorm  | -0 | +0 | +denorm   |    +F  | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -inf     | -inf     |   -inf | -inf     |-inf|-inf| -inf      |   -inf | NaN  | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -F       | -inf     |   -F   | -F       |src0|src0| -F        |   +/-F | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -denorm  | -inf     |   -F   |-F/denorm |src0|src0| +/-denorm |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| -0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +denorm  | -inf     |   -F   |+/-denorm |src0|src0| +F/denorm |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +F       | -inf     |  +/-F  | +F       |src0|src0| +F        |   +F   | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| +inf     | NaN      |   +inf | +inf     |+inf|+inf| +inf      |   +inf | +inf | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
+| NaN      | NaN      |   NaN  | NaN      |NaN |NaN | NaN       |   NaN  | NaN  | NaN |
++----------+----------+--------+----------+----+----+-----------+--------+------+-----+

+ 10 - 6
utils/hct/hctdb_test.py

@@ -91,14 +91,18 @@ def add_test_case_float_half(test_name, inst_names, validation_type, validation_
     add_test_case(test_name, inst_names, validation_type, validation_tolerance,
                   float_input_lists, float_output_lists, "cs_6_0", get_shader_text(shader_key, shader_op_name), **kwargs)
     # if half test cases are different from float input lists, use those lists instead for half testings
-    half_input_lists, half_output_lists = float_input_lists, float_output_lists
+    half_input_lists, half_output_lists, half_validation_type, half_validation_tolerance = float_input_lists, float_output_lists, validation_type, validation_tolerance
     if "half_inputs" in kwargs:
         half_input_lists = kwargs["half_inputs"]
     if "half_outputs" in kwargs:
         half_output_lists = kwargs["half_outputs"]
+    if "half_validation_type" in kwargs:
+        half_validation_type = kwargs["half_validation_type"]
+    if "half_validation_tolerance" in kwargs:
+        half_validation_tolerance = kwargs["half_validation_tolerance"]
     # skip relative error test check for half for now
     if validation_type != "Relative":
-        add_test_case(test_name + "Half", inst_names, validation_type, validation_tolerance,
+        add_test_case(test_name + "Half", inst_names, half_validation_type, half_validation_tolerance,
                     half_input_lists, half_output_lists, "cs_6_2",
                     get_shader_text(shader_key.replace("float","half"), shader_op_name), shader_arguments="-enable-16bit-types", **kwargs)
 
@@ -601,7 +605,7 @@ def add_test_cases():
         [['NaN', '-Inf', '-denorm', '-0', '0', 'denorm', 'Inf', '1', '-1']], [[
             'NaN', 'Inf', '1.0', '1.0', '1.0', '1.0', 'Inf', '1.543081',
             '1.543081'
-        ]], "unary float", "cosh")
+        ]], "unary float", "cosh", half_validation_type='ulp', half_validation_tolerance=2)
     add_test_case_float_half('Hsin', ['Hsin'], 'Epsilon', 0.0008,
         [['NaN', '-Inf', '-denorm', '-0', '0', 'denorm', 'Inf', '1', '-1']], [[
             'NaN', '-Inf', '0.0', '0.0', '0.0', '0.0', 'Inf', '1.175201',
@@ -802,9 +806,9 @@ def add_test_cases():
     [['0x0', '0x00FE0000', '0x007F0000', '0x007A0000']],
     'cs_6_2', get_shader_text("binary float", "-"))
     add_test_case_denorm('FDivDenorm', ['FDiv'], 'ulp', 1,
-    [['0x007F0000', '0x007F0000', '0x40000000', '0x00800000'],['1', '0x007F0000', '0x7F7F0000', '0x40000000']],
-    [['0', 'NaN', '0', '0']],
-    [['0x007F0000', '1', '0x00404040', '0x00400000']],
+    [['0x007F0000', '0x807F0000', '0x20000000', '0x00800000'],['1', '4', '0x607F0000', '0x40000000']],
+    [['0', '0', '0', '0']],
+    [['0x007F0000', '0x801FC000', '0x00101010', '0x00400000']],
     'cs_6_2', get_shader_text("binary float", "/"))
     add_test_case_denorm('FMulDenorm', ['FMul'], 'ulp', 1,
     [['0x00000300', '0x007F0000', '0x007F0000', '0x001E0000', '0x00000300'],['128', '1', '0x007F0000', '20', '0x78000000']],

+ 9 - 2
utils/hct/hctgettaef.py

@@ -1,5 +1,6 @@
 import urllib
 import os
+import ssl
 import zipfile
 
 url = "https://github.com/Microsoft/WinObjC/raw/develop/deps/prebuilt/nuget/taef.redist.wlk.1.0.170206001-nativetargets.nupkg"
@@ -11,11 +12,17 @@ if not os.path.isdir(taef_dir):
   os.makedirs(taef_dir)
 
 try:
-  urllib.urlretrieve(url, zipfile_name)
+  ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
+  response = urllib.urlopen(url, context=ctx)
+  f = open(zipfile_name, 'wb')
+  f.write(response.read())
+  f.close()
 except:
   print("Unable to read file with urllib, trying via powershell...")
   from subprocess import check_call
-  cmd = "(new-object System.Net.WebClient).DownloadFile('" + url + "', '" + zipfile_name + "')"
+  cmd = ""
+  cmd += "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12;"
+  cmd += "(new-object System.Net.WebClient).DownloadFile('" + url + "', '" + zipfile_name + "')"
   check_call(['powershell.exe', '-Command', cmd])
 
 z = zipfile.ZipFile(zipfile_name)

Bu fark içinde çok fazla dosya değişikliği olduğu için bazı dosyalar gösterilmiyor