7 yıl önce · f8e1af0417
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -1980,6 +1980,67 @@ ExtractValue  extracts from aggregate
 
				 ============= ======================================================================= =================
			
 
				 
			
 
				 
			
 
				+FAdd
			
 
				+~~~~
			
 
				+
			
 
				+%des = fadd float %src0, %src1
			
 
				+
			
 
				+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that "fp32-denorm-mode"="preserve".
			
 
				+For "fp32-denorm-mode"="ftz" mode, denorms inputs should be treated as corresponding signed zero, and any resulting denorm is also flushed to zero.
			
 
				+
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| src0\src1| -inf     | -F     | -denorm  | -0 | +0 | +denorm   |    +F  | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -inf     | -inf     |   -inf | -inf     |-inf|-inf| -inf      |   -inf | NaN  | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -F       | -inf     |   -F   | -F       |src0|src0| -F        |   +/-F | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -denorm  | -inf     |   -F   |-F/denorm |src0|src0| +/-denorm |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +denorm  | -inf     |   -F   |+/-denorm |src0|src0| +F/denorm |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +F       | -inf     |  +/-F  | +F       |src0|src0| +F        |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +inf     | NaN      |   +inf | +inf     |+inf|+inf| +inf      |   +inf | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| NaN      | NaN      |   NaN  | NaN      |NaN |NaN | NaN       |   NaN  | NaN  | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+
			
 
				+FDiv
			
 
				+~~~~
			
 
				+
			
 
				+%dest = fdiv float %src0, %src1
			
 
				+
			
 
				+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that fast math flag is not used and "fp32-denorm-mode"="preserve".
			
 
				+When "fp32-denorm-mode"="ftz", denorm inputs should be interpreted as corresponding signed zero, and any resulting denorm is also flushed to zero.
			
 
				+When fast math is enabled, implementation may use reciprocal form: src0*(1/src1).  This may result in evaluating src0*(+/-)INF from src0*(1/(+/-)denorm).  This may produce NaN in some cases or (+/-)INF in others.
			
 
				+
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| src0\\src1| -inf     | -F     |  -1   | -denorm | -0 | +0 | +denorm |  +1   |    +F  | +inf | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -inf      | NaN      |   +inf | +inf  | +inf    |+inf|-inf| -inf    |  -inf |   -inf | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -F        | +0       |   +F   | -src0 | +F      |+inf|-inf| -F      |  src0 |   -F   | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -denorm   | +0       | +denorm| -src0 | +F      |+inf|-inf| -F      |  src0 |-denorm | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -0        | +0       |   +0   | +0    | 0       |NaN |NaN | 0       |  -0   |   -0   | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +0        | -0       |   -0   | -0    | 0       |NaN |NaN | 0       |  +0   |   +0   | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +denorm   | -0       | -denorm| -src0 | -F      |-inf|+inf| +F      |  src0 |+denorm | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +F        | -0       |   -F   | -src0 | -F      |-inf|+inf| +F      |  src0 |   +F   | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +inf      | NaN      |   -inf | -inf  | -inf    |-inf|+inf| +inf    |  +inf |   +inf | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| NaN       | NaN      |   NaN  | NaN   | NaN     |NaN |NaN | NaN     |  NaN  |   NaN  | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+
			
 
				 .. INSTR-RST:END
			
 
				 
			
 
				 Operations via external functions
			
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -286,6 +286,58 @@ interface variables:
 
				   main([[vk::location(N)]] float4 input: A) : B
			
 
				   { ... }
			
 
				 
			
 
				+Legalization, optimization, validation
			
 
				+--------------------------------------
			
 
				+
			
 
				+After initial translation of the HLSL source code, SPIR-V CodeGen will further
			
 
				+conduct legalization (if needed), optimization (if requested), and validation
			
 
				+(if not turned off). All these three stages are outsourced to `SPIRV-Tools <https://github.com/KhronosGroup/SPIRV-Tools>`_.
			
 
				+Here are the options controlling these stages:
			
 
				+
			
 
				+* ``-fcgl``: turn off legalization and optimization
			
 
				+* ``-Od``: turn off optimization
			
 
				+* ``-Vd``: turn off validation
			
 
				+
			
 
				+Legalization
			
 
				+~~~~~~~~~~~~
			
 
				+
			
 
				+HLSL is a fairly permissive language considering the flexibility it provides for
			
 
				+manipulating resource objects. The developer can create local copies, pass
			
 
				+them around as function parameters and return values, as long as after certain
			
 
				+transformations (function inlining, constant evaluation and propagating, dead
			
 
				+code elimination, etc.), the compiler can remove all temporary copies and
			
 
				+pinpoint all uses to unique global resource objects.
			
 
				+
			
 
				+Resulting from the above property of HLSL, if we translate into SPIR-V for
			
 
				+Vulkan literally from the input HLSL source code, we will sometimes generate
			
 
				+illegal SPIR-V. Certain transformations are needed to legalize the literally
			
 
				+translated SPIR-V. Performing such transformations at the frontend AST level
			
 
				+is cumbersome or impossible (e.g., function inlining). They are better to be
			
 
				+conducted at SPIR-V level. Therefore, legalization is delegated to SPIRV-Tools.
			
 
				+
			
 
				+Specifically, we need to legalize the following HLSL source code patterns:
			
 
				+
			
 
				+* Using resource types in struct types
			
 
				+* Creating aliases of global resource objects
			
 
				+* Control flows invovling the above cases
			
 
				+
			
 
				+Legalization transformations will not run unless the above patterns are
			
 
				+encountered in the source code.
			
 
				+
			
 
				+Optimization
			
 
				+~~~~~~~~~~~~
			
 
				+
			
 
				+Optimization is also delegated to SPIRV-Tools. Right now there are no difference
			
 
				+between optimization levels greater than zero; they will all invoke the same
			
 
				+optimization recipe. This may change in the future.
			
 
				+
			
 
				+Validation
			
 
				+~~~~~~~~~~
			
 
				+
			
 
				+Validation is turned on by default as the last stage of SPIR-V CodeGen. Failing
			
 
				+validation, which indicates there is a CodeGen bug, will trigger a fatal error.
			
 
				+Please file an issue if you see that.
			
 
				+
			
 
				 HLSL Types
			
 
				 ==========
			
 
				 
			
@@ -307,7 +359,7 @@ type instructions:
 
				 ``uint``/``dword``/``uin32_t``                         ``OpTypeInt 32 0``
			
 
				 ``uint16_t``                   ``-enable-16bit-types`` ``OpTypeInt 16 0`` ``Int16``
			
 
				 ``half``                                               ``OpTypeFloat 32``
			
 
				-``half``/``float16_t``         ``-enable-16bit-types`` ``OpTypeFloat 16`` ``Float16`` ``SPV_AMD_gpu_shader_half_float``
			
 
				+``half``/``float16_t``         ``-enable-16bit-types`` ``OpTypeFloat 16``             ``SPV_AMD_gpu_shader_half_float``
			
 
				 ``float``/``float32_t``                                ``OpTypeFloat 32``
			
 
				 ``snorm float``                                        ``OpTypeFloat 32``
			
 
				 ``unorm float``                                        ``OpTypeFloat 32``
			
@@ -340,8 +392,8 @@ https://github.com/Microsoft/DirectXShaderCompiler/wiki/16-Bit-Scalar-Types
 
				 ``min16int``                           ``OpTypeInt 32 1`` ``RelaxedPrecision``
			
 
				 ``min12int``                           ``OpTypeInt 32 1`` ``RelaxedPrecision``
			
 
				 ``min16uint``                          ``OpTypeInt 32 0`` ``RelaxedPrecision``
			
 
				-``min16float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                      ``Float16``  ``SPV_AMD_gpu_shader_half_float``
			
 
				-``min10float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                      ``Float16``  ``SPV_AMD_gpu_shader_half_float``
			
 
				+``min16float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                                   ``SPV_AMD_gpu_shader_half_float``
			
 
				+``min10float`` ``-enable-16bit-types`` ``OpTypeFloat 16``                                   ``SPV_AMD_gpu_shader_half_float``
			
 
				 ``min16int``   ``-enable-16bit-types`` ``OpTypeInt 16 1``                      ``Int16``
			
 
				 ``min12int``   ``-enable-16bit-types`` ``OpTypeInt 16 1``                      ``Int16``
			
 
				 ``min16uint``  ``-enable-16bit-types`` ``OpTypeInt 16 0``                      ``Int16``
			
@@ -458,26 +510,28 @@ Textures
 
				 `Texture types <https://msdn.microsoft.com/en-us/library/windows/desktop/bb509700(v=vs.85).aspx>`_
			
 
				 are translated into SPIR-V ``OpTypeImage``, with parameters:
			
 
				 
			
 
				-======================= ========== ===== ======= == ======= ================ =================
			
 
				-HLSL Texture Type           Dim    Depth Arrayed MS Sampled  Image Format       Capability
			
 
				-======================= ========== ===== ======= == ======= ================ =================
			
 
				-``Texture1D``           ``1D``      0       0    0    1     ``Unknown``
			
 
				-``Texture2D``           ``2D``      0       0    0    1     ``Unknown``
			
 
				-``Texture3D``           ``3D``      0       0    0    1     ``Unknown``
			
 
				-``TextureCube``         ``Cube``    0       0    0    1     ``Unknown``
			
 
				-``Texture1DArray``      ``1D``      0       1    0    1     ``Unknown``
			
 
				-``Texture2DArray``      ``2D``      0       1    0    1     ``Unknown``
			
 
				-``Texture2DMS``         ``2D``      0       0    1    1     ``Unknown``
			
 
				-``Texture2DMSArray``    ``2D``      0       1    1    1     ``Unknown``      ``ImageMSArray``
			
 
				-``TextureCubeArray``    ``3D``      0       1    0    1     ``Unknown``
			
 
				-``Buffer<T>``           ``Buffer``  0       0    0    1     Depends on ``T`` ``SampledBuffer``
			
 
				-``RWBuffer<T>``         ``Buffer``  0       0    0    2     Depends on ``T`` ``SampledBuffer``
			
 
				-``RWTexture1D<T>``      ``1D``      0       0    0    2     Depends on ``T``
			
 
				-``RWTexture2D<T>``      ``2D``      0       0    0    2     Depends on ``T``
			
 
				-``RWTexture3D<T>``      ``3D``      0       0    0    2     Depends on ``T``
			
 
				-``RWTexture1DArray<T>`` ``1D``      0       1    0    2     Depends on ``T``
			
 
				-``RWTexture2DArray<T>`` ``2D``      0       1    0    2     Depends on ``T``
			
 
				-======================= ========== ===== ======= == ======= ================ =================
			
 
				+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
			
 
				+       HLSL                   Vulkan                                        SPIR-V
			
 
				+----------------------- -------------------------- ------------------------------------------------------------------------------------------
			
 
				+     Texture Type         Descriptor Type    RO/RW    Storage Class        Dim    Depth Arrayed MS Sampled   Image Format      Capability
			
 
				+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
			
 
				+``Texture1D``           Sampled Image         RO   ``UniformConstant`` ``1D``      0       0    0    1     ``Unknown``
			
 
				+``Texture2D``           Sampled Image         RO   ``UniformConstant`` ``2D``      0       0    0    1     ``Unknown``
			
 
				+``Texture3D``           Sampled Image         RO   ``UniformConstant`` ``3D``      0       0    0    1     ``Unknown``
			
 
				+``TextureCube``         Sampled Image         RO   ``UniformConstant`` ``Cube``    0       0    0    1     ``Unknown``
			
 
				+``Texture1DArray``      Sampled Image         RO   ``UniformConstant`` ``1D``      0       1    0    1     ``Unknown``
			
 
				+``Texture2DArray``      Sampled Image         RO   ``UniformConstant`` ``2D``      0       1    0    1     ``Unknown``
			
 
				+``Texture2DMS``         Sampled Image         RO   ``UniformConstant`` ``2D``      0       0    1    1     ``Unknown``
			
 
				+``Texture2DMSArray``    Sampled Image         RO   ``UniformConstant`` ``2D``      0       1    1    1     ``Unknown``      ``ImageMSArray``
			
 
				+``TextureCubeArray``    Sampled Image         RO   ``UniformConstant`` ``3D``      0       1    0    1     ``Unknown``
			
 
				+``Buffer<T>``           Uniform Texel Buffer  RO   ``UniformConstant`` ``Buffer``  0       0    0    1     Depends on ``T`` ``SampledBuffer``
			
 
				+``RWBuffer<T>``         Storage Texel Buffer  RW   ``UniformConstant`` ``Buffer``  0       0    0    2     Depends on ``T`` ``SampledBuffer``
			
 
				+``RWTexture1D<T>``      Storage Image         RW   ``UniformConstant`` ``1D``      0       0    0    2     Depends on ``T``
			
 
				+``RWTexture2D<T>``      Storage Image         RW   ``UniformConstant`` ``2D``      0       0    0    2     Depends on ``T``
			
 
				+``RWTexture3D<T>``      Storage Image         RW   ``UniformConstant`` ``3D``      0       0    0    2     Depends on ``T``
			
 
				+``RWTexture1DArray<T>`` Storage Image         RW   ``UniformConstant`` ``1D``      0       1    0    2     Depends on ``T``
			
 
				+``RWTexture2DArray<T>`` Storage Image         RW   ``UniformConstant`` ``2D``      0       1    0    2     Depends on ``T``
			
 
				+======================= ==================== ===== =================== ========== ===== ======= == ======= ================ =================
			
 
				 
			
 
				 The meanings of the headers in the above table is explained in ``OpTypeImage``
			
 
				 of the SPIR-V spec.
			
@@ -771,6 +825,8 @@ placed in the ``Uniform`` or ``UniformConstant`` storage class.
 
				 
			
 
				   - Global variables with ``groupshared`` modifier will be placed in the
			
 
				     ``Workgroup`` storage class.
			
 
				+  - Note that this modifier overrules ``static``; if both ``groupshared`` and
			
 
				+    ``static`` are applied to a variable, ``static`` will be ignored.
			
 
				 
			
 
				 - ``uinform``
			
 
				 
			
@@ -2257,7 +2313,6 @@ element is the height, and the third is the elements.
 
				 The ``OpImageQuerySize`` instruction is used to get a uint3. The first element is the width, the second
			
 
				 element is the height, and the third element is the depth.
			
 
				 
			
 
				-
			
 
				 HLSL Shader Stages
			
 
				 ==================
			
 
				 
			
@@ -2424,6 +2479,25 @@ behind ``T`` will be flushed before SPIR-V ``OpEmitVertex`` instruction is
 
				 generated. ``.RestartStrip()`` method calls will be translated into the SPIR-V
			
 
				 ``OpEndPrimitive`` instruction.
			
 
				 
			
 
				+Shader Model 6.0 Wave Intrinsics
			
 
				+================================
			
 
				+
			
 
				+Shader Model 6.0 introduces a set of wave operations, which are translated
			
 
				+according to the following table:
			
 
				+
			
 
				+====================== ============================= =========================
			
 
				+      Intrinsic               SPIR-V BuiltIn                Extension
			
 
				+====================== ============================= =========================
			
 
				+``WaveGetLaneCount()`` ``SubgroupSize``              ``SPV_KHR_shader_ballot``
			
 
				+``WaveGetLaneIndex()`` ``SubgroupLocalInvocationId`` ``SPV_KHR_shader_ballot``
			
 
				+====================== ============================= =========================
			
 
				+
			
 
				+======================= ================================ =========================
			
 
				+      Intrinsic               SPIR-V Instruction                Extension
			
 
				+======================= ================================ =========================
			
 
				+``WaveReadLaneFirst()`` ``OpSubgroupFirstInvocationKHR`` ``SPV_KHR_shader_ballot``
			
 
				+======================= ================================ =========================
			
 
				+
			
 
				 Vulkan Command-line Options
			
 
				 ===========================
			
 
				 
			
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
 
				-Subproject commit e0282aa7d54631502b4af567a85d3b6565fd5464
			
 
				+Subproject commit 02ffc719aa9f9c1dce5ce05743fb1afe6cbf17ea
			
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
 
				-Subproject commit 6c75050136a2657dac4501ca16d447852fc69e5f
			
 
				+Subproject commit 03b8a3fe540e72794646195fe261a679203c13ac
			
--- a/external/googletest
+++ b/external/googletest
@@ -1 +1 @@
 
				-Subproject commit a5014476f0c49c966e4ac602469cddefc7ed486d
			
 
				+Subproject commit 703b4a85a21e394252560a89cc856b384b48c286
			
--- a/external/re2
+++ b/external/re2
@@ -1 +1 @@
 
				-Subproject commit 715f0dcaafbeda5d9fef58194d9ce256f0317ecf
			
 
				+Subproject commit c1ed8543f1b703ce200212bb5629ba69a2f9b63a
			
--- a/include/dxc/HLSL/DxilConstants.h
+++ b/include/dxc/HLSL/DxilConstants.h
@@ -56,8 +56,9 @@ namespace DXIL {
 
				   const unsigned kMaxStructBufferStride = 2048;
			
 
				   const unsigned kMaxHSOutputControlPointsTotalScalars = 3968;
			
 
				   const unsigned kMaxHSOutputPatchConstantTotalScalars = 32*4;
			
 
				-  const unsigned kMaxOutputTotalScalars = 32*4;
			
 
				-  const unsigned kMaxInputTotalScalars = 32*4;
			
 
				+  const unsigned kMaxSignatureTotalVectors = 32;
			
 
				+  const unsigned kMaxOutputTotalScalars = kMaxSignatureTotalVectors * 4;
			
 
				+  const unsigned kMaxInputTotalScalars = kMaxSignatureTotalVectors * 4;
			
 
				   const unsigned kMaxClipOrCullDistanceElementCount = 2;
			
 
				   const unsigned kMaxClipOrCullDistanceCount = 2 * 4;
			
 
				   const unsigned kMaxGSOutputVertexCount = 1024;
			
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -5409,6 +5409,15 @@ void TranslateCBAddressUserLegacy(Instruction *user, Value *handle,
 
				 
			
 
				     ldInst->replaceAllUsesWith(newLd);
			
 
				     ldInst->eraseFromParent();
			
 
				+  } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(user)) {
			
 
				+    for (auto it = BCI->user_begin(); it != BCI->user_end(); ) {
			
 
				+      Instruction *I = cast<Instruction>(*it++);
			
 
				+      TranslateCBAddressUserLegacy(I,
			
 
				+                                   handle, legacyIdx, channelOffset, hlslOP,
			
 
				+                                   prevFieldAnnotation, dxilTypeSys,
			
 
				+                                   DL, pObjHelper);
			
 
				+    }
			
 
				+    BCI->eraseFromParent();
			
 
				   } else {
			
 
				     // Must be GEP here
			
 
				     GetElementPtrInst *GEP = cast<GetElementPtrInst>(user);
			
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@@ -105,6 +105,7 @@ private:
 
				 
			
 
				   void RewriteForConstExpr(ConstantExpr *user, IRBuilder<> &Builder);
			
 
				   void RewriteForGEP(GEPOperator *GEP, IRBuilder<> &Builder);
			
 
				+  void RewriteForAddrSpaceCast(ConstantExpr *user, IRBuilder<> &Builder);
			
 
				   void RewriteForLoad(LoadInst *loadInst);
			
 
				   void RewriteForStore(StoreInst *storeInst);
			
 
				   void RewriteMemIntrin(MemIntrinsic *MI, Value *OldV);
			
@@ -3158,6 +3159,22 @@ void SROA_Helper::RewriteCall(CallInst *CI) {
 
				   }
			
 
				 }
			
 
				 
			
 
				+/// RewriteForConstExpr - Rewrite the GEP which is ConstantExpr.
			
 
				+void SROA_Helper::RewriteForAddrSpaceCast(ConstantExpr *CE,
			
 
				+                                          IRBuilder<> &Builder) {
			
 
				+  SmallVector<Value *, 8> NewCasts;
			
 
				+  // create new AddrSpaceCast.
			
 
				+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
			
 
				+    Value *NewGEP = Builder.CreateAddrSpaceCast(
			
 
				+        NewElts[i],
			
 
				+        PointerType::get(NewElts[i]->getType()->getPointerElementType(),
			
 
				+                         CE->getType()->getPointerAddressSpace()));
			
 
				+    NewCasts.emplace_back(NewGEP);
			
 
				+  }
			
 
				+  SROA_Helper helper(CE, NewCasts, DeadInsts);
			
 
				+  helper.RewriteForScalarRepl(CE, Builder);
			
 
				+}
			
 
				+
			
 
				 /// RewriteForConstExpr - Rewrite the GEP which is ConstantExpr.
			
 
				 void SROA_Helper::RewriteForConstExpr(ConstantExpr *CE, IRBuilder<> &Builder) {
			
 
				   if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
			
@@ -3167,17 +3184,26 @@ void SROA_Helper::RewriteForConstExpr(ConstantExpr *CE, IRBuilder<> &Builder) {
 
				       return;
			
 
				     }
			
 
				   }
			
 
				+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
			
 
				+    if (OldVal == CE->getOperand(0)) {
			
 
				+      // Flatten AddrSpaceCast.
			
 
				+      RewriteForAddrSpaceCast(CE, Builder);
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				   // Skip unused CE. 
			
 
				   if (CE->use_empty())
			
 
				     return;
			
 
				 
			
 
				-  Instruction *constInst = CE->getAsInstruction();
			
 
				-  Builder.Insert(constInst);
			
 
				-  // Replace CE with constInst.
			
 
				   for (Value::use_iterator UI = CE->use_begin(), E = CE->use_end(); UI != E;) {
			
 
				     Use &TheUse = *UI++;
			
 
				-    if (isa<Instruction>(TheUse.getUser()))
			
 
				-      TheUse.set(constInst);
			
 
				+    if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
			
 
				+      IRBuilder<> tmpBuilder(I);
			
 
				+      // Replace CE with constInst.
			
 
				+      Instruction *tmpInst = CE->getAsInstruction();
			
 
				+      tmpBuilder.Insert(tmpInst);
			
 
				+      TheUse.set(tmpInst);
			
 
				+    }
			
 
				     else {
			
 
				       RewriteForConstExpr(cast<ConstantExpr>(TheUse.getUser()), Builder);
			
 
				     }
			
@@ -3788,17 +3814,23 @@ static void ReplaceUnboundedArrayUses(Value *V, Value *Src, IRBuilder<> &Builder
 
				 }
			
 
				 
			
 
				 static void ReplaceMemcpy(Value *V, Value *Src, MemCpyInst *MC) {
			
 
				+  Type *TyV = V->getType()->getPointerElementType();
			
 
				+  Type *TySrc = Src->getType()->getPointerElementType();
			
 
				   if (Constant *C = dyn_cast<Constant>(V)) {
			
 
				-    if (isa<Constant>(Src)) {
			
 
				-      V->replaceAllUsesWith(Src);
			
 
				+    if (TyV == TySrc) {
			
 
				+      if (isa<Constant>(Src)) {
			
 
				+        V->replaceAllUsesWith(Src);
			
 
				+      } else {
			
 
				+        // Replace Constant with a non-Constant.
			
 
				+        IRBuilder<> Builder(MC);
			
 
				+        ReplaceConstantWithInst(C, Src, Builder);
			
 
				+      }
			
 
				     } else {
			
 
				-      // Replace Constant with a non-Constant.
			
 
				       IRBuilder<> Builder(MC);
			
 
				+      Src = Builder.CreateBitCast(Src, V->getType());
			
 
				       ReplaceConstantWithInst(C, Src, Builder);
			
 
				     }
			
 
				   } else {
			
 
				-    Type* TyV = V->getType()->getPointerElementType();
			
 
				-    Type* TySrc = Src->getType()->getPointerElementType();
			
 
				     if (TyV == TySrc) {
			
 
				       if (V != Src)
			
 
				         V->replaceAllUsesWith(Src);
			
--- a/tools/clang/include/clang/SPIRV/InstBuilder.h
+++ b/tools/clang/include/clang/SPIRV/InstBuilder.h
@@ -806,6 +806,149 @@ public:
 
				   InstBuilder &opModuleProcessed(std::string process);
			
 
				   InstBuilder &opExecutionModeId(uint32_t entry_point, spv::ExecutionMode mode);
			
 
				   InstBuilder &opDecorateId(uint32_t target, spv::Decoration decoration);
			
 
				+  InstBuilder &opGroupNonUniformElect(uint32_t result_type, uint32_t result_id,
			
 
				+                                      uint32_t execution);
			
 
				+  InstBuilder &opGroupNonUniformAll(uint32_t result_type, uint32_t result_id,
			
 
				+                                    uint32_t execution, uint32_t predicate);
			
 
				+  InstBuilder &opGroupNonUniformAny(uint32_t result_type, uint32_t result_id,
			
 
				+                                    uint32_t execution, uint32_t predicate);
			
 
				+  InstBuilder &opGroupNonUniformAllEqual(uint32_t result_type,
			
 
				+                                         uint32_t result_id, uint32_t execution,
			
 
				+                                         uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformBroadcast(uint32_t result_type,
			
 
				+                                          uint32_t result_id,
			
 
				+                                          uint32_t execution, uint32_t value,
			
 
				+                                          uint32_t id);
			
 
				+  InstBuilder &opGroupNonUniformBroadcastFirst(uint32_t result_type,
			
 
				+                                               uint32_t result_id,
			
 
				+                                               uint32_t execution,
			
 
				+                                               uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformBallot(uint32_t result_type, uint32_t result_id,
			
 
				+                                       uint32_t execution, uint32_t predicate);
			
 
				+  InstBuilder &opGroupNonUniformInverseBallot(uint32_t result_type,
			
 
				+                                              uint32_t result_id,
			
 
				+                                              uint32_t execution,
			
 
				+                                              uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformBallotBitExtract(uint32_t result_type,
			
 
				+                                                 uint32_t result_id,
			
 
				+                                                 uint32_t execution,
			
 
				+                                                 uint32_t value,
			
 
				+                                                 uint32_t index);
			
 
				+  InstBuilder &opGroupNonUniformBallotBitCount(uint32_t result_type,
			
 
				+                                               uint32_t result_id,
			
 
				+                                               uint32_t execution,
			
 
				+                                               spv::GroupOperation operation,
			
 
				+                                               uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformBallotFindLSB(uint32_t result_type,
			
 
				+                                              uint32_t result_id,
			
 
				+                                              uint32_t execution,
			
 
				+                                              uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformBallotFindMSB(uint32_t result_type,
			
 
				+                                              uint32_t result_id,
			
 
				+                                              uint32_t execution,
			
 
				+                                              uint32_t value);
			
 
				+  InstBuilder &opGroupNonUniformShuffle(uint32_t result_type,
			
 
				+                                        uint32_t result_id, uint32_t execution,
			
 
				+                                        uint32_t value, uint32_t id);
			
 
				+  InstBuilder &opGroupNonUniformShuffleXor(uint32_t result_type,
			
 
				+                                           uint32_t result_id,
			
 
				+                                           uint32_t execution, uint32_t value,
			
 
				+                                           uint32_t mask);
			
 
				+  InstBuilder &opGroupNonUniformShuffleUp(uint32_t result_type,
			
 
				+                                          uint32_t result_id,
			
 
				+                                          uint32_t execution, uint32_t value,
			
 
				+                                          uint32_t delta);
			
 
				+  InstBuilder &opGroupNonUniformShuffleDown(uint32_t result_type,
			
 
				+                                            uint32_t result_id,
			
 
				+                                            uint32_t execution, uint32_t value,
			
 
				+                                            uint32_t delta);
			
 
				+  InstBuilder &opGroupNonUniformIAdd(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformFAdd(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformIMul(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformFMul(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformSMin(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformUMin(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformFMin(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformSMax(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformUMax(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformFMax(uint32_t result_type, uint32_t result_id,
			
 
				+                                     uint32_t execution,
			
 
				+                                     spv::GroupOperation operation,
			
 
				+                                     uint32_t value,
			
 
				+                                     llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformBitwiseAnd(uint32_t result_type, uint32_t result_id,
			
 
				+                              uint32_t execution, spv::GroupOperation operation,
			
 
				+                              uint32_t value,
			
 
				+                              llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformBitwiseOr(uint32_t result_type, uint32_t result_id,
			
 
				+                             uint32_t execution, spv::GroupOperation operation,
			
 
				+                             uint32_t value,
			
 
				+                             llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformBitwiseXor(uint32_t result_type, uint32_t result_id,
			
 
				+                              uint32_t execution, spv::GroupOperation operation,
			
 
				+                              uint32_t value,
			
 
				+                              llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformLogicalAnd(uint32_t result_type, uint32_t result_id,
			
 
				+                              uint32_t execution, spv::GroupOperation operation,
			
 
				+                              uint32_t value,
			
 
				+                              llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformLogicalOr(uint32_t result_type, uint32_t result_id,
			
 
				+                             uint32_t execution, spv::GroupOperation operation,
			
 
				+                             uint32_t value,
			
 
				+                             llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &
			
 
				+  opGroupNonUniformLogicalXor(uint32_t result_type, uint32_t result_id,
			
 
				+                              uint32_t execution, spv::GroupOperation operation,
			
 
				+                              uint32_t value,
			
 
				+                              llvm::Optional<uint32_t> cluster_size);
			
 
				+  InstBuilder &opGroupNonUniformQuadBroadcast(uint32_t result_type,
			
 
				+                                              uint32_t result_id,
			
 
				+                                              uint32_t execution,
			
 
				+                                              uint32_t value, uint32_t index);
			
 
				+  InstBuilder &opGroupNonUniformQuadSwap(uint32_t result_type,
			
 
				+                                         uint32_t result_id, uint32_t execution,
			
 
				+                                         uint32_t value, uint32_t direction);
			
 
				   InstBuilder &opSubgroupBallotKHR(uint32_t result_type, uint32_t result_id,
			
 
				                                    uint32_t predicate);
			
 
				   InstBuilder &opSubgroupFirstInvocationKHR(uint32_t result_type,
			
@@ -876,6 +1019,11 @@ public:
 
				   InstBuilder &opSubgroupImageBlockWriteINTEL(uint32_t image,
			
 
				                                               uint32_t coordinate,
			
 
				                                               uint32_t data);
			
 
				+  InstBuilder &opDecorateStringGOOGLE(uint32_t target,
			
 
				+                                      spv::Decoration decoration);
			
 
				+  InstBuilder &opMemberDecorateStringGOOGLE(uint32_t struct_type,
			
 
				+                                            uint32_t member,
			
 
				+                                            spv::Decoration decoration);
			
 
				 
			
 
				   // All-in-one methods for creating unary and binary operations.
			
 
				   InstBuilder &unaryOp(spv::Op op, uint32_t result_type, uint32_t result_id,
			
--- a/tools/clang/include/clang/SPIRV/ModuleBuilder.h
+++ b/tools/clang/include/clang/SPIRV/ModuleBuilder.h
@@ -303,6 +303,9 @@ public:
 
				   /// \brief Creates an OpEndPrimitive instruction.
			
 
				   void createEndPrimitive();
			
 
				 
			
 
				+  /// \brief Creates an OpSubgroupFirstInvocationKHR instruciton.
			
 
				+  uint32_t createSubgroupFirstInvocation(uint32_t resultType, uint32_t value);
			
 
				+
			
 
				   // === SPIR-V Module Structure ===
			
 
				 
			
 
				   inline void requireCapability(spv::Capability);
			
@@ -384,7 +387,8 @@ public:
 
				   uint32_t getFloat32Type();
			
 
				   uint32_t getFloat64Type();
			
 
				   uint32_t getVecType(uint32_t elemType, uint32_t elemCount);
			
 
				-  uint32_t getMatType(QualType elemType, uint32_t colType, uint32_t colCount);
			
 
				+  uint32_t getMatType(QualType elemType, uint32_t colType, uint32_t colCount,
			
 
				+                      Type::DecorationSet decorations = {});
			
 
				   uint32_t getPointerType(uint32_t pointeeType, spv::StorageClass);
			
 
				   uint32_t getStructType(llvm::ArrayRef<uint32_t> fieldTypes,
			
 
				                          llvm::StringRef structName = "",
			
--- a/tools/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/tools/clang/lib/CodeGen/CGExprAgg.cpp
@@ -726,9 +726,22 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
 
				       Expr *Src = E->getSubExpr();
			
 
				       switch (CGF.getEvaluationKind(Ty)) {
			
 
				       case TEK_Aggregate: {
			
 
				-        LValue LV = CGF.EmitAggExprToLValue(Src);
			
 
				-        CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
			
 
				-            CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
			
 
				+        if (CastExpr *SrcCast = dyn_cast<CastExpr>(Src)) {
			
 
				+          if (SrcCast->getCastKind() == CK_LValueToRValue) {
			
 
				+            // Skip the lval to rval cast to reach decl.
			
 
				+            Src = SrcCast->getSubExpr();
			
 
				+          }
			
 
				+        }
			
 
				+        // Just use decl if possible to skip useless copy.
			
 
				+        if (DeclRefExpr *SrcDecl = dyn_cast<DeclRefExpr>(Src)) {
			
 
				+          LValue LV = CGF.EmitLValue(SrcDecl);
			
 
				+          CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
			
 
				+              CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
			
 
				+        } else {
			
 
				+          LValue LV = CGF.EmitAggExprToLValue(Src);
			
 
				+          CGF.CGM.getHLSLRuntime().EmitHLSLFlatConversionAggregateCopy(
			
 
				+              CGF, LV.getAddress(), Src->getType(), DestPtr, E->getType());
			
 
				+        }
			
 
				       } break;
			
 
				       case TEK_Scalar: {
			
 
				         llvm::Value *SrcVal = CGF.EmitScalarExpr(Src);
			
--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
@@ -2559,6 +2559,11 @@ void CGMSHLSLRuntime::AddConstant(VarDecl *constDecl, HLCBuffer &CB) {
 
				     // For static inside cbuffer, take as global static.
			
 
				     // Don't add to cbuffer.
			
 
				     CGM.EmitGlobal(constDecl);
			
 
				+    // Add type annotation for static global types.
			
 
				+    // May need it when cast from cbuf.
			
 
				+    DxilTypeSystem &dxilTypeSys = m_pHLModule->GetTypeSystem();
			
 
				+    unsigned arraySize = 0;
			
 
				+    AddTypeAnnotation(constDecl->getType(), dxilTypeSys, arraySize);
			
 
				     return;
			
 
				   }
			
 
				   // Search defined structure for resource objects and fail
			
@@ -6144,6 +6149,43 @@ void CGMSHLSLRuntime::EmitHLSLAggregateCopy(CodeGenFunction &CGF, llvm::Value *S
 
				     SmallVector<Value *, 4> idxList;
			
 
				     EmitHLSLAggregateCopy(CGF, SrcPtr, DestPtr, idxList, Ty, Ty, SrcPtr->getType());
			
 
				 }
			
 
				+// To memcpy, need element type match.
			
 
				+// For struct type, the layout should match in cbuffer layout.
			
 
				+// struct { float2 x; float3 y; } will not match struct { float3 x; float2 y; }.
			
 
				+// struct { float2 x; float3 y; } will not match array of float.
			
 
				+static bool IsTypeMatchForMemcpy(llvm::Type *SrcTy, llvm::Type *DestTy) {
			
 
				+  llvm::Type *SrcEltTy = dxilutil::GetArrayEltTy(SrcTy);
			
 
				+  llvm::Type *DestEltTy = dxilutil::GetArrayEltTy(DestTy);
			
 
				+  if (SrcEltTy == DestEltTy)
			
 
				+    return true;
			
 
				+
			
 
				+  llvm::StructType *SrcST = dyn_cast<llvm::StructType>(SrcEltTy);
			
 
				+  llvm::StructType *DestST = dyn_cast<llvm::StructType>(DestEltTy);
			
 
				+  if (SrcST && DestST) {
			
 
				+    // Only allow identical struct.
			
 
				+    return SrcST->isLayoutIdentical(DestST);
			
 
				+  } else if (!SrcST && !DestST) {
			
 
				+    // For basic type, if one is array, one is not array, layout is different.
			
 
				+    // If both array, type mismatch. If both basic, copy should be fine.
			
 
				+    // So all return false.
			
 
				+    return false;
			
 
				+  } else {
			
 
				+    // One struct, one basic type.
			
 
				+    // Make sure all struct element match the basic type and basic type is
			
 
				+    // vector4.
			
 
				+    llvm::StructType *ST = SrcST ? SrcST : DestST;
			
 
				+    llvm::Type *Ty = SrcST ? DestEltTy : SrcEltTy;
			
 
				+    if (!Ty->isVectorTy())
			
 
				+      return false;
			
 
				+    if (Ty->getVectorNumElements() != 4)
			
 
				+      return false;
			
 
				+    for (llvm::Type *EltTy : ST->elements()) {
			
 
				+      if (EltTy != Ty)
			
 
				+        return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+  }
			
 
				+}
			
 
				 
			
 
				 void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF, llvm::Value *SrcPtr,
			
 
				     clang::QualType SrcTy,
			
@@ -6162,6 +6204,14 @@ void CGMSHLSLRuntime::EmitHLSLFlatConversionAggregateCopy(CodeGenFunction &CGF,
 
				     unsigned sizeDest = TheModule.getDataLayout().getTypeAllocSize(DestPtrTy);
			
 
				     CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, std::max(sizeSrc, sizeDest), 1);
			
 
				     return;
			
 
				+  } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(DestPtr)) {
			
 
				+    if (GV->isInternalLinkage(GV->getLinkage()) &&
			
 
				+        IsTypeMatchForMemcpy(SrcPtrTy, DestPtrTy)) {
			
 
				+      unsigned sizeSrc = TheModule.getDataLayout().getTypeAllocSize(SrcPtrTy);
			
 
				+      unsigned sizeDest = TheModule.getDataLayout().getTypeAllocSize(DestPtrTy);
			
 
				+      CGF.Builder.CreateMemCpy(DestPtr, SrcPtr, std::min(sizeSrc, sizeDest), 1);
			
 
				+      return;
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // It is possiable to implement EmitHLSLAggregateCopy, EmitHLSLAggregateStore
			
--- a/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
+++ b/tools/clang/lib/SPIRV/DeclResultIdMapper.cpp
@@ -304,15 +304,13 @@ SpirvEvalInfo DeclResultIdMapper::getDeclEvalInfo(const ValueDecl *decl,
 
				           cast<VarDecl>(decl)->getType(),
			
 
				           // We need to set decorateLayout here to avoid creating SPIR-V
			
 
				           // instructions for the current type without decorations.
			
 
				-          info->info.getLayoutRule(), info->isRowMajor);
			
 
				+          info->info.getLayoutRule(), info->info.isRowMajor());
			
 
				 
			
 
				       const uint32_t elemId = theBuilder.createAccessChain(
			
 
				           theBuilder.getPointerType(varType, info->info.getStorageClass()),
			
 
				           info->info, {theBuilder.getConstantInt32(info->indexInCTBuffer)});
			
 
				 
			
 
				-      return SpirvEvalInfo(elemId)
			
 
				-          .setStorageClass(info->info.getStorageClass())
			
 
				-          .setLayoutRule(info->info.getLayoutRule());
			
 
				+      return info->info.substResultId(elemId);
			
 
				     } else {
			
 
				       return *info;
			
 
				     }
			
@@ -383,8 +381,8 @@ uint32_t DeclResultIdMapper::createFileVar(const VarDecl *var,
 
				 uint32_t DeclResultIdMapper::createExternVar(const VarDecl *var) {
			
 
				   auto storageClass = spv::StorageClass::UniformConstant;
			
 
				   auto rule = LayoutRule::Void;
			
 
				-  bool isMatType = false;     // Whether this var is of matrix type
			
 
				-  bool isACRWSBuffer = false; // Whether its {Append|Consume|RW}StructuredBuffer
			
 
				+  bool isMatType = false;     // Whether is matrix that needs struct wrap
			
 
				+  bool isACRWSBuffer = false; // Whether is {Append|Consume|RW}StructuredBuffer
			
 
				 
			
 
				   if (var->getAttr<HLSLGroupSharedAttr>()) {
			
 
				     // For CS groupshared variables
			
@@ -432,11 +430,18 @@ uint32_t DeclResultIdMapper::createExternVar(const VarDecl *var) {
 
				   astDecls[var] =
			
 
				       SpirvEvalInfo(id).setStorageClass(storageClass).setLayoutRule(rule);
			
 
				   if (isMatType) {
			
 
				+    astDecls[var].info.setRowMajor(
			
 
				+        typeTranslator.isRowMajorMatrix(var->getType(), var));
			
 
				+
			
 
				     // We have wrapped the stand-alone matrix inside a struct. Mark it as
			
 
				     // needing an extra index to access.
			
 
				     astDecls[var].indexInCTBuffer = 0;
			
 
				   }
			
 
				 
			
 
				+  // Variables in Workgroup do not need descriptor decorations.
			
 
				+  if (storageClass == spv::StorageClass::Workgroup)
			
 
				+    return id;
			
 
				+
			
 
				   const auto *regAttr = getResourceBinding(var);
			
 
				   const auto *bindingAttr = var->getAttr<VKBindingAttr>();
			
 
				   const auto *counterBindingAttr = var->getAttr<VKCounterBindingAttr>();
			
@@ -573,12 +578,13 @@ uint32_t DeclResultIdMapper::createCTBuffer(const HLSLBufferDecl *decl) {
 
				     const auto *varDecl = cast<VarDecl>(subDecl);
			
 
				     const bool isRowMajor =
			
 
				         typeTranslator.isRowMajorMatrix(varDecl->getType(), varDecl);
			
 
				-    astDecls[varDecl] = {SpirvEvalInfo(bufferVar)
			
 
				-                             .setStorageClass(spv::StorageClass::Uniform)
			
 
				-                             .setLayoutRule(decl->isCBuffer()
			
 
				-                                                ? LayoutRule::GLSLStd140
			
 
				-                                                : LayoutRule::GLSLStd430),
			
 
				-                         index++, isRowMajor};
			
 
				+    astDecls[varDecl] =
			
 
				+        SpirvEvalInfo(bufferVar)
			
 
				+            .setStorageClass(spv::StorageClass::Uniform)
			
 
				+            .setLayoutRule(decl->isCBuffer() ? LayoutRule::GLSLStd140
			
 
				+                                             : LayoutRule::GLSLStd430)
			
 
				+            .setRowMajor(isRowMajor);
			
 
				+    astDecls[varDecl].indexInCTBuffer = index++;
			
 
				   }
			
 
				   resourceVars.emplace_back(
			
 
				       bufferVar, ResourceVar::Category::Other, getResourceBinding(decl),
			
@@ -793,12 +799,19 @@ public:
 
				   /// Uses the given location.
			
 
				   void useLoc(uint32_t loc) { usedLocs.set(loc); }
			
 
				 
			
 
				-  /// Uses the next available location.
			
 
				-  uint32_t useNextLoc() {
			
 
				+  /// Uses the next |count| available location.
			
 
				+  int useNextLocs(uint32_t count) {
			
 
				     while (usedLocs[nextLoc])
			
 
				       nextLoc++;
			
 
				-    usedLocs.set(nextLoc);
			
 
				-    return nextLoc++;
			
 
				+
			
 
				+    int toUse = nextLoc;
			
 
				+
			
 
				+    for (uint32_t i = 0; i < count; ++i) {
			
 
				+      assert(!usedLocs[nextLoc]);
			
 
				+      usedLocs.set(nextLoc++);
			
 
				+    }
			
 
				+
			
 
				+    return toUse;
			
 
				   }
			
 
				 
			
 
				   /// Returns true if the given location number is already used.
			
@@ -976,7 +989,8 @@ bool DeclResultIdMapper::finalizeStageIOLocations(bool forInput) {
 
				   }
			
 
				 
			
 
				   for (const auto *var : vars)
			
 
				-    theBuilder.decorateLocation(var->getSpirvId(), locSet.useNextLoc());
			
 
				+    theBuilder.decorateLocation(var->getSpirvId(),
			
 
				+                                locSet.useNextLocs(var->getLocationCount()));
			
 
				 
			
 
				   return true;
			
 
				 }
			
@@ -1257,9 +1271,11 @@ bool DeclResultIdMapper::createStageVars(const hlsl::SigPoint *sigPoint,
 
				       typeId = theBuilder.getArrayType(typeId,
			
 
				                                        theBuilder.getConstantUint32(arraySize));
			
 
				 
			
 
				-    StageVar stageVar(sigPoint, semanticToUse->str, semanticToUse->semantic,
			
 
				-                      semanticToUse->name, semanticToUse->index, builtinAttr,
			
 
				-                      typeId);
			
 
				+    StageVar stageVar(
			
 
				+        sigPoint, semanticToUse->str, semanticToUse->semantic,
			
 
				+        semanticToUse->name, semanticToUse->index, builtinAttr, typeId,
			
 
				+        // For HS/DS/GS, we have already stripped the outmost arrayness on type.
			
 
				+        typeTranslator.getLocationCount(type));
			
 
				     const auto name = namePrefix.str() + "." + stageVar.getSemanticStr();
			
 
				     const uint32_t varId =
			
 
				         createSpirvStageVar(&stageVar, decl, name, semanticToUse->loc);
			
@@ -1673,6 +1689,58 @@ void DeclResultIdMapper::decoratePSInterpolationMode(const NamedDecl *decl,
 
				   }
			
 
				 }
			
 
				 
			
 
				+uint32_t DeclResultIdMapper::getBuiltinVar(spv::BuiltIn builtIn) {
			
 
				+  // Guarantee uniqueness
			
 
				+  switch (builtIn) {
			
 
				+  case spv::BuiltIn::SubgroupSize:
			
 
				+    if (laneCountBuiltinId)
			
 
				+      return laneCountBuiltinId;
			
 
				+    break;
			
 
				+  case spv::BuiltIn::SubgroupLocalInvocationId:
			
 
				+    if (laneIndexBuiltinId)
			
 
				+      return laneIndexBuiltinId;
			
 
				+    break;
			
 
				+  default:
			
 
				+    // Only allow the two cases we know about
			
 
				+    assert(false && "unsupported builtin case");
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  // Both of them require the SPV_KHR_shader_ballot extension.
			
 
				+  theBuilder.addExtension("SPV_KHR_shader_ballot");
			
 
				+  theBuilder.requireCapability(spv::Capability::SubgroupBallotKHR);
			
 
				+
			
 
				+  uint32_t type = theBuilder.getUint32Type();
			
 
				+
			
 
				+  // Create a dummy StageVar for this builtin variable
			
 
				+  const uint32_t varId =
			
 
				+      theBuilder.addStageBuiltinVar(type, spv::StorageClass::Input, builtIn);
			
 
				+
			
 
				+  const hlsl::SigPoint *sigPoint =
			
 
				+      hlsl::SigPoint::GetSigPoint(hlsl::SigPointFromInputQual(
			
 
				+          hlsl::DxilParamInputQual::In, shaderModel.GetKind(),
			
 
				+          /*isPatchConstant=*/false));
			
 
				+
			
 
				+  StageVar stageVar(sigPoint, /*semaStr=*/"", hlsl::Semantic::GetInvalid(),
			
 
				+                    /*semaName=*/"", /*semaIndex=*/0, /*builtinAttr=*/nullptr,
			
 
				+                    type, /*locCount=*/0);
			
 
				+
			
 
				+  stageVar.setIsSpirvBuiltin();
			
 
				+  stageVar.setSpirvId(varId);
			
 
				+  stageVars.push_back(stageVar);
			
 
				+
			
 
				+  switch (builtIn) {
			
 
				+  case spv::BuiltIn::SubgroupSize:
			
 
				+    laneCountBuiltinId = varId;
			
 
				+    break;
			
 
				+  case spv::BuiltIn::SubgroupLocalInvocationId:
			
 
				+    laneIndexBuiltinId = varId;
			
 
				+    break;
			
 
				+  }
			
 
				+
			
 
				+  return varId;
			
 
				+}
			
 
				+
			
 
				 uint32_t DeclResultIdMapper::createSpirvStageVar(StageVar *stageVar,
			
 
				                                                  const NamedDecl *decl,
			
 
				                                                  const llvm::StringRef name,
			
--- a/tools/clang/lib/SPIRV/DeclResultIdMapper.h
+++ b/tools/clang/lib/SPIRV/DeclResultIdMapper.h
@@ -38,11 +38,13 @@ public:
 
				   inline StageVar(const hlsl::SigPoint *sig, llvm::StringRef semaStr,
			
 
				                   const hlsl::Semantic *sema, llvm::StringRef semaName,
			
 
				                   uint32_t semaIndex, const VKBuiltInAttr *builtin,
			
 
				-                  uint32_t type)
			
 
				+
			
 
				+                  uint32_t type, uint32_t locCount)
			
 
				       : sigPoint(sig), semanticStr(semaStr), semantic(sema),
			
 
				         semanticName(semaName), semanticIndex(semaIndex), builtinAttr(builtin),
			
 
				         typeId(type), valueId(0), isBuiltin(false),
			
 
				-        storageClass(spv::StorageClass::Max), location(nullptr) {
			
 
				+        storageClass(spv::StorageClass::Max), location(nullptr),
			
 
				+        locationCount(locCount) {
			
 
				     isBuiltin = builtinAttr != nullptr;
			
 
				   }
			
 
				 
			
@@ -68,6 +70,8 @@ public:
 
				   const VKLocationAttr *getLocationAttr() const { return location; }
			
 
				   void setLocationAttr(const VKLocationAttr *loc) { location = loc; }
			
 
				 
			
 
				+  uint32_t getLocationCount() const { return locationCount; }
			
 
				+
			
 
				 private:
			
 
				   /// HLSL SigPoint. It uniquely identifies each set of parameters that may be
			
 
				   /// input or output for each entry point.
			
@@ -92,6 +96,8 @@ private:
 
				   spv::StorageClass storageClass;
			
 
				   /// Location assignment if input/output variable.
			
 
				   const VKLocationAttr *location;
			
 
				+  /// How many locations this stage variable takes.
			
 
				+  uint32_t locationCount;
			
 
				 };
			
 
				 
			
 
				 class ResourceVar {
			
@@ -255,6 +261,9 @@ public:
 
				                             ModuleBuilder &builder,
			
 
				                             const EmitSPIRVOptions &spirvOptions);
			
 
				 
			
 
				+  /// \brief Returns the <result-id> for a SPIR-V builtin variable.
			
 
				+  uint32_t getBuiltinVar(spv::BuiltIn builtIn);
			
 
				+
			
 
				   /// \brief Creates the stage output variables by parsing the semantics
			
 
				   /// attached to the given function's parameter or return value and returns
			
 
				   /// true on success. SPIR-V instructions will also be generated to update the
			
@@ -347,8 +356,8 @@ private:
 
				     /// Default constructor to satisfy DenseMap
			
 
				     DeclSpirvInfo() : info(0), indexInCTBuffer(-1) {}
			
 
				 
			
 
				-    DeclSpirvInfo(const SpirvEvalInfo &info_, int index = -1, bool row = false)
			
 
				-        : info(info_), indexInCTBuffer(index), isRowMajor(row) {}
			
 
				+    DeclSpirvInfo(const SpirvEvalInfo &info_, int index = -1)
			
 
				+        : info(info_), indexInCTBuffer(index) {}
			
 
				 
			
 
				     /// Implicit conversion to SpirvEvalInfo.
			
 
				     operator SpirvEvalInfo() const { return info; }
			
@@ -357,8 +366,6 @@ private:
 
				     /// Value >= 0 means that this decl is a VarDecl inside a cbuffer/tbuffer
			
 
				     /// and this is the index; value < 0 means this is just a standalone decl.
			
 
				     int indexInCTBuffer;
			
 
				-    /// Whether this decl should be row major.
			
 
				-    bool isRowMajor;
			
 
				   };
			
 
				 
			
 
				   /// \brief Returns the SPIR-V information for the given decl.
			
@@ -559,7 +566,7 @@ private:
 
				   /// the children of this decl, and the children of this decl will be using
			
 
				   /// the semantic in inheritSemantic, with index increasing sequentially.
			
 
				   bool createStageVars(const hlsl::SigPoint *sigPoint, const NamedDecl *decl,
			
 
				-                       bool asInput, QualType type, uint32_t arraySize,
			
 
				+                       bool asInput, QualType asType, uint32_t arraySize,
			
 
				                        const llvm::StringRef namePrefix,
			
 
				                        llvm::Optional<uint32_t> invocationId, uint32_t *value,
			
 
				                        bool noWriteBack, SemanticInfo *inheritSemantic);
			
@@ -648,6 +655,15 @@ private:
 
				   /// to the <type-id>
			
 
				   llvm::DenseMap<const DeclContext *, uint32_t> ctBufferPCTypeIds;
			
 
				 
			
 
				+  /// <result-id> for the SPIR-V builtin variables accessed by
			
 
				+  /// WaveGetLaneCount() and WaveGetLaneIndex().
			
 
				+  ///
			
 
				+  /// These are the only two cases that SPIR-V builtin variables are accessed
			
 
				+  /// using HLSL intrinsic function calls. All other builtin variables are
			
 
				+  /// accessed using stage IO variables.
			
 
				+  uint32_t laneCountBuiltinId;
			
 
				+  uint32_t laneIndexBuiltinId;
			
 
				+
			
 
				   /// Whether the translated SPIR-V binary needs legalization.
			
 
				   ///
			
 
				   /// The following cases will require legalization:
			
@@ -718,7 +734,7 @@ DeclResultIdMapper::DeclResultIdMapper(const hlsl::ShaderModel &model,
 
				     : shaderModel(model), theBuilder(builder), spirvOptions(options),
			
 
				       astContext(context), diags(context.getDiagnostics()),
			
 
				       typeTranslator(context, builder, diags, options), entryFunctionId(0),
			
 
				-      needsLegalization(false),
			
 
				+      laneCountBuiltinId(0), laneIndexBuiltinId(0), needsLegalization(false),
			
 
				       glPerVertex(model, context, builder, typeTranslator, options.invertY) {}
			
 
				 
			
 
				 bool DeclResultIdMapper::decorateStageIOLocations() {
			
--- a/tools/clang/lib/SPIRV/InstBuilderAuto.cpp
+++ b/tools/clang/lib/SPIRV/InstBuilderAuto.cpp
--- a/tools/clang/lib/SPIRV/ModuleBuilder.cpp
+++ b/tools/clang/lib/SPIRV/ModuleBuilder.cpp
@@ -705,6 +705,18 @@ void ModuleBuilder::createEndPrimitive() {
 
				   insertPoint->appendInstruction(std::move(constructSite));
			
 
				 }
			
 
				 
			
 
				+uint32_t ModuleBuilder::createSubgroupFirstInvocation(uint32_t resultType,
			
 
				+                                                      uint32_t value) {
			
 
				+  assert(insertPoint && "null insert point");
			
 
				+  addExtension("SPV_KHR_shader_ballot");
			
 
				+  requireCapability(spv::Capability::SubgroupBallotKHR);
			
 
				+
			
 
				+  uint32_t resultId = theContext.takeNextId();
			
 
				+  instBuilder.opSubgroupFirstInvocationKHR(resultType, resultId, value).x();
			
 
				+  insertPoint->appendInstruction(std::move(constructSite));
			
 
				+  return resultId;
			
 
				+}
			
 
				+
			
 
				 void ModuleBuilder::addExecutionMode(uint32_t entryPointId,
			
 
				                                      spv::ExecutionMode em,
			
 
				                                      llvm::ArrayRef<uint32_t> params) {
			
@@ -835,12 +847,17 @@ IMPL_GET_PRIMITIVE_TYPE(Float32)
 
				 
			
 
				 #undef IMPL_GET_PRIMITIVE_TYPE
			
 
				 
			
 
				+// Note: At the moment, Float16 capability should not be added for Vulkan 1.0.
			
 
				+// It is not a required capability, and adding the SPV_AMD_gpu_half_float does
			
 
				+// not enable this capability. Any driver that supports float16 in Vulkan 1.0
			
 
				+// should accept this extension.
			
 
				 #define IMPL_GET_PRIMITIVE_TYPE_WITH_CAPABILITY(ty, cap)                       \
			
 
				                                                                                \
			
 
				   uint32_t ModuleBuilder::get##ty##Type() {                                    \
			
 
				-    requireCapability(spv::Capability::cap);                                   \
			
 
				     if (spv::Capability::cap == spv::Capability::Float16)                      \
			
 
				       theModule.addExtension("SPV_AMD_gpu_shader_half_float");                 \
			
 
				+    else                                                                       \
			
 
				+      requireCapability(spv::Capability::cap);                                 \
			
 
				     const Type *type = Type::get##ty(theContext);                              \
			
 
				     const uint32_t typeId = theContext.getResultIdForType(type);               \
			
 
				     theModule.addType(type, typeId);                                           \
			
@@ -881,7 +898,8 @@ uint32_t ModuleBuilder::getVecType(uint32_t elemType, uint32_t elemCount) {
 
				 }
			
 
				 
			
 
				 uint32_t ModuleBuilder::getMatType(QualType elemType, uint32_t colType,
			
 
				-                                   uint32_t colCount) {
			
 
				+                                   uint32_t colCount,
			
 
				+                                   Type::DecorationSet decorations) {
			
 
				   // NOTE: According to Item "Data rules" of SPIR-V Spec 2.16.1 "Universal
			
 
				   // Validation Rules":
			
 
				   //   Matrix types can only be parameterized with floating-point types.
			
@@ -889,7 +907,7 @@ uint32_t ModuleBuilder::getMatType(QualType elemType, uint32_t colType,
 
				   // So we need special handling of non-fp matrices. We emulate non-fp
			
 
				   // matrices as an array of vectors.
			
 
				   if (!elemType->isFloatingType())
			
 
				-    return getArrayType(colType, getConstantUint32(colCount));
			
 
				+    return getArrayType(colType, getConstantUint32(colCount), decorations);
			
 
				 
			
 
				   const Type *type = Type::getMatrix(theContext, colType, colCount);
			
 
				   const uint32_t typeId = theContext.getResultIdForType(type);
			
--- a/tools/clang/lib/SPIRV/SPIRVEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SPIRVEmitter.cpp
@@ -159,7 +159,12 @@ const Expr *isStructuredBufferLoad(const Expr *expr, const Expr **index) {
 
				 /// Returns true if the given VarDecl will be translated into a SPIR-V variable
			
 
				 /// not in the Private or Function storage class.
			
 
				 inline bool isExternalVar(const VarDecl *var) {
			
 
				-  return var->isExternallyVisible() && !var->isStaticDataMember();
			
 
				+  // Class static variables should be put in the Private storage class.
			
 
				+  // groupshared variables are allowed to be declared as "static". But we still
			
 
				+  // need to put them in the Workgroup storage class. That is, when seeing
			
 
				+  // "static groupshared", ignore "static".
			
 
				+  return var->isExternallyVisible() ? !var->isStaticDataMember()
			
 
				+                                    : var->getAttr<HLSLGroupSharedAttr>();
			
 
				 }
			
 
				 
			
 
				 /// Returns the referenced variable's DeclContext if the given expr is
			
@@ -778,8 +783,8 @@ SpirvEvalInfo SPIRVEmitter::loadIfGLValue(const Expr *expr,
 
				   if (const auto *declContext = isConstantTextureBufferDeclRef(expr)) {
			
 
				     valType = declIdMapper.getCTBufferPushConstantTypeId(declContext);
			
 
				   } else {
			
 
				-    valType =
			
 
				-        typeTranslator.translateType(expr->getType(), info.getLayoutRule());
			
 
				+    valType = typeTranslator.translateType(
			
 
				+        expr->getType(), info.getLayoutRule(), info.isRowMajor());
			
 
				   }
			
 
				   return info.setResultId(theBuilder.createLoad(valType, info)).setRValue();
			
 
				 }
			
@@ -1055,6 +1060,14 @@ void SPIRVEmitter::doHLSLBufferDecl(const HLSLBufferDecl *bufferDecl) {
 
				       for (const auto *annotation : varMember->getUnusualAnnotations())
			
 
				         if (const auto *packing = dyn_cast<hlsl::ConstantPacking>(annotation))
			
 
				           emitWarning("packoffset ignored since not supported", packing->Loc);
			
 
				+
			
 
				+      // We cannot handle external initialization of column-major matrices now.
			
 
				+      if (typeTranslator.isOrContainsNonFpColMajorMatrix(varMember->getType(),
			
 
				+                                                         varMember)) {
			
 
				+        emitError("externally initialized non-floating-point column-major "
			
 
				+                  "matrices not supported yet",
			
 
				+                  varMember->getLocation());
			
 
				+      }
			
 
				     }
			
 
				   }
			
 
				   if (!validateVKAttributes(bufferDecl))
			
@@ -1084,6 +1097,14 @@ void SPIRVEmitter::doVarDecl(const VarDecl *decl) {
 
				   if (!validateVKAttributes(decl))
			
 
				     return;
			
 
				 
			
 
				+  // We cannot handle external initialization of column-major matrices now.
			
 
				+  if (isExternalVar(decl) &&
			
 
				+      typeTranslator.isOrContainsNonFpColMajorMatrix(decl->getType(), decl)) {
			
 
				+    emitError("externally initialized non-floating-point column-major "
			
 
				+              "matrices not supported yet",
			
 
				+              decl->getLocation());
			
 
				+  }
			
 
				+
			
 
				   if (decl->hasAttr<VKConstantIdAttr>()) {
			
 
				     // This is a VarDecl for specialization constant.
			
 
				     createSpecConstant(decl);
			
@@ -1724,7 +1745,6 @@ void SPIRVEmitter::doSwitchStmt(const SwitchStmt *switchStmt,
 
				 
			
 
				 SpirvEvalInfo
			
 
				 SPIRVEmitter::doArraySubscriptExpr(const ArraySubscriptExpr *expr) {
			
 
				-
			
 
				   llvm::SmallVector<uint32_t, 4> indices;
			
 
				   auto info = loadIfAliasVarRef(collectArrayStructIndices(expr, &indices));
			
 
				 
			
@@ -1757,7 +1777,8 @@ SpirvEvalInfo SPIRVEmitter::doBinaryOperator(const BinaryOperator *expr) {
 
				   }
			
 
				 
			
 
				   return processBinaryOp(expr->getLHS(), expr->getRHS(), opcode,
			
 
				-                         expr->getType(), expr->getSourceRange());
			
 
				+                         expr->getLHS()->getType(), expr->getType(),
			
 
				+                         expr->getSourceRange());
			
 
				 }
			
 
				 
			
 
				 SpirvEvalInfo SPIRVEmitter::doCallExpr(const CallExpr *callExpr) {
			
@@ -2154,13 +2175,22 @@ SpirvEvalInfo SPIRVEmitter::doCastExpr(const CastExpr *expr) {
 
				       return SpirvEvalInfo(subExprId).setRValue().setConstant();
			
 
				     }
			
 
				 
			
 
				-    // Try to evaluate 'literal float' as float rather than double.
			
 
				+    TypeTranslator::LiteralTypeHint hint(typeTranslator);
			
 
				+    // Try to evaluate float literals as float rather than double.
			
 
				     if (const auto *floatLiteral = dyn_cast<FloatingLiteral>(subExpr)) {
			
 
				       subExprId = tryToEvaluateAsFloat32(floatLiteral->getValue());
			
 
				       if (subExprId)
			
 
				         evalType = astContext.FloatTy;
			
 
				     }
			
 
				-    // Try to evaluate 'literal int' as 32-bit int rather than 64-bit int.
			
 
				+    // Evaluate 'literal float' initializer type as float rather than double.
			
 
				+    // TODO: This could result in rounding error if the initializer is a
			
 
				+    // non-literal expression that requires larger than 32 bits and has the
			
 
				+    // 'literal float' type.
			
 
				+    else if (subExprType->isSpecificBuiltinType(BuiltinType::LitFloat)) {
			
 
				+      evalType = astContext.FloatTy;
			
 
				+      hint.setHint(astContext.FloatTy);
			
 
				+    }
			
 
				+    // Try to evaluate integer literals as 32-bit int rather than 64-bit int.
			
 
				     else if (const auto *intLiteral = dyn_cast<IntegerLiteral>(subExpr)) {
			
 
				       const bool isSigned = subExprType->isSignedIntegerType();
			
 
				       subExprId = tryToEvaluateAsInt32(intLiteral->getValue(), isSigned);
			
@@ -2229,15 +2259,18 @@ uint32_t SPIRVEmitter::processFlatConversion(const QualType type,
 
				         case BuiltinType::Bool:
			
 
				           return castToBool(initId, initType, ty);
			
 
				         // Target type is an integer variant.
			
 
				-        // TODO: Add long and ulong.
			
 
				         case BuiltinType::Int:
			
 
				         case BuiltinType::Short:
			
 
				         case BuiltinType::Min12Int:
			
 
				         case BuiltinType::UShort:
			
 
				         case BuiltinType::UInt:
			
 
				+        case BuiltinType::Long:
			
 
				+        case BuiltinType::LongLong:
			
 
				+        case BuiltinType::ULong:
			
 
				+        case BuiltinType::ULongLong:
			
 
				           return castToInt(initId, initType, ty, srcLoc);
			
 
				         // Target type is a float variant.
			
 
				-        // TODO: Add double.
			
 
				+        case BuiltinType::Double:
			
 
				         case BuiltinType::Float:
			
 
				         case BuiltinType::Half:
			
 
				         case BuiltinType::Min10Float:
			
@@ -2340,8 +2373,9 @@ SPIRVEmitter::doCompoundAssignOperator(const CompoundAssignOperator *expr) {
 
				   const auto *lhs = expr->getLHS();
			
 
				 
			
 
				   SpirvEvalInfo lhsPtr = 0;
			
 
				-  const auto result = processBinaryOp(lhs, rhs, opcode, expr->getType(),
			
 
				-                                      expr->getSourceRange(), &lhsPtr);
			
 
				+  const auto result =
			
 
				+      processBinaryOp(lhs, rhs, opcode, expr->getComputationLHSType(),
			
 
				+                      expr->getType(), expr->getSourceRange(), &lhsPtr);
			
 
				   return processAssignment(lhs, result, true, lhsPtr);
			
 
				 }
			
 
				 
			
@@ -4506,9 +4540,36 @@ SpirvEvalInfo SPIRVEmitter::processAssignment(const Expr *lhs,
 
				 void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
			
 
				                               const SpirvEvalInfo &rhsVal,
			
 
				                               const QualType lhsValType) {
			
 
				+
			
 
				+  // Lambda for cases where we want to store per each array element.
			
 
				+  const auto storeValueForEachArrayElement = [this, &lhsPtr,
			
 
				+                                              &rhsVal](uint32_t arraySize,
			
 
				+                                                       QualType arrayElemType) {
			
 
				+    for (uint32_t i = 0; i < arraySize; ++i) {
			
 
				+      const auto subRhsValType =
			
 
				+          typeTranslator.translateType(arrayElemType, rhsVal.getLayoutRule());
			
 
				+      const auto subRhsVal =
			
 
				+          theBuilder.createCompositeExtract(subRhsValType, rhsVal, {i});
			
 
				+      const auto subLhsPtrType = theBuilder.getPointerType(
			
 
				+          typeTranslator.translateType(arrayElemType, lhsPtr.getLayoutRule()),
			
 
				+          lhsPtr.getStorageClass());
			
 
				+      const auto subLhsPtr = theBuilder.createAccessChain(
			
 
				+          subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(i)});
			
 
				+
			
 
				+      storeValue(lhsPtr.substResultId(subLhsPtr),
			
 
				+                 rhsVal.substResultId(subRhsVal), arrayElemType);
			
 
				+    }
			
 
				+  };
			
 
				+
			
 
				+  QualType matElemType = {};
			
 
				+  uint32_t numRows = 0, numCols = 0;
			
 
				+  const bool lhsIsMat =
			
 
				+      typeTranslator.isMxNMatrix(lhsValType, &matElemType, &numRows, &numCols);
			
 
				+  const bool lhsIsFloatMat = lhsIsMat && matElemType->isFloatingType();
			
 
				+  const bool lhsIsNonFpMat = lhsIsMat && !matElemType->isFloatingType();
			
 
				+
			
 
				   if (typeTranslator.isScalarType(lhsValType) ||
			
 
				-      typeTranslator.isVectorType(lhsValType) ||
			
 
				-      typeTranslator.isMxNMatrix(lhsValType)) {
			
 
				+      typeTranslator.isVectorType(lhsValType) || lhsIsFloatMat) {
			
 
				     theBuilder.createStore(lhsPtr, rhsVal);
			
 
				   } else if (TypeTranslator::isOpaqueType(lhsValType)) {
			
 
				     // Resource types are represented using RecordType in the AST.
			
@@ -4545,16 +4606,24 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
 
				     // Note: this check should happen after those setting needsLegalization.
			
 
				     // TODO: is this optimization always correct?
			
 
				     theBuilder.createStore(lhsPtr, rhsVal);
			
 
				+  } else if (lhsIsNonFpMat) {
			
 
				+    // Note: This check should happen before the RecordType check.
			
 
				+    // Non-fp matrices are represented as arrays of vectors in SPIR-V.
			
 
				+    // Each array element is a vector. Get the QualType for the vector.
			
 
				+    const auto elemType = astContext.getExtVectorType(matElemType, numCols);
			
 
				+    storeValueForEachArrayElement(numRows, elemType);
			
 
				   } else if (const auto *recordType = lhsValType->getAs<RecordType>()) {
			
 
				     uint32_t index = 0;
			
 
				     for (const auto *field : recordType->getDecl()->fields()) {
			
 
				+      bool isRowMajor =
			
 
				+          typeTranslator.isRowMajorMatrix(field->getType(), field);
			
 
				       const auto subRhsValType = typeTranslator.translateType(
			
 
				-          field->getType(), rhsVal.getLayoutRule());
			
 
				+          field->getType(), rhsVal.getLayoutRule(), isRowMajor);
			
 
				       const auto subRhsVal =
			
 
				           theBuilder.createCompositeExtract(subRhsValType, rhsVal, {index});
			
 
				       const auto subLhsPtrType = theBuilder.getPointerType(
			
 
				-          typeTranslator.translateType(field->getType(),
			
 
				-                                       lhsPtr.getLayoutRule()),
			
 
				+          typeTranslator.translateType(field->getType(), lhsPtr.getLayoutRule(),
			
 
				+                                       isRowMajor),
			
 
				           lhsPtr.getStorageClass());
			
 
				       const auto subLhsPtr = theBuilder.createAccessChain(
			
 
				           subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(index)});
			
@@ -4569,21 +4638,7 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
 
				     // TODO: handle extra large array size?
			
 
				     const auto size =
			
 
				         static_cast<uint32_t>(arrayType->getSize().getZExtValue());
			
 
				-
			
 
				-    for (uint32_t i = 0; i < size; ++i) {
			
 
				-      const auto subRhsValType =
			
 
				-          typeTranslator.translateType(elemType, rhsVal.getLayoutRule());
			
 
				-      const auto subRhsVal =
			
 
				-          theBuilder.createCompositeExtract(subRhsValType, rhsVal, {i});
			
 
				-      const auto subLhsPtrType = theBuilder.getPointerType(
			
 
				-          typeTranslator.translateType(elemType, lhsPtr.getLayoutRule()),
			
 
				-          lhsPtr.getStorageClass());
			
 
				-      const auto subLhsPtr = theBuilder.createAccessChain(
			
 
				-          subLhsPtrType, lhsPtr, {theBuilder.getConstantUint32(i)});
			
 
				-
			
 
				-      storeValue(lhsPtr.substResultId(subLhsPtr),
			
 
				-                 rhsVal.substResultId(subRhsVal), elemType);
			
 
				-    }
			
 
				+    storeValueForEachArrayElement(size, elemType);
			
 
				   } else {
			
 
				     emitError("storing value of type %0 unimplemented", {}) << lhsValType;
			
 
				   }
			
@@ -4591,22 +4646,24 @@ void SPIRVEmitter::storeValue(const SpirvEvalInfo &lhsPtr,
 
				 
			
 
				 SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
			
 
				                                             const BinaryOperatorKind opcode,
			
 
				+                                            const QualType computationType,
			
 
				                                             const QualType resultType,
			
 
				                                             SourceRange sourceRange,
			
 
				                                             SpirvEvalInfo *lhsInfo,
			
 
				                                             const spv::Op mandateGenOpcode) {
			
 
				-  const uint32_t resultTypeId = typeTranslator.translateType(resultType);
			
 
				+  const QualType lhsType = lhs->getType();
			
 
				+  const QualType rhsType = rhs->getType();
			
 
				 
			
 
				   // Binary logical operations (such as ==, !=, etc) that return a boolean type
			
 
				   // may get a literal (e.g. 0, 1, etc.) as lhs or rhs args. Since only
			
 
				   // non-zero-ness of these literals matter, they can be translated as 32-bits.
			
 
				   TypeTranslator::LiteralTypeHint hint(typeTranslator);
			
 
				   if (resultType->isBooleanType()) {
			
 
				-    if (lhs->getType()->isSpecificBuiltinType(BuiltinType::LitInt) ||
			
 
				-        rhs->getType()->isSpecificBuiltinType(BuiltinType::LitInt))
			
 
				+    if (lhsType->isSpecificBuiltinType(BuiltinType::LitInt) ||
			
 
				+        rhsType->isSpecificBuiltinType(BuiltinType::LitInt))
			
 
				       hint.setHint(astContext.IntTy);
			
 
				-    if (lhs->getType()->isSpecificBuiltinType(BuiltinType::LitFloat) ||
			
 
				-        rhs->getType()->isSpecificBuiltinType(BuiltinType::LitFloat))
			
 
				+    if (lhsType->isSpecificBuiltinType(BuiltinType::LitFloat) ||
			
 
				+        rhsType->isSpecificBuiltinType(BuiltinType::LitFloat))
			
 
				       hint.setHint(astContext.FloatTy);
			
 
				   }
			
 
				 
			
@@ -4614,7 +4671,7 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
 
				   // onto each element vector iff the operands are not degenerated matrices
			
 
				   // and we don't have a matrix specific SPIR-V instruction for the operation.
			
 
				   if (!isSpirvMatrixOp(mandateGenOpcode) &&
			
 
				-      TypeTranslator::isMxNMatrix(lhs->getType())) {
			
 
				+      TypeTranslator::isMxNMatrix(lhsType)) {
			
 
				     return processMatrixBinaryOp(lhs, rhs, opcode, sourceRange);
			
 
				   }
			
 
				 
			
@@ -4626,11 +4683,8 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
 
				     return doExpr(rhs);
			
 
				   }
			
 
				 
			
 
				-  const spv::Op spvOp = (mandateGenOpcode == spv::Op::Max)
			
 
				-                            ? translateOp(opcode, lhs->getType())
			
 
				-                            : mandateGenOpcode;
			
 
				-
			
 
				   SpirvEvalInfo rhsVal = 0, lhsPtr = 0, lhsVal = 0;
			
 
				+
			
 
				   if (BinaryOperator::isCompoundAssignmentOp(opcode)) {
			
 
				     // Evalute rhs before lhs
			
 
				     rhsVal = loadIfGLValue(rhs);
			
@@ -4640,6 +4694,12 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
 
				     if (!lhsPtr.isRValue() && !isVectorShuffle(lhs)) {
			
 
				       lhsVal = loadIfGLValue(lhs, lhsPtr);
			
 
				     }
			
 
				+    // For a compound assignments, the AST does not have the proper implicit
			
 
				+    // cast if lhs and rhs have different types. So we need to manually cast lhs
			
 
				+    // to the computation type.
			
 
				+    if (computationType != lhsType)
			
 
				+      lhsVal.setResultId(
			
 
				+          castToType(lhsVal, lhsType, computationType, lhs->getExprLoc()));
			
 
				   } else {
			
 
				     // Evalute lhs before rhs
			
 
				     lhsPtr = doExpr(lhs);
			
@@ -4650,6 +4710,10 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
 
				   if (lhsInfo)
			
 
				     *lhsInfo = lhsPtr;
			
 
				 
			
 
				+  const spv::Op spvOp = (mandateGenOpcode == spv::Op::Max)
			
 
				+                            ? translateOp(opcode, computationType)
			
 
				+                            : mandateGenOpcode;
			
 
				+
			
 
				   switch (opcode) {
			
 
				   case BO_Add:
			
 
				   case BO_Sub:
			
@@ -4679,19 +4743,32 @@ SpirvEvalInfo SPIRVEmitter::processBinaryOp(const Expr *lhs, const Expr *rhs,
 
				   case BO_XorAssign:
			
 
				   case BO_ShlAssign:
			
 
				   case BO_ShrAssign: {
			
 
				+
			
 
				     // To evaluate this expression as an OpSpecConstantOp, we need to make sure
			
 
				     // both operands are constant and at least one of them is a spec constant.
			
 
				     if (lhsVal.isConstant() && rhsVal.isConstant() &&
			
 
				         (lhsVal.isSpecConstant() || rhsVal.isSpecConstant()) &&
			
 
				         isAcceptedSpecConstantBinaryOp(spvOp)) {
			
 
				       const auto valId = theBuilder.createSpecConstantBinaryOp(
			
 
				-          spvOp, resultTypeId, lhsVal, rhsVal);
			
 
				+          spvOp, typeTranslator.translateType(resultType), lhsVal, rhsVal);
			
 
				       return SpirvEvalInfo(valId).setRValue().setSpecConstant();
			
 
				     }
			
 
				 
			
 
				     // Normal binary operation
			
 
				-    const auto valId =
			
 
				-        theBuilder.createBinaryOp(spvOp, resultTypeId, lhsVal, rhsVal);
			
 
				+    uint32_t valId = 0;
			
 
				+    if (BinaryOperator::isCompoundAssignmentOp(opcode)) {
			
 
				+      valId = theBuilder.createBinaryOp(
			
 
				+          spvOp, typeTranslator.translateType(computationType), lhsVal, rhsVal);
			
 
				+      // For a compound assignments, the AST does not have the proper implicit
			
 
				+      // cast if lhs and rhs have different types. So we need to manually cast
			
 
				+      // the result back to lhs' type.
			
 
				+      if (computationType != lhsType)
			
 
				+        valId = castToType(valId, computationType, lhsType, lhs->getExprLoc());
			
 
				+    } else {
			
 
				+      valId = theBuilder.createBinaryOp(
			
 
				+          spvOp, typeTranslator.translateType(resultType), lhsVal, rhsVal);
			
 
				+    }
			
 
				+
			
 
				     auto result = SpirvEvalInfo(valId).setRValue();
			
 
				     if (lhsVal.isRelaxedPrecision() || rhsVal.isRelaxedPrecision())
			
 
				       result.setRelaxedPrecision();
			
@@ -4977,12 +5054,12 @@ SPIRVEmitter::tryToGenFloatVectorScale(const BinaryOperator *expr) {
 
				         if (isa<CompoundAssignOperator>(expr)) {
			
 
				           SpirvEvalInfo lhsPtr = 0;
			
 
				           const auto result = processBinaryOp(
			
 
				-              lhs, cast->getSubExpr(), expr->getOpcode(), vecType, range,
			
 
				-              &lhsPtr, spv::Op::OpVectorTimesScalar);
			
 
				+              lhs, cast->getSubExpr(), expr->getOpcode(), vecType, vecType,
			
 
				+              range, &lhsPtr, spv::Op::OpVectorTimesScalar);
			
 
				           return processAssignment(lhs, result, true, lhsPtr);
			
 
				         } else {
			
 
				           return processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
			
 
				-                                 vecType, range, nullptr,
			
 
				+                                 vecType, vecType, range, nullptr,
			
 
				                                  spv::Op::OpVectorTimesScalar);
			
 
				         }
			
 
				       }
			
@@ -4998,7 +5075,7 @@ SPIRVEmitter::tryToGenFloatVectorScale(const BinaryOperator *expr) {
 
				         // OpVectorTimesScalar requires the first operand to be a vector and
			
 
				         // the second to be a scalar.
			
 
				         return processBinaryOp(rhs, cast->getSubExpr(), expr->getOpcode(),
			
 
				-                               vecType, range, nullptr,
			
 
				+                               vecType, vecType, range, nullptr,
			
 
				                                spv::Op::OpVectorTimesScalar);
			
 
				       }
			
 
				     }
			
@@ -5043,11 +5120,11 @@ SPIRVEmitter::tryToGenFloatMatrixScale(const BinaryOperator *expr) {
 
				           SpirvEvalInfo lhsPtr = 0;
			
 
				           const auto result =
			
 
				               processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
			
 
				-                              matType, range, &lhsPtr, opcode);
			
 
				+                              matType, matType, range, &lhsPtr, opcode);
			
 
				           return processAssignment(lhs, result, true, lhsPtr);
			
 
				         } else {
			
 
				           return processBinaryOp(lhs, cast->getSubExpr(), expr->getOpcode(),
			
 
				-                                 matType, range, nullptr, opcode);
			
 
				+                                 matType, matType, range, nullptr, opcode);
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -5063,7 +5140,7 @@ SPIRVEmitter::tryToGenFloatMatrixScale(const BinaryOperator *expr) {
 
				         // OpMatrixTimesScalar requires the first operand to be a matrix and
			
 
				         // the second to be a scalar.
			
 
				         return processBinaryOp(rhs, cast->getSubExpr(), expr->getOpcode(),
			
 
				-                               matType, range, nullptr, opcode);
			
 
				+                               matType, matType, range, nullptr, opcode);
			
 
				       }
			
 
				     }
			
 
				   }
			
@@ -5557,7 +5634,7 @@ uint32_t SPIRVEmitter::castToBool(const uint32_t fromVal, QualType fromType,
 
				   return theBuilder.createBinaryOp(spvOp, boolType, fromVal, zeroVal);
			
 
				 }
			
 
				 
			
 
				-uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
			
 
				+uint32_t SPIRVEmitter::castToInt(uint32_t fromVal, QualType fromType,
			
 
				                                  QualType toIntType, SourceLocation srcLoc) {
			
 
				   if (TypeTranslator::isSameScalarOrVecType(fromType, toIntType))
			
 
				     return fromVal;
			
@@ -5571,11 +5648,18 @@ uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
 
				   }
			
 
				 
			
 
				   if (isSintOrVecOfSintType(fromType) || isUintOrVecOfUintType(fromType)) {
			
 
				-    // TODO: handle different bitwidths
			
 
				+    // First convert the source to the bitwidth of the destination if necessary.
			
 
				+    uint32_t convertedType = 0;
			
 
				+    fromVal = convertBitwidth(fromVal, fromType, toIntType, &convertedType);
			
 
				+    // If bitwidth conversion was the only thing we needed to do, we're done.
			
 
				+    if (convertedType == typeTranslator.translateType(toIntType))
			
 
				+      return fromVal;
			
 
				     return theBuilder.createUnaryOp(spv::Op::OpBitcast, intType, fromVal);
			
 
				   }
			
 
				 
			
 
				   if (isFloatOrVecOfFloatType(fromType)) {
			
 
				+    // First convert the source to the bitwidth of the destination if necessary.
			
 
				+    fromVal = convertBitwidth(fromVal, fromType, toIntType);
			
 
				     if (isSintOrVecOfSintType(toIntType)) {
			
 
				       return theBuilder.createUnaryOp(spv::Op::OpConvertFToS, intType, fromVal);
			
 
				     } else if (isUintOrVecOfUintType(toIntType)) {
			
@@ -5619,7 +5703,41 @@ uint32_t SPIRVEmitter::castToInt(const uint32_t fromVal, QualType fromType,
 
				   return 0;
			
 
				 }
			
 
				 
			
 
				-uint32_t SPIRVEmitter::castToFloat(const uint32_t fromVal, QualType fromType,
			
 
				+uint32_t SPIRVEmitter::convertBitwidth(uint32_t fromVal, QualType fromType,
			
 
				+                                       QualType toType, uint32_t *resultType) {
			
 
				+  // At the moment, we will not make bitwidth conversions for literal int and
			
 
				+  // literal float types because they always indicate 64-bit and do not
			
 
				+  // represent what SPIR-V was actually resolved to.
			
 
				+  // TODO: If the evaluated type is added to SpirvEvalInfo, change 'fromVal' to
			
 
				+  // SpirvEvalInfo and use it to handle literal types more accurately.
			
 
				+  if (fromType->isSpecificBuiltinType(BuiltinType::LitFloat) ||
			
 
				+      fromType->isSpecificBuiltinType(BuiltinType::LitInt))
			
 
				+    return fromVal;
			
 
				+
			
 
				+  const auto fromBitwidth = typeTranslator.getElementSpirvBitwidth(fromType);
			
 
				+  const auto toBitwidth = typeTranslator.getElementSpirvBitwidth(toType);
			
 
				+  if (fromBitwidth == toBitwidth) {
			
 
				+    if (resultType)
			
 
				+      *resultType = typeTranslator.translateType(fromType);
			
 
				+    return fromVal;
			
 
				+  }
			
 
				+
			
 
				+  // We want the 'fromType' with the 'toBitwidth'.
			
 
				+  const uint32_t targetTypeId =
			
 
				+      typeTranslator.getTypeWithCustomBitwidth(fromType, toBitwidth);
			
 
				+  if (resultType)
			
 
				+    *resultType = targetTypeId;
			
 
				+
			
 
				+  if (isFloatOrVecOfFloatType(fromType))
			
 
				+    return theBuilder.createUnaryOp(spv::Op::OpFConvert, targetTypeId, fromVal);
			
 
				+  if (isSintOrVecOfSintType(fromType))
			
 
				+    return theBuilder.createUnaryOp(spv::Op::OpSConvert, targetTypeId, fromVal);
			
 
				+  if (isUintOrVecOfUintType(fromType))
			
 
				+    return theBuilder.createUnaryOp(spv::Op::OpUConvert, targetTypeId, fromVal);
			
 
				+  llvm_unreachable("invalid type passed to convertBitwidth");
			
 
				+}
			
 
				+
			
 
				+uint32_t SPIRVEmitter::castToFloat(uint32_t fromVal, QualType fromType,
			
 
				                                    QualType toFloatType,
			
 
				                                    SourceLocation srcLoc) {
			
 
				   if (TypeTranslator::isSameScalarOrVecType(fromType, toFloatType))
			
@@ -5634,15 +5752,20 @@ uint32_t SPIRVEmitter::castToFloat(const uint32_t fromVal, QualType fromType,
 
				   }
			
 
				 
			
 
				   if (isSintOrVecOfSintType(fromType)) {
			
 
				+    // First convert the source to the bitwidth of the destination if necessary.
			
 
				+    fromVal = convertBitwidth(fromVal, fromType, toFloatType);
			
 
				     return theBuilder.createUnaryOp(spv::Op::OpConvertSToF, floatType, fromVal);
			
 
				   }
			
 
				 
			
 
				   if (isUintOrVecOfUintType(fromType)) {
			
 
				+    // First convert the source to the bitwidth of the destination if necessary.
			
 
				+    fromVal = convertBitwidth(fromVal, fromType, toFloatType);
			
 
				     return theBuilder.createUnaryOp(spv::Op::OpConvertUToF, floatType, fromVal);
			
 
				   }
			
 
				 
			
 
				   if (isFloatOrVecOfFloatType(fromType)) {
			
 
				-    return theBuilder.createUnaryOp(spv::Op::OpFConvert, floatType, fromVal);
			
 
				+    // This is the case of float to float conversion with different bitwidths.
			
 
				+    return convertBitwidth(fromVal, fromType, toFloatType);
			
 
				   }
			
 
				 
			
 
				   // Casting matrix types
			
@@ -5895,6 +6018,31 @@ SpirvEvalInfo SPIRVEmitter::processIntrinsicCallExpr(const CallExpr *callExpr) {
 
				   case hlsl::IntrinsicOp::IOP_f32tof16:
			
 
				     retVal = processIntrinsicF32ToF16(callExpr);
			
 
				     break;
			
 
				+  case hlsl::IntrinsicOp::IOP_WaveGetLaneCount: {
			
 
				+    const uint32_t retType =
			
 
				+        typeTranslator.translateType(callExpr->getCallReturnType(astContext));
			
 
				+    const uint32_t varId =
			
 
				+        declIdMapper.getBuiltinVar(spv::BuiltIn::SubgroupSize);
			
 
				+    retVal = theBuilder.createLoad(retType, varId);
			
 
				+  } break;
			
 
				+  case hlsl::IntrinsicOp::IOP_WaveGetLaneIndex: {
			
 
				+    const uint32_t retType =
			
 
				+        typeTranslator.translateType(callExpr->getCallReturnType(astContext));
			
 
				+    const uint32_t varId =
			
 
				+        declIdMapper.getBuiltinVar(spv::BuiltIn::SubgroupLocalInvocationId);
			
 
				+    retVal = theBuilder.createLoad(retType, varId);
			
 
				+  } break;
			
 
				+  case hlsl::IntrinsicOp::IOP_WaveReadLaneFirst: {
			
 
				+    const auto retType = callExpr->getCallReturnType(astContext);
			
 
				+    if (!retType->isScalarType()) {
			
 
				+      emitError("vector overloads of WaveReadLaneFirst unimplemented",
			
 
				+                callExpr->getExprLoc());
			
 
				+      return 0;
			
 
				+    }
			
 
				+    const uint32_t retTypeId = typeTranslator.translateType(retType);
			
 
				+    retVal = theBuilder.createSubgroupFirstInvocation(
			
 
				+        retTypeId, doExpr(callExpr->getArg(0)));
			
 
				+  } break;
			
 
				   case hlsl::IntrinsicOp::IOP_abort:
			
 
				   case hlsl::IntrinsicOp::IOP_GetRenderTargetSampleCount:
			
 
				   case hlsl::IntrinsicOp::IOP_GetRenderTargetSamplePosition: {
			
--- a/tools/clang/lib/SPIRV/SPIRVEmitter.h
+++ b/tools/clang/lib/SPIRV/SPIRVEmitter.h
@@ -148,12 +148,20 @@ private:
 
				                   QualType lhsValType);
			
 
				 
			
 
				   /// Generates the necessary instructions for conducting the given binary
			
 
				-  /// operation on lhs and rhs. If lhsResultId is not nullptr, the evaluated
			
 
				-  /// pointer from lhs during the process will be written into it. If
			
 
				-  /// mandateGenOpcode is not spv::Op::Max, it will used as the SPIR-V opcode
			
 
				-  /// instead of deducing from Clang frontend opcode.
			
 
				+  /// operation on lhs and rhs.
			
 
				+  ///
			
 
				+  /// computationType is the type for LHS and RHS when doing computation, while
			
 
				+  /// resultType is the type of the whole binary operation. They can be
			
 
				+  /// different for compound assignments like <some-int-value> *=
			
 
				+  /// <some-float-value>, where computationType is float and resultType is int.
			
 
				+  ///
			
 
				+  /// If lhsResultId is not nullptr, the evaluated pointer from lhs during the
			
 
				+  /// process will be written into it. If mandateGenOpcode is not spv::Op::Max,
			
 
				+  /// it will used as the SPIR-V opcode instead of deducing from Clang frontend
			
 
				+  /// opcode.
			
 
				   SpirvEvalInfo processBinaryOp(const Expr *lhs, const Expr *rhs,
			
 
				-                                BinaryOperatorKind opcode, QualType resultType,
			
 
				+                                BinaryOperatorKind opcode,
			
 
				+                                QualType computationType, QualType resultType,
			
 
				                                 SourceRange, SpirvEvalInfo *lhsInfo = nullptr,
			
 
				                                 spv::Op mandateGenOpcode = spv::Op::Max);
			
 
				 
			
@@ -283,6 +291,14 @@ private:
 
				   bool validateVKAttributes(const NamedDecl *decl);
			
 
				 
			
 
				 private:
			
 
				+  /// Converts the given value from the bitwidth of 'fromType' to the bitwidth
			
 
				+  /// of 'toType'. If the two have the same bitwidth, returns the value itself.
			
 
				+  /// If resultType is not nullptr, the resulting value's type will be written
			
 
				+  /// to resultType. Panics if the given types are not scalar or vector of
			
 
				+  /// float/integer type.
			
 
				+  uint32_t convertBitwidth(uint32_t value, QualType fromType, QualType toType,
			
 
				+                           uint32_t *resultType = nullptr);
			
 
				+
			
 
				   /// Processes the given expr, casts the result into the given bool (vector)
			
 
				   /// type and returns the <result-id> of the casted value.
			
 
				   uint32_t castToBool(uint32_t value, QualType fromType, QualType toType);
			
--- a/tools/clang/lib/SPIRV/SpirvEvalInfo.h
+++ b/tools/clang/lib/SPIRV/SpirvEvalInfo.h
@@ -100,6 +100,9 @@ public:
 
				   inline SpirvEvalInfo &setRelaxedPrecision();
			
 
				   bool isRelaxedPrecision() const { return isRelaxedPrecision_; }
			
 
				 
			
 
				+  inline SpirvEvalInfo &setRowMajor(bool);
			
 
				+  bool isRowMajor() const { return isRowMajor_; }
			
 
				+
			
 
				 private:
			
 
				   uint32_t resultId;
			
 
				   /// Indicates whether this evaluation result contains alias variables
			
@@ -119,13 +122,14 @@ private:
 
				   bool isConstant_;
			
 
				   bool isSpecConstant_;
			
 
				   bool isRelaxedPrecision_;
			
 
				+  bool isRowMajor_;
			
 
				 };
			
 
				 
			
 
				 SpirvEvalInfo::SpirvEvalInfo(uint32_t id)
			
 
				     : resultId(id), containsAlias(false),
			
 
				       storageClass(spv::StorageClass::Function), layoutRule(LayoutRule::Void),
			
 
				       isRValue_(false), isConstant_(false), isSpecConstant_(false),
			
 
				-      isRelaxedPrecision_(false) {}
			
 
				+      isRelaxedPrecision_(false), isRowMajor_(false) {}
			
 
				 
			
 
				 SpirvEvalInfo &SpirvEvalInfo::setResultId(uint32_t id) {
			
 
				   resultId = id;
			
@@ -174,6 +178,11 @@ SpirvEvalInfo &SpirvEvalInfo::setRelaxedPrecision() {
 
				   return *this;
			
 
				 }
			
 
				 
			
 
				+SpirvEvalInfo &SpirvEvalInfo::setRowMajor(bool rm) {
			
 
				+  isRowMajor_ = rm;
			
 
				+  return *this;
			
 
				+}
			
 
				+
			
 
				 } // end namespace spirv
			
 
				 } // end namespace clang
			
 
				 
			
--- a/tools/clang/lib/SPIRV/TypeTranslator.cpp
+++ b/tools/clang/lib/SPIRV/TypeTranslator.cpp
@@ -203,11 +203,219 @@ void TypeTranslator::popIntendedLiteralType() {
 
				   intendedLiteralTypes.pop();
			
 
				 }
			
 
				 
			
 
				+uint32_t TypeTranslator::getLocationCount(QualType type) {
			
 
				+  // See Vulkan spec 14.1.4. Location Assignment for the complete set of rules.
			
 
				+
			
 
				+  const auto canonicalType = type.getCanonicalType();
			
 
				+  if (canonicalType != type)
			
 
				+    return getLocationCount(canonicalType);
			
 
				+
			
 
				+  // Inputs and outputs of the following types consume a single interface
			
 
				+  // location:
			
 
				+  // * 16-bit scalar and vector types, and
			
 
				+  // * 32-bit scalar and vector types, and
			
 
				+  // * 64-bit scalar and 2-component vector types.
			
 
				+
			
 
				+  // 64-bit three- and four- component vectors consume two consecutive
			
 
				+  // locations.
			
 
				+
			
 
				+  // Primitive types
			
 
				+  if (isScalarType(type))
			
 
				+    return 1;
			
 
				+
			
 
				+  // Vector types
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    uint32_t elemCount = {};
			
 
				+    if (isVectorType(type, &elemType, &elemCount)) {
			
 
				+      const auto *builtinType = elemType->getAs<BuiltinType>();
			
 
				+      switch (builtinType->getKind()) {
			
 
				+      case BuiltinType::Double:
			
 
				+      case BuiltinType::LongLong:
			
 
				+      case BuiltinType::ULongLong:
			
 
				+        if (elemCount >= 3)
			
 
				+          return 2;
			
 
				+      }
			
 
				+      return 1;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // If the declared input or output is an n * m 16- , 32- or 64- bit matrix,
			
 
				+  // it will be assigned multiple locations starting with the location
			
 
				+  // specified. The number of locations assigned for each matrix will be the
			
 
				+  // same as for an n-element array of m-component vectors.
			
 
				+
			
 
				+  // Matrix types
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    uint32_t rowCount = 0, colCount = 0;
			
 
				+    if (isMxNMatrix(type, &elemType, &rowCount, &colCount))
			
 
				+      return getLocationCount(astContext.getExtVectorType(elemType, colCount)) *
			
 
				+             rowCount;
			
 
				+  }
			
 
				+
			
 
				+  // Typedefs
			
 
				+  if (const auto *typedefType = type->getAs<TypedefType>())
			
 
				+    return getLocationCount(typedefType->desugar());
			
 
				+
			
 
				+  // Reference types
			
 
				+  if (const auto *refType = type->getAs<ReferenceType>())
			
 
				+    return getLocationCount(refType->getPointeeType());
			
 
				+
			
 
				+  // Pointer types
			
 
				+  if (const auto *ptrType = type->getAs<PointerType>())
			
 
				+    return getLocationCount(ptrType->getPointeeType());
			
 
				+
			
 
				+  // If a declared input or output is an array of size n and each element takes
			
 
				+  // m locations, it will be assigned m * n consecutive locations starting with
			
 
				+  // the location specified.
			
 
				+
			
 
				+  // Array types
			
 
				+  if (const auto *arrayType = astContext.getAsConstantArrayType(type))
			
 
				+    return getLocationCount(arrayType->getElementType()) *
			
 
				+           static_cast<uint32_t>(arrayType->getSize().getZExtValue());
			
 
				+
			
 
				+  // Struct type
			
 
				+  if (const auto *structType = type->getAs<RecordType>()) {
			
 
				+    assert(false && "all structs should already be flattened");
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  emitError(
			
 
				+      "calculating number of occupied locations for type %0 unimplemented")
			
 
				+      << type;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+uint32_t TypeTranslator::getTypeWithCustomBitwidth(QualType type,
			
 
				+                                                   uint32_t bitwidth) {
			
 
				+  // Cases where the given type is a vector of float/int.
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    uint32_t elemCount = 0;
			
 
				+    const bool isVec = isVectorType(type, &elemType, &elemCount);
			
 
				+    if (isVec) {
			
 
				+      return theBuilder.getVecType(
			
 
				+          getTypeWithCustomBitwidth(elemType, bitwidth), elemCount);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Scalar cases.
			
 
				+  assert(!type->isBooleanType());
			
 
				+  assert(type->isIntegerType() || type->isFloatingType());
			
 
				+  if (type->isFloatingType()) {
			
 
				+    switch (bitwidth) {
			
 
				+    case 16:
			
 
				+      return theBuilder.getFloat16Type();
			
 
				+    case 32:
			
 
				+      return theBuilder.getFloat32Type();
			
 
				+    case 64:
			
 
				+      return theBuilder.getFloat64Type();
			
 
				+    }
			
 
				+  }
			
 
				+  if (type->isSignedIntegerType()) {
			
 
				+    switch (bitwidth) {
			
 
				+    case 16:
			
 
				+      return theBuilder.getInt16Type();
			
 
				+    case 32:
			
 
				+      return theBuilder.getInt32Type();
			
 
				+    case 64:
			
 
				+      return theBuilder.getInt64Type();
			
 
				+    }
			
 
				+  }
			
 
				+  if (type->isUnsignedIntegerType()) {
			
 
				+    switch (bitwidth) {
			
 
				+    case 16:
			
 
				+      return theBuilder.getUint16Type();
			
 
				+    case 32:
			
 
				+      return theBuilder.getUint32Type();
			
 
				+    case 64:
			
 
				+      return theBuilder.getUint64Type();
			
 
				+    }
			
 
				+  }
			
 
				+  llvm_unreachable(
			
 
				+      "invalid type or bitwidth passed to getTypeWithCustomBitwidth");
			
 
				+}
			
 
				+
			
 
				+uint32_t TypeTranslator::getElementSpirvBitwidth(QualType type) {
			
 
				+  const auto canonicalType = type.getCanonicalType();
			
 
				+  if (canonicalType != type)
			
 
				+    return getElementSpirvBitwidth(canonicalType);
			
 
				+
			
 
				+  // Vector types
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    if (isVectorType(type, &elemType))
			
 
				+      return getElementSpirvBitwidth(elemType);
			
 
				+  }
			
 
				+
			
 
				+  // Scalar types
			
 
				+  QualType ty = {};
			
 
				+  const bool isScalar = isScalarType(type, &ty);
			
 
				+  assert(isScalar);
			
 
				+  if (const auto *builtinType = ty->getAs<BuiltinType>()) {
			
 
				+    switch (builtinType->getKind()) {
			
 
				+    case BuiltinType::Int:
			
 
				+    case BuiltinType::UInt:
			
 
				+    case BuiltinType::Float:
			
 
				+      return 32;
			
 
				+    case BuiltinType::Double:
			
 
				+    case BuiltinType::LongLong:
			
 
				+    case BuiltinType::ULongLong:
			
 
				+      return 64;
			
 
				+    // min16int (short), ushort, min12int, half, and min10float are treated as
			
 
				+    // 16-bit if '-enable-16bit-types' option is enabled. They are treated as
			
 
				+    // 32-bit otherwise.
			
 
				+    case BuiltinType::Short:
			
 
				+    case BuiltinType::UShort:
			
 
				+    case BuiltinType::Min12Int:
			
 
				+    case BuiltinType::Half:
			
 
				+    case BuiltinType::Min10Float: {
			
 
				+      if (spirvOptions.enable16BitTypes)
			
 
				+        return 16;
			
 
				+      else
			
 
				+        return 32;
			
 
				+    }
			
 
				+    case BuiltinType::LitFloat: {
			
 
				+      // First try to see if there are any hints about how this literal type
			
 
				+      // is going to be used. If so, use the hint.
			
 
				+      if (getIntendedLiteralType(ty) != ty) {
			
 
				+        return getElementSpirvBitwidth(getIntendedLiteralType(ty));
			
 
				+      }
			
 
				+
			
 
				+      const auto &semantics = astContext.getFloatTypeSemantics(type);
			
 
				+      const auto bitwidth = llvm::APFloat::getSizeInBits(semantics);
			
 
				+      if (bitwidth <= 32)
			
 
				+        return 32;
			
 
				+      else
			
 
				+        return 64;
			
 
				+    }
			
 
				+    case BuiltinType::LitInt: {
			
 
				+      // First try to see if there are any hints about how this literal type
			
 
				+      // is going to be used. If so, use the hint.
			
 
				+      if (getIntendedLiteralType(ty) != ty) {
			
 
				+        return getElementSpirvBitwidth(getIntendedLiteralType(ty));
			
 
				+      }
			
 
				+
			
 
				+      const auto bitwidth = astContext.getIntWidth(type);
			
 
				+      // All integer variants with bitwidth larger than 32 are represented
			
 
				+      // as 64-bit int in SPIR-V.
			
 
				+      // All integer variants with bitwidth of 32 or less are represented as
			
 
				+      // 32-bit int in SPIR-V.
			
 
				+      return bitwidth > 32 ? 64 : 32;
			
 
				+    }
			
 
				+    }
			
 
				+  }
			
 
				+  llvm_unreachable("invalid type passed to getElementSpirvBitwidth");
			
 
				+}
			
 
				+
			
 
				 uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
			
 
				                                        bool isRowMajor) {
			
 
				   // We can only apply row_major to matrices or arrays of matrices.
			
 
				+  // isRowMajor will be ignored for scalar and vector types.
			
 
				   if (isRowMajor)
			
 
				-    assert(isMxNMatrix(type) || type->isArrayType());
			
 
				+    assert(type->isScalarType() || type->isArrayType() ||
			
 
				+           hlsl::IsHLSLVecMatType(type));
			
 
				 
			
 
				   // Try to translate the canonical type first
			
 
				   const auto canonicalType = type.getCanonicalType();
			
@@ -224,80 +432,30 @@ uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
 
				           return theBuilder.getVoidType();
			
 
				         case BuiltinType::Bool:
			
 
				           return theBuilder.getBoolType();
			
 
				+        // All the ints
			
 
				         case BuiltinType::Int:
			
 
				-          return theBuilder.getInt32Type();
			
 
				         case BuiltinType::UInt:
			
 
				-          return theBuilder.getUint32Type();
			
 
				-        case BuiltinType::Float:
			
 
				-          return theBuilder.getFloat32Type();
			
 
				-        case BuiltinType::Double:
			
 
				-          return theBuilder.getFloat64Type();
			
 
				+        case BuiltinType::Short:
			
 
				+        case BuiltinType::Min12Int:
			
 
				+        case BuiltinType::UShort:
			
 
				         case BuiltinType::LongLong:
			
 
				-          return theBuilder.getInt64Type();
			
 
				         case BuiltinType::ULongLong:
			
 
				-          return theBuilder.getUint64Type();
			
 
				-        // min16int (short), and min12int are treated as 16-bit Int if
			
 
				-        // '-enable-16bit-types' option is enabled. They are treated as 32-bit
			
 
				-        // Int otherwise.
			
 
				-        case BuiltinType::Short:
			
 
				-        case BuiltinType::Min12Int: {
			
 
				-          if (spirvOptions.enable16BitTypes)
			
 
				-            return theBuilder.getInt16Type();
			
 
				-          else
			
 
				-            return theBuilder.getInt32Type();
			
 
				-        }
			
 
				-        // min16uint (ushort) is treated as 16-bit Uint if '-enable-16bit-types'
			
 
				-        // option is enabled. It is treated as 32-bit Uint otherwise.
			
 
				-        case BuiltinType::UShort: {
			
 
				-          if (spirvOptions.enable16BitTypes)
			
 
				-            return theBuilder.getUint16Type();
			
 
				-          else
			
 
				-            return theBuilder.getUint32Type();
			
 
				-        }
			
 
				-        // min16float (half), and min10float are all translated to
			
 
				-        // 32-bit float in SPIR-V.
			
 
				-        // min16float (half), and min10float are treated as 16-bit float if
			
 
				-        // '-enable-16bit-types' option is enabled. They are treated as 32-bit
			
 
				-        // float otherwise.
			
 
				+        // All the floats
			
 
				+        case BuiltinType::Float:
			
 
				+        case BuiltinType::Double:
			
 
				         case BuiltinType::Half:
			
 
				         case BuiltinType::Min10Float: {
			
 
				-          if (spirvOptions.enable16BitTypes)
			
 
				-            return theBuilder.getFloat16Type();
			
 
				-          else
			
 
				-            return theBuilder.getFloat32Type();
			
 
				+          const auto bitwidth = getElementSpirvBitwidth(ty);
			
 
				+          return getTypeWithCustomBitwidth(ty, bitwidth);
			
 
				         }
			
 
				+        // Literal types. First try to resolve them using hints.
			
 
				+        case BuiltinType::LitInt:
			
 
				         case BuiltinType::LitFloat: {
			
 
				-          // First try to see if there are any hints about how this literal type
			
 
				-          // is going to be used. If so, use the hint.
			
 
				-          if (getIntendedLiteralType(ty) != ty) {
			
 
				-            return translateType(getIntendedLiteralType(ty));
			
 
				-          }
			
 
				-
			
 
				-          const auto &semantics = astContext.getFloatTypeSemantics(type);
			
 
				-          const auto bitwidth = llvm::APFloat::getSizeInBits(semantics);
			
 
				-          if (bitwidth <= 32)
			
 
				-            return theBuilder.getFloat32Type();
			
 
				-          else
			
 
				-            return theBuilder.getFloat64Type();
			
 
				-        }
			
 
				-        case BuiltinType::LitInt: {
			
 
				-          // First try to see if there are any hints about how this literal type
			
 
				-          // is going to be used. If so, use the hint.
			
 
				           if (getIntendedLiteralType(ty) != ty) {
			
 
				             return translateType(getIntendedLiteralType(ty));
			
 
				           }
			
 
				-
			
 
				-          const auto bitwidth = astContext.getIntWidth(type);
			
 
				-          // All integer variants with bitwidth larger than 32 are represented
			
 
				-          // as 64-bit int in SPIR-V.
			
 
				-          // All integer variants with bitwidth of 32 or less are represented as
			
 
				-          // 32-bit int in SPIR-V.
			
 
				-          if (type->isSignedIntegerType())
			
 
				-            return bitwidth > 32 ? theBuilder.getInt64Type()
			
 
				-                                 : theBuilder.getInt32Type();
			
 
				-          else
			
 
				-            return bitwidth > 32 ? theBuilder.getUint64Type()
			
 
				-                                 : theBuilder.getUint32Type();
			
 
				+          const auto bitwidth = getElementSpirvBitwidth(ty);
			
 
				+          return getTypeWithCustomBitwidth(ty, bitwidth);
			
 
				         }
			
 
				         default:
			
 
				           emitError("primitive type %0 unimplemented")
			
@@ -345,20 +503,22 @@ uint32_t TypeTranslator::translateType(QualType type, LayoutRule rule,
 
				     QualType elemType = {};
			
 
				     uint32_t rowCount = 0, colCount = 0;
			
 
				     if (isMxNMatrix(type, &elemType, &rowCount, &colCount)) {
			
 
				-
			
 
				-      // We cannot handle external initialization of column-major matrices now.
			
 
				-      if (!elemType->isFloatingType() && rule != LayoutRule::Void &&
			
 
				-          !isRowMajor) {
			
 
				-        emitError(
			
 
				-            "externally initialized column-major matrices not supported yet");
			
 
				-        return 0;
			
 
				-      }
			
 
				-
			
 
				       // HLSL matrices are row major, while SPIR-V matrices are column major.
			
 
				       // We are mapping what HLSL semantically mean a row into a column here.
			
 
				       const uint32_t vecType =
			
 
				           theBuilder.getVecType(translateType(elemType), colCount);
			
 
				-      return theBuilder.getMatType(elemType, vecType, rowCount);
			
 
				+
			
 
				+      // If the matrix element type is not float, it is represented as an array
			
 
				+      // of vectors, and should therefore have the ArrayStride decoration.
			
 
				+      llvm::SmallVector<const Decoration *, 4> decorations;
			
 
				+      if (!elemType->isFloatingType() && rule != LayoutRule::Void) {
			
 
				+        uint32_t stride = 0;
			
 
				+        (void)getAlignmentAndSize(type, rule, isRowMajor, &stride);
			
 
				+        decorations.push_back(
			
 
				+            Decoration::getArrayStride(*theBuilder.getSPIRVContext(), stride));
			
 
				+      }
			
 
				+
			
 
				+      return theBuilder.getMatType(elemType, vecType, rowCount, decorations);
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -746,6 +906,35 @@ bool TypeTranslator::isMxNMatrix(QualType type, QualType *elemType,
 
				   return true;
			
 
				 }
			
 
				 
			
 
				+bool TypeTranslator::isOrContainsNonFpColMajorMatrix(QualType type,
			
 
				+                                                     const Decl *decl) const {
			
 
				+  const auto isColMajorDecl = [this](const Decl *decl) {
			
 
				+    return decl->hasAttr<HLSLColumnMajorAttr>() ||
			
 
				+           !decl->hasAttr<HLSLRowMajorAttr>() && !spirvOptions.defaultRowMajor;
			
 
				+  };
			
 
				+
			
 
				+  QualType elemType = {};
			
 
				+  if (isMxNMatrix(type, &elemType) && !elemType->isFloatingType()) {
			
 
				+    return isColMajorDecl(decl);
			
 
				+  }
			
 
				+
			
 
				+  if (const auto *arrayType = astContext.getAsConstantArrayType(type)) {
			
 
				+    if (isMxNMatrix(arrayType->getElementType(), &elemType) &&
			
 
				+        !elemType->isFloatingType())
			
 
				+      return isColMajorDecl(decl);
			
 
				+  }
			
 
				+
			
 
				+  if (const auto *structType = type->getAs<RecordType>()) {
			
 
				+    const auto *decl = structType->getDecl();
			
 
				+    for (const auto *field : decl->fields()) {
			
 
				+      if (isOrContainsNonFpColMajorMatrix(field->getType(), field))
			
 
				+        return true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				 bool TypeTranslator::isRowMajorMatrix(QualType type, const Decl *decl) const {
			
 
				   if (!isMxNMatrix(type) && !type->isArrayType())
			
 
				     return false;
			
@@ -907,7 +1096,12 @@ TypeTranslator::getLayoutDecorations(const DeclContext *decl, LayoutRule rule) {
 
				       // MatrixStride on the field. So skip possible arrays here.
			
 
				       fieldType = arrayType->getElementType();
			
 
				     }
			
 
				-    if (isMxNMatrix(fieldType)) {
			
 
				+
			
 
				+    // Non-floating point matrices are represented as arrays of vectors, and
			
 
				+    // therefore ColMajor and RowMajor decorations should not be applied to
			
 
				+    // them.
			
 
				+    QualType elemType = {};
			
 
				+    if (isMxNMatrix(fieldType, &elemType) && elemType->isFloatingType()) {
			
 
				       memberAlignment = memberSize = stride = 0;
			
 
				       std::tie(memberAlignment, memberSize) =
			
 
				           getAlignmentAndSize(fieldType, rule, isRowMajor, &stride);
			
@@ -1172,8 +1366,7 @@ TypeTranslator::getAlignmentAndSize(QualType type, LayoutRule rule,
 
				   //
			
 
				   // 8. If the member is an array of S row-major matrices with C columns and R
			
 
				   //    rows, the matrix is stored identically to a row of S X R row vectors
			
 
				-  //    with C
			
 
				-  //    components each, according to rule (4).
			
 
				+  //    with C components each, according to rule (4).
			
 
				   //
			
 
				   // 9. If the member is a structure, the base alignment of the structure is N,
			
 
				   //    where N is the largest base alignment value of any of its members, and
			
@@ -1207,6 +1400,10 @@ TypeTranslator::getAlignmentAndSize(QualType type, LayoutRule rule,
 
				         case BuiltinType::UInt:
			
 
				         case BuiltinType::Float:
			
 
				           return {4, 4};
			
 
				+        case BuiltinType::Double:
			
 
				+        case BuiltinType::LongLong:
			
 
				+        case BuiltinType::ULongLong:
			
 
				+          return {8, 8};
			
 
				         default:
			
 
				           emitError("primitive type %0 unimplemented")
			
 
				               << builtinType->getTypeClassName();
			
--- a/tools/clang/lib/SPIRV/TypeTranslator.h
+++ b/tools/clang/lib/SPIRV/TypeTranslator.h
@@ -125,6 +125,20 @@ public:
 
				   /// \brief Returns true if the given type is SubpassInputMS.
			
 
				   static bool isSubpassInputMS(QualType);
			
 
				 
			
 
				+  /// \brief Evluates the given type at the given bitwidth and returns the
			
 
				+  /// result-id for it. Panics if the given type is not a scalar or vector of
			
 
				+  /// float or integer type. For example: if QualType of an int4 and bitwidth of
			
 
				+  /// 64 is passed in, the result-id of a SPIR-V vector of size 4 of signed
			
 
				+  /// 64-bit integers is returned.
			
 
				+  /// Acceptable bitwidths are 16, 32, and 64.
			
 
				+  uint32_t getTypeWithCustomBitwidth(QualType type, uint32_t bitwidth);
			
 
				+
			
 
				+  /// \brief Returns the realized bitwidth of the given type when represented in
			
 
				+  /// SPIR-V. Panics if the given type is not a scalar or vector of float or
			
 
				+  /// integer. In case of vectors, it returns the realized SPIR-V bitwidth of
			
 
				+  /// the vector elements.
			
 
				+  uint32_t getElementSpirvBitwidth(QualType type);
			
 
				+
			
 
				   /// \brief Returns true if the given type will be translated into a SPIR-V
			
 
				   /// scalar type. This includes normal scalar types, vectors of size 1, and
			
 
				   /// 1x1 matrices. If scalarType is not nullptr, writes the scalar type to
			
@@ -164,10 +178,15 @@ public:
 
				                           uint32_t *rowCount = nullptr,
			
 
				                           uint32_t *colCount = nullptr);
			
 
				 
			
 
				-  /// \broef returns true if type is a matrix and matrix is row major
			
 
				-  /// If decl is not nullptr, is is checked for attributes specifying majorness
			
 
				+  /// \brief Returns true if type is a matrix and matrix is row major
			
 
				+  /// If decl is not nullptr, it is checked for attributes specifying majorness.
			
 
				   bool isRowMajorMatrix(QualType type, const Decl *decl = nullptr) const;
			
 
				 
			
 
				+  /// \brief Returns true if the decl type is a non-floating-point matrix and
			
 
				+  /// the matrix is column major, or if it is an array/struct containing such
			
 
				+  /// matrices.
			
 
				+  bool isOrContainsNonFpColMajorMatrix(QualType type, const Decl *decl) const;
			
 
				+
			
 
				   /// \brief Returns true if the two types are the same scalar or vector type,
			
 
				   /// regardless of constness and literalness.
			
 
				   static bool isSameScalarOrVecType(QualType type1, QualType type2);
			
@@ -221,6 +240,9 @@ public:
 
				   llvm::SmallVector<const Decoration *, 4>
			
 
				   getLayoutDecorations(const DeclContext *decl, LayoutRule rule);
			
 
				 
			
 
				+  /// \brief Returns how many sequential locations are consumed by a given type.
			
 
				+  uint32_t getLocationCount(QualType type);
			
 
				+
			
 
				 private:
			
 
				   /// \brief Wrapper method to create an error message and report it
			
 
				   /// in the diagnostic engine associated with this consumer.
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/constant_cast.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/constant_cast.hlsl
@@ -0,0 +1,27 @@
 
				+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Make sure no store is generated.
			
 
				+// CHECK-NOT:store {{.*}},
			
 
				+
			
 
				+struct ST
			
 
				+{
			
 
				+    float4 a;
			
 
				+    float4 b;
			
 
				+    float4 c;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+cbuffer cbModelSkinningConstants : register ( b4 )
			
 
				+{
			
 
				+    float4 v[ 2 * 256 * 3 ];
			
 
				+
			
 
				+    static const float4 v2d[ 512 ] [ 3 ] = v ;
			
 
				+    static const ST vst[ 512 ] = v;
			
 
				+} ;
			
 
				+
			
 
				+
			
 
				+float4 main(int i:I) : SV_Target {
			
 
				+  return v2d[i][1] + vst[i].b;
			
 
				+}
			
--- a/tools/clang/test/CodeGenHLSL/quick-test/flat_addrspacecast.hlsl
+++ b/tools/clang/test/CodeGenHLSL/quick-test/flat_addrspacecast.hlsl
@@ -0,0 +1,28 @@
 
				+// RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
			
 
				+
			
 
				+// Make sure generate addrspacecast.
			
 
				+// CHECK: addrspacecast (float addrspace(3)*
			
 
				+
			
 
				+struct ST
			
 
				+{
			
 
				+	float3 a; // center
			
 
				+	float3 b; // half extents
			
 
				+
			
 
				+        void func(float3 x, float3 y)
			
 
				+	{
			
 
				+		a = x + y;
			
 
				+		b = x * y;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+groupshared ST myST;
			
 
				+StructuredBuffer<ST> buf0;
			
 
				+float3 a;
			
 
				+float3 b;
			
 
				+RWBuffer<float3> buf1;
			
 
				+[numthreads(8,8,1)]
			
 
				+void main() {
			
 
				+  myST = buf0[0];
			
 
				+  myST.func(a, b);
			
 
				+  buf1[0] = myST.b;
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/bezier.domain.hlsl2spv
+++ b/tools/clang/test/CodeGenSPIRV/bezier.domain.hlsl2spv
@@ -107,10 +107,10 @@ DS_OUTPUT BezierEvalDS( HS_CONSTANT_DATA_OUTPUT input,
 
				 // OpDecorate %gl_TessCoord Patch
			
 
				 // OpDecorate %in_var_BEZIERPOS Location 0
			
 
				 // OpDecorate %in_var_TANGENT Location 1
			
 
				-// OpDecorate %in_var_TANUCORNER Location 2
			
 
				-// OpDecorate %in_var_TANVCORNER Location 3
			
 
				-// OpDecorate %in_var_TANWEIGHTS Location 4
			
 
				-// OpDecorate %in_var_TEXCOORD Location 5
			
 
				+// OpDecorate %in_var_TANUCORNER Location 5
			
 
				+// OpDecorate %in_var_TANVCORNER Location 9
			
 
				+// OpDecorate %in_var_TANWEIGHTS Location 13
			
 
				+// OpDecorate %in_var_TEXCOORD Location 14
			
 
				 // OpDecorate %out_var_NORMAL Location 0
			
 
				 // OpDecorate %out_var_TEXCOORD Location 1
			
 
				 // OpDecorate %out_var_TANGENT Location 2
			
--- a/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
+++ b/tools/clang/test/CodeGenSPIRV/bezier.hull.hlsl2spv
@@ -129,10 +129,10 @@ BEZIER_CONTROL_POINT SubDToBezierHS(InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POIN
 
				 // OpDecorate %in_var_TANGENT Location 2
			
 
				 // OpDecorate %out_var_BEZIERPOS Location 0
			
 
				 // OpDecorate %out_var_TANGENT Location 1
			
 
				-// OpDecorate %out_var_TANUCORNER Location 2
			
 
				-// OpDecorate %out_var_TANVCORNER Location 3
			
 
				-// OpDecorate %out_var_TANWEIGHTS Location 4
			
 
				-// OpDecorate %out_var_TEXCOORD Location 5
			
 
				+// OpDecorate %out_var_TANUCORNER Location 5
			
 
				+// OpDecorate %out_var_TANVCORNER Location 9
			
 
				+// OpDecorate %out_var_TANWEIGHTS Location 13
			
 
				+// OpDecorate %out_var_TEXCOORD Location 14
			
 
				 // %void = OpTypeVoid
			
 
				 // %3 = OpTypeFunction %void
			
 
				 // %float = OpTypeFloat 32
			
--- a/tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.form.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.form.hlsl
@@ -1,8 +1,6 @@
 
				 // Run: %dxc -T vs_6_0 -E main
			
 
				 
			
 
				 void main() {
			
 
				-// CHECK-LABEL: %bb_entry = OpLabel
			
 
				-
			
 
				     float4 a;
			
 
				     float s;
			
 
				 
			
--- a/tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.type.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/binary-op.arith-assign.mixed.type.hlsl
@@ -0,0 +1,26 @@
 
				+// Run: %dxc -T vs_6_0 -E main
			
 
				+
			
 
				+void main() {
			
 
				+    uint uVal;
			
 
				+    bool bVal;
			
 
				+
			
 
				+    float fVal;
			
 
				+    int iVal;
			
 
				+
			
 
				+    // No conversion of lhs
			
 
				+// CHECK:      [[b_bool:%\d+]] = OpLoad %bool %bVal
			
 
				+// CHECK-NEXT: [[b_uint:%\d+]] = OpSelect %uint [[b_bool]] %uint_1 %uint_0
			
 
				+// CHECK-NEXT: [[u_uint:%\d+]] = OpLoad %uint %uVal
			
 
				+// CHECK-NEXT:    [[add:%\d+]] = OpIAdd %uint [[u_uint]] [[b_uint]]
			
 
				+// CHECK-NEXT:                   OpStore %uVal [[add]]
			
 
				+    uVal += bVal;
			
 
				+
			
 
				+    // Convert lhs to the type of rhs, do computation, and then convert back
			
 
				+// CHECK:        [[f_float:%\d+]] = OpLoad %float %fVal
			
 
				+// CHECK-NEXT:     [[i_int:%\d+]] = OpLoad %int %iVal
			
 
				+// CHECK-NEXT:   [[i_float:%\d+]] = OpConvertSToF %float [[i_int]]
			
 
				+// CHECK-NEXT: [[mul_float:%\d+]] = OpFMul %float [[i_float]] [[f_float]]
			
 
				+// CHECK-NEXT:   [[mul_int:%\d+]] = OpConvertFToS %int [[mul_float]]
			
 
				+// CHECK-NEXT:                      OpStore %iVal [[mul_int]]
			
 
				+    iVal *= fVal;
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/cast.bitwidth.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/cast.bitwidth.hlsl
@@ -0,0 +1,165 @@
 
				+// Run: %dxc -T ps_6_2 -E main -enable-16bit-types
			
 
				+
			
 
				+void main() {
			
 
				+
			
 
				+  // 32-bit uint to various 64-bit types.
			
 
				+  uint a;
			
 
				+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
			
 
				+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
			
 
				+// CHECK-NEXT:                    OpStore %b [[a_ulong]]
			
 
				+  uint64_t b = a;
			
 
				+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
			
 
				+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
			
 
				+// CHECK-NEXT:[[a_double:%\d+]] = OpConvertUToF %double [[a_ulong]]
			
 
				+// CHECK-NEXT:                    OpStore %c [[a_double]]
			
 
				+  double   c = a;
			
 
				+// CHECK:            [[a:%\d+]] = OpLoad %uint %a
			
 
				+// CHECK-NEXT: [[a_ulong:%\d+]] = OpUConvert %ulong [[a]]
			
 
				+// CHECK-NEXT:  [[a_long:%\d+]] = OpBitcast %long [[a_ulong]]
			
 
				+// CHECK-NEXT:                    OpStore %d [[a_long]]
			
 
				+  int64_t  d = a;
			
 
				+
			
 
				+
			
 
				+  // 32-bit int to various 64-bit types.
			
 
				+  int aa;
			
 
				+// CHECK:            [[aa:%\d+]] = OpLoad %int %aa
			
 
				+// CHECK-NEXT:  [[aa_long:%\d+]] = OpSConvert %long [[aa]]
			
 
				+// CHECK-NEXT: [[aa_ulong:%\d+]] = OpBitcast %ulong [[aa_long]]
			
 
				+// CHECK-NEXT:                     OpStore %bb [[aa_ulong]]
			
 
				+  uint64_t bb = aa;
			
 
				+// CHECK:             [[aa:%\d+]] = OpLoad %int %aa
			
 
				+// CHECK-NEXT:   [[aa_long:%\d+]] = OpSConvert %long [[aa]]
			
 
				+// CHECK-NEXT: [[aa_double:%\d+]] = OpConvertSToF %double [[aa_long]]
			
 
				+// CHECK-NEXT:                      OpStore %cc [[aa_double]]
			
 
				+  double   cc = aa;
			
 
				+// CHECK:           [[aa:%\d+]] = OpLoad %int %aa
			
 
				+// CHECK-NEXT: [[aa_long:%\d+]] = OpSConvert %long [[aa]]
			
 
				+// CHECK-NEXT:                    OpStore %dd [[aa_long]]
			
 
				+  int64_t  dd = aa;
			
 
				+
			
 
				+
			
 
				+  // 32-bit float to various 64-bit types.
			
 
				+  float aaa;
			
 
				+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
			
 
				+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
			
 
				+// CHECK-NEXT:  [[aaa_ulong:%\d+]] = OpConvertFToU %ulong [[aaa_double]]
			
 
				+// CHECK-NEXT:                       OpStore %bbb [[aaa_ulong]]
			
 
				+  uint64_t bbb = aaa;
			
 
				+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
			
 
				+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
			
 
				+// CHECK-NEXT:                       OpStore %ccc [[aaa_double]]
			
 
				+  double   ccc = aaa;
			
 
				+// CHECK:             [[aaa:%\d+]] = OpLoad %float %aaa
			
 
				+// CHECK-NEXT: [[aaa_double:%\d+]] = OpFConvert %double [[aaa]]
			
 
				+// CHECK-NEXT:   [[aaa_long:%\d+]] = OpConvertFToS %long [[aaa_double]]
			
 
				+// CHECK-NEXT:                       OpStore %ddd [[aaa_long]]
			
 
				+  int64_t  ddd = aaa;
			
 
				+
			
 
				+
			
 
				+  // 64-bit uint to various 32-bit types.
			
 
				+  uint64_t e;
			
 
				+// CHECK:      [[e64:%\d+]] = OpLoad %ulong %e
			
 
				+// CHECK-NEXT: [[e32:%\d+]] = OpUConvert %uint [[e64]]
			
 
				+// CHECK-NEXT:                OpStore %f [[e32]]
			
 
				+  uint  f = e;
			
 
				+// CHECK:          [[e64:%\d+]] = OpLoad %ulong %e
			
 
				+// CHECK-NEXT:     [[e32:%\d+]] = OpUConvert %uint [[e64]]
			
 
				+// CHECK-NEXT: [[e_float:%\d+]] = OpConvertUToF %float [[e32]]
			
 
				+// CHECK-NEXT:                    OpStore %g [[e_float]]
			
 
				+  float g = e;
			
 
				+// CHECK:        [[e64:%\d+]] = OpLoad %ulong %e
			
 
				+// CHECK-NEXT:   [[e32:%\d+]] = OpUConvert %uint [[e64]]
			
 
				+// CHECK-NEXT: [[e_int:%\d+]] = OpBitcast %int [[e32]]
			
 
				+// CHECK-NEXT:                  OpStore %h [[e_int]]
			
 
				+  int   h = e;
			
 
				+
			
 
				+
			
 
				+  // 64-bit int to various 32-bit types.
			
 
				+  int64_t ee;
			
 
				+// CHECK:           [[e:%\d+]] = OpLoad %long %ee
			
 
				+// CHECK-NEXT:  [[e_int:%\d+]] = OpSConvert %int [[e]]
			
 
				+// CHECK-NEXT: [[e_uint:%\d+]] = OpBitcast %uint [[e_int]]
			
 
				+// CHECK-NEXT:                   OpStore %ff [[e_uint]]
			
 
				+  uint  ff = ee;
			
 
				+// CHECK:            [[e:%\d+]] = OpLoad %long %ee
			
 
				+// CHECK-NEXT:   [[e_int:%\d+]] = OpSConvert %int [[e]]
			
 
				+// CHECK-NEXT: [[e_float:%\d+]] = OpConvertSToF %float [[e_int]]
			
 
				+// CHECK-NEXT:                    OpStore %gg [[e_float]]
			
 
				+  float gg = ee;
			
 
				+// CHECK:          [[e:%\d+]] = OpLoad %long %ee
			
 
				+// CHECK-NEXT: [[e_int:%\d+]] = OpSConvert %int [[e]]
			
 
				+// CHECK-NEXT:                  OpStore %hh [[e_int]]
			
 
				+  int   hh = ee;
			
 
				+
			
 
				+
			
 
				+  // 64-bit float to various 32-bit types.
			
 
				+  double eee;
			
 
				+// CHECK:         [[e64:%\d+]] = OpLoad %double %eee
			
 
				+// CHECK-NEXT:    [[e32:%\d+]] = OpFConvert %float [[e64]]
			
 
				+// CHECK-NEXT: [[e_uint:%\d+]] = OpConvertFToU %uint [[e32]]
			
 
				+// CHECK-NEXT:                   OpStore %fff [[e_uint]]
			
 
				+  uint  fff = eee;
			
 
				+// CHECK:              [[e:%\d+]] = OpLoad %double %eee
			
 
				+// CHECK-NEXT:   [[e_float:%\d+]] = OpFConvert %float [[e]]
			
 
				+// CHECK-NEXT:                      OpStore %ggg [[e_float]]
			
 
				+  float ggg = eee;
			
 
				+// CHECK:            [[e:%\d+]] = OpLoad %double %eee
			
 
				+// CHECK-NEXT: [[e_float:%\d+]] = OpFConvert %float [[e]]
			
 
				+// CHECK-NEXT:   [[e_int:%\d+]] = OpConvertFToS %int [[e_float]]
			
 
				+// CHECK-NEXT:                    OpStore %hhh [[e_int]]
			
 
				+  int   hhh = eee;
			
 
				+
			
 
				+
			
 
				+  // Vector case: 64-bit float to various 32-bit types.
			
 
				+  double2 i;
			
 
				+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
			
 
				+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
			
 
				+// CHECK-NEXT:   [[i_uint:%\d+]] = OpConvertFToU %v2uint [[i_float]]
			
 
				+// CHECK-NEXT:                     OpStore %j [[i_uint]]
			
 
				+  uint2   j = i;
			
 
				+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
			
 
				+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
			
 
				+// CHECK-NEXT:    [[i_int:%\d+]] = OpConvertFToS %v2int [[i_float]]
			
 
				+// CHECK-NEXT:                     OpStore %k [[i_int]]
			
 
				+  int2    k = i;
			
 
				+// CHECK:      [[i_double:%\d+]] = OpLoad %v2double %i
			
 
				+// CHECK-NEXT:  [[i_float:%\d+]] = OpFConvert %v2float [[i_double]]
			
 
				+// CHECK-NEXT:                     OpStore %l [[i_float]]
			
 
				+  float2  l = i;
			
 
				+
			
 
				+
			
 
				+  // 16-bit uint to various 32-bit types.
			
 
				+  uint16_t m;
			
 
				+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
			
 
				+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
			
 
				+// CHECK-NEXT:                     OpStore %n [[m_uint]]
			
 
				+  uint  n = m;
			
 
				+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
			
 
				+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
			
 
				+// CHECK-NEXT:  [[m_float:%\d+]] = OpConvertUToF %float [[m_uint]]
			
 
				+// CHECK-NEXT:                     OpStore %o [[m_float]]
			
 
				+  float o = m;
			
 
				+// CHECK:      [[m_ushort:%\d+]] = OpLoad %ushort %m
			
 
				+// CHECK-NEXT:   [[m_uint:%\d+]] = OpUConvert %uint [[m_ushort]]
			
 
				+// CHECK-NEXT:    [[m_int:%\d+]] = OpBitcast %int [[m_uint]]
			
 
				+// CHECK-NEXT:                     OpStore %p [[m_int]]
			
 
				+  int   p = m;
			
 
				+
			
 
				+
			
 
				+  // 16-bit int to various 32-bit types.
			
 
				+  int16_t mm;
			
 
				+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
			
 
				+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
			
 
				+// CHECK-NEXT:  [[mm_uint:%\d+]] = OpBitcast %uint [[mm_int]]
			
 
				+// CHECK-NEXT:                     OpStore %nn [[mm_uint]]
			
 
				+  uint  nn = mm;
			
 
				+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
			
 
				+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
			
 
				+// CHECK-NEXT: [[mm_float:%\d+]] = OpConvertSToF %float [[mm_int]]
			
 
				+// CHECK-NEXT:                     OpStore %oo [[mm_float]]
			
 
				+  float oo = mm;
			
 
				+// CHECK:      [[mm_short:%\d+]] = OpLoad %short %mm
			
 
				+// CHECK-NEXT:   [[mm_int:%\d+]] = OpSConvert %int [[mm_short]]
			
 
				+// CHECK-NEXT:                     OpStore %pp [[mm_int]]
			
 
				+  int   pp = mm;
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/cast.flat-conversion.literal-initializer.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/cast.flat-conversion.literal-initializer.hlsl
@@ -0,0 +1,50 @@
 
				+// Run: %dxc -T ps_6_0 -E main
			
 
				+
			
 
				+struct S {
			
 
				+  float2   a;
			
 
				+  float    b;
			
 
				+  double2  c;
			
 
				+  double   d;
			
 
				+  int64_t  e;
			
 
				+  uint64_t f;
			
 
				+};
			
 
				+
			
 
				+void main() {
			
 
				+
			
 
				+// CHECK:              [[inf:%\d+]] = OpFDiv %float %float_1 %float_0
			
 
				+// CHECK-NEXT:        [[inf2:%\d+]] = OpCompositeConstruct %v2float [[inf]] [[inf]]
			
 
				+// CHECK-NEXT:  [[inf_double:%\d+]] = OpFConvert %double [[inf]]
			
 
				+// CHECK-NEXT: [[inf2_double:%\d+]] = OpCompositeConstruct %v2double [[inf_double]] [[inf_double]] 
			
 
				+// CHECK-NEXT:  [[inf_double:%\d+]] = OpFConvert %double [[inf]]
			
 
				+// CHECK-NEXT: [[inf_double_:%\d+]] = OpFConvert %double [[inf]]
			
 
				+// CHECK-NEXT:   [[inf_int64:%\d+]] = OpConvertFToS %long [[inf_double_]]
			
 
				+// CHECK-NEXT: [[inf_double_:%\d+]] = OpFConvert %double [[inf]]
			
 
				+// CHECK-NEXT:  [[inf_uint64:%\d+]] = OpConvertFToU %ulong [[inf_double_]]
			
 
				+// CHECK-NEXT:             {{%\d+}} = OpCompositeConstruct %S [[inf2]] [[inf]] [[inf2_double]] [[inf_double]] [[inf_int64]] [[inf_uint64]]
			
 
				+  S s3 = (S)(1.0 / 0.0);
			
 
				+
			
 
				+// CHECK:              [[b:%\d+]] = OpLoad %float %b
			
 
				+// CHECK-NEXT:  [[b2_float:%\d+]] = OpCompositeConstruct %v2float [[b]] [[b]]
			
 
				+// CHECK-NEXT:  [[b_double:%\d+]] = OpFConvert %double [[b]]
			
 
				+// CHECK-NEXT: [[b2_double:%\d+]] = OpCompositeConstruct %v2double [[b_double]] [[b_double]]
			
 
				+// CHECK-NEXT:  [[b_double:%\d+]] = OpFConvert %double [[b]]
			
 
				+// CHECK-NEXT: [[b_double_:%\d+]] = OpFConvert %double [[b]]
			
 
				+// CHECK-NEXT:   [[b_int64:%\d+]] = OpConvertFToS %long [[b_double_]]
			
 
				+// CHECK-NEXT: [[b_double_:%\d+]] = OpFConvert %double [[b]]
			
 
				+// CHECK-NEXT:  [[b_uint64:%\d+]] = OpConvertFToU %ulong [[b_double_]]
			
 
				+// CHECK-NEXT:           {{%\d+}} = OpCompositeConstruct %S [[b2_float]] [[b]] [[b2_double]] [[b_double]] [[b_int64]] [[b_uint64]]
			
 
				+  float b;
			
 
				+  S s2 = (S)(b);
			
 
				+
			
 
				+
			
 
				+// CHECK:              [[a:%\d+]] = OpLoad %double %a
			
 
				+// CHECK-NEXT:   [[a_float:%\d+]] = OpFConvert %float [[a]]
			
 
				+// CHECK-NEXT:  [[a2_float:%\d+]] = OpCompositeConstruct %v2float [[a_float]] [[a_float]]
			
 
				+// CHECK-NEXT:   [[a_float:%\d+]] = OpFConvert %float [[a]]
			
 
				+// CHECK-NEXT: [[a2_double:%\d+]] = OpCompositeConstruct %v2double [[a]] [[a]]
			
 
				+// CHECK-NEXT:   [[a_int64:%\d+]] = OpConvertFToS %long [[a]]
			
 
				+// CHECK-NEXT:  [[a_uint64:%\d+]] = OpConvertFToU %ulong [[a]]
			
 
				+// CHECK-NEXT:           {{%\d+}} = OpCompositeConstruct %S [[a2_float]] [[a_float]] [[a2_double]] [[a]] [[a_int64]] [[a_uint64]]
			
 
				+  double a;
			
 
				+  S s1 = (S)(a);
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/constant.scalar.16bit.enabled.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/constant.scalar.16bit.enabled.hlsl
@@ -15,8 +15,8 @@
 
				 // min16uint:  uint16_t(warning)
			
 
				 // uint16_t:   uint16_t
			
 
				 
			
 
				-// CHECK: OpCapability Float16
			
 
				 // CHECK: OpCapability Int16
			
 
				+// CHECK: OpExtension "SPV_AMD_gpu_shader_half_float"
			
 
				 
			
 
				 // CHECK-NOT: OpDecorate %c_half RelaxedPrecision
			
 
				 // CHECK-NOT: OpDecorate %c_min10float RelaxedPrecision
			
--- a/tools/clang/test/CodeGenSPIRV/cs.groupshared.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/cs.groupshared.hlsl
@@ -5,10 +5,16 @@ struct S {
 
				     float3 f2;
			
 
				 };
			
 
				 
			
 
				+// CHECK-NOT: OpDecorate %a DescriptorSet
			
 
				+// CHECK-NOT: OpDecorate %b DescriptorSet
			
 
				+// CHECK-NOT: OpDecorate %c DescriptorSet
			
 
				+// CHECK-NOT: OpDecorate %d DescriptorSet
			
 
				+// CHECK-NOT: OpDecorate %s DescriptorSet
			
 
				+
			
 
				 // CHECK: %a = OpVariable %_ptr_Workgroup_float Workgroup
			
 
				 groupshared              float    a;
			
 
				 // CHECK: %b = OpVariable %_ptr_Workgroup_v3float Workgroup
			
 
				-groupshared              float3   b;
			
 
				+static groupshared       float3   b;  // Ignore static modifier
			
 
				 // CHECK: %c = OpVariable %_ptr_Workgroup_mat2v3float Workgroup
			
 
				 groupshared column_major float2x3 c;
			
 
				 // CHECK: %d = OpVariable %_ptr_Workgroup__arr_v2float_uint_5 Workgroup
			
--- a/tools/clang/test/CodeGenSPIRV/op.cbuffer.access.majorness.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/op.cbuffer.access.majorness.hlsl
@@ -0,0 +1,34 @@
 
				+// Run: %dxc -T cs_6_0 -E main -Zpr
			
 
				+
			
 
				+// CHECK: %SData = OpTypeStruct %_arr_mat3v4float_uint_2 %_arr_mat3v4float_uint_2_0
			
 
				+struct SData {
			
 
				+                float3x4 mat1[2];
			
 
				+   column_major float3x4 mat2[2];
			
 
				+};
			
 
				+
			
 
				+// CHECK: %type_SBufferData = OpTypeStruct %SData %_arr_mat3v4float_uint_2 %_arr_mat3v4float_uint_2_0
			
 
				+cbuffer SBufferData {
			
 
				+                SData    BufferData;
			
 
				+                float3x4 Mat1[2];
			
 
				+   column_major float3x4 Mat2[2];
			
 
				+};
			
 
				+
			
 
				+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_SData %SBufferData %int_0
			
 
				+// CHECK: [[val:%\d+]] = OpLoad %SData [[ptr]]
			
 
				+// CHECK:     {{%\d+}} = OpCompositeExtract %_arr_mat3v4float_uint_2 %32 0
			
 
				+// CHECK:     {{%\d+}} = OpCompositeExtract %_arr_mat3v4float_uint_2_0 %32 1
			
 
				+static const SData Data = BufferData;
			
 
				+
			
 
				+RWStructuredBuffer<float4> Out;
			
 
				+
			
 
				+[numthreads(4, 4, 4)]
			
 
				+void main() {
			
 
				+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform__arr_mat3v4float_uint_2 %SBufferData %int_1
			
 
				+// CHECK:     {{%\d+}} = OpLoad %_arr_mat3v4float_uint_2 [[ptr]]
			
 
				+  float3x4 a[2] = Mat1;
			
 
				+// CHECK: [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform__arr_mat3v4float_uint_2_0 %SBufferData %int_2
			
 
				+// CHECK:     {{%\d+}} = OpLoad %_arr_mat3v4float_uint_2_0 [[ptr]]
			
 
				+  float3x4 b[2] = Mat2;
			
 
				+
			
 
				+  Out[0] = Data.mat1[0][0];
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-count.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-count.hlsl
@@ -0,0 +1,19 @@
 
				+// Run: %dxc -T cs_6_0 -E main
			
 
				+
			
 
				+RWStructuredBuffer<uint> values;
			
 
				+
			
 
				+// CHECK: OpCapability SubgroupBallotKHR
			
 
				+// CHECK: OpExtension "SPV_KHR_shader_ballot"
			
 
				+
			
 
				+// CHECK: OpEntryPoint GLCompute
			
 
				+// CHECK-SAME: %SubgroupSize
			
 
				+
			
 
				+// CHECK: OpDecorate %SubgroupSize BuiltIn SubgroupSize
			
 
				+
			
 
				+// CHECK: %SubgroupSize = OpVariable %_ptr_Input_uint Input
			
 
				+
			
 
				+[numthreads(32, 1, 1)]
			
 
				+void main(uint3 id: SV_DispatchThreadID) {
			
 
				+// CHECK: OpLoad %uint %SubgroupSize
			
 
				+    values[id.x] = WaveGetLaneCount();
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-index.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/sm6.wave-get-lane-index.hlsl
@@ -0,0 +1,19 @@
 
				+// Run: %dxc -T cs_6_0 -E main
			
 
				+
			
 
				+RWStructuredBuffer<uint> values;
			
 
				+
			
 
				+// CHECK: OpCapability SubgroupBallotKHR
			
 
				+// CHECK: OpExtension "SPV_KHR_shader_ballot"
			
 
				+
			
 
				+// CHECK: OpEntryPoint GLCompute
			
 
				+// CHECK-SAME: %SubgroupLocalInvocationId
			
 
				+
			
 
				+// CHECK: OpDecorate %SubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
			
 
				+
			
 
				+// CHECK: %SubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
			
 
				+
			
 
				+[numthreads(32, 1, 1)]
			
 
				+void main(uint3 id: SV_DispatchThreadID) {
			
 
				+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
			
 
				+    values[id.x] = WaveGetLaneIndex();
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/sm6.wave-read-lane-first.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/sm6.wave-read-lane-first.hlsl
@@ -0,0 +1,30 @@
 
				+// Run: %dxc -T cs_6_0 -E main
			
 
				+
			
 
				+// CHECK: OpCapability SubgroupBallotKHR
			
 
				+// CHECK: OpExtension "SPV_KHR_shader_ballot"
			
 
				+
			
 
				+struct S {
			
 
				+    uint4 val1;
			
 
				+     int2 val2;
			
 
				+    float val3;
			
 
				+};
			
 
				+
			
 
				+RWStructuredBuffer<S> values;
			
 
				+
			
 
				+[numthreads(32, 1, 1)]
			
 
				+void main(uint3 id: SV_DispatchThreadID) {
			
 
				+    uint x = id.x;
			
 
				+
			
 
				+    uint4 val1 = values[x].val1;
			
 
				+     int2 val2 = values[x].val2;
			
 
				+    float val3 = values[x].val3;
			
 
				+
			
 
				+// OpSubgroupFirstInvocationKHR requires that:
			
 
				+//   Result Type must be a 32-bit integer type or a 32-bit float type scalar.
			
 
				+
			
 
				+    // values[x].val1 = WaveReadLaneFirst(val1);
			
 
				+    // values[x].val2 = WaveReadLaneFirst(val2);
			
 
				+// CHECK:      [[val3:%\d+]] = OpLoad %float %val3
			
 
				+// CHECK-NEXT:      {{%\d+}} = OpSubgroupFirstInvocationKHR %float [[val3]]
			
 
				+    values[x].val3 = WaveReadLaneFirst(val3);
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/sm6.wave.builtin.no-dup.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/sm6.wave.builtin.no-dup.hlsl
@@ -0,0 +1,27 @@
 
				+// Run: %dxc -T cs_6_0 -E main
			
 
				+
			
 
				+// Some wave ops translates into SPIR-V builtin variables.
			
 
				+// Test that we are not generating duplicated builtins for multiple calls of
			
 
				+// of the same wave ops.
			
 
				+RWStructuredBuffer<uint> values;
			
 
				+
			
 
				+// CHECK: OpEntryPoint GLCompute
			
 
				+// CHECK-SAME: %SubgroupSize %SubgroupLocalInvocationId
			
 
				+
			
 
				+// CHECK: OpDecorate %SubgroupSize BuiltIn SubgroupSize
			
 
				+// CHECK-NOT: OpDecorate {{%\w+}} BuiltIn SubgroupSize
			
 
				+
			
 
				+// CHECK: OpDecorate %SubgroupLocalInvocationId BuiltIn SubgroupLocalInvocationId
			
 
				+// CHECK-NOT: OpDecorate {{%\w+}} BuiltIn SubgroupLocalInvocationId
			
 
				+
			
 
				+// CHECK: %SubgroupSize = OpVariable %_ptr_Input_uint Input
			
 
				+// CHECK-NEXT: %SubgroupLocalInvocationId = OpVariable %_ptr_Input_uint Input
			
 
				+
			
 
				+[numthreads(32, 1, 1)]
			
 
				+void main(uint3 id: SV_DispatchThreadID) {
			
 
				+// CHECK: OpLoad %uint %SubgroupSize
			
 
				+// CHECK: OpLoad %uint %SubgroupSize
			
 
				+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
			
 
				+// CHECK: OpLoad %uint %SubgroupLocalInvocationId
			
 
				+    values[id.x] = WaveGetLaneCount() + WaveGetLaneCount() + WaveGetLaneIndex() + WaveGetLaneIndex();
			
 
				+}
			
--- a/tools/clang/test/CodeGenSPIRV/spirv.interface.hs.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/spirv.interface.hs.hlsl
@@ -90,7 +90,7 @@ struct HsPcfOut
 
				 // CHECK: OpDecorate %out_var_BAR Location 0
			
 
				 // CHECK: OpDecorate %out_var_FOO Location 1
			
 
				 // CHECK: OpDecorate %out_var_TEXCOORD Location 2
			
 
				-// CHECK: OpDecorate %out_var_WEIGHT Location 3
			
 
				+// CHECK: OpDecorate %out_var_WEIGHT Location 6
			
 
				 
			
 
				 // Input : clip0 + clip2         : 3 floats
			
 
				 // Input : cull3 + cull5         : 4 floats
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std140.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std140.hlsl
@@ -0,0 +1,36 @@
 
				+// Run: %dxc -T vs_6_0 -E main
			
 
				+
			
 
				+// CHECK: OpDecorate %_arr_double_uint_3 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr_mat2v3double_uint_2 ArrayStride 64
			
 
				+// CHECK: OpDecorate %_arr_v2long_uint_1 ArrayStride 16
			
 
				+
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 0 Offset 0
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 1 Offset 8
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 2 Offset 16
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 3 Offset 64
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 4 Offset 96
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 5 Offset 128
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 5 MatrixStride 32
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 5 RowMajor
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 6 Offset 192
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 7 Offset 208
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 8 Offset 224
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 8 MatrixStride 32
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 8 ColMajor
			
 
				+// CHECK: OpMemberDecorate %type_MyCBuffer 9 Offset 352
			
 
				+
			
 
				+
			
 
				+cbuffer MyCBuffer{               // Alignment | Offset + Size                 = Next
			
 
				+              float     f1;      // 0         | 0        4                      4
			
 
				+              uint64_t  f2;      // 8         | 8        8                      16
			
 
				+              double    f3[3];   // 16        | 16       16 (stride) * 3        64
			
 
				+              float     f4;      // 4         | 64       4                      68
			
 
				+              int64_t3  f5;      // 32        | 96       8 * 3                  120
			
 
				+              double3x2 f6;      // 32        | 128      32 * 2                 192    // SPIR-V RowMajor
			
 
				+              double2x1 f7;      // 16        | 192      16                     208
			
 
				+              float     f8;      // 4         | 208      4                      212
			
 
				+    row_major double2x3 f9[2];   // 32        | 224      32 * 4                 352    // SPIR-V ColMajor
			
 
				+              int64_t2  f10[1];  // 16        | 352      16 (stride)            368
			
 
				+};                               // 32 (max)                                    384
			
 
				+
			
 
				+void main() { }
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.64bit-types.std430.hlsl
@@ -0,0 +1,39 @@
 
				+// Run: %dxc -T vs_6_0 -E main
			
 
				+
			
 
				+// CHECK: OpDecorate %_arr_double_uint_3 ArrayStride 8
			
 
				+// CHECK: OpDecorate %_arr_mat2v3double_uint_2 ArrayStride 64
			
 
				+// CHECK: OpDecorate %_arr_v2long_uint_1 ArrayStride 16
			
 
				+
			
 
				+// CHECK: OpMemberDecorate %S 0 Offset 0
			
 
				+// CHECK: OpMemberDecorate %S 1 Offset 8
			
 
				+// CHECK: OpMemberDecorate %S 2 Offset 16
			
 
				+// CHECK: OpMemberDecorate %S 3 Offset 40
			
 
				+// CHECK: OpMemberDecorate %S 4 Offset 64
			
 
				+// CHECK: OpMemberDecorate %S 5 Offset 96
			
 
				+// CHECK: OpMemberDecorate %S 5 MatrixStride 32
			
 
				+// CHECK: OpMemberDecorate %S 5 RowMajor
			
 
				+// CHECK: OpMemberDecorate %S 6 Offset 160
			
 
				+// CHECK: OpMemberDecorate %S 7 Offset 176
			
 
				+// CHECK: OpMemberDecorate %S 8 Offset 192
			
 
				+// CHECK: OpMemberDecorate %S 8 MatrixStride 32
			
 
				+// CHECK: OpMemberDecorate %S 8 ColMajor
			
 
				+// CHECK: OpMemberDecorate %S 9 Offset 320
			
 
				+
			
 
				+// CHECK: OpDecorate %_runtimearr_S ArrayStride 352
			
 
				+
			
 
				+struct S {                       // Alignment | Offset + Size       = Next
			
 
				+              float     f1;      // 0         | 0        4            4
			
 
				+              uint64_t  f2;      // 8         | 8        8            16
			
 
				+              double    f3[3];   // 8         | 16       8 * 3        40
			
 
				+              float     f4;      // 4         | 40       4            44
			
 
				+              int64_t3  f5;      // 32        | 64       8 * 3        88
			
 
				+              double3x2 f6;      // 32        | 96       32 * 2       160    // SPIR-V RowMajor
			
 
				+              double2x1 f7;      // 16        | 160      16           176
			
 
				+              float     f8;      // 4         | 176      4            180
			
 
				+    row_major double2x3 f9[2];   // 32        | 192      32 * 4       320    // SPIR-V ColMajor
			
 
				+              int64_t2  f10[1];  // 16        | 320      16           336
			
 
				+};                               // 32 (max)                          352
			
 
				+
			
 
				+StructuredBuffer<S> MySBuffer;
			
 
				+
			
 
				+void main() { }
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.asbuffer.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.asbuffer.std430.hlsl
@@ -4,6 +4,8 @@
 
				 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %S 1 Offset 16
			
@@ -17,13 +19,14 @@
 
				 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
			
 
				 // CHECK: OpMemberDecorate %S 4 RowMajor
			
 
				 // CHECK: OpMemberDecorate %S 5 Offset 208
			
 
				+// CHECK: OpMemberDecorate %S 6 Offset 272
			
 
				 
			
 
				-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
			
 
				+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %T 0 Offset 0
			
 
				-// CHECK: OpMemberDecorate %T 1 Offset 448
			
 
				+// CHECK: OpMemberDecorate %T 1 Offset 576
			
 
				 
			
 
				-// CHECK: OpDecorate %_runtimearr_T ArrayStride 464
			
 
				+// CHECK: OpDecorate %_runtimearr_T ArrayStride 592
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %type_AppendStructuredBuffer_T 0 Offset 0
			
 
				 // CHECK: OpDecorate %type_AppendStructuredBuffer_T BufferBlock
			
@@ -36,7 +39,8 @@ struct S {
 
				     row_major    float2x3 c[2];
			
 
				     column_major float2x3 d[2];
			
 
				                  float2x3 e[2];
			
 
				-                 int      f;
			
 
				+    row_major    int2x3   f[2];
			
 
				+                 int      g;
			
 
				 };
			
 
				 
			
 
				 struct T {
			
@@ -49,4 +53,3 @@ AppendStructuredBuffer<T> buffer2;
 
				 float main() : A {
			
 
				     return 1.0;
			
 
				 }
			
 
				-
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.nested.std140.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.nested.std140.hlsl
@@ -1,39 +1,42 @@
 
				 // Run: %dxc -T vs_6_0 -E main
			
 
				 
			
 
				 // Deep nested array of matrices
			
 
				-// Depp nested majorness
			
 
				+// Deep nested majorness
			
 
				 struct R {                         // Alignment    Offset  Size                              Next
			
 
				     row_major    float2x3 rf1[3];  // 16(vec4)  -> 0     + 3(array) * stride(2 * 16(vec4)) = 96
			
 
				     column_major float2x3 rf2[4];  // 16(vec4)  -> 96    + 4(array) * stride(3 * 16(vec4)) = 288
			
 
				                  float2x3 rf3[2];  // 16(vec4)  -> 288   + 2(array) * stride(3 * 16(vec4)) = 384
			
 
				-                 int      rf4;     // 4         -> 384   + 4                               = 388
			
 
				-};                                 // 16(max)                                                400 (388 round up to R alignment)
			
 
				+    row_major    int2x3   rf4[2];  // 16(vec4)  -> 384   + 2(array) * stride(2 * 16(vec4)) = 448
			
 
				+                 int      rf5;     // 4         -> 448   + 4                               = 452
			
 
				+};                                 // 16(max)                                                464 (452 round up to R alignment)
			
 
				 
			
 
				 // Array of scalars, vectors, matrices, and structs
			
 
				 struct S {                         // Alignment   Offset  Size                              Next
			
 
				     float3       sf1[3];           // 16(vec4) -> 0     + 3(array) * 16(vec4)             = 48
			
 
				     float        sf2[3];           // 4        -> 48    + 3(array) * 16(vec4)             = 96
			
 
				-    R            sf3[4];           // 16       -> 96    + 4(array) * stride(400)          = 1696
			
 
				-    row_major    float3x2 sf4[2];  // 16(vec4) -> 1696  + 2(array) * stride(3 * 16(vec4)) = 1792
			
 
				-    column_major float3x2 sf5[3];  // 16(vec4) -> 1792  + 3(array) * stride(2 * 16(vec4)) = 1888
			
 
				-                 float3x2 sf6[4];  // 16(vec4) -> 1888  + 4(array) * stride(2 * 16(vec4)) = 2016
			
 
				-                 float    sf7;     // 4        -> 2016  + 4                               = 2020
			
 
				-};                                 // 16(max)                                               2032 (2020 round up to S alignment)
			
 
				+    R            sf3[4];           // 16       -> 96    + 4(array) * stride(464)          = 1952
			
 
				+    row_major    float3x2 sf4[2];  // 16(vec4) -> 1952  + 2(array) * stride(3 * 16(vec4)) = 2048
			
 
				+    column_major float3x2 sf5[3];  // 16(vec4) -> 2048  + 3(array) * stride(2 * 16(vec4)) = 2144
			
 
				+                 float3x2 sf6[4];  // 16(vec4) -> 2144  + 4(array) * stride(2 * 16(vec4)) = 2272
			
 
				+                 float    sf7;     // 4        -> 2272  + 4                               = 2276
			
 
				+};                                 // 16(max)                                               2288 (2276 round up to S alignment)
			
 
				 
			
 
				 struct T {        // Alignment    Offset  Size              Next
			
 
				-    R    tf1[2];  // 16        -> 0     + 2(array) * 400  = 800
			
 
				-    S    tf2[3];  // 16        -> 800   + 3(array) * 2032 = 6896
			
 
				-    uint tf3;     // 4         -> 6896  + 4               = 6900
			
 
				-};                // 16(max)                                6912 (6900 round up to T alignment)
			
 
				+    R    tf1[2];  // 16        -> 0     + 2(array) * 464  = 928
			
 
				+    S    tf2[3];  // 16        -> 928   + 3(array) * 2288 = 7792
			
 
				+    uint tf3;     // 4         -> 7792  + 4               = 7796
			
 
				+};                // 16(max)                                7808 (7796 round up to T alignment)
			
 
				 
			
 
				 cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
			
 
				-    T    t[2];       // 16       -> 0      + 2(array) * 6912 = 13824
			
 
				-    bool z;          // 4        -> 13824
			
 
				+    T    t[2];       // 16       -> 0      + 2(array) * 7808 = 15616
			
 
				+    bool z;          // 4        -> 15616
			
 
				 };
			
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_4 ArrayStride 48
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 48
			
 
				+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %R 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 0 MatrixStride 16
			
@@ -45,11 +48,12 @@ cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
 
				 // CHECK-NEXT: OpMemberDecorate %R 2 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 2 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 3 Offset 384
			
 
				+// CHECK-NEXT: OpMemberDecorate %R 4 Offset 448
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 400
			
 
				+// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 464
			
 
				 // CHECK:      OpDecorate %_arr_v3float_uint_3 ArrayStride 16
			
 
				 // CHECK:      OpDecorate %_arr_float_uint_3 ArrayStride 16
			
 
				-// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 400
			
 
				+// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 464
			
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 48
			
 
				 // CHECK:      OpDecorate %_arr_mat3v2float_uint_3 ArrayStride 32
			
@@ -58,27 +62,27 @@ cbuffer MyCbuffer {  // Alignment   Offset   Size              Next
 
				 // CHECK:      OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 48
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 2 Offset 96
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1696
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1952
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 3 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 3 ColMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1792
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 4 Offset 2048
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 4 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 4 RowMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1888
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 5 Offset 2144
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 5 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 5 RowMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 6 Offset 2016
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 6 Offset 2272
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 2032
			
 
				+// CHECK-NEXT: OpDecorate %_arr_S_uint_3 ArrayStride 2288
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %T 0 Offset 0
			
 
				-// CHECK-NEXT: OpMemberDecorate %T 1 Offset 800
			
 
				-// CHECK-NEXT: OpMemberDecorate %T 2 Offset 6896
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 1 Offset 928
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 2 Offset 7792
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 6912
			
 
				+// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 7808
			
 
				 
			
 
				 // CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 0 Offset 0
			
 
				-// CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 1 Offset 13824
			
 
				+// CHECK-NEXT: OpMemberDecorate %type_MyCbuffer 1 Offset 15616
			
 
				 
			
 
				 // CHECK:      OpDecorate %type_MyCbuffer Block
			
 
				 float main() : A {
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.std140.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.std140.hlsl
@@ -11,13 +11,14 @@ struct S {      // Alignment    Offset                                Size
 
				     float  sf4; // 4         -> 44                                  + 4         = 48
			
 
				 };              // 16(max)                                                        48(48 round up to S max alignment)
			
 
				 
			
 
				-struct T {           // Alignment     Offset                               Size              = Next
			
 
				-    int      tf1;    // 4          -> 0                                  + 4                 = 4
			
 
				-    R        tf2[3]; // 16         -> 16 (4 rounded up to R alignment)   + 3 * stride(16)    = 64
			
 
				-    float3x2 tf3;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 2 * stride(vec4)  = 96
			
 
				-    S        tf4;    // 16         -> 96 (96 round up to S alignment)    + 48                = 144
			
 
				-    float    tf5;    // 4          -> 144                                + 4                 = 148
			
 
				-};                   // 16(max)                                                                160(148 round up to T max alignment)
			
 
				+struct T {                     // Alignment     Offset                               Size              = Next
			
 
				+              int      tf1;    // 4          -> 0                                  + 4                 = 4
			
 
				+              R        tf2[3]; // 16         -> 16 (4 rounded up to R alignment)   + 3 * stride(16)    = 64
			
 
				+              float3x2 tf3;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 2 * stride(vec4)  = 96
			
 
				+              S        tf4;    // 16         -> 96 (96 round up to S alignment)    + 48                = 144
			
 
				+              float    tf5;    // 4          -> 144                                + 4                 = 148
			
 
				+    row_major int3x2   tf6;    // 16(vec4)   -> 160 (148 rounded up to vec4)       + 3 * stride(vec4)  = 208
			
 
				+};                             // 16(max)                                                                208(208 round up to T max alignment)
			
 
				 
			
 
				 cbuffer MyCBuffer {              // Alignment   Offset                                 Size                     Next
			
 
				                  bool     a;     // 4        -> 0                                    +     4                  = 4
			
@@ -28,9 +29,9 @@ cbuffer MyCBuffer {              // Alignment   Offset
 
				                  float2x1 f;     // 8(vec2)  -> 112 (112 round up to vec2 aligment)  + 2 * 4                  = 120
			
 
				     row_major    float2x3 g[3];  // 16(vec4) -> 128 (120 round up to vec4 alignment) + 3 * 2 * stride(vec4)   = 224
			
 
				     column_major float2x2 h[4];  // 16(vec4) -> 224 (224 round up to vec4 alignment) + 4 * 2 * stride(vec4)   = 352
			
 
				-                 T        t;     // 16       -> 352 (352 round up to vec4 alignment) + 160                    = 512
			
 
				-                 float    z;     // 4        -> 512
			
 
				-
			
 
				+                 T        t;     // 16       -> 352 (352 round up to vec4 alignment) + 208                    = 560
			
 
				+    row_major    int2x3   y;     // 16(vec4) -> 560 (560 round up to vec4 alignment) + 2 * stride(vec4)       = 592
			
 
				+                 float    z;     // 4        -> 592
			
 
				 };
			
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
			
@@ -52,6 +53,7 @@ cbuffer MyCBuffer {              // Alignment   Offset
 
				 // CHECK-NEXT: OpMemberDecorate %T 2 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %T 3 Offset 96
			
 
				 // CHECK-NEXT: OpMemberDecorate %T 4 Offset 144
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 5 Offset 160
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %type_MyCBuffer 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 1 Offset 4
			
@@ -70,7 +72,8 @@ cbuffer MyCBuffer {              // Alignment   Offset
 
				 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 7 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 7 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 8 Offset 352
			
 
				-// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 9 Offset 512
			
 
				+// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 9 Offset 560
			
 
				+// CHECK-NEXT: OpMemberDecorate %type_MyCBuffer 10 Offset 592
			
 
				 // CHECK-NEXT: OpDecorate %type_MyCBuffer Block
			
 
				 
			
 
				 float main() : A {
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpc.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpc.hlsl
@@ -3,6 +3,9 @@
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_5 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_5_0 ArrayStride 48
			
 
				 
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
			
 
				+
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 0 ColMajor
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 1 RowMajor
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 2 RowMajor
			
@@ -12,6 +15,8 @@ cbuffer MyCBuffer {
 
				     row_major    float2x3 matrices1[5];
			
 
				     column_major float2x3 matrices2[5];
			
 
				                  float2x3 matrices3[5];
			
 
				+
			
 
				+    row_major    int2x3   matrices4[5];
			
 
				 }
			
 
				 
			
 
				 void main() {
			
@@ -22,4 +27,19 @@ void main() {
 
				     float2x3 m1 = matrices1[1];
			
 
				     float2x3 m2 = matrices2[2];
			
 
				     float2x3 m3 = matrices3[3];
			
 
				+
			
 
				+    // Note: Since non-fp matrices are represented as arrays of vectors, and
			
 
				+    // due to layout decoration on the rhs of the assignments below,
			
 
				+    // a load and store is performed for each vector.
			
 
				+
			
 
				+// CHECK:          [[ptr_matrices4:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_3
			
 
				+// CHECK-NEXT:   [[ptr_matrices4_1:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices4]] %int_1
			
 
				+// CHECK-NEXT:       [[matrices4_1:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices4_1]]
			
 
				+// CHECK-NEXT:  [[matrices4_1_row0:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 0
			
 
				+// CHECK-NEXT:       [[ptr_m4_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_0
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m4_row0]] [[matrices4_1_row0]]
			
 
				+// CHECK-NEXT:  [[matrices4_1_row1:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 1
			
 
				+// CHECK-NEXT:       [[ptr_m4_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_1
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m4_row1]] [[matrices4_1_row1]]
			
 
				+    int2x3 m4 = matrices4[1];
			
 
				 }
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpr.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.cbuffer.zpr.hlsl
@@ -3,6 +3,9 @@
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_5 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_5_0 ArrayStride 48
			
 
				 
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
			
 
				+
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 0 ColMajor
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 1 RowMajor
			
 
				 // CHECK: OpMemberDecorate %type_MyCBuffer 2 ColMajor
			
@@ -12,6 +15,9 @@ cbuffer MyCBuffer {
 
				     row_major    float2x3 matrices1[5];
			
 
				     column_major float2x3 matrices2[5];
			
 
				                  float2x3 matrices3[5];
			
 
				+
			
 
				+    row_major    int2x3   matrices4[5];
			
 
				+                 int2x3   matrices5[5];
			
 
				 }
			
 
				 
			
 
				 void main() {
			
@@ -22,4 +28,29 @@ void main() {
 
				     float2x3 m1 = matrices1[1];
			
 
				     float2x3 m2 = matrices2[2];
			
 
				     float2x3 m3 = matrices3[3];
			
 
				+
			
 
				+    // Note: Since non-fp matrices are represented as arrays of vectors, and
			
 
				+    // due to layout decoration on the rhs of the assignments below,
			
 
				+    // a load and store is performed for each vector.
			
 
				+
			
 
				+// CHECK:          [[ptr_matrices4:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_3
			
 
				+// CHECK-NEXT:   [[ptr_matrices4_1:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices4]] %int_1
			
 
				+// CHECK-NEXT:       [[matrices4_1:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices4_1]]
			
 
				+// CHECK-NEXT:  [[matrices4_1_row0:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 0
			
 
				+// CHECK-NEXT:       [[ptr_m4_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_0
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m4_row0]] [[matrices4_1_row0]]
			
 
				+// CHECK-NEXT:  [[matrices4_1_row1:%\d+]] = OpCompositeExtract %v3int [[matrices4_1]] 1
			
 
				+// CHECK-NEXT:       [[ptr_m4_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m4 %uint_1
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m4_row1]] [[matrices4_1_row1]]
			
 
				+    int2x3 m4 = matrices4[1];
			
 
				+// CHECK:          [[ptr_matrices5:%\d+]] = OpAccessChain %_ptr_Uniform__arr__arr_v3int_uint_2_uint_5 %MyCBuffer %int_4
			
 
				+// CHECK-NEXT:   [[ptr_matrices5_2:%\d+]] = OpAccessChain %_ptr_Uniform__arr_v3int_uint_2 [[ptr_matrices5]] %int_2
			
 
				+// CHECK-NEXT:       [[matrices5_2:%\d+]] = OpLoad %_arr_v3int_uint_2 [[ptr_matrices5_2]]
			
 
				+// CHECK-NEXT: [[matrices_5_2_row0:%\d+]] = OpCompositeExtract %v3int [[matrices5_2]] 0
			
 
				+// CHECK-NEXT:       [[ptr_m5_row0:%\d+]] = OpAccessChain %_ptr_Function_v3int %m5 %uint_0
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m5_row0]] [[matrices_5_2_row0]]
			
 
				+// CHECK-NEXT: [[matrices_5_2_row1:%\d+]] = OpCompositeExtract %v3int [[matrices5_2]] 1
			
 
				+// CHECK-NEXT:       [[ptr_m5_row1:%\d+]] = OpAccessChain %_ptr_Function_v3int %m5 %uint_1
			
 
				+// CHECK-NEXT:                              OpStore [[ptr_m5_row1]] [[matrices_5_2_row1]]
			
 
				+    int2x3 m5 = matrices5[2];
			
 
				 }
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.csbuffer.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.csbuffer.std430.hlsl
@@ -4,6 +4,8 @@
 
				 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %S 1 Offset 16
			
@@ -17,13 +19,14 @@
 
				 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
			
 
				 // CHECK: OpMemberDecorate %S 4 RowMajor
			
 
				 // CHECK: OpMemberDecorate %S 5 Offset 208
			
 
				+// CHECK: OpMemberDecorate %S 6 Offset 272
			
 
				 
			
 
				-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
			
 
				+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %T 0 Offset 0
			
 
				-// CHECK: OpMemberDecorate %T 1 Offset 448
			
 
				+// CHECK: OpMemberDecorate %T 1 Offset 576
			
 
				 
			
 
				-// CHECK: OpDecorate %_runtimearr_T ArrayStride 464
			
 
				+// CHECK: OpDecorate %_runtimearr_T ArrayStride 592
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %type_ConsumeStructuredBuffer_T 0 Offset 0
			
 
				 // CHECK: OpDecorate %type_ConsumeStructuredBuffer_T BufferBlock
			
@@ -36,7 +39,8 @@ struct S {
 
				     row_major    float2x3 c[2];
			
 
				     column_major float2x3 d[2];
			
 
				                  float2x3 e[2];
			
 
				-                 int      f;
			
 
				+    row_major    int2x3   f[2];
			
 
				+                 int      g;
			
 
				 };
			
 
				 
			
 
				 struct T {
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.non-fp-matrix.error.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.non-fp-matrix.error.hlsl
@@ -0,0 +1,20 @@
 
				+// Run: %dxc -T vs_6_0 -E main
			
 
				+
			
 
				+cbuffer MyCBuffer {
			
 
				+  struct S {
			
 
				+    int2x3   matrices4[5];
			
 
				+  } s;
			
 
				+}
			
 
				+
			
 
				+struct T {
			
 
				+    int2x3   t[5];
			
 
				+};
			
 
				+
			
 
				+RWStructuredBuffer<T> rwsb;
			
 
				+
			
 
				+void main() {
			
 
				+   int2x3 m4 = s.matrices4[1];
			
 
				+}
			
 
				+
			
 
				+// CHECK: :6:5: error: externally initialized non-floating-point column-major matrices not supported yet
			
 
				+// CHECK: :13:23: error: externally initialized non-floating-point column-major matrices not supported yet
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.push-constant.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.push-constant.std430.hlsl
@@ -2,28 +2,39 @@
 
				 
			
 
				 // CHECK: OpDecorate %_arr_v2float_uint_3 ArrayStride 8
			
 
				 // CHECK: OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 32
			
 
				+// CHECK: OpDecorate %_arr_v2int_uint_3 ArrayStride 8
			
 
				+// CHECK: OpDecorate %_arr__arr_v2int_uint_3_uint_2 ArrayStride 24
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %T 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %T 1 Offset 32
			
 
				 // CHECK: OpMemberDecorate %T 1 MatrixStride 16
			
 
				 // CHECK: OpMemberDecorate %T 1 RowMajor
			
 
				+// CHECK: OpMemberDecorate %T 2 Offset 96
			
 
				+// CHECK: OpMemberDecorate %T 3 Offset 144
			
 
				+// CHECK: OpMemberDecorate %T 3 MatrixStride 8
			
 
				+// CHECK: OpMemberDecorate %T 3 ColMajor
			
 
				 struct T {
			
 
				                  float2   f1[3];
			
 
				     column_major float3x2 f2[2];
			
 
				+    row_major    int3x2   f4[2];
			
 
				+    row_major    float3x2 f3[2];
			
 
				 };
			
 
				 
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				 // CHECK: OpMemberDecorate %type_PushConstant_S 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %type_PushConstant_S 1 Offset 16
			
 
				 // CHECK: OpMemberDecorate %type_PushConstant_S 2 Offset 32
			
 
				-// CHECK: OpMemberDecorate %type_PushConstant_S 3 Offset 128
			
 
				-// CHECK: OpMemberDecorate %type_PushConstant_S 3 MatrixStride 16
			
 
				-// CHECK: OpMemberDecorate %type_PushConstant_S 3 ColMajor
			
 
				+// CHECK: OpMemberDecorate %type_PushConstant_S 3 Offset 224
			
 
				+// CHECK: OpMemberDecorate %type_PushConstant_S 4 Offset 256
			
 
				+// CHECK: OpMemberDecorate %type_PushConstant_S 4 MatrixStride 16
			
 
				+// CHECK: OpMemberDecorate %type_PushConstant_S 4 ColMajor
			
 
				 
			
 
				 // CHECK: OpDecorate %type_PushConstant_S Block
			
 
				 struct S {
			
 
				               float    f1;
			
 
				               float3   f2;
			
 
				               T        f4;
			
 
				+    row_major int2x3   f5;
			
 
				     row_major float2x3 f3;
			
 
				 };
			
 
				 
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.nested.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.nested.std430.hlsl
@@ -6,29 +6,31 @@ struct R {                         // Alignment    Offset  Size
 
				     row_major    float2x3 rf1[3];  // 16(vec4)  -> 0     + 3(array) * stride(2 * 16(vec4)) = 96
			
 
				     column_major float2x3 rf2[4];  // 8(vec2)   -> 96    + 4(array) * stride(3 * 8(vec2))  = 192
			
 
				                  float2x3 rf3[2];  // 8(vec2)   -> 192   + 2(array) * stride(3 * 8(vec2))  = 240
			
 
				-                 int      rf4;     // 4         -> 240   + 4                               = 244
			
 
				-};                                 // 16(max)                                                256 (244 round up to R alignment)
			
 
				+    row_major    int2x3   rf4[3];  // 16(vec4)  -> 240   + 3(array) * stride(2 * 16(vec4)) = 336
			
 
				+                 int      rf5;     // 4         -> 336   + 4                               = 340
			
 
				+};                                 // 16(max)                                                352 (340 round up to R alignment)
			
 
				 
			
 
				 // Array of scalars, vectors, matrices, and structs
			
 
				 struct S {                         // Alignment   Offset  Size                              Next
			
 
				     float3       sf1[3];           // 16(vec4) -> 0     + 3(array) * 16(vec4)             = 48
			
 
				     float        sf2[3];           // 4        -> 48    + 3(array) * 4                    = 60
			
 
				-    R            sf3[4];           // 16       -> 64    + 4(array) * stride(256)          = 1088
			
 
				-    row_major    float3x2 sf4[2];  // 8(vec2)  -> 1088  + 2(array) * stride(3 * 8(vec2))  = 1136
			
 
				-    column_major float3x2 sf5[3];  // 16(vec4) -> 1136  + 3(array) * stride(2 * 16(vec4)) = 1232
			
 
				-                 float3x2 sf6[4];  // 16(vec4) -> 1232  + 4(array) * stride(2 * 16(vec4)) = 1360
			
 
				-                 float    sf7;     // 4        -> 1360  + 4                               = 1364
			
 
				-};                                 // 16(max)                                               1376 (1364 round up to S alignment)
			
 
				+    R            sf3[4];           // 16       -> 64    + 4(array) * stride(256)          = 1472
			
 
				+    row_major    float3x2 sf4[2];  // 8(vec2)  -> 1472  + 2(array) * stride(3 * 8(vec2))  = 1520
			
 
				+    column_major float3x2 sf5[3];  // 16(vec4) -> 1520  + 3(array) * stride(2 * 16(vec4)) = 1616
			
 
				+                 float3x2 sf6[4];  // 16(vec4) -> 1616  + 4(array) * stride(2 * 16(vec4)) = 1744
			
 
				+    row_major    int3x2   sf7[2];  // 8(vec2)  -> 1744  + 2(array) * stride(3 * 8(vec2))  = 1792
			
 
				+                 float    sf8;     // 4        -> 1792  + 4                               = 1796
			
 
				+};                                 // 16(max)                                               1808 (1796 round up to S alignment)
			
 
				 
			
 
				 struct T {        // Alignment    Offset  Size              Next
			
 
				-    R    tf1[2];  // 16        -> 0     + 2(array) * 256  = 512
			
 
				-    S    tf2[3];  // 16        -> 512   + 3(array) * 1376 = 4640
			
 
				-    uint tf3;     // 4         -> 4640  + 4               = 4644
			
 
				-};                // 16(max)                                4656 (4640 round up to T alignment)
			
 
				+    R    tf1[2];  // 16        -> 0     + 2(array) * 352  = 704
			
 
				+    S    tf2[3];  // 16        -> 704   + 3(array) * 1808 = 6128
			
 
				+    uint tf3;     // 4         -> 6128  + 4               = 6132
			
 
				+};                // 16(max)                                6144 (6132 round up to T alignment)
			
 
				 
			
 
				 struct SBuffer {  // Alignment   Offset   Size                 Next
			
 
				-    T    t[2];       // 16       -> 0      + 2(array) * 4656 = 9312
			
 
				-    bool z;          // 4        -> 9312
			
 
				+    T    t[2];       // 16       -> 0      + 2(array) * 6144 = 12288
			
 
				+    bool z;          // 4        -> 12288
			
 
				 };
			
 
				 
			
 
				 RWStructuredBuffer<SBuffer> MySBuffer;
			
@@ -36,6 +38,8 @@ RWStructuredBuffer<SBuffer> MySBuffer;
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_4 ArrayStride 24
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 24
			
 
				+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_3 ArrayStride 32
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %R 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 0 MatrixStride 16
			
@@ -47,42 +51,46 @@ RWStructuredBuffer<SBuffer> MySBuffer;
 
				 // CHECK-NEXT: OpMemberDecorate %R 2 MatrixStride 8
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 2 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %R 3 Offset 240
			
 
				+// CHECK-NEXT: OpMemberDecorate %R 4 Offset 336
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 256
			
 
				+// CHECK:      OpDecorate %_arr_R_uint_2 ArrayStride 352
			
 
				 // CHECK:      OpDecorate %_arr_v3float_uint_3 ArrayStride 16
			
 
				 // CHECK:      OpDecorate %_arr_float_uint_3 ArrayStride 4
			
 
				-// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 256
			
 
				+// CHECK:      OpDecorate %_arr_R_uint_4 ArrayStride 352
			
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_mat3v2float_uint_2 ArrayStride 24
			
 
				 // CHECK:      OpDecorate %_arr_mat3v2float_uint_3 ArrayStride 32
			
 
				 // CHECK:      OpDecorate %_arr_mat3v2float_uint_4 ArrayStride 32
			
 
				+// CHECK:      OpDecorate %_arr_v2int_uint_3 ArrayStride 8
			
 
				+// CHECK:      OpDecorate %_arr__arr_v2int_uint_3_uint_2 ArrayStride 24
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 48
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 2 Offset 64
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1088
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 3 Offset 1472
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 3 MatrixStride 8
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 3 ColMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1136
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 4 Offset 1520
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 4 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 4 RowMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1232
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 5 Offset 1616
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 5 MatrixStride 16
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 5 RowMajor
			
 
				-// CHECK-NEXT: OpMemberDecorate %S 6 Offset 1360
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 6 Offset 1744
			
 
				+// CHECK-NEXT: OpMemberDecorate %S 7 Offset 1792
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 1376
			
 
				+// CHECK:      OpDecorate %_arr_S_uint_3 ArrayStride 1808
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %T 0 Offset 0
			
 
				-// CHECK-NEXT: OpMemberDecorate %T 1 Offset 512
			
 
				-// CHECK-NEXT: OpMemberDecorate %T 2 Offset 4640
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 1 Offset 704
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 2 Offset 6128
			
 
				 
			
 
				-// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 4656
			
 
				+// CHECK:      OpDecorate %_arr_T_uint_2 ArrayStride 6144
			
 
				 
			
 
				 // CHECK-NEXT: OpMemberDecorate %SBuffer 0 Offset 0
			
 
				-// CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 9312
			
 
				+// CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 12288
			
 
				 
			
 
				-// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 9328
			
 
				+// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 12304
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %type_RWStructuredBuffer_SBuffer 0 Offset 0
			
 
				 // CHECK-NEXT: OpDecorate %type_RWStructuredBuffer_SBuffer BufferBlock
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.sbuffer.std430.hlsl
@@ -11,13 +11,14 @@ struct S {      // Alignment    Offset                                Size
 
				     float  sf4; // 4         -> 28                                  + 4         = 32
			
 
				 };              // 16(max)                                                        32
			
 
				 
			
 
				-struct T {           // Alignment     Offset                               Size              = Next
			
 
				-    int      tf1;    // 4          -> 0                                  + 4                 = 4
			
 
				-    R        tf2[3]; // 8          -> 8                                  + 3 * stride(8)     = 32
			
 
				-    float3x2 tf3;    // 16(vec4)   -> 32 (32 round up to vec4 alignment) + 2 * stride(vec4)  = 64
			
 
				-    S        tf4;    // 16         -> 64 (64 round up to S alignment)    + 32                = 96
			
 
				-    float    tf5;    // 4          -> 96                                 + 4                 = 100
			
 
				-};                   // 16(max)                                                                112(100 round up to T max alignment)
			
 
				+struct T {                      // Alignment     Offset                               Size              = Next
			
 
				+               int      tf1;    // 4          -> 0                                  + 4                 = 4
			
 
				+               R        tf2[3]; // 8          -> 8                                  + 3 * stride(8)     = 32
			
 
				+               float3x2 tf3;    // 16(vec4)   -> 32 (32 round up to vec4 alignment) + 2 * stride(vec4)  = 64
			
 
				+  row_major    int3x2   tf4;    // 16(vec4)   -> 64 (64 round up to vec4 alignment) + 3 * stride(vec2)  = 88
			
 
				+               S        tf5;    // 16         -> 96 (88 round up to S alignment)    + 32                = 128
			
 
				+               float    tf6;    // 4          -> 128                                + 4                 = 132
			
 
				+};                              // 16(max)                                                                144(132 round up to T max alignment)
			
 
				 
			
 
				 struct SBuffer {              // Alignment   Offset                                 Size                     Next
			
 
				                  bool     a;     // 4        -> 0                                    +     4                  = 4
			
@@ -28,8 +29,9 @@ struct SBuffer {              // Alignment   Offset
 
				                  float2x1 f;     // 8(vec2)  -> 88 (88 round up to vec2 aligment)    + 2 * 4                  = 96
			
 
				     row_major    float2x3 g[3];  // 16(vec4) -> 96 (96 round up to vec4 alignment)   + 3 * 2 * stride(vec4)   = 192
			
 
				     column_major float2x2 h[4];  // 16(vec4) -> 192 (192 round up to vec2 alignment) + 4 * 2 * stride(vec2)   = 256
			
 
				-                 T        t;     // 16       -> 256 (352 round up to T alignment)    + 112                    = 368
			
 
				-                 float    z;     // 4        -> 368
			
 
				+    row_major    int2x3   i[5];  // 16(vec4) -> 256 (256 round up to vec4 alignment) + 5 * 2 * stride(vec4)   = 416
			
 
				+                 T        t;     // 16       -> 416 (416 round up to T alignment)    + 144                    = 560
			
 
				+                 float    z;     // 4        -> 560
			
 
				 
			
 
				 };
			
 
				 
			
@@ -37,10 +39,13 @@ StructuredBuffer<SBuffer> MySBuffer;
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_mat2v3float_uint_3 ArrayStride 32
			
 
				 // CHECK:      OpDecorate %_arr_mat2v2float_uint_4 ArrayStride 16
			
 
				+// CHECK:      OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK:      OpDecorate %_arr__arr_v3int_uint_2_uint_5 ArrayStride 32
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %R 0 Offset 0
			
 
				 
			
 
				 // CHECK:      OpDecorate %_arr_R_uint_3 ArrayStride 8
			
 
				+// CEHCK:      OpDecorate %_arr_v2int_uint_3 ArrayStride 8
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %S 1 Offset 8
			
@@ -54,6 +59,7 @@ StructuredBuffer<SBuffer> MySBuffer;
 
				 // CHECK-NEXT: OpMemberDecorate %T 2 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %T 3 Offset 64
			
 
				 // CHECK-NEXT: OpMemberDecorate %T 4 Offset 96
			
 
				+// CHECK-NEXT: OpMemberDecorate %T 5 Offset 128
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %SBuffer 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %SBuffer 1 Offset 4
			
@@ -72,9 +78,10 @@ StructuredBuffer<SBuffer> MySBuffer;
 
				 // CHECK-NEXT: OpMemberDecorate %SBuffer 7 MatrixStride 8
			
 
				 // CHECK-NEXT: OpMemberDecorate %SBuffer 7 RowMajor
			
 
				 // CHECK-NEXT: OpMemberDecorate %SBuffer 8 Offset 256
			
 
				-// CHECK-NEXT: OpMemberDecorate %SBuffer 9 Offset 368
			
 
				+// CHECK-NEXT: OpMemberDecorate %SBuffer 9 Offset 416
			
 
				+// CHECK-NEXT: OpMemberDecorate %SBuffer 10 Offset 560
			
 
				 
			
 
				-// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 384
			
 
				+// CHECK:      OpDecorate %_runtimearr_SBuffer ArrayStride 576
			
 
				 
			
 
				 // CHECK:      OpMemberDecorate %type_StructuredBuffer_SBuffer 0 Offset 0
			
 
				 // CHECK-NEXT: OpMemberDecorate %type_StructuredBuffer_SBuffer 0 NonWritable
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.tbuffer.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.tbuffer.std430.hlsl
@@ -4,6 +4,8 @@
 
				 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %S 1 Offset 16
			
@@ -17,11 +19,12 @@
 
				 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
			
 
				 // CHECK: OpMemberDecorate %S 4 RowMajor
			
 
				 // CHECK: OpMemberDecorate %S 5 Offset 208
			
 
				+// CHECK: OpMemberDecorate %S 6 Offset 272
			
 
				 
			
 
				-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
			
 
				+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %type_myTbuffer 0 Offset 0
			
 
				-// CHECK: OpMemberDecorate %type_myTbuffer 1 Offset 448
			
 
				+// CHECK: OpMemberDecorate %type_myTbuffer 1 Offset 576
			
 
				 
			
 
				 // CHECK: OpDecorate %type_myTbuffer BufferBlock
			
 
				 
			
@@ -34,7 +37,8 @@ struct S {
 
				     row_major    float2x3 c[2];
			
 
				     column_major float2x3 d[2];
			
 
				                  float2x3 e[2];
			
 
				-                 int      f;
			
 
				+    row_major    int2x3   f[2];
			
 
				+                 int      g;
			
 
				 };
			
 
				 
			
 
				 tbuffer myTbuffer : register(t0)
			
--- a/tools/clang/test/CodeGenSPIRV/vk.layout.texture-buffer.std430.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.layout.texture-buffer.std430.hlsl
@@ -4,6 +4,8 @@
 
				 // CHECK: OpDecorate %_arr_v3float_uint_2 ArrayStride 16
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2 ArrayStride 32
			
 
				 // CHECK: OpDecorate %_arr_mat2v3float_uint_2_0 ArrayStride 24
			
 
				+// CHECK: OpDecorate %_arr_v3int_uint_2 ArrayStride 16
			
 
				+// CHECK: OpDecorate %_arr__arr_v3int_uint_2_uint_2 ArrayStride 32
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %S 0 Offset 0
			
 
				 // CHECK: OpMemberDecorate %S 1 Offset 16
			
@@ -17,11 +19,12 @@
 
				 // CHECK: OpMemberDecorate %S 4 MatrixStride 8
			
 
				 // CHECK: OpMemberDecorate %S 4 RowMajor
			
 
				 // CHECK: OpMemberDecorate %S 5 Offset 208
			
 
				+// CHECK: OpMemberDecorate %S 6 Offset 272
			
 
				 
			
 
				-// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 224
			
 
				+// CHECK: OpDecorate %_arr_S_uint_2 ArrayStride 288
			
 
				 
			
 
				 // CHECK: OpMemberDecorate %type_TextureBuffer_T 0 Offset 0
			
 
				-// CHECK: OpMemberDecorate %type_TextureBuffer_T 1 Offset 448
			
 
				+// CHECK: OpMemberDecorate %type_TextureBuffer_T 1 Offset 576
			
 
				 
			
 
				 // CHECK: OpDecorate %type_TextureBuffer_T BufferBlock
			
 
				 
			
@@ -34,7 +37,8 @@ struct S {
 
				     row_major    float2x3 c[2];
			
 
				     column_major float2x3 d[2];
			
 
				                  float2x3 e[2];
			
 
				-                 int      f;
			
 
				+    row_major    int2x3   f[2];
			
 
				+                 int      g;
			
 
				 };
			
 
				 
			
 
				 struct T {
			
--- a/tools/clang/test/CodeGenSPIRV/vk.location.composite.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/vk.location.composite.hlsl
@@ -0,0 +1,46 @@
 
				+// Run: %dxc -T vs_6_0 -E main
			
 
				+
			
 
				+// CHECK: OpDecorate %in_var_A Location 0
			
 
				+// CHECK: OpDecorate %in_var_B Location 1
			
 
				+// CHECK: OpDecorate %in_var_C Location 2
			
 
				+// CHECK: OpDecorate %in_var_D Location 4
			
 
				+// CHECK: OpDecorate %in_var_E Location 6
			
 
				+// CHECK: OpDecorate %in_var_F Location 8
			
 
				+// CHECK: OpDecorate %in_var_G Location 16
			
 
				+
			
 
				+// CHECK: OpDecorate %out_var_A Location 0
			
 
				+// CHECK: OpDecorate %out_var_B Location 2
			
 
				+// CHECK: OpDecorate %out_var_C Location 3
			
 
				+// CHECK: OpDecorate %out_var_D Location 4
			
 
				+// CHECK: OpDecorate %out_var_E Location 5
			
 
				+// CHECK: OpDecorate %out_var_F Location 11
			
 
				+// CHECK: OpDecorate %out_var_G Location 13
			
 
				+// CHECK: OpDecorate %out_var_H Location 14
			
 
				+
			
 
				+struct S {
			
 
				+    half2x3  matrix2x3 : A; // 0 (+2)
			
 
				+    float1x2 vector1x2 : B; // 2 (+1)
			
 
				+    float3x1 vector3x1 : C; // 3 (+1)
			
 
				+    float1x1 scalar1x1 : D; // 4 (+1)
			
 
				+};
			
 
				+
			
 
				+struct T {
			
 
				+    S        s;
			
 
				+    float2x3 array1[3] : E; // 5  (+2*3)
			
 
				+    half1x2  array2[2] : F; // 11 (+1*2)
			
 
				+    half3x1  array3[1] : G; // 13 (+1*1)
			
 
				+    float    array4[4] : H; // 14 (+1*4)
			
 
				+};
			
 
				+
			
 
				+T main(
			
 
				+    double    a   : A, // 0  (+1)
			
 
				+    double2   b   : B, // 1  (+1)
			
 
				+    double3   c   : C, // 2  (+2)
			
 
				+    double4   d   : D, // 4  (+2)
			
 
				+    double2x2 e   : E, // 6  (+1*2)
			
 
				+    double2x3 f[2]: F, // 8  (+2*2*2)
			
 
				+    double2x3 g   : G  // 16 (+2x2)
			
 
				+) {
			
 
				+    T t = (T)0;
			
 
				+    return t;
			
 
				+}
			
--- a/tools/clang/test/vk.cloption.invert-y.vs.hlsl
+++ b/tools/clang/test/vk.cloption.invert-y.vs.hlsl
--- a/tools/clang/tools/dxcompiler/dxcontainerbuilder.cpp
+++ b/tools/clang/tools/dxcompiler/dxcontainerbuilder.cpp
@@ -98,8 +98,12 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::AddPart(_In_ UINT32 fourCC, _In_
 
				     IFTBOOL(pSource != nullptr && !IsDxilContainerLike(pSource->GetBufferPointer(),
			
 
				       pSource->GetBufferSize()),
			
 
				       E_INVALIDARG);
			
 
				-    // Only allow adding private data and root signature for now
			
 
				-    IFTBOOL(fourCC == DxilFourCC::DFCC_RootSignature || fourCC == DxilFourCC::DFCC_PrivateData, E_INVALIDARG);
			
 
				+    // Only allow adding private data, debug info name and root signature for now
			
 
				+    IFTBOOL(
			
 
				+        fourCC == DxilFourCC::DFCC_RootSignature || 
			
 
				+        fourCC == DxilFourCC::DFCC_ShaderDebugName ||
			
 
				+        fourCC == DxilFourCC::DFCC_PrivateData, 
			
 
				+      E_INVALIDARG);
			
 
				     PartList::iterator it = std::find_if(m_parts.begin(), m_parts.end(), [&](DxilPart part) {
			
 
				       return part.m_fourCC == fourCC;
			
 
				     });
			
@@ -117,9 +121,10 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::RemovePart(_In_ UINT32 fourCC) {
 
				   DxcThreadMalloc TM(m_pMalloc);
			
 
				   try {
			
 
				     IFTBOOL(fourCC == DxilFourCC::DFCC_ShaderDebugInfoDXIL ||
			
 
				+                fourCC == DxilFourCC::DFCC_ShaderDebugName ||
			
 
				                 fourCC == DxilFourCC::DFCC_RootSignature ||
			
 
				                 fourCC == DxilFourCC::DFCC_PrivateData,
			
 
				-            E_INVALIDARG); // You can only remove debug info, rootsignature, or private data blob
			
 
				+            E_INVALIDARG); // You can only remove debug info, debug info name, rootsignature, or private data blob
			
 
				     PartList::iterator it =
			
 
				       std::find_if(m_parts.begin(), m_parts.end(),
			
 
				         [&](DxilPart part) { return part.m_fourCC == fourCC; });
			
--- a/tools/clang/unittests/HLSL/CompilerTest.cpp
+++ b/tools/clang/unittests/HLSL/CompilerTest.cpp
@@ -400,6 +400,7 @@ public:
 
				   TEST_METHOD(CompileWhenWorksThenDisassembleWorks)
			
 
				   TEST_METHOD(CompileWhenDebugWorksThenStripDebug)
			
 
				   TEST_METHOD(CompileWhenWorksThenAddRemovePrivate)
			
 
				+  TEST_METHOD(CompileThenAddCustomDebugName)
			
 
				   TEST_METHOD(CompileWithRootSignatureThenStripRootSignature)
			
 
				 
			
 
				   TEST_METHOD(CompileWhenIncludeThenLoadInvoked)
			
@@ -2081,12 +2082,12 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
 
				 
			
 
				   VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
			
 
				   CreateBlobFromText("float4 main() : SV_Target {\r\n"
			
 
				-                     "  return 0;\r\n"
			
 
				-                     "}",
			
 
				-                     &pSource);
			
 
				+    "  return 0;\r\n"
			
 
				+    "}",
			
 
				+    &pSource);
			
 
				   VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"source.hlsl", L"main",
			
 
				-                                      L"ps_6_0", nullptr, 0, nullptr, 0,
			
 
				-                                      nullptr, &pResult));
			
 
				+    L"ps_6_0", nullptr, 0, nullptr, 0,
			
 
				+    nullptr, &pResult));
			
 
				   VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
			
 
				   // Append private data blob
			
 
				   CComPtr<IDxcContainerBuilder> pBuilder;
			
@@ -2103,9 +2104,9 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
 
				   CComPtr<IDxcBlob> pNewProgram;
			
 
				   VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
			
 
				   hlsl::DxilContainerHeader *pContainerHeader =
			
 
				-      (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
			
 
				+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
			
 
				   hlsl::DxilPartHeader *pPartHeader = hlsl::GetDxilPartByType(
			
 
				-      pContainerHeader, hlsl::DxilFourCC::DFCC_PrivateData);
			
 
				+    pContainerHeader, hlsl::DxilFourCC::DFCC_PrivateData);
			
 
				   VERIFY_IS_NOT_NULL(pPartHeader);
			
 
				   // compare data
			
 
				   std::string privatePart((const char *)(pPartHeader + 1), privateTxt.size());
			
@@ -2128,6 +2129,81 @@ TEST_F(CompilerTest, CompileWhenWorksThenAddRemovePrivate) {
 
				   VERIFY_IS_NULL(pPartHeader);
			
 
				 }
			
 
				 
			
 
				+TEST_F(CompilerTest, CompileThenAddCustomDebugName) {
			
 
				+  CComPtr<IDxcCompiler> pCompiler;
			
 
				+  CComPtr<IDxcOperationResult> pResult;
			
 
				+  CComPtr<IDxcBlobEncoding> pSource;
			
 
				+  CComPtr<IDxcBlob> pProgram;
			
 
				+
			
 
				+  VERIFY_SUCCEEDED(CreateCompiler(&pCompiler));
			
 
				+  CreateBlobFromText("float4 main() : SV_Target {\r\n"
			
 
				+    "  return 0;\r\n"
			
 
				+    "}",
			
 
				+    &pSource);
			
 
				+
			
 
				+  LPCWSTR args[] = { L"/Zi", L"/Zss" };
			
 
				+
			
 
				+  VERIFY_SUCCEEDED(pCompiler->Compile(pSource, L"source.hlsl", L"main",
			
 
				+    L"ps_6_0", args, _countof(args), nullptr, 0,
			
 
				+    nullptr, &pResult));
			
 
				+  VERIFY_SUCCEEDED(pResult->GetResult(&pProgram));
			
 
				+  // Append private data blob
			
 
				+  CComPtr<IDxcContainerBuilder> pBuilder;
			
 
				+  VERIFY_SUCCEEDED(CreateContainerBuilder(&pBuilder));
			
 
				+
			
 
				+  const char pNewName[] = "MyOwnUniqueName.lld";
			
 
				+  //include null terminator:
			
 
				+  size_t nameBlobPartSize = sizeof(hlsl::DxilShaderDebugName) + _countof(pNewName);
			
 
				+  // round up to four-byte size:
			
 
				+  size_t allocatedSize = (nameBlobPartSize + 3) & ~3;
			
 
				+  auto pNameBlobContent = reinterpret_cast<hlsl::DxilShaderDebugName*>(malloc(allocatedSize));
			
 
				+  ZeroMemory(pNameBlobContent, allocatedSize); //just to make sure trailing nulls are nulls.
			
 
				+  pNameBlobContent->Flags = 0;
			
 
				+  pNameBlobContent->NameLength = _countof(pNewName) - 1; //this is not supposed to include null terminator
			
 
				+  memcpy(pNameBlobContent + 1, pNewName, _countof(pNewName));
			
 
				+
			
 
				+  CComPtr<IDxcBlobEncoding> pDebugName;
			
 
				+
			
 
				+  CreateBlobPinned(pNameBlobContent, allocatedSize, CP_UTF8, &pDebugName);
			
 
				+
			
 
				+
			
 
				+  VERIFY_SUCCEEDED(pBuilder->Load(pProgram));
			
 
				+  // should fail since it already exists:
			
 
				+  VERIFY_FAILED(pBuilder->AddPart(hlsl::DxilFourCC::DFCC_ShaderDebugName, pDebugName));
			
 
				+  VERIFY_SUCCEEDED(pBuilder->RemovePart(hlsl::DxilFourCC::DFCC_ShaderDebugName));
			
 
				+  VERIFY_SUCCEEDED(pBuilder->AddPart(hlsl::DxilFourCC::DFCC_ShaderDebugName, pDebugName));
			
 
				+  pResult.Release();
			
 
				+  VERIFY_SUCCEEDED(pBuilder->SerializeContainer(&pResult));
			
 
				+
			
 
				+  CComPtr<IDxcBlob> pNewProgram;
			
 
				+  VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
			
 
				+  hlsl::DxilContainerHeader *pContainerHeader =
			
 
				+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
			
 
				+  hlsl::DxilPartHeader *pPartHeader = hlsl::GetDxilPartByType(
			
 
				+    pContainerHeader, hlsl::DxilFourCC::DFCC_ShaderDebugName);
			
 
				+  VERIFY_IS_NOT_NULL(pPartHeader);
			
 
				+  // compare data
			
 
				+  VERIFY_IS_TRUE(memcmp(pPartHeader + 1, pNameBlobContent, allocatedSize) == 0);
			
 
				+
			
 
				+  free(pNameBlobContent);
			
 
				+
			
 
				+  // Remove private data blob
			
 
				+  pBuilder.Release();
			
 
				+  VERIFY_SUCCEEDED(CreateContainerBuilder(&pBuilder));
			
 
				+  VERIFY_SUCCEEDED(pBuilder->Load(pNewProgram));
			
 
				+  VERIFY_SUCCEEDED(pBuilder->RemovePart(hlsl::DxilFourCC::DFCC_ShaderDebugName));
			
 
				+  pResult.Release();
			
 
				+  VERIFY_SUCCEEDED(pBuilder->SerializeContainer(&pResult));
			
 
				+
			
 
				+  pNewProgram.Release();
			
 
				+  VERIFY_SUCCEEDED(pResult->GetResult(&pNewProgram));
			
 
				+  pContainerHeader =
			
 
				+    (hlsl::DxilContainerHeader *)(pNewProgram->GetBufferPointer());
			
 
				+  pPartHeader = hlsl::GetDxilPartByType(
			
 
				+    pContainerHeader, hlsl::DxilFourCC::DFCC_ShaderDebugName);
			
 
				+  VERIFY_IS_NULL(pPartHeader);
			
 
				+}
			
 
				+
			
 
				 TEST_F(CompilerTest, CompileWithRootSignatureThenStripRootSignature) {
			
 
				   CComPtr<IDxcCompiler> pCompiler;
			
 
				   CComPtr<IDxcOperationResult> pResult;
			
--- a/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
+++ b/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
@@ -1760,8 +1760,8 @@
 
				             <Parameter Name="ShaderOp.Arguments">-enable-16bit-types</Parameter>
			
 
				         </Row>
			
 
				         <Row Name="HcosHalf">
			
 
				-            <Parameter Name="Validation.Type">Epsilon</Parameter>
			
 
				-            <Parameter Name="Validation.Tolerance">0.0008</Parameter>
			
 
				+            <Parameter Name="Validation.Type">ulp</Parameter>
			
 
				+            <Parameter Name="Validation.Tolerance">2</Parameter>
			
 
				             <Parameter Name="ShaderOp.Text"> struct SUnaryFPOp {
			
 
				                 float16_t input;
			
 
				                 float16_t output;
			
@@ -5798,19 +5798,19 @@
 
				             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
			
 
				             <Parameter Name="Validation.Input1">
			
 
				                 <Value>0x007F0000</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x40000000</Value>
			
 
				+                <Value>0x807F0000</Value>
			
 
				+                <Value>0x20000000</Value>
			
 
				                 <Value>0x00800000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Input2">
			
 
				                 <Value>1</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x7F7F0000</Value>
			
 
				+                <Value>4</Value>
			
 
				+                <Value>0x607F0000</Value>
			
 
				                 <Value>0x40000000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Expected1">
			
 
				                 <Value>0</Value>
			
 
				-                <Value>NaN</Value>
			
 
				+                <Value>0</Value>
			
 
				                 <Value>0</Value>
			
 
				                 <Value>0</Value>
			
 
				             </Parameter>
			
@@ -5925,25 +5925,25 @@
 
				             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
			
 
				             <Parameter Name="Validation.Input1">
			
 
				                 <Value>0x007F0000</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x40000000</Value>
			
 
				+                <Value>0x807F0000</Value>
			
 
				+                <Value>0x20000000</Value>
			
 
				                 <Value>0x00800000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Input2">
			
 
				                 <Value>1</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x7F7F0000</Value>
			
 
				+                <Value>4</Value>
			
 
				+                <Value>0x607F0000</Value>
			
 
				                 <Value>0x40000000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Expected1">
			
 
				                 <Value>0x007F0000</Value>
			
 
				-                <Value>1</Value>
			
 
				-                <Value>0x00404040</Value>
			
 
				+                <Value>0x801FC000</Value>
			
 
				+                <Value>0x00101010</Value>
			
 
				                 <Value>0x00400000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Expected2">
			
 
				                 <Value>0</Value>
			
 
				-                <Value>NaN</Value>
			
 
				+                <Value>0</Value>
			
 
				                 <Value>0</Value>
			
 
				                 <Value>0</Value>
			
 
				             </Parameter>
			
@@ -6045,20 +6045,20 @@
 
				             <Parameter Name="ShaderOp.Target">cs_6_2</Parameter>
			
 
				             <Parameter Name="Validation.Input1">
			
 
				                 <Value>0x007F0000</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x40000000</Value>
			
 
				+                <Value>0x807F0000</Value>
			
 
				+                <Value>0x20000000</Value>
			
 
				                 <Value>0x00800000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Input2">
			
 
				                 <Value>1</Value>
			
 
				-                <Value>0x007F0000</Value>
			
 
				-                <Value>0x7F7F0000</Value>
			
 
				+                <Value>4</Value>
			
 
				+                <Value>0x607F0000</Value>
			
 
				                 <Value>0x40000000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="Validation.Expected1">
			
 
				                 <Value>0x007F0000</Value>
			
 
				-                <Value>1</Value>
			
 
				-                <Value>0x00404040</Value>
			
 
				+                <Value>0x801FC000</Value>
			
 
				+                <Value>0x00101010</Value>
			
 
				                 <Value>0x00400000</Value>
			
 
				             </Parameter>
			
 
				             <Parameter Name="ShaderOp.Arguments">-denorm preserve</Parameter>
			
--- a/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
+++ b/tools/clang/unittests/SPIRV/CodeGenSPIRVTest.cpp
@@ -194,8 +194,13 @@ TEST_F(FileTest, BinaryOpVectorArithAssign) {
 
				 TEST_F(FileTest, BinaryOpMatrixArithAssign) {
			
 
				   runFileTest("binary-op.arith-assign.matrix.hlsl");
			
 
				 }
			
 
				-TEST_F(FileTest, BinaryOpMixedArithAssign) {
			
 
				-  runFileTest("binary-op.arith-assign.mixed.hlsl");
			
 
				+TEST_F(FileTest, BinaryOpMixedFormArithAssign) {
			
 
				+  // Test mixing scalar/vector/matrix/etc.
			
 
				+  runFileTest("binary-op.arith-assign.mixed.form.hlsl");
			
 
				+}
			
 
				+TEST_F(FileTest, BinaryOpMixedTypeArithAssign) {
			
 
				+  // Test mixing float/int/uint/bool/etc.
			
 
				+  runFileTest("binary-op.arith-assign.mixed.type.hlsl");
			
 
				 }
			
 
				 
			
 
				 // For bitwise binary operators
			
@@ -273,6 +278,10 @@ TEST_F(FileTest, OpArrayAccess) { runFileTest("op.array.access.hlsl"); }
 
				 TEST_F(FileTest, OpBufferAccess) { runFileTest("op.buffer.access.hlsl"); }
			
 
				 TEST_F(FileTest, OpRWBufferAccess) { runFileTest("op.rwbuffer.access.hlsl"); }
			
 
				 TEST_F(FileTest, OpCBufferAccess) { runFileTest("op.cbuffer.access.hlsl"); }
			
 
				+TEST_F(FileTest, OpCBufferAccessMajorness) {
			
 
				+  /// Tests that we correctly consider majorness when accessing matrices
			
 
				+  runFileTest("op.cbuffer.access.majorness.hlsl");
			
 
				+}
			
 
				 TEST_F(FileTest, OpConstantBufferAccess) {
			
 
				   runFileTest("op.constant-buffer.access.hlsl");
			
 
				 }
			
@@ -327,9 +336,13 @@ TEST_F(FileTest, CastFlatConversionStruct) {
 
				 TEST_F(FileTest, CastFlatConversionNoOp) {
			
 
				   runFileTest("cast.flat-conversion.no-op.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, CastFlatConversionLiteralInitializer) {
			
 
				+  runFileTest("cast.flat-conversion.literal-initializer.hlsl");
			
 
				+}
			
 
				 TEST_F(FileTest, CastExplicitVecToMat) {
			
 
				   runFileTest("cast.vec-to-mat.explicit.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, CastBitwidth) { runFileTest("cast.bitwidth.hlsl"); }
			
 
				 
			
 
				 // For vector/matrix splatting and trunction
			
 
				 TEST_F(FileTest, CastTruncateVector) { runFileTest("cast.vector.trunc.hlsl"); }
			
@@ -969,6 +982,22 @@ TEST_F(FileTest, PrimitiveErrorGS) {
 
				   runFileTest("primitive.error.gs.hlsl", Expect::Failure);
			
 
				 }
			
 
				 
			
 
				+// Shader model 6.0 wave query
			
 
				+TEST_F(FileTest, SM6WaveGetLaneCount) {
			
 
				+  runFileTest("sm6.wave-get-lane-count.hlsl");
			
 
				+}
			
 
				+TEST_F(FileTest, SM6WaveGetLaneIndex) {
			
 
				+  runFileTest("sm6.wave-get-lane-index.hlsl");
			
 
				+}
			
 
				+TEST_F(FileTest, SM6WaveBuiltInNoDuplicate) {
			
 
				+  runFileTest("sm6.wave.builtin.no-dup.hlsl");
			
 
				+}
			
 
				+
			
 
				+// Shader model 6.0 wave broadcast
			
 
				+TEST_F(FileTest, SM6WaveReadLaneFirst) {
			
 
				+  runFileTest("sm6.wave-read-lane-first.hlsl");
			
 
				+}
			
 
				+
			
 
				 // SPIR-V specific
			
 
				 TEST_F(FileTest, SpirvStorageClass) { runFileTest("spirv.storage-class.hlsl"); }
			
 
				 
			
@@ -1086,6 +1115,9 @@ TEST_F(FileTest, VulkanLocationInputExplicitOutputImplicit) {
 
				 TEST_F(FileTest, VulkanLocationInputImplicitOutputExplicit) {
			
 
				   runFileTest("vk.location.exp-out.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, VulkanLocationCompositeTypes) {
			
 
				+  runFileTest("vk.location.composite.hlsl");
			
 
				+}
			
 
				 TEST_F(FileTest, VulkanLocationTooLarge) {
			
 
				   runFileTest("vk.location.large.hlsl", Expect::Failure);
			
 
				 }
			
@@ -1179,6 +1211,12 @@ TEST_F(FileTest, VulkanLayoutTBufferStd430) {
 
				 TEST_F(FileTest, VulkanLayoutTextureBufferStd430) {
			
 
				   runFileTest("vk.layout.texture-buffer.std430.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, VulkanLayout64BitTypesStd430) {
			
 
				+  runFileTest("vk.layout.64bit-types.std430.hlsl");
			
 
				+}
			
 
				+TEST_F(FileTest, VulkanLayout64BitTypesStd140) {
			
 
				+  runFileTest("vk.layout.64bit-types.std140.hlsl");
			
 
				+}
			
 
				 
			
 
				 TEST_F(FileTest, VulkanLayoutPushConstantStd430) {
			
 
				   runFileTest("vk.layout.push-constant.std430.hlsl");
			
@@ -1196,6 +1234,10 @@ TEST_F(FileTest, VulkanSubpassInputError) {
 
				   runFileTest("vk.subpass-input.error.hlsl", Expect::Failure);
			
 
				 }
			
 
				 
			
 
				+TEST_F(FileTest, NonFpColMajorError) {
			
 
				+  runFileTest("vk.layout.non-fp-matrix.error.hlsl", Expect::Failure);
			
 
				+}
			
 
				+
			
 
				 // HS: for different Patch Constant Functions
			
 
				 TEST_F(FileTest, HullShaderPCFVoid) { runFileTest("hs.pcf.void.hlsl"); }
			
 
				 TEST_F(FileTest, HullShaderPCFTakesInputPatch) {
			
--- a/utils/hct/hctdb_inst_docs.txt
+++ b/utils/hct/hctdb_inst_docs.txt
@@ -588,3 +588,62 @@ dest0, dest1 = USubb(src0, src1)
 
				 * Inst: AttributeAtVertex - returns the values of the attributes at the vertex.
			
 
				 
			
 
				 returns the values of the attributes at the vertex. VertexID ranges from 0 to 2.
			
 
				+
			
 
				+* Inst: FDiv - returns the quotient of its two operands
			
 
				+
			
 
				+%dest = fdiv float %src0, %src1
			
 
				+
			
 
				+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that fast math flag is not used and "fp32-denorm-mode"="preserve".
			
 
				+When "fp32-denorm-mode"="ftz", denorm inputs should be interpreted as corresponding signed zero, and any resulting denorm is also flushed to zero.
			
 
				+When fast math is enabled, implementation may use reciprocal form: src0*(1/src1).  This may result in evaluating src0*(+/-)INF from src0*(1/(+/-)denorm).  This may produce NaN in some cases or (+/-)INF in others.
			
 
				+
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| src0\\src1| -inf     | -F     |  -1   | -denorm | -0 | +0 | +denorm |  +1   |    +F  | +inf | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -inf      | NaN      |   +inf | +inf  | +inf    |+inf|-inf| -inf    |  -inf |   -inf | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -F        | +0       |   +F   | -src0 | +F      |+inf|-inf| -F      |  src0 |   -F   | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -denorm   | +0       | +denorm| -src0 | +F      |+inf|-inf| -F      |  src0 |-denorm | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| -0        | +0       |   +0   | +0    | 0       |NaN |NaN | 0       |  -0   |   -0   | -0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +0        | -0       |   -0   | -0    | 0       |NaN |NaN | 0       |  +0   |   +0   | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +denorm   | -0       | -denorm| -src0 | -F      |-inf|+inf| +F      |  src0 |+denorm | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +F        | -0       |   -F   | -src0 | -F      |-inf|+inf| +F      |  src0 |   +F   | +0   | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| +inf      | NaN      |   -inf | -inf  | -inf    |-inf|+inf| +inf    |  +inf |   +inf | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+| NaN       | NaN      |   NaN  | NaN   | NaN     |NaN |NaN | NaN     |  NaN  |   NaN  | NaN  | NaN |
			
 
				++-----------+----------+--------+-------+---------+----+----+---------+-------+--------+------+-----+
			
 
				+
			
 
				+* Inst: FAdd - component-wise add
			
 
				+
			
 
				+%des = fadd float %src0, %src1
			
 
				+
			
 
				+The following table shows the results obtained when executing the instruction with various classes of numbers, assuming that "fp32-denorm-mode"="preserve". 
			
 
				+For "fp32-denorm-mode"="ftz" mode, denorms inputs should be treated as corresponding signed zero, and any resulting denorm is also flushed to zero.
			
 
				+
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| src0\src1| -inf     | -F     | -denorm  | -0 | +0 | +denorm   |    +F  | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -inf     | -inf     |   -inf | -inf     |-inf|-inf| -inf      |   -inf | NaN  | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -F       | -inf     |   -F   | -F       |src0|src0| -F        |   +/-F | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -denorm  | -inf     |   -F   |-F/denorm |src0|src0| +/-denorm |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| -0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +0       | -inf     |   src1 | src1     |-0  |+0  | src1      |   src1 | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +denorm  | -inf     |   -F   |+/-denorm |src0|src0| +F/denorm |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +F       | -inf     |  +/-F  | +F       |src0|src0| +F        |   +F   | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| +inf     | NaN      |   +inf | +inf     |+inf|+inf| +inf      |   +inf | +inf | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
 
				+| NaN      | NaN      |   NaN  | NaN      |NaN |NaN | NaN       |   NaN  | NaN  | NaN |
			
 
				++----------+----------+--------+----------+----+----+-----------+--------+------+-----+
			
--- a/utils/hct/hctdb_test.py
+++ b/utils/hct/hctdb_test.py
@@ -91,14 +91,18 @@ def add_test_case_float_half(test_name, inst_names, validation_type, validation_
 
				     add_test_case(test_name, inst_names, validation_type, validation_tolerance,
			
 
				                   float_input_lists, float_output_lists, "cs_6_0", get_shader_text(shader_key, shader_op_name), **kwargs)
			
 
				     # if half test cases are different from float input lists, use those lists instead for half testings
			
 
				-    half_input_lists, half_output_lists = float_input_lists, float_output_lists
			
 
				+    half_input_lists, half_output_lists, half_validation_type, half_validation_tolerance = float_input_lists, float_output_lists, validation_type, validation_tolerance
			
 
				     if "half_inputs" in kwargs:
			
 
				         half_input_lists = kwargs["half_inputs"]
			
 
				     if "half_outputs" in kwargs:
			
 
				         half_output_lists = kwargs["half_outputs"]
			
 
				+    if "half_validation_type" in kwargs:
			
 
				+        half_validation_type = kwargs["half_validation_type"]
			
 
				+    if "half_validation_tolerance" in kwargs:
			
 
				+        half_validation_tolerance = kwargs["half_validation_tolerance"]
			
 
				     # skip relative error test check for half for now
			
 
				     if validation_type != "Relative":
			
 
				-        add_test_case(test_name + "Half", inst_names, validation_type, validation_tolerance,
			
 
				+        add_test_case(test_name + "Half", inst_names, half_validation_type, half_validation_tolerance,
			
 
				                     half_input_lists, half_output_lists, "cs_6_2",
			
 
				                     get_shader_text(shader_key.replace("float","half"), shader_op_name), shader_arguments="-enable-16bit-types", **kwargs)
			
 
				 
			
@@ -601,7 +605,7 @@ def add_test_cases():
 
				         [['NaN', '-Inf', '-denorm', '-0', '0', 'denorm', 'Inf', '1', '-1']], [[
			
 
				             'NaN', 'Inf', '1.0', '1.0', '1.0', '1.0', 'Inf', '1.543081',
			
 
				             '1.543081'
			
 
				-        ]], "unary float", "cosh")
			
 
				+        ]], "unary float", "cosh", half_validation_type='ulp', half_validation_tolerance=2)
			
 
				     add_test_case_float_half('Hsin', ['Hsin'], 'Epsilon', 0.0008,
			
 
				         [['NaN', '-Inf', '-denorm', '-0', '0', 'denorm', 'Inf', '1', '-1']], [[
			
 
				             'NaN', '-Inf', '0.0', '0.0', '0.0', '0.0', 'Inf', '1.175201',
			
@@ -802,9 +806,9 @@ def add_test_cases():
 
				     [['0x0', '0x00FE0000', '0x007F0000', '0x007A0000']],
			
 
				     'cs_6_2', get_shader_text("binary float", "-"))
			
 
				     add_test_case_denorm('FDivDenorm', ['FDiv'], 'ulp', 1,
			
 
				-    [['0x007F0000', '0x007F0000', '0x40000000', '0x00800000'],['1', '0x007F0000', '0x7F7F0000', '0x40000000']],
			
 
				-    [['0', 'NaN', '0', '0']],
			
 
				-    [['0x007F0000', '1', '0x00404040', '0x00400000']],
			
 
				+    [['0x007F0000', '0x807F0000', '0x20000000', '0x00800000'],['1', '4', '0x607F0000', '0x40000000']],
			
 
				+    [['0', '0', '0', '0']],
			
 
				+    [['0x007F0000', '0x801FC000', '0x00101010', '0x00400000']],
			
 
				     'cs_6_2', get_shader_text("binary float", "/"))
			
 
				     add_test_case_denorm('FMulDenorm', ['FMul'], 'ulp', 1,
			
 
				     [['0x00000300', '0x007F0000', '0x007F0000', '0x001E0000', '0x00000300'],['128', '1', '0x007F0000', '20', '0x78000000']],
			
--- a/utils/hct/hctgettaef.py
+++ b/utils/hct/hctgettaef.py
@@ -1,5 +1,6 @@
 
				 import urllib
			
 
				 import os
			
 
				+import ssl
			
 
				 import zipfile
			
 
				 
			
 
				 url = "https://github.com/Microsoft/WinObjC/raw/develop/deps/prebuilt/nuget/taef.redist.wlk.1.0.170206001-nativetargets.nupkg"
			
@@ -11,11 +12,17 @@ if not os.path.isdir(taef_dir):
 
				   os.makedirs(taef_dir)
			
 
				 
			
 
				 try:
			
 
				-  urllib.urlretrieve(url, zipfile_name)
			
 
				+  ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
			
 
				+  response = urllib.urlopen(url, context=ctx)
			
 
				+  f = open(zipfile_name, 'wb')
			
 
				+  f.write(response.read())
			
 
				+  f.close()
			
 
				 except:
			
 
				   print("Unable to read file with urllib, trying via powershell...")
			
 
				   from subprocess import check_call
			
 
				-  cmd = "(new-object System.Net.WebClient).DownloadFile('" + url + "', '" + zipfile_name + "')"
			
 
				+  cmd = ""
			
 
				+  cmd += "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12;"
			
 
				+  cmd += "(new-object System.Net.WebClient).DownloadFile('" + url + "', '" + zipfile_name + "')"
			
 
				   check_call(['powershell.exe', '-Command', cmd])
			
 
				 
			
 
				 z = zipfile.ZipFile(zipfile_name)