6 years ago · b6a7b82644
--- a/tools/clang/include/clang/SPIRV/AstTypeProbe.h
+++ b/tools/clang/include/clang/SPIRV/AstTypeProbe.h
@@ -38,6 +38,9 @@ bool isScalarType(QualType type, QualType *scalarType = nullptr);
 
				 bool isVectorType(QualType type, QualType *elemType = nullptr,
			
 
				                   uint32_t *elemCount = nullptr);
			
 
				 
			
 
				+/// Returns true if the given type is an array with constant known size.
			
 
				+bool isConstantArrayType(const ASTContext &, QualType);
			
 
				+
			
 
				 /// Returns true if the given type is enum type based on AST parse.
			
 
				 bool isEnumType(QualType type);
			
 
				 
			
--- a/tools/clang/lib/SPIRV/AstTypeProbe.cpp
+++ b/tools/clang/lib/SPIRV/AstTypeProbe.cpp
@@ -152,6 +152,10 @@ bool isVectorType(QualType type, QualType *elemType, uint32_t *elemCount) {
 
				   return isVec;
			
 
				 }
			
 
				 
			
 
				+bool isConstantArrayType(const ASTContext &astContext, QualType type) {
			
 
				+  return astContext.getAsConstantArrayType(type) != nullptr;
			
 
				+}
			
 
				+
			
 
				 bool isEnumType(QualType type) {
			
 
				   if (isa<EnumType>(type.getTypePtr()))
			
 
				     return true;
			
--- a/tools/clang/lib/SPIRV/RawBufferMethods.cpp
+++ b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
@@ -33,7 +33,7 @@ RawBufferHandler::bitCastToNumericalOrBool(SpirvInstruction *instr,
 
				   if (isSameType(astContext, fromType, toType))
			
 
				     return instr;
			
 
				 
			
 
				-  if (toType->isBooleanType())
			
 
				+  if (toType->isBooleanType() || fromType->isBooleanType())
			
 
				     return theEmitter.castToType(instr, fromType, toType, loc);
			
 
				 
			
 
				   // Perform a bitcast
			
@@ -185,148 +185,6 @@ SpirvInstruction *RawBufferHandler::load16BitsAtBitOffset16(
 
				   return result;
			
 
				 }
			
 
				 
			
 
				-SpirvInstruction *RawBufferHandler::load32BitsAtBitOffset16(
			
 
				-    SpirvInstruction *buffer, SpirvInstruction *&index,
			
 
				-    QualType target32BitType, uint32_t &bitOffset) {
			
 
				-  assert(bitOffset == 16);
			
 
				-  const auto loc = buffer->getSourceLocation();
			
 
				-  SpirvInstruction *result = nullptr;
			
 
				-  SpirvInstruction *ptr = nullptr;
			
 
				-  auto *constUint0 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				-  auto *constUint1 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				-  auto *constUint16 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
			
 
				-
			
 
				-  // The underlying element type of the ByteAddressBuffer is uint. Since the
			
 
				-  // bitOffset is not zero, we need to perform two load operations.
			
 
				-
			
 
				-  // Load the first 32-bit uint. Only its 16 MSBs matter.
			
 
				-  // The 16 MSBs of the loaded value becomes the 16 LSBs of the result.
			
 
				-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				-                                     {constUint0, index}, loc);
			
 
				-  SpirvInstruction *lsb =
			
 
				-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
			
 
				-
			
 
				-  // Right shift by 16 bits leaves the upper 16 bits as 0.
			
 
				-  lsb = spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
			
 
				-                                  astContext.UnsignedIntTy, lsb, constUint16,
			
 
				-                                  loc);
			
 
				-
			
 
				-  // Increment the base index
			
 
				-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				-                                    index, constUint1, loc);
			
 
				-
			
 
				-  // Load the second 32-bit uint. Only its 16 LSBs matter.
			
 
				-  // The 16 LSBs of the loaded value becomes the 16 MSBs of the result.
			
 
				-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				-                                     {constUint0, index}, loc);
			
 
				-  SpirvInstruction *msb =
			
 
				-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
			
 
				-
			
 
				-  // Left shift by 16 bits leaves the lower 16 bits as 0.
			
 
				-  msb = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
			
 
				-                                  astContext.UnsignedIntTy, msb, constUint16,
			
 
				-                                  loc);
			
 
				-
			
 
				-  // Bitwise Or the MSBs and LSBs to get the resulting 32-bit value.
			
 
				-  result = spvBuilder.createBinaryOp(spv::Op::OpBitwiseOr,
			
 
				-                                     astContext.UnsignedIntTy, lsb, msb, loc);
			
 
				-
			
 
				-  result = bitCastToNumericalOrBool(result, astContext.UnsignedIntTy,
			
 
				-                                    target32BitType, loc);
			
 
				-  result->setRValue();
			
 
				-
			
 
				-  // Now that a 32-bit load at bit-offset 16 has been performed, the next load
			
 
				-  // should be done at *the next base index* at bit-offset 16.
			
 
				-  // The base index has already been incremented.
			
 
				-  bitOffset = (bitOffset + 32) % 32;
			
 
				-
			
 
				-  return result;
			
 
				-}
			
 
				-
			
 
				-SpirvInstruction *RawBufferHandler::load64BitsAtBitOffset16(
			
 
				-    SpirvInstruction *buffer, SpirvInstruction *&index,
			
 
				-    QualType target64BitType, uint32_t &bitOffset) {
			
 
				-  assert(bitOffset == 16);
			
 
				-  const auto loc = buffer->getSourceLocation();
			
 
				-  SpirvInstruction *result = nullptr;
			
 
				-  SpirvInstruction *ptr = nullptr;
			
 
				-  auto *constUint0 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				-  auto *constUint1 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				-  auto *constUint16 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
			
 
				-  auto *constUint48 =
			
 
				-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 48));
			
 
				-
			
 
				-  // The underlying element type of the ByteAddressBuffer is uint. Since the
			
 
				-  // bitOffset is 16, we need to perform three load operations.
			
 
				-  // Use 16 bits from the first load, all the 32 bits from the second load, and
			
 
				-  // 16 bits from the third load.
			
 
				-
			
 
				-  // Load the first 32-bit uint. Only its 16 MSBs matter.
			
 
				-  // Right shift by 16 bits leaves the upper 16 bits as 0.
			
 
				-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				-                                     {constUint0, index}, loc);
			
 
				-  SpirvInstruction *first16 =
			
 
				-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
			
 
				-
			
 
				-  // Incremenet the index and load a 32-bit uint.
			
 
				-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				-                                    index, constUint1, loc);
			
 
				-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				-                                     {constUint0, index}, loc);
			
 
				-  SpirvInstruction *middle32 =
			
 
				-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
			
 
				-
			
 
				-  // Incremenet the index and load a 32-bit uint. Only its 16 LSBs matter.
			
 
				-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				-                                    index, constUint1, loc);
			
 
				-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				-                                     {constUint0, index}, loc);
			
 
				-  SpirvInstruction *last16 =
			
 
				-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
			
 
				-
			
 
				-  // Convert all parts to 64 bits
			
 
				-  first16 = spvBuilder.createUnaryOp(
			
 
				-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, first16, loc);
			
 
				-  middle32 = spvBuilder.createUnaryOp(
			
 
				-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, middle32, loc);
			
 
				-  last16 = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
			
 
				-                                    astContext.UnsignedLongLongTy, last16, loc);
			
 
				-
			
 
				-  // Perform: (first16 >> 16) | (middle32 << 16) | (last16 << 48)
			
 
				-  first16 = spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
			
 
				-                                      astContext.UnsignedLongLongTy, first16,
			
 
				-                                      constUint16, loc);
			
 
				-  middle32 = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
			
 
				-                                       astContext.UnsignedLongLongTy, middle32,
			
 
				-                                       constUint16, loc);
			
 
				-  last16 = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
			
 
				-                                     astContext.UnsignedLongLongTy, last16,
			
 
				-                                     constUint48, loc);
			
 
				-
			
 
				-  result = spvBuilder.createBinaryOp(spv::Op::OpBitwiseOr,
			
 
				-                                     astContext.UnsignedLongLongTy, first16,
			
 
				-                                     middle32, loc);
			
 
				-  result = spvBuilder.createBinaryOp(
			
 
				-      spv::Op::OpBitwiseOr, astContext.UnsignedLongLongTy, result, last16, loc);
			
 
				-
			
 
				-  result = bitCastToNumericalOrBool(result, astContext.UnsignedLongLongTy,
			
 
				-                                    target64BitType, loc);
			
 
				-  result->setRValue();
			
 
				-
			
 
				-  // Now that a 64-bit load at bit-offset 16 has been performed, the next load
			
 
				-  // should be done at *the base index + 2* at bit-offset 16.
			
 
				-  // The base index has already been incremented twice.
			
 
				-  bitOffset = (bitOffset + 64) % 32;
			
 
				-
			
 
				-  return result;
			
 
				-}
			
 
				-
			
 
				 SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
			
 
				     SpirvInstruction *buffer, SpirvInstruction *&index,
			
 
				     const QualType targetType, uint32_t &bitOffset) {
			
@@ -372,11 +230,12 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
				         return load16BitsAtBitOffset16(buffer, index, targetType, bitOffset);
			
 
				         break;
			
 
				       case 32:
			
 
				-        return load32BitsAtBitOffset16(buffer, index, targetType, bitOffset);
			
 
				-        break;
			
 
				       case 64:
			
 
				-        return load64BitsAtBitOffset16(buffer, index, targetType, bitOffset);
			
 
				-        break;
			
 
				+        theEmitter.emitError(
			
 
				+            "templated buffer load should not result in loading "
			
 
				+            "32-bit or 64-bit values at bit offset 16",
			
 
				+            loc);
			
 
				+        return nullptr;
			
 
				       default:
			
 
				         theEmitter.emitError(
			
 
				             "templated load of ByteAddressBuffer is only implemented for "
			
@@ -462,7 +321,6 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
				   // As a result, there might exist some padding after some struct members.
			
 
				   if (const auto *structType = targetType->getAs<RecordType>()) {
			
 
				     const auto *decl = structType->getDecl();
			
 
				-    assert(bitOffset == 0);
			
 
				     SpirvInstruction *originalIndex = index;
			
 
				     uint32_t originalBitOffset = bitOffset;
			
 
				     llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
			
@@ -481,15 +339,16 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
				           field->getType(), theEmitter.getSpirvOptions().sBufferLayoutRule,
			
 
				           /*isRowMajor*/ llvm::None, &stride);
			
 
				       fieldOffsetInBytes = roundToPow2(fieldOffsetInBytes, fieldAlignment);
			
 
				+      const auto wordOffset =
			
 
				+          ((originalBitOffset / 8) + fieldOffsetInBytes) / 4;
			
 
				+      bitOffset = (((originalBitOffset / 8) + fieldOffsetInBytes) % 4) * 8;
			
 
				 
			
 
				-      if (fieldOffsetInBytes != 0) {
			
 
				+      if (wordOffset != 0) {
			
 
				         // Divide the fieldOffset by 4 to figure out how much to increment the
			
 
				         // index into the buffer (increment occurs by 32-bit words since the
			
 
				         // underlying type is an array of uints).
			
 
				         // The remainder by four tells us the *byte offset* (then multiply by 8
			
 
				         // to get bit offset).
			
 
				-        auto wordOffset = fieldOffsetInBytes / 4;
			
 
				-        bitOffset = (fieldOffsetInBytes % 4) * 8;
			
 
				         index = spvBuilder.createBinaryOp(
			
 
				             spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
			
 
				             spvBuilder.getConstantInt(astContext.UnsignedIntTy,
			
@@ -512,16 +371,14 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
				     // 8 * (4 + 1) = 40
			
 
				     assert(structAlignment != 0);
			
 
				     uint32_t newByteOffset = roundToPow2(structSize, structAlignment);
			
 
				-    uint32_t newWordOffset = newByteOffset / 4;
			
 
				+    uint32_t newWordOffset = ((originalBitOffset / 8) + newByteOffset) / 4;
			
 
				+    bitOffset = 8 * (((originalBitOffset / 8) + newByteOffset) % 4);
			
 
				     index = spvBuilder.createBinaryOp(
			
 
				         spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
			
 
				         spvBuilder.getConstantInt(astContext.UnsignedIntTy,
			
 
				                                   llvm::APInt(32, newWordOffset)),
			
 
				         loc);
			
 
				 
			
 
				-    // New bitOffset should be zero because after loading the struct, we will
			
 
				-    // be loading at the next aligned address.
			
 
				-    bitOffset = 0;
			
 
				     result = spvBuilder.createCompositeConstruct(targetType, loadedElems, loc);
			
 
				     result->setRValue();
			
 
				     return result;
			
@@ -530,5 +387,391 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
				   llvm_unreachable("templated buffer load unimplemented for type");
			
 
				 }
			
 
				 
			
 
				+void RawBufferHandler::store16BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                                               SpirvInstruction *buffer,
			
 
				+                                               SpirvInstruction *&index,
			
 
				+                                               const QualType valueType) {
			
 
				+  const auto loc = buffer->getSourceLocation();
			
 
				+  SpirvInstruction *result = nullptr;
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  // The underlying element type of the ByteAddressBuffer is uint. So we
			
 
				+  // need to store a 32-bit value.
			
 
				+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                           {constUint0, index}, loc);
			
 
				+  result = bitCastToNumericalOrBool(value, valueType,
			
 
				+                                    astContext.UnsignedShortTy, loc);
			
 
				+  result = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
			
 
				+                                    astContext.UnsignedIntTy, result, loc);
			
 
				+  spvBuilder.createStore(ptr, result, loc);
			
 
				+}
			
 
				+
			
 
				+void RawBufferHandler::store16BitsAtBitOffset16(SpirvInstruction *value,
			
 
				+                                               SpirvInstruction *buffer,
			
 
				+                                               SpirvInstruction *&index,
			
 
				+                                               const QualType valueType) {
			
 
				+  const auto loc = buffer->getSourceLocation();
			
 
				+  SpirvInstruction *result = nullptr;
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  auto *constUint1 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				+  auto *constUint16 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
			
 
				+  // The underlying element type of the ByteAddressBuffer is uint. So we
			
 
				+  // need to store a 32-bit value.
			
 
				+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                           {constUint0, index}, loc);
			
 
				+  result = bitCastToNumericalOrBool(value, valueType,
			
 
				+                                    astContext.UnsignedShortTy, loc);
			
 
				+  result = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
			
 
				+                                    astContext.UnsignedIntTy, result, loc);
			
 
				+  result = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
			
 
				+                                     astContext.UnsignedIntTy, result,
			
 
				+                                     constUint16, loc);
			
 
				+  result = spvBuilder.createBinaryOp(
			
 
				+      spv::Op::OpBitwiseOr, astContext.UnsignedIntTy,
			
 
				+      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc), result, loc);
			
 
				+  spvBuilder.createStore(ptr, result, loc);
			
 
				+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				+                                    index, constUint1, loc);
			
 
				+}
			
 
				+
			
 
				+void RawBufferHandler::store32BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                                               SpirvInstruction *buffer,
			
 
				+                                               SpirvInstruction *&index,
			
 
				+                                               const QualType valueType) {
			
 
				+  const auto loc = buffer->getSourceLocation();
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  auto *constUint1 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				+  // The underlying element type of the ByteAddressBuffer is uint. So we
			
 
				+  // need to store a 32-bit value.
			
 
				+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                           {constUint0, index}, loc);
			
 
				+  value =
			
 
				+      bitCastToNumericalOrBool(value, valueType, astContext.UnsignedIntTy, loc);
			
 
				+  spvBuilder.createStore(ptr, value, loc);
			
 
				+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				+                                    index, constUint1, loc);
			
 
				+}
			
 
				+
			
 
				+void RawBufferHandler::store64BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                                               SpirvInstruction *buffer,
			
 
				+                                               SpirvInstruction *&index,
			
 
				+                                               const QualType valueType) {
			
 
				+  const auto loc = buffer->getSourceLocation();
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  auto *constUint1 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				+  auto *constUint32 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 32));
			
 
				+
			
 
				+  // The underlying element type of the ByteAddressBuffer is uint. So we
			
 
				+  // need to store two 32-bit values.
			
 
				+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                           {constUint0, index}, loc);
			
 
				+  // First convert the 64-bit value to uint64_t. Then extract two 32-bit words
			
 
				+  // from it.
			
 
				+  value = bitCastToNumericalOrBool(value, valueType,
			
 
				+                                   astContext.UnsignedLongLongTy, loc);
			
 
				+
			
 
				+  // Use OpUConvert to perform truncation (produces the least significant bits).
			
 
				+  SpirvInstruction *lsb = spvBuilder.createUnaryOp(
			
 
				+      spv::Op::OpUConvert, astContext.UnsignedIntTy, value, loc);
			
 
				+
			
 
				+  // Shift uint64_t to the right by 32 bits and truncate to get the most
			
 
				+  // significant bits.
			
 
				+  SpirvInstruction *msb = spvBuilder.createUnaryOp(
			
 
				+      spv::Op::OpUConvert, astContext.UnsignedIntTy,
			
 
				+      spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
			
 
				+                                astContext.UnsignedLongLongTy, value,
			
 
				+                                constUint32, loc),
			
 
				+      loc);
			
 
				+
			
 
				+  spvBuilder.createStore(ptr, lsb, loc);
			
 
				+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				+                                    index, constUint1, loc);
			
 
				+  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                     {constUint0, index}, loc);
			
 
				+  spvBuilder.createStore(ptr, msb, loc);
			
 
				+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
			
 
				+                                    index, constUint1, loc);
			
 
				+}
			
 
				+
			
 
				+void RawBufferHandler::storeArrayOfScalars(
			
 
				+    std::deque<SpirvInstruction *> values, SpirvInstruction *buffer,
			
 
				+    SpirvInstruction *&index, const QualType valueType, uint32_t &bitOffset,
			
 
				+    SourceLocation loc) {
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  auto *constUint1 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				+  auto *constUint16 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
			
 
				+  const auto storeWidth = getElementSpirvBitwidth(
			
 
				+      astContext, valueType, theEmitter.getSpirvOptions().enable16BitTypes);
			
 
				+  const uint32_t elemCount = values.size();
			
 
				+
			
 
				+  if (storeWidth == 16u) {
			
 
				+    uint32_t elemIndex = 0;
			
 
				+    if (bitOffset == 16) {
			
 
				+      // First store the first element at offset 16 of the last memory index.
			
 
				+      store16BitsAtBitOffset16(values[0], buffer, index, valueType);
			
 
				+      bitOffset = 0;
			
 
				+      ++elemIndex;
			
 
				+    }
			
 
				+    // Do a custom store based on the number of elements.
			
 
				+    for (; elemIndex < elemCount; elemIndex = elemIndex + 2) {
			
 
				+      // The underlying element type of the ByteAddressBuffer is uint. So we
			
 
				+      // need to store a 32-bit value by combining two 16-bit values.
			
 
				+      SpirvInstruction *word = nullptr;
			
 
				+      word = bitCastToNumericalOrBool(values[elemIndex], valueType,
			
 
				+                                      astContext.UnsignedShortTy, loc);
			
 
				+      // Zero-extend to 32 bits.
			
 
				+      word = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
			
 
				+                                      astContext.UnsignedIntTy, word, loc);
			
 
				+      if (elemIndex + 1 < elemCount) {
			
 
				+        SpirvInstruction *msb = nullptr;
			
 
				+        msb = bitCastToNumericalOrBool(values[elemIndex + 1], valueType,
			
 
				+                                       astContext.UnsignedShortTy, loc);
			
 
				+        msb = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
			
 
				+                                       astContext.UnsignedIntTy, msb, loc);
			
 
				+        msb = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
			
 
				+                                        astContext.UnsignedIntTy, msb,
			
 
				+                                        constUint16, loc);
			
 
				+        word = spvBuilder.createBinaryOp(
			
 
				+            spv::Op::OpBitwiseOr, astContext.UnsignedIntTy, word, msb, loc);
			
 
				+        // We will store two 16-bit values.
			
 
				+        bitOffset = (bitOffset + 32) % 32;
			
 
				+      } else {
			
 
				+        // We will store one 16-bit value.
			
 
				+        bitOffset = (bitOffset + 16) % 32;
			
 
				+      }
			
 
				+
			
 
				+      auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
			
 
				+                                               {constUint0, index}, loc);
			
 
				+      spvBuilder.createStore(ptr, word, loc);
			
 
				+      index = spvBuilder.createBinaryOp(
			
 
				+          spv::Op::OpIAdd, astContext.UnsignedIntTy, index, constUint1, loc);
			
 
				+    }
			
 
				+  } else if (storeWidth == 32u || storeWidth == 64u) {
			
 
				+    assert(bitOffset == 0);
			
 
				+    for (uint32_t i = 0; i < elemCount; ++i)
			
 
				+      processTemplatedStoreToBuffer(values[i], buffer, index, valueType, bitOffset);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+QualType RawBufferHandler::serializeToScalarsOrStruct(
			
 
				+    std::deque<SpirvInstruction *> *values, QualType valueType,
			
 
				+    SourceLocation loc) {
			
 
				+  uint32_t size = values->size();
			
 
				+
			
 
				+  // Vector type
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    uint32_t elemCount = 0;
			
 
				+    if (isVectorType(valueType, &elemType, &elemCount)) {
			
 
				+      for (uint32_t i = 0; i < size; ++i) {
			
 
				+        for (uint32_t j = 0; j < elemCount; ++j) {
			
 
				+          values->push_back(spvBuilder.createCompositeExtract(
			
 
				+              elemType, values->front(), {j}, loc));
			
 
				+        }
			
 
				+        values->pop_front();
			
 
				+      }
			
 
				+      return elemType;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Matrix type
			
 
				+  {
			
 
				+    QualType elemType = {};
			
 
				+    uint32_t numRows = 0, numCols = 0;
			
 
				+    if (isMxNMatrix(valueType, &elemType, &numRows, &numCols)) {
			
 
				+      for (uint32_t i = 0; i < size; ++i) {
			
 
				+        for (uint32_t j = 0; j < numRows; ++j) {
			
 
				+          for (uint32_t k = 0; k < numCols; ++k) {
			
 
				+            // TODO: This is currently doing a row_major matrix store. We must
			
 
				+            // investigate whether we also need to implement it for
			
 
				+            // column_major.
			
 
				+            values->push_back(spvBuilder.createCompositeExtract(
			
 
				+                elemType, values->front(), {j, k}, loc));
			
 
				+          }
			
 
				+        }
			
 
				+        values->pop_front();
			
 
				+      }
			
 
				+      return serializeToScalarsOrStruct(values, elemType, loc);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Array type
			
 
				+  {
			
 
				+    if (const auto *arrType = astContext.getAsConstantArrayType(valueType)) {
			
 
				+      const uint32_t arrElemCount =
			
 
				+          static_cast<uint32_t>(arrType->getSize().getZExtValue());
			
 
				+      const QualType arrElemType = arrType->getElementType();
			
 
				+      for (uint32_t i = 0; i < size; ++i) {
			
 
				+        for (uint32_t j = 0; j < arrElemCount; ++j) {
			
 
				+          values->push_back(spvBuilder.createCompositeExtract(
			
 
				+              arrElemType, values->front(), {j}, loc));
			
 
				+        }
			
 
				+        values->pop_front();
			
 
				+      }
			
 
				+      return serializeToScalarsOrStruct(values, arrElemType, loc);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if (isScalarType(valueType))
			
 
				+    return valueType;
			
 
				+
			
 
				+  if (const auto *structType = valueType->getAs<RecordType>())
			
 
				+    return valueType;
			
 
				+
			
 
				+  llvm_unreachable("unhandled type when serializing an array");
			
 
				+}
			
 
				+
			
 
				+void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
			
 
				+                                                     SpirvInstruction *buffer,
			
 
				+                                                     SpirvInstruction *&index,
			
 
				+                                                     const QualType valueType,
			
 
				+                                                     uint32_t &bitOffset) {
			
 
				+  assert(bitOffset == 0 || bitOffset == 16);
			
 
				+  const auto loc = buffer->getSourceLocation();
			
 
				+  auto *constUint0 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
			
 
				+  auto *constUint1 =
			
 
				+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
			
 
				+
			
 
				+  // Scalar types
			
 
				+  if (isScalarType(valueType)) {
			
 
				+    auto storeWidth = getElementSpirvBitwidth(
			
 
				+        astContext, valueType, theEmitter.getSpirvOptions().enable16BitTypes);
			
 
				+    switch (bitOffset) {
			
 
				+    case 0: {
			
 
				+      switch (storeWidth) {
			
 
				+      case 16:
			
 
				+        store16BitsAtBitOffset0(value, buffer, index, valueType);
			
 
				+        return;
			
 
				+      case 32:
			
 
				+        store32BitsAtBitOffset0(value, buffer, index, valueType);
			
 
				+        return;
			
 
				+      case 64:
			
 
				+        store64BitsAtBitOffset0(value, buffer, index, valueType);
			
 
				+        return;
			
 
				+      default:
			
 
				+        theEmitter.emitError(
			
 
				+            "templated load of ByteAddressBuffer is only implemented for "
			
 
				+            "16, 32, and 64-bit types",
			
 
				+            loc);
			
 
				+        return;
			
 
				+      }
			
 
				+    }
			
 
				+    case 16: {
			
 
				+      // The only legal store at offset 16 is by a 16-bit value.
			
 
				+      assert(storeWidth == 16);
			
 
				+      store16BitsAtBitOffset16(value, buffer, index, valueType);
			
 
				+      return;
			
 
				+    }
			
 
				+    default:
			
 
				+      theEmitter.emitError(
			
 
				+          "templated load of ByteAddressBuffer is only implemented for "
			
 
				+          "16, 32, and 64-bit types",
			
 
				+          loc);
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Vectors, Matrices, and Arrays can all be serialized and stored.
			
 
				+  if (isVectorType(valueType) || isMxNMatrix(valueType) ||
			
 
				+      isConstantArrayType(astContext, valueType)) {
			
 
				+    std::deque<SpirvInstruction *> elems;
			
 
				+    elems.push_back(value);
			
 
				+    auto serializedType = serializeToScalarsOrStruct(&elems, valueType, loc);
			
 
				+    if (isScalarType(serializedType)) {
			
 
				+      storeArrayOfScalars(elems, buffer, index, serializedType, bitOffset, loc);
			
 
				+    } else if (const auto *structType = serializedType->getAs<RecordType>()) {
			
 
				+      for (auto elem : elems)
			
 
				+        processTemplatedStoreToBuffer(elem, buffer, index, serializedType,
			
 
				+                                      bitOffset);
			
 
				+    }
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // Struct types
			
 
				+  // The "natural" layout for structure types dictates that structs are
			
 
				+  // aligned like their field with the largest alignment.
			
 
				+  // As a result, there might exist some padding after some struct members.
			
 
				+  if (const auto *structType = valueType->getAs<RecordType>()) {
			
 
				+    const auto *decl = structType->getDecl();
			
 
				+    SpirvInstruction *originalIndex = index;
			
 
				+    const auto originalBitOffset = bitOffset;
			
 
				+    uint32_t fieldOffsetInBytes = 0;
			
 
				+    uint32_t structAlignment = 0, structSize = 0, stride = 0;
			
 
				+    std::tie(structAlignment, structSize) =
			
 
				+        AlignmentSizeCalculator(astContext, theEmitter.getSpirvOptions())
			
 
				+            .getAlignmentAndSize(valueType,
			
 
				+                                 theEmitter.getSpirvOptions().sBufferLayoutRule,
			
 
				+                                 llvm::None, &stride);
			
 
				+    uint32_t fieldIndex = 0;
			
 
				+    for (const auto *field : decl->fields()) {
			
 
				+      AlignmentSizeCalculator alignmentCalc(astContext,
			
 
				+                                            theEmitter.getSpirvOptions());
			
 
				+      uint32_t fieldSize = 0, fieldAlignment = 0;
			
 
				+      std::tie(fieldAlignment, fieldSize) = alignmentCalc.getAlignmentAndSize(
			
 
				+          field->getType(), theEmitter.getSpirvOptions().sBufferLayoutRule,
			
 
				+          /*isRowMajor*/ llvm::None, &stride);
			
 
				+      fieldOffsetInBytes = roundToPow2(fieldOffsetInBytes, fieldAlignment);
			
 
				+      const auto wordOffset =
			
 
				+          ((originalBitOffset / 8) + fieldOffsetInBytes) / 4;
			
 
				+      bitOffset = (((originalBitOffset / 8) + fieldOffsetInBytes) % 4) * 8;
			
 
				+
			
 
				+      if (wordOffset != 0) {
			
 
				+        // Divide the fieldOffset by 4 to figure out how much to increment the
			
 
				+        // index into the buffer (increment occurs by 32-bit words since the
			
 
				+        // underlying type is an array of uints).
			
 
				+        // The remainder by four tells us the *byte offset* (then multiply by 8
			
 
				+        // to get bit offset).
			
 
				+        index = spvBuilder.createBinaryOp(
			
 
				+            spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
			
 
				+            spvBuilder.getConstantInt(astContext.UnsignedIntTy,
			
 
				+                                      llvm::APInt(32, wordOffset)),
			
 
				+            loc);
			
 
				+      }
			
 
				+
			
 
				+      processTemplatedStoreToBuffer(
			
 
				+          spvBuilder.createCompositeExtract(field->getType(), value,
			
 
				+                                            {fieldIndex}, loc),
			
 
				+          buffer, index, field->getType(), bitOffset);
			
 
				+
			
 
				+      fieldOffsetInBytes += fieldSize;
			
 
				+      ++fieldIndex;
			
 
				+    }
			
 
				+
			
 
				+    // After we're done with storing the entire struct, we need to update the
			
 
				+    // index (in case we are stroring an array of structs).
			
 
				+    //
			
 
				+    // Example: struct alignment = 8. struct size = 34 bytes
			
 
				+    // (34 / 8) = 4 full words
			
 
				+    // (34 % 8) = 2 > 0, therefore need to move to the next aligned address
			
 
				+    // So the starting byte offset after loading the entire struct is:
			
 
				+    // 8 * (4 + 1) = 40
			
 
				+    assert(structAlignment != 0);
			
 
				+    uint32_t newByteOffset = roundToPow2(structSize, structAlignment);
			
 
				+    uint32_t newWordOffset = ((originalBitOffset / 8) + newByteOffset) / 4;
			
 
				+    bitOffset = 8 * (((originalBitOffset / 8) + newByteOffset) % 4);
			
 
				+    index = spvBuilder.createBinaryOp(
			
 
				+        spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
			
 
				+        spvBuilder.getConstantInt(astContext.UnsignedIntTy,
			
 
				+                                  llvm::APInt(32, newWordOffset)),
			
 
				+        loc);
			
 
				+
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  llvm_unreachable("templated buffer store unimplemented for type");
			
 
				+}
			
 
				+
			
 
				 } // namespace spirv
			
 
				 } // namespace clang
			
--- a/tools/clang/lib/SPIRV/RawBufferMethods.h
+++ b/tools/clang/lib/SPIRV/RawBufferMethods.h
@@ -28,8 +28,10 @@ public:
 
				   /// which is a runtime array in SPIR-V. This method works by loading one or
			
 
				   /// more uints, and performing necessary casts and composite constructions
			
 
				   /// to build the 'targetType'. The 'offset' parameter can be used for finer
			
 
				-  /// grained load of bitwidths smaller than 32-bits. Example: targetType =
			
 
				-  /// uint16_t, address=0, offset=0
			
 
				+  /// grained load of bitwidths smaller than 32-bits.
			
 
				+  ///
			
 
				+  /// Example:
			
 
				+  /// targetType = uint16_t, address=0, offset=0
			
 
				   ///                 --> Load the first 16-bit uint starting at address 0.
			
 
				   /// targetType = uint16_t, address=0, offset=16
			
 
				   ///                 --> Load the second 16-bit uint starting at address 0.
			
@@ -38,6 +40,25 @@ public:
 
				                                                    const QualType targetType,
			
 
				                                                    uint32_t &bitOffset);
			
 
				 
			
 
				+  /// \brief Performs RWByteAddressBuffer.Store<T>(address, value).
			
 
				+  /// RWByteAddressBuffers are represented in SPIR-V as structs with only one
			
 
				+  /// member which is a runtime array of uints. This method works by decomposing
			
 
				+  /// the given |value| to reach numeric/bool types. Then performs necessary
			
 
				+  /// casts to uints and stores them in the underlying runtime array.
			
 
				+  /// The |bitOffset| parameter can be used for finer-grained bit-offset
			
 
				+  /// control.
			
 
				+  ///
			
 
				+  /// Example:
			
 
				+  /// targetType = uint16_t, address=0, offset=0
			
 
				+  ///                 --> Store to the first 16-bit uint starting at address 0.
			
 
				+  /// targetType = uint16_t, address=0, offset=16
			
 
				+  ///                 --> Store to the second 16-bit uint starting at address 0.
			
 
				+  void processTemplatedStoreToBuffer(SpirvInstruction *value,
			
 
				+                                     SpirvInstruction *buffer,
			
 
				+                                     SpirvInstruction *&index,
			
 
				+                                     const QualType valueType,
			
 
				+                                     uint32_t &bitOffset);
			
 
				+
			
 
				 private:
			
 
				   SpirvInstruction *load16BitsAtBitOffset0(SpirvInstruction *buffer,
			
 
				                                            SpirvInstruction *&index,
			
@@ -59,15 +80,36 @@ private:
 
				                                             QualType target16BitType,
			
 
				                                             uint32_t &bitOffset);
			
 
				 
			
 
				-  SpirvInstruction *load32BitsAtBitOffset16(SpirvInstruction *buffer,
			
 
				-                                            SpirvInstruction *&index,
			
 
				-                                            QualType target32BitType,
			
 
				-                                            uint32_t &bitOffset);
			
 
				+private:
			
 
				+  void store16BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                               SpirvInstruction *buffer,
			
 
				+                               SpirvInstruction *&index,
			
 
				+                               const QualType valueType);
			
 
				 
			
 
				-  SpirvInstruction *load64BitsAtBitOffset16(SpirvInstruction *buffer,
			
 
				-                                            SpirvInstruction *&index,
			
 
				-                                            QualType target64BitType,
			
 
				-                                            uint32_t &bitOffset);
			
 
				+  void store32BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                               SpirvInstruction *buffer,
			
 
				+                               SpirvInstruction *&index,
			
 
				+                               const QualType valueType);
			
 
				+
			
 
				+  void store64BitsAtBitOffset0(SpirvInstruction *value,
			
 
				+                               SpirvInstruction *buffer,
			
 
				+                               SpirvInstruction *&index,
			
 
				+                               const QualType valueType);
			
 
				+
			
 
				+  void store16BitsAtBitOffset16(SpirvInstruction *value,
			
 
				+                                SpirvInstruction *buffer,
			
 
				+                                SpirvInstruction *&index,
			
 
				+                                const QualType valueType);
			
 
				+
			
 
				+  void storeArrayOfScalars(std::deque<SpirvInstruction *> values,
			
 
				+                           SpirvInstruction *buffer, SpirvInstruction *&index,
			
 
				+                           const QualType valueType, uint32_t &bitOffset,
			
 
				+                           SourceLocation);
			
 
				+
			
 
				+  /// \brief Serializes the given values into their components until a scalar or
			
 
				+  /// a struct has been reached. Returns the most basic type it reaches.
			
 
				+  QualType serializeToScalarsOrStruct(std::deque<SpirvInstruction *> *values,
			
 
				+                                      QualType valueType, SourceLocation);
			
 
				 
			
 
				 private:
			
 
				   /// \brief Performs an OpBitCast from |fromType| to |toType| on the given
			
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -3317,8 +3317,9 @@ SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
 
				   // Store3, Store4 intrinsic functions.
			
 
				   const bool isTemplatedLoadOrStore =
			
 
				       (numWords == 1) &&
			
 
				-      (doStore ? expr->getArg(1)->getType() != astContext.UnsignedIntTy
			
 
				-               : expr->getType() != astContext.UnsignedIntTy);
			
 
				+      (doStore ? !expr->getArg(1)->getType()->isSpecificBuiltinType(
			
 
				+                     BuiltinType::UInt)
			
 
				+               : !expr->getType()->isSpecificBuiltinType(BuiltinType::UInt));
			
 
				 
			
 
				   // Do a OpShiftRightLogical by 2 (divide by 4 to get aligned memory
			
 
				   // access). The AST always casts the address to unsinged integer, so shift
			
@@ -3329,13 +3330,20 @@ SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
 
				       spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical, addressType,
			
 
				                                 byteAddress, constUint2, expr->getExprLoc());
			
 
				 
			
 
				-  if (isTemplatedLoadOrStore && !doStore) {
			
 
				+  if (isTemplatedLoadOrStore) {
			
 
				     // Templated load. Need to (potentially) perform more
			
 
				     // loads/casts/composite-constructs.
			
 
				     uint32_t bitOffset = 0;
			
 
				-    RawBufferHandler rawBufferHandler(*this);
			
 
				-    return rawBufferHandler.processTemplatedLoadFromBuffer(
			
 
				-        objectInfo, address, expr->getType(), bitOffset);
			
 
				+    if (doStore) {
			
 
				+      auto *values = doExpr(expr->getArg(1));
			
 
				+      RawBufferHandler(*this).processTemplatedStoreToBuffer(
			
 
				+          values, objectInfo, address, expr->getArg(1)->getType(), bitOffset);
			
 
				+      return nullptr;
			
 
				+    } else {
			
 
				+      RawBufferHandler rawBufferHandler(*this);
			
 
				+      return rawBufferHandler.processTemplatedLoadFromBuffer(
			
 
				+          objectInfo, address, expr->getType(), bitOffset);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // Perform access chain into the RWByteAddressBuffer.
			
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl
@@ -110,7 +110,7 @@ ByteAddressBuffer buf;
 
				 
			
 
				 // CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr:%\d+]]
			
 
				 // CHECK: [[val0_uint:%\d+]] = OpLoad %uint [[ptr]]
			
 
				-// CHECK:      [[val0:%\d+]] = OpBitcast %int %174
			
 
				+// CHECK:      [[val0:%\d+]] = OpBitcast %int [[val0_uint]]
			
 
				 // CHECK:   [[newAddr:%\d+]] = OpIAdd %uint [[addr]] %uint_1
			
 
				 // CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr]]
			
 
				 // CHECK: [[val1_uint:%\d+]] = OpLoad %uint [[ptr]]
			
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.struct3.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.struct3.hlsl
@@ -0,0 +1,95 @@
 
				+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
			
 
				+
			
 
				+ByteAddressBuffer buf;
			
 
				+RWByteAddressBuffer buf2;
			
 
				+
			
 
				+struct T {
			
 
				+  float16_t x[2];
			
 
				+};
			
 
				+
			
 
				+struct S {
			
 
				+  float16_t a;
			
 
				+  T e[2];
			
 
				+};
			
 
				+
			
 
				+[numthreads(64, 1, 1)]
			
 
				+void main(uint3 tid : SV_DispatchThreadId) {
			
 
				+  S sArr[2] = buf.Load<S[2]>(tid.x);
			
 
				+  buf2.Store<S[2]>(tid.x, sArr);
			
 
				+}
			
 
				+
			
 
				+// Note: the DX layout tightly packs all members of S and its sub-structures.
			
 
				+// It stores elements at the following byte offsets:
			
 
				+// 0, 2, 4, 6, 8, 10, 12, 14, 16, 18
			
 
				+//
			
 
				+//                              |-----------------------|
			
 
				+// address 0:                   |     a     | e[0].x[0] |
			
 
				+//                              |-----------------------|
			
 
				+// address 1 (byte offset 4):   | e[0].x[1] | e[1].x[0] |
			
 
				+//                              |-----------------------|
			
 
				+// address 2 (byte offset 8):   | e[1].x[1] |     a     |
			
 
				+//                              |-----------------------|
			
 
				+// address 3 (byte offset 12)   | e[0].x[0] | e[0].x[1] |
			
 
				+//                              |-----------------------|
			
 
				+// address 4 (byte offset 16)   | e[1].x[0] | e[1].x[1] |
			
 
				+//                              |-----------------------|
			
 
				+//
			
 
				+
			
 
				+// CHECK:      [[tidx_ptr:%\d+]] = OpAccessChain %_ptr_Function_uint %tid %int_0
			
 
				+// CHECK:          [[tidx:%\d+]] = OpLoad %uint [[tidx_ptr]]
			
 
				+// CHECK:      [[address0:%\d+]] = OpShiftRightLogical %uint [[tidx]] %uint_2
			
 
				+// CHECK:          [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address0]]
			
 
				+// CHECK:         [[word0:%\d+]] = OpLoad %uint [[ptr0]]
			
 
				+// CHECK:      [[word0u16:%\d+]] = OpUConvert %ushort [[word0]]
			
 
				+// CHECK:             [[a:%\d+]] = OpBitcast %half [[word0u16]]
			
 
				+// CHECK:          [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address0]]
			
 
				+// CHECK:         [[word0:%\d+]] = OpLoad %uint [[ptr0]]
			
 
				+// CHECK:    [[word0upper:%\d+]] = OpShiftRightLogical %uint [[word0]] %uint_16
			
 
				+// CHECK: [[word0upperu16:%\d+]] = OpUConvert %ushort [[word0upper]]
			
 
				+// CHECK:           [[x_0:%\d+]] = OpBitcast %half [[word0upperu16]]
			
 
				+// CHECK:      [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
			
 
				+// CHECK:          [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address1]]
			
 
				+// CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr1]]
			
 
				+// CHECK:      [[word1u16:%\d+]] = OpUConvert %ushort [[word1]]
			
 
				+// CHECK:           [[x_1:%\d+]] = OpBitcast %half [[word1u16]]
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeConstruct %_arr_half_uint_2 [[x_0]] [[x_1]]
			
 
				+// CHECK:      [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
			
 
				+// CHECK:           [[e_0:%\d+]] = OpCompositeConstruct %T [[x]]
			
 
				+// CHECK:          [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address1]]
			
 
				+// CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr1]]
			
 
				+// CHECK:    [[word1upper:%\d+]] = OpShiftRightLogical %uint [[word1]] %uint_16
			
 
				+// CHECK: [[word1upperu16:%\d+]] = OpUConvert %ushort [[word1upper]]
			
 
				+// CHECK:           [[x_0:%\d+]] = OpBitcast %half [[word1upperu16]]
			
 
				+// CHECK:      [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
			
 
				+// CHECK:          [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address2]]
			
 
				+// CHECK:         [[word2:%\d+]] = OpLoad %uint [[ptr2]]
			
 
				+// CHECK:      [[word2u16:%\d+]] = OpUConvert %ushort [[word2]]
			
 
				+// CHECK:           [[x_1:%\d+]] = OpBitcast %half [[word2u16]]
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeConstruct %_arr_half_uint_2 [[x_0]] [[x_1]]
			
 
				+// CHECK:           [[e_1:%\d+]] = OpCompositeConstruct %T [[x]]
			
 
				+// CHECK:             [[e:%\d+]] = OpCompositeConstruct %_arr_T_uint_2 [[e_0]] [[e_1]]
			
 
				+// CHECK:      [[address2:%\d+]] = OpIAdd %uint [[address0]] %uint_2
			
 
				+// CHECK:           [[s_0:%\d+]] = OpCompositeConstruct %S [[a]] [[e]]
			
 
				+//
			
 
				+// Now start with the second 'S' object
			
 
				+//
			
 
				+// CHECK:          [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address2]]
			
 
				+// CHECK:         [[word2:%\d+]] = OpLoad %uint [[ptr2]]
			
 
				+// CHECK:  [[word2upper16:%\d+]] = OpShiftRightLogical %uint [[word2]] %uint_16
			
 
				+// CHECK: [[word2upperu16:%\d+]] = OpUConvert %ushort [[word2upper16]]
			
 
				+// CHECK:             [[a:%\d+]] = OpBitcast %half [[word2upperu16]]
			
 
				+// CHECK:      [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
			
 
				+// CHECK:      [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
			
 
				+// CHECK:               {{%\d+}} = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address3]]
			
 
				+// CHECK:               {{%\d+}} = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address3]]
			
 
				+// CHECK:                          OpCompositeConstruct %_arr_half_uint_2
			
 
				+// CHECK:      [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
			
 
				+// CHECK:                          OpCompositeConstruct %T
			
 
				+// CHECK:                          OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address4]]
			
 
				+// CHECK:                          OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address4]]
			
 
				+// CHECK:                          OpCompositeConstruct %_arr_half_uint_2
			
 
				+// CHECK:                          OpCompositeConstruct %T
			
 
				+// CHECK:                          OpCompositeConstruct %_arr_T_uint_2
			
 
				+// CHECK:                          OpCompositeConstruct %S
			
 
				+// CHECK:                          OpCompositeConstruct %_arr_S_uint_2
			
 
				+// CHECK:                          OpStore %sArr {{%\d+}}
			
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl
@@ -0,0 +1,717 @@
 
				+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
			
 
				+
			
 
				+ByteAddressBuffer buf;
			
 
				+RWByteAddressBuffer buf2;
			
 
				+
			
 
				+struct T {
			
 
				+  float16_t x[5];
			
 
				+};
			
 
				+
			
 
				+struct U {
			
 
				+  float16_t v[3];
			
 
				+  uint w;
			
 
				+};
			
 
				+
			
 
				+struct S {
			
 
				+  float16_t3 a[3];
			
 
				+  double c;
			
 
				+  T t;
			
 
				+  double b;
			
 
				+  float16_t d;
			
 
				+  T e[2];
			
 
				+  U f[2];
			
 
				+  float16_t z;
			
 
				+};
			
 
				+
			
 
				+[numthreads(64, 1, 1)]
			
 
				+void main(uint3 tid : SV_DispatchThreadId) {
			
 
				+  S sArr[2] = buf.Load<S[2]>(tid.x);
			
 
				+  buf2.Store<S[2]>(tid.x, sArr);
			
 
				+}
			
 
				+
			
 
				+// Note: the following indeces are taken from them DXIL compilation:
			
 
				+//
			
 
				+//
			
 
				+//                           // sArr[0] starts
			
 
				+//
			
 
				+//  %3 = 0                    // a[0] starts at byte offset 0
			
 
				+//  %8 = add i32 %3, 6        // a[1] starts at byte offset 6
			
 
				+// %13 = add i32 %3, 12       // a[2] starts at byte offset 12
			
 
				+//                            // since the next member is a 'double' it does not
			
 
				+//                            // start at offset 18 or 20. It starts at offset 24.
			
 
				+//                            // byte [18-23] inclusive are PADDING.
			
 
				+// %18 = add i32 %3, 24       // c starts at offset 24 (6 words)
			
 
				+// %23 = add i32 %3, 32       // t.x[0] starts at byte offset 32 (8 words)
			
 
				+// %26 = add i32 %3, 34       // t.x[1] starts at byte offset 34
			
 
				+// %29 = add i32 %3, 36       // t.x[2] starts at byte offset 36
			
 
				+// %32 = add i32 %3, 38       // t.x[2] starts at byte offset 38
			
 
				+// %35 = add i32 %3, 40       // t.x[2] starts at byte offset 40 
			
 
				+//                            // byte [42-47] inclusive are PADDING.
			
 
				+// %38 = add i32 %3, 48       // b starts at byte offset 48 (12 words)
			
 
				+// %43 = add i32 %3, 56       // d starts at byte offset 56 (14 words)
			
 
				+//                            // even though 'e' is the next struct member,
			
 
				+//                            // it does NOT start at an aligned address (does not start at 64 byte offset).
			
 
				+// %46 = add i32 %3, 58       // e[0].x[0] starts at byte offset 58
			
 
				+// %49 = add i32 %3, 60       // e[0].x[1] starts at byte offset 60
			
 
				+// %52 = add i32 %3, 62       // e[0].x[2] starts at byte offset 62
			
 
				+// %55 = add i32 %3, 64       // e[0].x[3] starts at byte offset 64
			
 
				+// %58 = add i32 %3, 66       // e[0].x[4] starts at byte offset 66
			
 
				+// %61 = add i32 %3, 68       // e[1].x[0] starts at byte offset 68
			
 
				+// %64 = add i32 %3, 70       // e[1].x[1] starts at byte offset 70
			
 
				+// %67 = add i32 %3, 72       // e[1].x[2] starts at byte offset 72
			
 
				+// %70 = add i32 %3, 74       // e[1].x[3] starts at byte offset 74
			
 
				+// %73 = add i32 %3, 76       // e[1].x[4] starts at byte offset 76
			
 
				+//                            // 'f' starts at the next aligned address
			
 
				+//                            // byte [78-79] inclusive are PADDING
			
 
				+// %76 = add i32 %3, 80       // f[0].v[0] starts at byte offset 80 (20 words)
			
 
				+// %79 = add i32 %3, 82       // f[0].v[1] starts at byte offset 82
			
 
				+// %82 = add i32 %3, 84       // f[0].v[2] starts at byte offset 84
			
 
				+//                            // byte [86-87] inclusive are PADDING
			
 
				+// %85 = add i32 %3, 88       // f[0].w starts at byte offset 88 (22 words)
			
 
				+// %88 = add i32 %3, 92       // f[1].v[0] starts at byte offset 92
			
 
				+// %91 = add i32 %3, 94       // f[1].v[1] starts at byte offset 94
			
 
				+// %94 = add i32 %3, 96       // f[1].v[2] starts at byte offset 96
			
 
				+//                            // byte [98-99] inclusive are PADDING
			
 
				+// %97 = add i32 %3, 100      // f[1].w starts at byte offset 100 (25 words)
			
 
				+// %100 = add i32 %3, 104     // z starts at byte offset 104 (26 words)
			
 
				+//
			
 
				+//                           // sArr[1] starts
			
 
				+//
			
 
				+//                           // byte [106-111] inclusive are PADDING
			
 
				+//
			
 
				+//                           // ALL the following offsets are similar to offsets
			
 
				+//                           // of sArr[0], shifted by 112 bytes.
			
 
				+//
			
 
				+// %103 = add i32 %3, 112
			
 
				+// %108 = add i32 %3, 118
			
 
				+// %113 = add i32 %3, 124
			
 
				+// %118 = add i32 %3, 136
			
 
				+// %123 = add i32 %3, 144
			
 
				+// %126 = add i32 %3, 146
			
 
				+// %129 = add i32 %3, 148
			
 
				+// %132 = add i32 %3, 150
			
 
				+// %135 = add i32 %3, 152
			
 
				+// %138 = add i32 %3, 160
			
 
				+// %143 = add i32 %3, 168
			
 
				+// %146 = add i32 %3, 170
			
 
				+// %149 = add i32 %3, 172
			
 
				+// %152 = add i32 %3, 174
			
 
				+// %155 = add i32 %3, 176
			
 
				+// %158 = add i32 %3, 178
			
 
				+// %161 = add i32 %3, 180
			
 
				+// %164 = add i32 %3, 182
			
 
				+// %167 = add i32 %3, 184
			
 
				+// %170 = add i32 %3, 186
			
 
				+// %173 = add i32 %3, 188
			
 
				+// %176 = add i32 %3, 192
			
 
				+// %179 = add i32 %3, 194
			
 
				+// %182 = add i32 %3, 196
			
 
				+// %185 = add i32 %3, 200
			
 
				+// %188 = add i32 %3, 204
			
 
				+// %191 = add i32 %3, 206
			
 
				+// %194 = add i32 %3, 208
			
 
				+// %197 = add i32 %3, 212
			
 
				+// %200 = add i32 %3, 216
			
 
				+
			
 
				+// Initialization of sArr array.
			
 
				+// CHECK: OpStore %sArr {{%\d+}}
			
 
				+//
			
 
				+// Check for templated 'Store' method.
			
 
				+//
			
 
				+// CHECK:          [[tidx_ptr:%\d+]] = OpAccessChain %_ptr_Function_uint %tid %int_0
			
 
				+// CHECK:              [[tidx:%\d+]] = OpLoad %uint [[tidx_ptr]]
			
 
				+// CHECK:          [[address0:%\d+]] = OpShiftRightLogical %uint [[tidx]] %uint_2
			
 
				+// CHECK:              [[sArr:%\d+]] = OpLoad %_arr_S_uint_2 %sArr
			
 
				+// CHECK:                [[s0:%\d+]] = OpCompositeExtract %S [[sArr]] 0
			
 
				+// CHECK:                [[s1:%\d+]] = OpCompositeExtract %S [[sArr]] 1
			
 
				+// CHECK:                 [[a:%\d+]] = OpCompositeExtract %_arr_v3half_uint_3 [[s0]] 0
			
 
				+// CHECK:                [[a0:%\d+]] = OpCompositeExtract %v3half [[a]] 0
			
 
				+// CHECK:                [[a1:%\d+]] = OpCompositeExtract %v3half [[a]] 1
			
 
				+// CHECK:                [[a2:%\d+]] = OpCompositeExtract %v3half [[a]] 2
			
 
				+// CHECK:               [[a00:%\d+]] = OpCompositeExtract %half [[a0]] 0
			
 
				+// CHECK:               [[a01:%\d+]] = OpCompositeExtract %half [[a0]] 1
			
 
				+// CHECK:               [[a02:%\d+]] = OpCompositeExtract %half [[a0]] 2
			
 
				+// CHECK:               [[a10:%\d+]] = OpCompositeExtract %half [[a1]] 0
			
 
				+// CHECK:               [[a11:%\d+]] = OpCompositeExtract %half [[a1]] 1
			
 
				+// CHECK:               [[a12:%\d+]] = OpCompositeExtract %half [[a1]] 2
			
 
				+// CHECK:               [[a20:%\d+]] = OpCompositeExtract %half [[a2]] 0
			
 
				+// CHECK:               [[a21:%\d+]] = OpCompositeExtract %half [[a2]] 1
			
 
				+// CHECK:               [[a22:%\d+]] = OpCompositeExtract %half [[a2]] 2
			
 
				+// CHECK:         [[a00_16bit:%\d+]] = OpBitcast %ushort [[a00]]
			
 
				+// CHECK:         [[a00_32bit:%\d+]] = OpUConvert %uint [[a00_16bit]]
			
 
				+// CHECK:         [[a01_16bit:%\d+]] = OpBitcast %ushort [[a01]]
			
 
				+// CHECK:         [[a01_32bit:%\d+]] = OpUConvert %uint [[a01_16bit]]
			
 
				+// CHECK: [[a01_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a01_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a00_32bit]] [[a01_32bit_shifted]]
			
 
				+// CHECK:              [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
			
 
				+// CHECK:                             OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:          [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
			
 
				+// CHECK:         [[a02_16bit:%\d+]] = OpBitcast %ushort [[a02]]
			
 
				+// CHECK:         [[a02_32bit:%\d+]] = OpUConvert %uint [[a02_16bit]]
			
 
				+// CHECK:         [[a10_16bit:%\d+]] = OpBitcast %ushort [[a10]]
			
 
				+// CHECK:         [[a10_32bit:%\d+]] = OpUConvert %uint [[a10_16bit]]
			
 
				+// CHECK: [[a10_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a10_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a02_32bit]] [[a10_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:          [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
			
 
				+// CHECK:         [[a11_16bit:%\d+]] = OpBitcast %ushort [[a11]]
			
 
				+// CHECK:         [[a11_32bit:%\d+]] = OpUConvert %uint [[a11_16bit]]
			
 
				+// CHECK:         [[a12_16bit:%\d+]] = OpBitcast %ushort [[a12]]
			
 
				+// CHECK:         [[a12_32bit:%\d+]] = OpUConvert %uint [[a12_16bit]]
			
 
				+// CHECK: [[a12_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a12_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a11_32bit]] [[a12_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:          [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
			
 
				+// CHECK:         [[a20_16bit:%\d+]] = OpBitcast %ushort [[a20]]
			
 
				+// CHECK:         [[a20_32bit:%\d+]] = OpUConvert %uint [[a20_16bit]]
			
 
				+// CHECK:         [[a21_16bit:%\d+]] = OpBitcast %ushort [[a21]]
			
 
				+// CHECK:         [[a21_32bit:%\d+]] = OpUConvert %uint [[a21_16bit]]
			
 
				+// CHECK: [[a21_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a21_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a20_32bit]] [[a21_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address3]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:          [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
			
 
				+// CHECK:         [[a22_16bit:%\d+]] = OpBitcast %ushort [[a22]]
			
 
				+// CHECK:         [[a22_32bit:%\d+]] = OpUConvert %uint [[a22_16bit]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address4]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[a22_32bit]]
			
 
				+
			
 
				+//
			
 
				+// The second member of S starts at byte offset 24 (6 words)
			
 
				+//
			
 
				+// CHECK: [[address6:%\d+]] = OpIAdd %uint [[address0]] %uint_6
			
 
				+//
			
 
				+// CHECK:             [[c:%\d+]] = OpCompositeExtract %double [[s0]] 1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address6]]
			
 
				+// CHECK:         [[c_u64:%\d+]] = OpBitcast %ulong [[c]]
			
 
				+// CHECK:       [[c_word0:%\d+]] = OpUConvert %uint [[c_u64]]
			
 
				+// CHECK: [[c_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[c_u64]] %uint_32
			
 
				+// CHECK:       [[c_word1:%\d+]] = OpUConvert %uint [[c_u64_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[c_word0]]
			
 
				+// CHECK:      [[address7:%\d+]] = OpIAdd %uint [[address6]] %uint_1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address7]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[c_word1]]
			
 
				+
			
 
				+//
			
 
				+// The third member of S starts at byte offset 32 (8 words)
			
 
				+//
			
 
				+// CHECK: [[address8:%\d+]] = OpIAdd %uint [[address0]] %uint_8
			
 
				+//
			
 
				+// CHECK:              [[t:%\d+]] = OpCompositeExtract %T [[s0]] 2
			
 
				+// CHECK:              [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[t]] 0
			
 
				+// CHECK:             [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:             [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:             [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:             [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:             [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:         [[x0_u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0_u32:%\d+]] = OpUConvert %uint [[x0_u16]]
			
 
				+// CHECK:         [[x1_u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1_u32:%\d+]] = OpUConvert %uint [[x1_u16]]
			
 
				+// CHECK: [[x1_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1_u32]] %uint_16
			
 
				+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x0_u32]] [[x1_u32_shifted]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address8]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[word]]
			
 
				+// CHECK:       [[address9:%\d+]] = OpIAdd %uint [[address8]] %uint_1
			
 
				+// CHECK:         [[x2_u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2_u32:%\d+]] = OpUConvert %uint [[x2_u16]]
			
 
				+// CHECK:         [[x3_u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3_u32:%\d+]] = OpUConvert %uint [[x3_u16:%\d+]]
			
 
				+// CHECK: [[x3_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3_u32]] %uint_16
			
 
				+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x2_u32]] [[x3_u32_shifted]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address9]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[word]]
			
 
				+// CHECK:      [[address10:%\d+]] = OpIAdd %uint [[address9]] %uint_1
			
 
				+// CHECK:         [[x4_u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4_u32:%\d+]] = OpUConvert %uint [[x4_u16]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address10]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[x4_u32]]
			
 
				+
			
 
				+//
			
 
				+// The fourth member of S starts at byte offset 48 (12 words)
			
 
				+//
			
 
				+// CHECK: [[address12:%\d+]] = OpIAdd %uint [[address0]] %uint_12
			
 
				+//
			
 
				+// CHECK:             [[b:%\d+]] = OpCompositeExtract %double [[s0]] 3
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address12]]
			
 
				+// CHECK:         [[b_u64:%\d+]] = OpBitcast %ulong [[b]]
			
 
				+// CHECK:       [[b_word0:%\d+]] = OpUConvert %uint [[b_u64]]
			
 
				+// CHECK: [[b_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[b_u64]] %uint_32
			
 
				+// CHECK:       [[b_word1:%\d+]] = OpUConvert %uint [[b_u64_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[b_word0]]
			
 
				+// CHECK:     [[address13:%\d+]] = OpIAdd %uint [[address12]] %uint_1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address13]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[b_word1]]
			
 
				+
			
 
				+//
			
 
				+// The fifth member of S starts at byte offset 56 (14 words)
			
 
				+//
			
 
				+// CHECK: [[address14:%\d+]] = OpIAdd %uint [[address0]] %uint_14
			
 
				+//
			
 
				+// CHECK:     [[d:%\d+]] = OpCompositeExtract %half [[s0]] 4
			
 
				+// CHECK:   [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address14]]
			
 
				+// CHECK: [[d_u16:%\d+]] = OpBitcast %ushort [[d]]
			
 
				+// CHECK: [[d_u32:%\d+]] = OpUConvert %uint [[d_u16]]
			
 
				+// CHECK:                  OpStore [[ptr]] [[d_u32]]
			
 
				+
			
 
				+//
			
 
				+// The sixth member of S starts at byte offset 58 (14 words + 16bit offset)
			
 
				+// This is an extraordinary case of alignment. Since the sixth member only
			
 
				+// contains fp16, and the fifth member was also fp16, DX packs them tightly.
			
 
				+// As a result, store must occur at non-aligned offset.
			
 
				+// e[0] takes the following byte offsets: 58, 60, 62, 64, 66.
			
 
				+// e[1] takes the following byte offsets: 68, 70, 72, 74, 76.
			
 
				+// (60-64 = index 15. 64-68 = index 16)
			
 
				+// (68-72 = index 17. 72-76 = index 18)
			
 
				+// (76-78 = first half of index 19)
			
 
				+//
			
 
				+// CHECK:     [[address14:%\d+]] = OpIAdd %uint [[address0]] %uint_14
			
 
				+// CHECK:             [[e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[s0]] 5
			
 
				+// CHECK:            [[e0:%\d+]] = OpCompositeExtract %T [[e]] 0
			
 
				+// CHECK:            [[e1:%\d+]] = OpCompositeExtract %T [[e]] 1
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e0]] 0
			
 
				+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address14]]
			
 
				+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
			
 
				+// CHECK: [[x0u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x0u32]] %uint_16
			
 
				+// CHECK:  [[existingWord:%\d+]] = OpLoad %uint [[ptr]]
			
 
				+// CHECK:       [[newWord:%\d+]] = OpBitwiseOr %uint [[existingWord]] [[x0u32_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[newWord]]
			
 
				+
			
 
				+// CHECK:     [[address15:%\d+]] = OpIAdd %uint [[address14]] %uint_1
			
 
				+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
			
 
				+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
			
 
				+// CHECK: [[x2u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x2u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x1u32]] [[x2u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address15]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address16:%\d+]] = OpIAdd %uint [[address15]] %uint_1
			
 
				+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
			
 
				+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
			
 
				+// CHECK: [[x4u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x4u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x3u32]] [[x4u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address16]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address17:%\d+]] = OpIAdd %uint [[address14]] %uint_3
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e1]] 0
			
 
				+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
			
 
				+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
			
 
				+// CHECK: [[x1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x0u32]] [[x1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address17]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address18:%\d+]] = OpIAdd %uint [[address17]] %uint_1
			
 
				+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
			
 
				+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
			
 
				+// CHECK: [[x3u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x2u32]] [[x3u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address18]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address19:%\d+]] = OpIAdd %uint [[address18]] %uint_1
			
 
				+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address19]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[x4u32]]
			
 
				+
			
 
				+//
			
 
				+// The seventh member of S starts at byte offset 80 (20 words), so:
			
 
				+// for f[0]:
			
 
				+// v should start at byte offset 80 (20 words)
			
 
				+// w should start at byte offset 88 (22 words)
			
 
				+// for f[1]:
			
 
				+// v should start at byte offset 92 (23 words)
			
 
				+// w should start at byte offset 100 (25 words)
			
 
				+//
			
 
				+// CHECK:     [[address20:%\d+]] = OpIAdd %uint [[address0]] %uint_20
			
 
				+// CHECK:             [[f:%\d+]] = OpCompositeExtract %_arr_U_uint_2 [[s0]] 6
			
 
				+// CHECK:            [[u0:%\d+]] = OpCompositeExtract %U [[f]] 0
			
 
				+// CHECK:            [[u1:%\d+]] = OpCompositeExtract %U [[f]] 1
			
 
				+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u0]] 0
			
 
				+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
			
 
				+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
			
 
				+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
			
 
				+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
			
 
				+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
			
 
				+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
			
 
				+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
			
 
				+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address20]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK: [[address21:%\d+]] = OpIAdd %uint [[address20]] %uint_1
			
 
				+// CHECK:     [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
			
 
				+// CHECK:     [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address21]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[v2u32]]
			
 
				+
			
 
				+// CHECK: [[address22:%\d+]] = OpIAdd %uint [[address20]] %uint_2
			
 
				+// CHECK:         [[w:%\d+]] = OpCompositeExtract %uint [[u0]] 1
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address22]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[w]]
			
 
				+
			
 
				+// CHECK:     [[address23:%\d+]] = OpIAdd %uint [[address20]] %uint_3
			
 
				+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u1]] 0
			
 
				+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
			
 
				+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
			
 
				+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
			
 
				+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
			
 
				+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
			
 
				+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
			
 
				+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
			
 
				+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address23]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address24:%\d+]] = OpIAdd %uint [[address23]] %uint_1
			
 
				+// CHECK:         [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
			
 
				+// CHECK:         [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address24]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[v2u32]]
			
 
				+
			
 
				+// CHECK:     [[address25:%\d+]] = OpIAdd %uint [[address23]] %uint_2
			
 
				+// CHECK:             [[w:%\d+]] = OpCompositeExtract %uint [[u1]] 1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address25]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[w]]
			
 
				+
			
 
				+//
			
 
				+// The eighth member of S starts at byte offset 104 (26 words)
			
 
				+//
			
 
				+// CHECK: [[address26:%\d+]] = OpIAdd %uint [[address0]] %uint_26
			
 
				+// CHECK:         [[z:%\d+]] = OpCompositeExtract %half [[s0]] 7
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address26]]
			
 
				+// CHECK:      [[zu16:%\d+]] = OpBitcast %ushort [[z]]
			
 
				+// CHECK:      [[zu32:%\d+]] = OpUConvert %uint [[zu16]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[zu32]]
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////
			
 
				+///////////////////////////////////////////////////////////////////////////////
			
 
				+//
			
 
				+//
			
 
				+// We have an array of S structures (sArr). The second member (sArr[1]) should
			
 
				+// start at an aligned address. A structure aligment is the maximum alignment
			
 
				+// of its members.
			
 
				+// In this example, sArr[1] should start at byte offset 112 (28 words)
			
 
				+// It should *NOT* start at byte offset 108 (27 words).
			
 
				+//
			
 
				+//
			
 
				+// CHECK: [[address28:%\d+]] = OpIAdd %uint [[address0]] %uint_28
			
 
				+//
			
 
				+// CHECK:                 [[a:%\d+]] = OpCompositeExtract %_arr_v3half_uint_3 [[s1]] 0
			
 
				+// CHECK:                [[a0:%\d+]] = OpCompositeExtract %v3half [[a]] 0
			
 
				+// CHECK:                [[a1:%\d+]] = OpCompositeExtract %v3half [[a]] 1
			
 
				+// CHECK:                [[a2:%\d+]] = OpCompositeExtract %v3half [[a]] 2
			
 
				+// CHECK:               [[a00:%\d+]] = OpCompositeExtract %half [[a0]] 0
			
 
				+// CHECK:               [[a01:%\d+]] = OpCompositeExtract %half [[a0]] 1
			
 
				+// CHECK:               [[a02:%\d+]] = OpCompositeExtract %half [[a0]] 2
			
 
				+// CHECK:               [[a10:%\d+]] = OpCompositeExtract %half [[a1]] 0
			
 
				+// CHECK:               [[a11:%\d+]] = OpCompositeExtract %half [[a1]] 1
			
 
				+// CHECK:               [[a12:%\d+]] = OpCompositeExtract %half [[a1]] 2
			
 
				+// CHECK:               [[a20:%\d+]] = OpCompositeExtract %half [[a2]] 0
			
 
				+// CHECK:               [[a21:%\d+]] = OpCompositeExtract %half [[a2]] 1
			
 
				+// CHECK:               [[a22:%\d+]] = OpCompositeExtract %half [[a2]] 2
			
 
				+// CHECK:         [[a00_16bit:%\d+]] = OpBitcast %ushort [[a00]]
			
 
				+// CHECK:         [[a00_32bit:%\d+]] = OpUConvert %uint [[a00_16bit]]
			
 
				+// CHECK:         [[a01_16bit:%\d+]] = OpBitcast %ushort [[a01]]
			
 
				+// CHECK:         [[a01_32bit:%\d+]] = OpUConvert %uint [[a01_16bit]]
			
 
				+// CHECK: [[a01_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a01_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a00_32bit]] [[a01_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address28]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:         [[address29:%\d+]] = OpIAdd %uint [[address28]] %uint_1
			
 
				+// CHECK:         [[a02_16bit:%\d+]] = OpBitcast %ushort [[a02]]
			
 
				+// CHECK:         [[a02_32bit:%\d+]] = OpUConvert %uint [[a02_16bit]]
			
 
				+// CHECK:         [[a10_16bit:%\d+]] = OpBitcast %ushort [[a10]]
			
 
				+// CHECK:         [[a10_32bit:%\d+]] = OpUConvert %uint [[a10_16bit]]
			
 
				+// CHECK: [[a10_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a10_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a02_32bit]] [[a10_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address29]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:         [[address30:%\d+]] = OpIAdd %uint [[address29]] %uint_1
			
 
				+// CHECK:         [[a11_16bit:%\d+]] = OpBitcast %ushort [[a11]]
			
 
				+// CHECK:         [[a11_32bit:%\d+]] = OpUConvert %uint [[a11_16bit]]
			
 
				+// CHECK:         [[a12_16bit:%\d+]] = OpBitcast %ushort [[a12]]
			
 
				+// CHECK:         [[a12_32bit:%\d+]] = OpUConvert %uint [[a12_16bit]]
			
 
				+// CHECK: [[a12_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a12_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a11_32bit]] [[a12_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address30]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:         [[address31:%\d+]] = OpIAdd %uint [[address30]] %uint_1
			
 
				+// CHECK:         [[a20_16bit:%\d+]] = OpBitcast %ushort [[a20]]
			
 
				+// CHECK:         [[a20_32bit:%\d+]] = OpUConvert %uint [[a20_16bit]]
			
 
				+// CHECK:         [[a21_16bit:%\d+]] = OpBitcast %ushort [[a21]]
			
 
				+// CHECK:         [[a21_32bit:%\d+]] = OpUConvert %uint [[a21_16bit]]
			
 
				+// CHECK: [[a21_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a21_32bit]] %uint_16
			
 
				+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a20_32bit]] [[a21_32bit_shifted]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address31]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[word]]
			
 
				+// CHECK:         [[address32:%\d+]] = OpIAdd %uint [[address31]] %uint_1
			
 
				+// CHECK:         [[a22_16bit:%\d+]] = OpBitcast %ushort [[a22]]
			
 
				+// CHECK:         [[a22_32bit:%\d+]] = OpUConvert %uint [[a22_16bit]]
			
 
				+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address32]]
			
 
				+// CHECK:                              OpStore [[ptr]] [[a22_32bit]]
			
 
				+
			
 
				+//
			
 
				+// The second member of S starts at byte offset 24 (6 words)
			
 
				+//
			
 
				+// CHECK: [[address34:%\d+]] = OpIAdd %uint [[address28]] %uint_6
			
 
				+//
			
 
				+// CHECK:             [[c:%\d+]] = OpCompositeExtract %double [[s1]] 1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address34]]
			
 
				+// CHECK:         [[c_u64:%\d+]] = OpBitcast %ulong [[c]]
			
 
				+// CHECK:       [[c_word0:%\d+]] = OpUConvert %uint [[c_u64]]
			
 
				+// CHECK: [[c_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[c_u64]] %uint_32
			
 
				+// CHECK:       [[c_word1:%\d+]] = OpUConvert %uint [[c_u64_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[c_word0]]
			
 
				+// CHECK:     [[address35:%\d+]] = OpIAdd %uint [[address34]] %uint_1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address35]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[c_word1]]
			
 
				+
			
 
				+//
			
 
				+// The third member of S starts at byte offset 32 (8 words)
			
 
				+//
			
 
				+// CHECK: [[address36:%\d+]] = OpIAdd %uint [[address28]] %uint_8
			
 
				+//
			
 
				+// CHECK:              [[t:%\d+]] = OpCompositeExtract %T [[s1]] 2
			
 
				+// CHECK:              [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[t]] 0
			
 
				+// CHECK:             [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:             [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:             [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:             [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:             [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:         [[x0_u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0_u32:%\d+]] = OpUConvert %uint [[x0_u16]]
			
 
				+// CHECK:         [[x1_u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1_u32:%\d+]] = OpUConvert %uint [[x1_u16]]
			
 
				+// CHECK: [[x1_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1_u32]] %uint_16
			
 
				+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x0_u32]] [[x1_u32_shifted]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address36]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:      [[address37:%\d+]] = OpIAdd %uint [[address36]] %uint_1
			
 
				+// CHECK:         [[x2_u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2_u32:%\d+]] = OpUConvert %uint [[x2_u16]]
			
 
				+// CHECK:         [[x3_u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3_u32:%\d+]] = OpUConvert %uint [[x3_u16:%\d+]]
			
 
				+// CHECK: [[x3_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3_u32]] %uint_16
			
 
				+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x2_u32]] [[x3_u32_shifted]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address37]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:      [[address38:%\d+]] = OpIAdd %uint [[address37]] %uint_1
			
 
				+// CHECK:         [[x4_u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4_u32:%\d+]] = OpUConvert %uint [[x4_u16]]
			
 
				+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address38]]
			
 
				+// CHECK:                           OpStore [[ptr]] [[x4_u32]]
			
 
				+
			
 
				+//
			
 
				+// The fourth member of S starts at byte offset 48 (12 words)
			
 
				+//
			
 
				+// CHECK: [[address40:%\d+]] = OpIAdd %uint [[address28]] %uint_12
			
 
				+//
			
 
				+// CHECK:             [[b:%\d+]] = OpCompositeExtract %double [[s1]] 3
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address40]]
			
 
				+// CHECK:         [[b_u64:%\d+]] = OpBitcast %ulong [[b]]
			
 
				+// CHECK:       [[b_word0:%\d+]] = OpUConvert %uint [[b_u64]]
			
 
				+// CHECK: [[b_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[b_u64]] %uint_32
			
 
				+// CHECK:       [[b_word1:%\d+]] = OpUConvert %uint [[b_u64_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[b_word0]]
			
 
				+// CHECK:     [[address41:%\d+]] = OpIAdd %uint [[address40]] %uint_1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address41]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[b_word1]]
			
 
				+
			
 
				+//
			
 
				+// The fifth member of S starts at byte offset 56 (14 words)
			
 
				+//
			
 
				+// CHECK: [[address42:%\d+]] = OpIAdd %uint [[address28]] %uint_14
			
 
				+//
			
 
				+// CHECK:     [[d:%\d+]] = OpCompositeExtract %half [[s1]] 4
			
 
				+// CHECK:   [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address42]]
			
 
				+// CHECK: [[d_u16:%\d+]] = OpBitcast %ushort [[d]]
			
 
				+// CHECK: [[d_u32:%\d+]] = OpUConvert %uint [[d_u16]]
			
 
				+// CHECK:                  OpStore [[ptr]] [[d_u32]]
			
 
				+
			
 
				+//
			
 
				+// The sixth member of S starts at byte offset 58 (14 words + 16bit offset)
			
 
				+// This is an extraordinary case of alignment. Since the sixth member only
			
 
				+// contains fp16, and the fifth member was also fp16, DX packs them tightly.
			
 
				+// As a result, store must occur at non-aligned offset.
			
 
				+// e[0] takes the following byte offsets: 58, 60, 62, 64, 66.
			
 
				+// e[1] takes the following byte offsets: 68, 70, 72, 74, 76.
			
 
				+// (60-64 = index 15. 64-68 = index 16)
			
 
				+// (68-72 = index 17. 72-76 = index 18)
			
 
				+// (76-78 = first half of index 19)
			
 
				+//
			
 
				+// CHECK:     [[address42:%\d+]] = OpIAdd %uint [[address28]] %uint_14
			
 
				+// CHECK:             [[e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[s1]] 5
			
 
				+// CHECK:            [[e0:%\d+]] = OpCompositeExtract %T [[e]] 0
			
 
				+// CHECK:            [[e1:%\d+]] = OpCompositeExtract %T [[e]] 1
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e0]] 0
			
 
				+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address42]]
			
 
				+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
			
 
				+// CHECK: [[x0u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x0u32]] %uint_16
			
 
				+// CHECK:  [[existingWord:%\d+]] = OpLoad %uint [[ptr]]
			
 
				+// CHECK:       [[newWord:%\d+]] = OpBitwiseOr %uint [[existingWord]] [[x0u32_shifted]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[newWord]]
			
 
				+
			
 
				+// CHECK:     [[address43:%\d+]] = OpIAdd %uint [[address42]] %uint_1
			
 
				+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
			
 
				+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
			
 
				+// CHECK: [[x2u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x2u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x1u32]] [[x2u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address43]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address44:%\d+]] = OpIAdd %uint [[address43]] %uint_1
			
 
				+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
			
 
				+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
			
 
				+// CHECK: [[x4u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x4u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x3u32]] [[x4u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address44]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address45:%\d+]] = OpIAdd %uint [[address42]] %uint_3
			
 
				+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e1]] 0
			
 
				+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
			
 
				+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
			
 
				+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
			
 
				+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
			
 
				+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
			
 
				+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
			
 
				+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
			
 
				+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
			
 
				+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
			
 
				+// CHECK: [[x1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x0u32]] [[x1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address45]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address46:%\d+]] = OpIAdd %uint [[address45]] %uint_1
			
 
				+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
			
 
				+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
			
 
				+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
			
 
				+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
			
 
				+// CHECK: [[x3u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x2u32]] [[x3u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address46]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address47:%\d+]] = OpIAdd %uint [[address46]] %uint_1
			
 
				+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
			
 
				+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address47]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[x4u32]]
			
 
				+
			
 
				+//
			
 
				+// The seventh member of S starts at byte offset 80 (20 words), so:
			
 
				+// for f[0]:
			
 
				+// v should start at byte offset 80 (20 words)
			
 
				+// w should start at byte offset 88 (22 words)
			
 
				+// for f[1]:
			
 
				+// v should start at byte offset 92 (23 words)
			
 
				+// w should start at byte offset 100 (25 words)
			
 
				+//
			
 
				+// CHECK:     [[address48:%\d+]] = OpIAdd %uint [[address28]] %uint_20
			
 
				+// CHECK:             [[f:%\d+]] = OpCompositeExtract %_arr_U_uint_2 [[s1]] 6
			
 
				+// CHECK:            [[u0:%\d+]] = OpCompositeExtract %U [[f]] 0
			
 
				+// CHECK:            [[u1:%\d+]] = OpCompositeExtract %U [[f]] 1
			
 
				+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u0]] 0
			
 
				+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
			
 
				+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
			
 
				+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
			
 
				+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
			
 
				+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
			
 
				+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
			
 
				+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
			
 
				+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address48]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK: [[address49:%\d+]] = OpIAdd %uint [[address48]] %uint_1
			
 
				+// CHECK:     [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
			
 
				+// CHECK:     [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address49]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[v2u32]]
			
 
				+
			
 
				+// CHECK: [[address50:%\d+]] = OpIAdd %uint [[address48]] %uint_2
			
 
				+// CHECK:         [[w:%\d+]] = OpCompositeExtract %uint [[u0]] 1
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address50]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[w]]
			
 
				+
			
 
				+// CHECK:     [[address51:%\d+]] = OpIAdd %uint [[address48]] %uint_3
			
 
				+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u1]] 0
			
 
				+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
			
 
				+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
			
 
				+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
			
 
				+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
			
 
				+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
			
 
				+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
			
 
				+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
			
 
				+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
			
 
				+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address51]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[word]]
			
 
				+
			
 
				+// CHECK:     [[address52:%\d+]] = OpIAdd %uint [[address51]] %uint_1
			
 
				+// CHECK:         [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
			
 
				+// CHECK:         [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address52]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[v2u32]]
			
 
				+
			
 
				+// CHECK:     [[address53:%\d+]] = OpIAdd %uint [[address51]] %uint_2
			
 
				+// CHECK:             [[w:%\d+]] = OpCompositeExtract %uint [[u1]] 1
			
 
				+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address53]]
			
 
				+// CHECK:                          OpStore [[ptr]] [[w]]
			
 
				+
			
 
				+//
			
 
				+// The eighth member of S starts at byte offset 104 (26 words)
			
 
				+//
			
 
				+// CHECK: [[address54:%\d+]] = OpIAdd %uint [[address28]] %uint_26
			
 
				+// CHECK:         [[z:%\d+]] = OpCompositeExtract %half [[s1]] 7
			
 
				+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address54]]
			
 
				+// CHECK:      [[zu16:%\d+]] = OpBitcast %ushort [[z]]
			
 
				+// CHECK:      [[zu32:%\d+]] = OpUConvert %uint [[zu16]]
			
 
				+// CHECK:                      OpStore [[ptr]] [[zu32]]
			
--- a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct2.hlsl
+++ b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct2.hlsl
@@ -0,0 +1,123 @@
 
				+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
			
 
				+
			
 
				+ByteAddressBuffer buf;
			
 
				+RWByteAddressBuffer buf2;
			
 
				+
			
 
				+struct T {
			
 
				+  float16_t x[2];
			
 
				+};
			
 
				+
			
 
				+struct S {
			
 
				+  float16_t a;
			
 
				+  T e[2];
			
 
				+};
			
 
				+
			
 
				+[numthreads(64, 1, 1)]
			
 
				+void main(uint3 tid : SV_DispatchThreadId) {
			
 
				+  S sArr[2] = buf.Load<S[2]>(tid.x);
			
 
				+  buf2.Store<S[2]>(tid.x, sArr);
			
 
				+}
			
 
				+
			
 
				+// Note: the DX layout tightly packs all members of S and its sub-structures.
			
 
				+// It stores elements at the following byte offsets:
			
 
				+// 0, 2, 4, 6, 8, 10, 12, 14, 16, 18
			
 
				+//
			
 
				+//                              |-----------------------|
			
 
				+// address 0:                   |     a     | e[0].x[0] |
			
 
				+//                              |-----------------------|
			
 
				+// address 1 (byte offset 4):   | e[0].x[1] | e[1].x[0] |
			
 
				+//                              |-----------------------|
			
 
				+// address 2 (byte offset 8):   | e[1].x[1] |     a     |
			
 
				+//                              |-----------------------|
			
 
				+// address 3 (byte offset 12)   | e[0].x[0] | e[0].x[1] |
			
 
				+//                              |-----------------------|
			
 
				+// address 4 (byte offset 16)   | e[1].x[0] | e[1].x[1] |
			
 
				+//                              |-----------------------|
			
 
				+//
			
 
				+
			
 
				+// CHECK: OpStore %sArr
			
 
				+// CHECK: OpAccessChain %_ptr_Function_uint %tid %int_0
			
 
				+// CHECK: [[address0:%\d+]] = OpShiftRightLogical %uint {{%\d+}} %uint_2
			
 
				+// CHECK:     [[sArr:%\d+]] = OpLoad %_arr_S_uint_2 %sArr
			
 
				+// CHECK:    [[sArr0:%\d+]] = OpCompositeExtract %S [[sArr]] 0
			
 
				+// CHECK:    [[sArr1:%\d+]] = OpCompositeExtract %S [[sArr]] 1
			
 
				+// CHECK:     [[s0_a:%\d+]] = OpCompositeExtract %half [[sArr]] 0
			
 
				+// CHECK:     [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
			
 
				+// CHECK: OpBitcast %ushort
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpStore [[ptr0]]
			
 
				+// CHECK:     [[s0_e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[sArr0]] 1
			
 
				+// CHECK:    [[s0_e0:%\d+]] = OpCompositeExtract %T [[s0_e]] 0
			
 
				+// CHECK:    [[s0_e1:%\d+]] = OpCompositeExtract %T [[s0_e]] 1
			
 
				+// CHECK:  [[s0_e0_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s0_e0]] 0
			
 
				+// CHECK: [[s0_e0_x0:%\d+]] = OpCompositeExtract %half [[s0_e0_x]] 0
			
 
				+// CHECK: [[s0_e0_x1:%\d+]] = OpCompositeExtract %half [[s0_e0_x]] 1
			
 
				+// CHECK:     [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
			
 
				+// CHECK: OpBitcast %ushort [[s0_e0_x0]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpShiftLeftLogical %uint
			
 
				+// CHECK: [[oldWord0:%\d+]] = OpLoad %uint [[ptr0]]
			
 
				+// CHECK: [[newWord0:%\d+]] = OpBitwiseOr %uint [[oldWord0]] {{%\d+}}
			
 
				+// CHECK:                     OpStore [[ptr0]] [[newWord0]]
			
 
				+// CHECK: [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
			
 
				+// CHECK: OpBitcast %ushort [[s0_e0_x1]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK:     [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
			
 
				+// CHECK:                     OpStore [[ptr1]] {{%\d+}}
			
 
				+// CHECK: [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
			
 
				+// CHECK:  [[s0_e1_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s0_e1]] 0
			
 
				+// CHECK: [[s0_e1_x0:%\d+]] = OpCompositeExtract %half [[s0_e1_x]] 0
			
 
				+// CHECK: [[s0_e1_x1:%\d+]] = OpCompositeExtract %half [[s0_e1_x]] 1
			
 
				+// CHECK:     [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
			
 
				+// CHECK: OpBitcast %ushort [[s0_e1_x0]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
			
 
				+// CHECK: [[oldWord1:%\d+]] = OpLoad %uint [[address1]]
			
 
				+// CHECK: [[newWord1:%\d+]] = OpBitwiseOr %uint [[oldWord1]] {{%\d+}}
			
 
				+// CHECK:                     OpStore [[ptr1]] [[newWord1]]
			
 
				+
			
 
				+// CHECK: [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
			
 
				+// CHECK: OpBitcast %ushort [[s0_e1_x1]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK:     [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
			
 
				+// CHECK:                     OpStore [[ptr2]] {{%\d+}}
			
 
				+// CHECK: [[address2:%\d+]] = OpIAdd %uint [[address0]] %uint_2
			
 
				+// CHECK:     [[s1_a:%\d+]] = OpCompositeExtract %half [[sArr1]] 0
			
 
				+// CHECK:     [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
			
 
				+// CHECK: OpBitcast %ushort [[s1_a]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
			
 
				+// CHECK: [[oldWord2:%\d+]] = OpLoad %uint [[ptr2]]
			
 
				+// CHECK: [[newWord2:%\d+]] = OpBitwiseOr %uint [[oldWord2]] {{%\d+}}
			
 
				+// CHECK:                     OpStore [[ptr2]] [[newWord2]]
			
 
				+
			
 
				+// CHECK: [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
			
 
				+// CHECK: [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
			
 
				+// CHECK:     [[s1_e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[sArr1]] 1
			
 
				+// CHECK:    [[s1_e0:%\d+]] = OpCompositeExtract %T [[s1_e]] 0
			
 
				+// CHECK:    [[s1_e1:%\d+]] = OpCompositeExtract %T [[s1_e]] 1
			
 
				+// CHECK:  [[s1_e0_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s1_e0]] 0
			
 
				+// CHECK: [[s1_e0_x0:%\d+]] = OpCompositeExtract %half [[s1_e0_x]] 0
			
 
				+// CHECK: [[s1_e0_x1:%\d+]] = OpCompositeExtract %half [[s1_e0_x]] 1
			
 
				+// CHECK: OpBitcast %ushort [[s1_e0_x0]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpBitcast %ushort [[s1_e0_x1]]
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
			
 
				+// CHECK: OpBitwiseOr %uint
			
 
				+// CHECK: [[ptr3:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address3]]
			
 
				+// CHECK: OpStore [[ptr3]] {{%\d+}}
			
 
				+
			
 
				+// CHECK: [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
			
 
				+// CHECK: [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
			
 
				+// CHECK:  [[s1_e1_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s1_e1]] 0
			
 
				+// CHECK: [[s1_e1_x0:%\d+]] = OpCompositeExtract %half [[s1_e1_x]] 0
			
 
				+// CHECK: [[s1_e1_x1:%\d+]] = OpCompositeExtract %half [[s1_e1_x]] 1
			
 
				+// CHECK: OpBitcast %ushort
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpBitcast %ushort
			
 
				+// CHECK: OpUConvert %uint
			
 
				+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
			
 
				+// CHECK: OpBitwiseOr %uint
			
 
				+// CHECK: [[ptr4:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address4]]
			
 
				+// CHECK: OpStore [[ptr4]] {{%\d+}}
			
--- a/tools/clang/unittests/SPIRV/CodeGenSpirvTest.cpp
+++ b/tools/clang/unittests/SPIRV/CodeGenSpirvTest.cpp
@@ -898,9 +898,18 @@ TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct) {
 
				 TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct2) {
			
 
				   runFileTest("method.byte-address-buffer.templated-load.struct2.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct3) {
			
 
				+  runFileTest("method.byte-address-buffer.templated-load.struct3.hlsl");
			
 
				+}
			
 
				 TEST_F(FileTest, ByteAddressBufferStore) {
			
 
				   runFileTest("method.byte-address-buffer.store.hlsl");
			
 
				 }
			
 
				+TEST_F(FileTest, ByteAddressBufferTemplatedStoreStruct) {
			
 
				+  runFileTest("method.byte-address-buffer.templated-store.struct.hlsl");
			
 
				+}
			
 
				+TEST_F(FileTest, ByteAddressBufferTemplatedStoreStruct2) {
			
 
				+  runFileTest("method.byte-address-buffer.templated-store.struct.hlsl");
			
 
				+}
			
 
				 TEST_F(FileTest, ByteAddressBufferGetDimensions) {
			
 
				   runFileTest("method.byte-address-buffer.get-dimensions.hlsl");
			
 
				 }