Browse Source

[spirv] support ByteAddressBuffer templated Store<T> (#2428)

* [spirv] Remove load methods that cannot be used.

In order to maintain correctness in alignment, there should be no case
when we need to load a 32-bit or 64-bit value at an offset of 16 bits.

* [spirv] ByteAddressBuffer templated Store<T>.

* Remove bitOffset for Store method.

* Fix struct offset bug and add test for it.

* Fix more bugs in struct offset for both load and store and add tests.
Ehsan 6 years ago
parent
commit
b6a7b82644

+ 3 - 0
tools/clang/include/clang/SPIRV/AstTypeProbe.h

@@ -38,6 +38,9 @@ bool isScalarType(QualType type, QualType *scalarType = nullptr);
 bool isVectorType(QualType type, QualType *elemType = nullptr,
                   uint32_t *elemCount = nullptr);
 
+/// Returns true if the given type is an array with constant known size.
+bool isConstantArrayType(const ASTContext &, QualType);
+
 /// Returns true if the given type is enum type based on AST parse.
 bool isEnumType(QualType type);
 

+ 4 - 0
tools/clang/lib/SPIRV/AstTypeProbe.cpp

@@ -152,6 +152,10 @@ bool isVectorType(QualType type, QualType *elemType, uint32_t *elemCount) {
   return isVec;
 }
 
+bool isConstantArrayType(const ASTContext &astContext, QualType type) {
+  return astContext.getAsConstantArrayType(type) != nullptr;
+}
+
 bool isEnumType(QualType type) {
   if (isa<EnumType>(type.getTypePtr()))
     return true;

+ 398 - 155
tools/clang/lib/SPIRV/RawBufferMethods.cpp

@@ -33,7 +33,7 @@ RawBufferHandler::bitCastToNumericalOrBool(SpirvInstruction *instr,
   if (isSameType(astContext, fromType, toType))
     return instr;
 
-  if (toType->isBooleanType())
+  if (toType->isBooleanType() || fromType->isBooleanType())
     return theEmitter.castToType(instr, fromType, toType, loc);
 
   // Perform a bitcast
@@ -185,148 +185,6 @@ SpirvInstruction *RawBufferHandler::load16BitsAtBitOffset16(
   return result;
 }
 
-SpirvInstruction *RawBufferHandler::load32BitsAtBitOffset16(
-    SpirvInstruction *buffer, SpirvInstruction *&index,
-    QualType target32BitType, uint32_t &bitOffset) {
-  assert(bitOffset == 16);
-  const auto loc = buffer->getSourceLocation();
-  SpirvInstruction *result = nullptr;
-  SpirvInstruction *ptr = nullptr;
-  auto *constUint0 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
-  auto *constUint1 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
-  auto *constUint16 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
-
-  // The underlying element type of the ByteAddressBuffer is uint. Since the
-  // bitOffset is not zero, we need to perform two load operations.
-
-  // Load the first 32-bit uint. Only its 16 MSBs matter.
-  // The 16 MSBs of the loaded value becomes the 16 LSBs of the result.
-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
-                                     {constUint0, index}, loc);
-  SpirvInstruction *lsb =
-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
-
-  // Right shift by 16 bits leaves the upper 16 bits as 0.
-  lsb = spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
-                                  astContext.UnsignedIntTy, lsb, constUint16,
-                                  loc);
-
-  // Increment the base index
-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
-                                    index, constUint1, loc);
-
-  // Load the second 32-bit uint. Only its 16 LSBs matter.
-  // The 16 LSBs of the loaded value becomes the 16 MSBs of the result.
-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
-                                     {constUint0, index}, loc);
-  SpirvInstruction *msb =
-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
-
-  // Left shift by 16 bits leaves the lower 16 bits as 0.
-  msb = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
-                                  astContext.UnsignedIntTy, msb, constUint16,
-                                  loc);
-
-  // Bitwise Or the MSBs and LSBs to get the resulting 32-bit value.
-  result = spvBuilder.createBinaryOp(spv::Op::OpBitwiseOr,
-                                     astContext.UnsignedIntTy, lsb, msb, loc);
-
-  result = bitCastToNumericalOrBool(result, astContext.UnsignedIntTy,
-                                    target32BitType, loc);
-  result->setRValue();
-
-  // Now that a 32-bit load at bit-offset 16 has been performed, the next load
-  // should be done at *the next base index* at bit-offset 16.
-  // The base index has already been incremented.
-  bitOffset = (bitOffset + 32) % 32;
-
-  return result;
-}
-
-SpirvInstruction *RawBufferHandler::load64BitsAtBitOffset16(
-    SpirvInstruction *buffer, SpirvInstruction *&index,
-    QualType target64BitType, uint32_t &bitOffset) {
-  assert(bitOffset == 16);
-  const auto loc = buffer->getSourceLocation();
-  SpirvInstruction *result = nullptr;
-  SpirvInstruction *ptr = nullptr;
-  auto *constUint0 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
-  auto *constUint1 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
-  auto *constUint16 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
-  auto *constUint48 =
-      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 48));
-
-  // The underlying element type of the ByteAddressBuffer is uint. Since the
-  // bitOffset is 16, we need to perform three load operations.
-  // Use 16 bits from the first load, all the 32 bits from the second load, and
-  // 16 bits from the third load.
-
-  // Load the first 32-bit uint. Only its 16 MSBs matter.
-  // Right shift by 16 bits leaves the upper 16 bits as 0.
-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
-                                     {constUint0, index}, loc);
-  SpirvInstruction *first16 =
-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
-
-  // Incremenet the index and load a 32-bit uint.
-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
-                                    index, constUint1, loc);
-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
-                                     {constUint0, index}, loc);
-  SpirvInstruction *middle32 =
-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
-
-  // Incremenet the index and load a 32-bit uint. Only its 16 LSBs matter.
-  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
-                                    index, constUint1, loc);
-  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
-                                     {constUint0, index}, loc);
-  SpirvInstruction *last16 =
-      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc);
-
-  // Convert all parts to 64 bits
-  first16 = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, first16, loc);
-  middle32 = spvBuilder.createUnaryOp(
-      spv::Op::OpUConvert, astContext.UnsignedLongLongTy, middle32, loc);
-  last16 = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
-                                    astContext.UnsignedLongLongTy, last16, loc);
-
-  // Perform: (first16 >> 16) | (middle32 << 16) | (last16 << 48)
-  first16 = spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
-                                      astContext.UnsignedLongLongTy, first16,
-                                      constUint16, loc);
-  middle32 = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
-                                       astContext.UnsignedLongLongTy, middle32,
-                                       constUint16, loc);
-  last16 = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
-                                     astContext.UnsignedLongLongTy, last16,
-                                     constUint48, loc);
-
-  result = spvBuilder.createBinaryOp(spv::Op::OpBitwiseOr,
-                                     astContext.UnsignedLongLongTy, first16,
-                                     middle32, loc);
-  result = spvBuilder.createBinaryOp(
-      spv::Op::OpBitwiseOr, astContext.UnsignedLongLongTy, result, last16, loc);
-
-  result = bitCastToNumericalOrBool(result, astContext.UnsignedLongLongTy,
-                                    target64BitType, loc);
-  result->setRValue();
-
-  // Now that a 64-bit load at bit-offset 16 has been performed, the next load
-  // should be done at *the base index + 2* at bit-offset 16.
-  // The base index has already been incremented twice.
-  bitOffset = (bitOffset + 64) % 32;
-
-  return result;
-}
-
 SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     SpirvInstruction *buffer, SpirvInstruction *&index,
     const QualType targetType, uint32_t &bitOffset) {
@@ -372,11 +230,12 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
         return load16BitsAtBitOffset16(buffer, index, targetType, bitOffset);
         break;
       case 32:
-        return load32BitsAtBitOffset16(buffer, index, targetType, bitOffset);
-        break;
       case 64:
-        return load64BitsAtBitOffset16(buffer, index, targetType, bitOffset);
-        break;
+        theEmitter.emitError(
+            "templated buffer load should not result in loading "
+            "32-bit or 64-bit values at bit offset 16",
+            loc);
+        return nullptr;
       default:
         theEmitter.emitError(
             "templated load of ByteAddressBuffer is only implemented for "
@@ -462,7 +321,6 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
   // As a result, there might exist some padding after some struct members.
   if (const auto *structType = targetType->getAs<RecordType>()) {
     const auto *decl = structType->getDecl();
-    assert(bitOffset == 0);
     SpirvInstruction *originalIndex = index;
     uint32_t originalBitOffset = bitOffset;
     llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
@@ -481,15 +339,16 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
           field->getType(), theEmitter.getSpirvOptions().sBufferLayoutRule,
           /*isRowMajor*/ llvm::None, &stride);
       fieldOffsetInBytes = roundToPow2(fieldOffsetInBytes, fieldAlignment);
+      const auto wordOffset =
+          ((originalBitOffset / 8) + fieldOffsetInBytes) / 4;
+      bitOffset = (((originalBitOffset / 8) + fieldOffsetInBytes) % 4) * 8;
 
-      if (fieldOffsetInBytes != 0) {
+      if (wordOffset != 0) {
         // Divide the fieldOffset by 4 to figure out how much to increment the
         // index into the buffer (increment occurs by 32-bit words since the
         // underlying type is an array of uints).
         // The remainder by four tells us the *byte offset* (then multiply by 8
         // to get bit offset).
-        auto wordOffset = fieldOffsetInBytes / 4;
-        bitOffset = (fieldOffsetInBytes % 4) * 8;
         index = spvBuilder.createBinaryOp(
             spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
             spvBuilder.getConstantInt(astContext.UnsignedIntTy,
@@ -512,16 +371,14 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     // 8 * (4 + 1) = 40
     assert(structAlignment != 0);
     uint32_t newByteOffset = roundToPow2(structSize, structAlignment);
-    uint32_t newWordOffset = newByteOffset / 4;
+    uint32_t newWordOffset = ((originalBitOffset / 8) + newByteOffset) / 4;
+    bitOffset = 8 * (((originalBitOffset / 8) + newByteOffset) % 4);
     index = spvBuilder.createBinaryOp(
         spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
         spvBuilder.getConstantInt(astContext.UnsignedIntTy,
                                   llvm::APInt(32, newWordOffset)),
         loc);
 
-    // New bitOffset should be zero because after loading the struct, we will
-    // be loading at the next aligned address.
-    bitOffset = 0;
     result = spvBuilder.createCompositeConstruct(targetType, loadedElems, loc);
     result->setRValue();
     return result;
@@ -530,5 +387,391 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
   llvm_unreachable("templated buffer load unimplemented for type");
 }
 
+void RawBufferHandler::store16BitsAtBitOffset0(SpirvInstruction *value,
+                                               SpirvInstruction *buffer,
+                                               SpirvInstruction *&index,
+                                               const QualType valueType) {
+  const auto loc = buffer->getSourceLocation();
+  SpirvInstruction *result = nullptr;
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  // The underlying element type of the ByteAddressBuffer is uint. So we
+  // need to store a 32-bit value.
+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                           {constUint0, index}, loc);
+  result = bitCastToNumericalOrBool(value, valueType,
+                                    astContext.UnsignedShortTy, loc);
+  result = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
+                                    astContext.UnsignedIntTy, result, loc);
+  spvBuilder.createStore(ptr, result, loc);
+}
+
+void RawBufferHandler::store16BitsAtBitOffset16(SpirvInstruction *value,
+                                               SpirvInstruction *buffer,
+                                               SpirvInstruction *&index,
+                                               const QualType valueType) {
+  const auto loc = buffer->getSourceLocation();
+  SpirvInstruction *result = nullptr;
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  auto *constUint1 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
+  auto *constUint16 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
+  // The underlying element type of the ByteAddressBuffer is uint. So we
+  // need to store a 32-bit value.
+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                           {constUint0, index}, loc);
+  result = bitCastToNumericalOrBool(value, valueType,
+                                    astContext.UnsignedShortTy, loc);
+  result = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
+                                    astContext.UnsignedIntTy, result, loc);
+  result = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
+                                     astContext.UnsignedIntTy, result,
+                                     constUint16, loc);
+  result = spvBuilder.createBinaryOp(
+      spv::Op::OpBitwiseOr, astContext.UnsignedIntTy,
+      spvBuilder.createLoad(astContext.UnsignedIntTy, ptr, loc), result, loc);
+  spvBuilder.createStore(ptr, result, loc);
+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
+                                    index, constUint1, loc);
+}
+
+void RawBufferHandler::store32BitsAtBitOffset0(SpirvInstruction *value,
+                                               SpirvInstruction *buffer,
+                                               SpirvInstruction *&index,
+                                               const QualType valueType) {
+  const auto loc = buffer->getSourceLocation();
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  auto *constUint1 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
+  // The underlying element type of the ByteAddressBuffer is uint. So we
+  // need to store a 32-bit value.
+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                           {constUint0, index}, loc);
+  value =
+      bitCastToNumericalOrBool(value, valueType, astContext.UnsignedIntTy, loc);
+  spvBuilder.createStore(ptr, value, loc);
+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
+                                    index, constUint1, loc);
+}
+
+void RawBufferHandler::store64BitsAtBitOffset0(SpirvInstruction *value,
+                                               SpirvInstruction *buffer,
+                                               SpirvInstruction *&index,
+                                               const QualType valueType) {
+  const auto loc = buffer->getSourceLocation();
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  auto *constUint1 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
+  auto *constUint32 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 32));
+
+  // The underlying element type of the ByteAddressBuffer is uint. So we
+  // need to store two 32-bit values.
+  auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                           {constUint0, index}, loc);
+  // First convert the 64-bit value to uint64_t. Then extract two 32-bit words
+  // from it.
+  value = bitCastToNumericalOrBool(value, valueType,
+                                   astContext.UnsignedLongLongTy, loc);
+
+  // Use OpUConvert to perform truncation (produces the least significant bits).
+  SpirvInstruction *lsb = spvBuilder.createUnaryOp(
+      spv::Op::OpUConvert, astContext.UnsignedIntTy, value, loc);
+
+  // Shift uint64_t to the right by 32 bits and truncate to get the most
+  // significant bits.
+  SpirvInstruction *msb = spvBuilder.createUnaryOp(
+      spv::Op::OpUConvert, astContext.UnsignedIntTy,
+      spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical,
+                                astContext.UnsignedLongLongTy, value,
+                                constUint32, loc),
+      loc);
+
+  spvBuilder.createStore(ptr, lsb, loc);
+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
+                                    index, constUint1, loc);
+  ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                     {constUint0, index}, loc);
+  spvBuilder.createStore(ptr, msb, loc);
+  index = spvBuilder.createBinaryOp(spv::Op::OpIAdd, astContext.UnsignedIntTy,
+                                    index, constUint1, loc);
+}
+
+void RawBufferHandler::storeArrayOfScalars(
+    std::deque<SpirvInstruction *> values, SpirvInstruction *buffer,
+    SpirvInstruction *&index, const QualType valueType, uint32_t &bitOffset,
+    SourceLocation loc) {
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  auto *constUint1 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
+  auto *constUint16 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 16));
+  const auto storeWidth = getElementSpirvBitwidth(
+      astContext, valueType, theEmitter.getSpirvOptions().enable16BitTypes);
+  const uint32_t elemCount = values.size();
+
+  if (storeWidth == 16u) {
+    uint32_t elemIndex = 0;
+    if (bitOffset == 16) {
+      // First store the first element at offset 16 of the last memory index.
+      store16BitsAtBitOffset16(values[0], buffer, index, valueType);
+      bitOffset = 0;
+      ++elemIndex;
+    }
+    // Do a custom store based on the number of elements.
+    for (; elemIndex < elemCount; elemIndex = elemIndex + 2) {
+      // The underlying element type of the ByteAddressBuffer is uint. So we
+      // need to store a 32-bit value by combining two 16-bit values.
+      SpirvInstruction *word = nullptr;
+      word = bitCastToNumericalOrBool(values[elemIndex], valueType,
+                                      astContext.UnsignedShortTy, loc);
+      // Zero-extend to 32 bits.
+      word = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
+                                      astContext.UnsignedIntTy, word, loc);
+      if (elemIndex + 1 < elemCount) {
+        SpirvInstruction *msb = nullptr;
+        msb = bitCastToNumericalOrBool(values[elemIndex + 1], valueType,
+                                       astContext.UnsignedShortTy, loc);
+        msb = spvBuilder.createUnaryOp(spv::Op::OpUConvert,
+                                       astContext.UnsignedIntTy, msb, loc);
+        msb = spvBuilder.createBinaryOp(spv::Op::OpShiftLeftLogical,
+                                        astContext.UnsignedIntTy, msb,
+                                        constUint16, loc);
+        word = spvBuilder.createBinaryOp(
+            spv::Op::OpBitwiseOr, astContext.UnsignedIntTy, word, msb, loc);
+        // We will store two 16-bit values.
+        bitOffset = (bitOffset + 32) % 32;
+      } else {
+        // We will store one 16-bit value.
+        bitOffset = (bitOffset + 16) % 32;
+      }
+
+      auto *ptr = spvBuilder.createAccessChain(astContext.UnsignedIntTy, buffer,
+                                               {constUint0, index}, loc);
+      spvBuilder.createStore(ptr, word, loc);
+      index = spvBuilder.createBinaryOp(
+          spv::Op::OpIAdd, astContext.UnsignedIntTy, index, constUint1, loc);
+    }
+  } else if (storeWidth == 32u || storeWidth == 64u) {
+    assert(bitOffset == 0);
+    for (uint32_t i = 0; i < elemCount; ++i)
+      processTemplatedStoreToBuffer(values[i], buffer, index, valueType, bitOffset);
+  }
+}
+
+QualType RawBufferHandler::serializeToScalarsOrStruct(
+    std::deque<SpirvInstruction *> *values, QualType valueType,
+    SourceLocation loc) {
+  uint32_t size = values->size();
+
+  // Vector type
+  {
+    QualType elemType = {};
+    uint32_t elemCount = 0;
+    if (isVectorType(valueType, &elemType, &elemCount)) {
+      for (uint32_t i = 0; i < size; ++i) {
+        for (uint32_t j = 0; j < elemCount; ++j) {
+          values->push_back(spvBuilder.createCompositeExtract(
+              elemType, values->front(), {j}, loc));
+        }
+        values->pop_front();
+      }
+      return elemType;
+    }
+  }
+
+  // Matrix type
+  {
+    QualType elemType = {};
+    uint32_t numRows = 0, numCols = 0;
+    if (isMxNMatrix(valueType, &elemType, &numRows, &numCols)) {
+      for (uint32_t i = 0; i < size; ++i) {
+        for (uint32_t j = 0; j < numRows; ++j) {
+          for (uint32_t k = 0; k < numCols; ++k) {
+            // TODO: This is currently doing a row_major matrix store. We must
+            // investigate whether we also need to implement it for
+            // column_major.
+            values->push_back(spvBuilder.createCompositeExtract(
+                elemType, values->front(), {j, k}, loc));
+          }
+        }
+        values->pop_front();
+      }
+      return serializeToScalarsOrStruct(values, elemType, loc);
+    }
+  }
+
+  // Array type
+  {
+    if (const auto *arrType = astContext.getAsConstantArrayType(valueType)) {
+      const uint32_t arrElemCount =
+          static_cast<uint32_t>(arrType->getSize().getZExtValue());
+      const QualType arrElemType = arrType->getElementType();
+      for (uint32_t i = 0; i < size; ++i) {
+        for (uint32_t j = 0; j < arrElemCount; ++j) {
+          values->push_back(spvBuilder.createCompositeExtract(
+              arrElemType, values->front(), {j}, loc));
+        }
+        values->pop_front();
+      }
+      return serializeToScalarsOrStruct(values, arrElemType, loc);
+    }
+  }
+
+  if (isScalarType(valueType))
+    return valueType;
+
+  if (const auto *structType = valueType->getAs<RecordType>())
+    return valueType;
+
+  llvm_unreachable("unhandled type when serializing an array");
+}
+
+void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
+                                                     SpirvInstruction *buffer,
+                                                     SpirvInstruction *&index,
+                                                     const QualType valueType,
+                                                     uint32_t &bitOffset) {
+  assert(bitOffset == 0 || bitOffset == 16);
+  const auto loc = buffer->getSourceLocation();
+  auto *constUint0 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 0));
+  auto *constUint1 =
+      spvBuilder.getConstantInt(astContext.UnsignedIntTy, llvm::APInt(32, 1));
+
+  // Scalar types
+  if (isScalarType(valueType)) {
+    auto storeWidth = getElementSpirvBitwidth(
+        astContext, valueType, theEmitter.getSpirvOptions().enable16BitTypes);
+    switch (bitOffset) {
+    case 0: {
+      switch (storeWidth) {
+      case 16:
+        store16BitsAtBitOffset0(value, buffer, index, valueType);
+        return;
+      case 32:
+        store32BitsAtBitOffset0(value, buffer, index, valueType);
+        return;
+      case 64:
+        store64BitsAtBitOffset0(value, buffer, index, valueType);
+        return;
+      default:
+        theEmitter.emitError(
+            "templated load of ByteAddressBuffer is only implemented for "
+            "16, 32, and 64-bit types",
+            loc);
+        return;
+      }
+    }
+    case 16: {
+      // The only legal store at offset 16 is by a 16-bit value.
+      assert(storeWidth == 16);
+      store16BitsAtBitOffset16(value, buffer, index, valueType);
+      return;
+    }
+    default:
+      theEmitter.emitError(
+          "templated load of ByteAddressBuffer is only implemented for "
+          "16, 32, and 64-bit types",
+          loc);
+      return;
+    }
+  }
+
+  // Vectors, Matrices, and Arrays can all be serialized and stored.
+  if (isVectorType(valueType) || isMxNMatrix(valueType) ||
+      isConstantArrayType(astContext, valueType)) {
+    std::deque<SpirvInstruction *> elems;
+    elems.push_back(value);
+    auto serializedType = serializeToScalarsOrStruct(&elems, valueType, loc);
+    if (isScalarType(serializedType)) {
+      storeArrayOfScalars(elems, buffer, index, serializedType, bitOffset, loc);
+    } else if (const auto *structType = serializedType->getAs<RecordType>()) {
+      for (auto elem : elems)
+        processTemplatedStoreToBuffer(elem, buffer, index, serializedType,
+                                      bitOffset);
+    }
+    return;
+  }
+
+  // Struct types
+  // The "natural" layout for structure types dictates that structs are
+  // aligned like their field with the largest alignment.
+  // As a result, there might exist some padding after some struct members.
+  if (const auto *structType = valueType->getAs<RecordType>()) {
+    const auto *decl = structType->getDecl();
+    SpirvInstruction *originalIndex = index;
+    const auto originalBitOffset = bitOffset;
+    uint32_t fieldOffsetInBytes = 0;
+    uint32_t structAlignment = 0, structSize = 0, stride = 0;
+    std::tie(structAlignment, structSize) =
+        AlignmentSizeCalculator(astContext, theEmitter.getSpirvOptions())
+            .getAlignmentAndSize(valueType,
+                                 theEmitter.getSpirvOptions().sBufferLayoutRule,
+                                 llvm::None, &stride);
+    uint32_t fieldIndex = 0;
+    for (const auto *field : decl->fields()) {
+      AlignmentSizeCalculator alignmentCalc(astContext,
+                                            theEmitter.getSpirvOptions());
+      uint32_t fieldSize = 0, fieldAlignment = 0;
+      std::tie(fieldAlignment, fieldSize) = alignmentCalc.getAlignmentAndSize(
+          field->getType(), theEmitter.getSpirvOptions().sBufferLayoutRule,
+          /*isRowMajor*/ llvm::None, &stride);
+      fieldOffsetInBytes = roundToPow2(fieldOffsetInBytes, fieldAlignment);
+      const auto wordOffset =
+          ((originalBitOffset / 8) + fieldOffsetInBytes) / 4;
+      bitOffset = (((originalBitOffset / 8) + fieldOffsetInBytes) % 4) * 8;
+
+      if (wordOffset != 0) {
+        // Divide the fieldOffset by 4 to figure out how much to increment the
+        // index into the buffer (increment occurs by 32-bit words since the
+        // underlying type is an array of uints).
+        // The remainder by four tells us the *byte offset* (then multiply by 8
+        // to get bit offset).
+        index = spvBuilder.createBinaryOp(
+            spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
+            spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                      llvm::APInt(32, wordOffset)),
+            loc);
+      }
+
+      processTemplatedStoreToBuffer(
+          spvBuilder.createCompositeExtract(field->getType(), value,
+                                            {fieldIndex}, loc),
+          buffer, index, field->getType(), bitOffset);
+
+      fieldOffsetInBytes += fieldSize;
+      ++fieldIndex;
+    }
+
+    // After we're done with storing the entire struct, we need to update the
+    // index (in case we are stroring an array of structs).
+    //
+    // Example: struct alignment = 8. struct size = 34 bytes
+    // (34 / 8) = 4 full words
+    // (34 % 8) = 2 > 0, therefore need to move to the next aligned address
+    // So the starting byte offset after loading the entire struct is:
+    // 8 * (4 + 1) = 40
+    assert(structAlignment != 0);
+    uint32_t newByteOffset = roundToPow2(structSize, structAlignment);
+    uint32_t newWordOffset = ((originalBitOffset / 8) + newByteOffset) / 4;
+    bitOffset = 8 * (((originalBitOffset / 8) + newByteOffset) % 4);
+    index = spvBuilder.createBinaryOp(
+        spv::Op::OpIAdd, astContext.UnsignedIntTy, originalIndex,
+        spvBuilder.getConstantInt(astContext.UnsignedIntTy,
+                                  llvm::APInt(32, newWordOffset)),
+        loc);
+
+    return;
+  }
+
+  llvm_unreachable("templated buffer store unimplemented for type");
+}
+
 } // namespace spirv
 } // namespace clang

+ 52 - 10
tools/clang/lib/SPIRV/RawBufferMethods.h

@@ -28,8 +28,10 @@ public:
   /// which is a runtime array in SPIR-V. This method works by loading one or
   /// more uints, and performing necessary casts and composite constructions
   /// to build the 'targetType'. The 'offset' parameter can be used for finer
-  /// grained load of bitwidths smaller than 32-bits. Example: targetType =
-  /// uint16_t, address=0, offset=0
+  /// grained load of bitwidths smaller than 32-bits.
+  ///
+  /// Example:
+  /// targetType = uint16_t, address=0, offset=0
   ///                 --> Load the first 16-bit uint starting at address 0.
   /// targetType = uint16_t, address=0, offset=16
   ///                 --> Load the second 16-bit uint starting at address 0.
@@ -38,6 +40,25 @@ public:
                                                    const QualType targetType,
                                                    uint32_t &bitOffset);
 
+  /// \brief Performs RWByteAddressBuffer.Store<T>(address, value).
+  /// RWByteAddressBuffers are represented in SPIR-V as structs with only one
+  /// member which is a runtime array of uints. This method works by decomposing
+  /// the given |value| to reach numeric/bool types. Then performs necessary
+  /// casts to uints and stores them in the underlying runtime array.
+  /// The |bitOffset| parameter can be used for finer-grained bit-offset
+  /// control.
+  ///
+  /// Example:
+  /// targetType = uint16_t, address=0, offset=0
+  ///                 --> Store to the first 16-bit uint starting at address 0.
+  /// targetType = uint16_t, address=0, offset=16
+  ///                 --> Store to the second 16-bit uint starting at address 0.
+  void processTemplatedStoreToBuffer(SpirvInstruction *value,
+                                     SpirvInstruction *buffer,
+                                     SpirvInstruction *&index,
+                                     const QualType valueType,
+                                     uint32_t &bitOffset);
+
 private:
   SpirvInstruction *load16BitsAtBitOffset0(SpirvInstruction *buffer,
                                            SpirvInstruction *&index,
@@ -59,15 +80,36 @@ private:
                                             QualType target16BitType,
                                             uint32_t &bitOffset);
 
-  SpirvInstruction *load32BitsAtBitOffset16(SpirvInstruction *buffer,
-                                            SpirvInstruction *&index,
-                                            QualType target32BitType,
-                                            uint32_t &bitOffset);
+private:
+  void store16BitsAtBitOffset0(SpirvInstruction *value,
+                               SpirvInstruction *buffer,
+                               SpirvInstruction *&index,
+                               const QualType valueType);
 
-  SpirvInstruction *load64BitsAtBitOffset16(SpirvInstruction *buffer,
-                                            SpirvInstruction *&index,
-                                            QualType target64BitType,
-                                            uint32_t &bitOffset);
+  void store32BitsAtBitOffset0(SpirvInstruction *value,
+                               SpirvInstruction *buffer,
+                               SpirvInstruction *&index,
+                               const QualType valueType);
+
+  void store64BitsAtBitOffset0(SpirvInstruction *value,
+                               SpirvInstruction *buffer,
+                               SpirvInstruction *&index,
+                               const QualType valueType);
+
+  void store16BitsAtBitOffset16(SpirvInstruction *value,
+                                SpirvInstruction *buffer,
+                                SpirvInstruction *&index,
+                                const QualType valueType);
+
+  void storeArrayOfScalars(std::deque<SpirvInstruction *> values,
+                           SpirvInstruction *buffer, SpirvInstruction *&index,
+                           const QualType valueType, uint32_t &bitOffset,
+                           SourceLocation);
+
+  /// \brief Serializes the given values into their components until a scalar or
+  /// a struct has been reached. Returns the most basic type it reaches.
+  QualType serializeToScalarsOrStruct(std::deque<SpirvInstruction *> *values,
+                                      QualType valueType, SourceLocation);
 
 private:
   /// \brief Performs an OpBitCast from |fromType| to |toType| on the given

+ 14 - 6
tools/clang/lib/SPIRV/SpirvEmitter.cpp

@@ -3317,8 +3317,9 @@ SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
   // Store3, Store4 intrinsic functions.
   const bool isTemplatedLoadOrStore =
       (numWords == 1) &&
-      (doStore ? expr->getArg(1)->getType() != astContext.UnsignedIntTy
-               : expr->getType() != astContext.UnsignedIntTy);
+      (doStore ? !expr->getArg(1)->getType()->isSpecificBuiltinType(
+                     BuiltinType::UInt)
+               : !expr->getType()->isSpecificBuiltinType(BuiltinType::UInt));
 
   // Do a OpShiftRightLogical by 2 (divide by 4 to get aligned memory
   // access). The AST always casts the address to unsinged integer, so shift
@@ -3329,13 +3330,20 @@ SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
       spvBuilder.createBinaryOp(spv::Op::OpShiftRightLogical, addressType,
                                 byteAddress, constUint2, expr->getExprLoc());
 
-  if (isTemplatedLoadOrStore && !doStore) {
+  if (isTemplatedLoadOrStore) {
     // Templated load. Need to (potentially) perform more
     // loads/casts/composite-constructs.
     uint32_t bitOffset = 0;
-    RawBufferHandler rawBufferHandler(*this);
-    return rawBufferHandler.processTemplatedLoadFromBuffer(
-        objectInfo, address, expr->getType(), bitOffset);
+    if (doStore) {
+      auto *values = doExpr(expr->getArg(1));
+      RawBufferHandler(*this).processTemplatedStoreToBuffer(
+          values, objectInfo, address, expr->getArg(1)->getType(), bitOffset);
+      return nullptr;
+    } else {
+      RawBufferHandler rawBufferHandler(*this);
+      return rawBufferHandler.processTemplatedLoadFromBuffer(
+          objectInfo, address, expr->getType(), bitOffset);
+    }
   }
 
   // Perform access chain into the RWByteAddressBuffer.

+ 1 - 1
tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.scalar.hlsl

@@ -110,7 +110,7 @@ ByteAddressBuffer buf;
 
 // CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[addr:%\d+]]
 // CHECK: [[val0_uint:%\d+]] = OpLoad %uint [[ptr]]
-// CHECK:      [[val0:%\d+]] = OpBitcast %int %174
+// CHECK:      [[val0:%\d+]] = OpBitcast %int [[val0_uint]]
 // CHECK:   [[newAddr:%\d+]] = OpIAdd %uint [[addr]] %uint_1
 // CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[newAddr]]
 // CHECK: [[val1_uint:%\d+]] = OpLoad %uint [[ptr]]

+ 95 - 0
tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.struct3.hlsl

@@ -0,0 +1,95 @@
+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
+
+ByteAddressBuffer buf;
+RWByteAddressBuffer buf2;
+
+struct T {
+  float16_t x[2];
+};
+
+struct S {
+  float16_t a;
+  T e[2];
+};
+
+[numthreads(64, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadId) {
+  S sArr[2] = buf.Load<S[2]>(tid.x);
+  buf2.Store<S[2]>(tid.x, sArr);
+}
+
+// Note: the DX layout tightly packs all members of S and its sub-structures.
+// It stores elements at the following byte offsets:
+// 0, 2, 4, 6, 8, 10, 12, 14, 16, 18
+//
+//                              |-----------------------|
+// address 0:                   |     a     | e[0].x[0] |
+//                              |-----------------------|
+// address 1 (byte offset 4):   | e[0].x[1] | e[1].x[0] |
+//                              |-----------------------|
+// address 2 (byte offset 8):   | e[1].x[1] |     a     |
+//                              |-----------------------|
+// address 3 (byte offset 12)   | e[0].x[0] | e[0].x[1] |
+//                              |-----------------------|
+// address 4 (byte offset 16)   | e[1].x[0] | e[1].x[1] |
+//                              |-----------------------|
+//
+
+// CHECK:      [[tidx_ptr:%\d+]] = OpAccessChain %_ptr_Function_uint %tid %int_0
+// CHECK:          [[tidx:%\d+]] = OpLoad %uint [[tidx_ptr]]
+// CHECK:      [[address0:%\d+]] = OpShiftRightLogical %uint [[tidx]] %uint_2
+// CHECK:          [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address0]]
+// CHECK:         [[word0:%\d+]] = OpLoad %uint [[ptr0]]
+// CHECK:      [[word0u16:%\d+]] = OpUConvert %ushort [[word0]]
+// CHECK:             [[a:%\d+]] = OpBitcast %half [[word0u16]]
+// CHECK:          [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address0]]
+// CHECK:         [[word0:%\d+]] = OpLoad %uint [[ptr0]]
+// CHECK:    [[word0upper:%\d+]] = OpShiftRightLogical %uint [[word0]] %uint_16
+// CHECK: [[word0upperu16:%\d+]] = OpUConvert %ushort [[word0upper]]
+// CHECK:           [[x_0:%\d+]] = OpBitcast %half [[word0upperu16]]
+// CHECK:      [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
+// CHECK:          [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address1]]
+// CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr1]]
+// CHECK:      [[word1u16:%\d+]] = OpUConvert %ushort [[word1]]
+// CHECK:           [[x_1:%\d+]] = OpBitcast %half [[word1u16]]
+// CHECK:             [[x:%\d+]] = OpCompositeConstruct %_arr_half_uint_2 [[x_0]] [[x_1]]
+// CHECK:      [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
+// CHECK:           [[e_0:%\d+]] = OpCompositeConstruct %T [[x]]
+// CHECK:          [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address1]]
+// CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr1]]
+// CHECK:    [[word1upper:%\d+]] = OpShiftRightLogical %uint [[word1]] %uint_16
+// CHECK: [[word1upperu16:%\d+]] = OpUConvert %ushort [[word1upper]]
+// CHECK:           [[x_0:%\d+]] = OpBitcast %half [[word1upperu16]]
+// CHECK:      [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
+// CHECK:          [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address2]]
+// CHECK:         [[word2:%\d+]] = OpLoad %uint [[ptr2]]
+// CHECK:      [[word2u16:%\d+]] = OpUConvert %ushort [[word2]]
+// CHECK:           [[x_1:%\d+]] = OpBitcast %half [[word2u16]]
+// CHECK:             [[x:%\d+]] = OpCompositeConstruct %_arr_half_uint_2 [[x_0]] [[x_1]]
+// CHECK:           [[e_1:%\d+]] = OpCompositeConstruct %T [[x]]
+// CHECK:             [[e:%\d+]] = OpCompositeConstruct %_arr_T_uint_2 [[e_0]] [[e_1]]
+// CHECK:      [[address2:%\d+]] = OpIAdd %uint [[address0]] %uint_2
+// CHECK:           [[s_0:%\d+]] = OpCompositeConstruct %S [[a]] [[e]]
+//
+// Now start with the second 'S' object
+//
+// CHECK:          [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address2]]
+// CHECK:         [[word2:%\d+]] = OpLoad %uint [[ptr2]]
+// CHECK:  [[word2upper16:%\d+]] = OpShiftRightLogical %uint [[word2]] %uint_16
+// CHECK: [[word2upperu16:%\d+]] = OpUConvert %ushort [[word2upper16]]
+// CHECK:             [[a:%\d+]] = OpBitcast %half [[word2upperu16]]
+// CHECK:      [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
+// CHECK:      [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
+// CHECK:               {{%\d+}} = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address3]]
+// CHECK:               {{%\d+}} = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address3]]
+// CHECK:                          OpCompositeConstruct %_arr_half_uint_2
+// CHECK:      [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
+// CHECK:                          OpCompositeConstruct %T
+// CHECK:                          OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address4]]
+// CHECK:                          OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[address4]]
+// CHECK:                          OpCompositeConstruct %_arr_half_uint_2
+// CHECK:                          OpCompositeConstruct %T
+// CHECK:                          OpCompositeConstruct %_arr_T_uint_2
+// CHECK:                          OpCompositeConstruct %S
+// CHECK:                          OpCompositeConstruct %_arr_S_uint_2
+// CHECK:                          OpStore %sArr {{%\d+}}

+ 717 - 0
tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct.hlsl

@@ -0,0 +1,717 @@
+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
+
+ByteAddressBuffer buf;
+RWByteAddressBuffer buf2;
+
+struct T {
+  float16_t x[5];
+};
+
+struct U {
+  float16_t v[3];
+  uint w;
+};
+
+struct S {
+  float16_t3 a[3];
+  double c;
+  T t;
+  double b;
+  float16_t d;
+  T e[2];
+  U f[2];
+  float16_t z;
+};
+
+[numthreads(64, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadId) {
+  S sArr[2] = buf.Load<S[2]>(tid.x);
+  buf2.Store<S[2]>(tid.x, sArr);
+}
+
+// Note: the following indeces are taken from them DXIL compilation:
+//
+//
+//                           // sArr[0] starts
+//
+//  %3 = 0                    // a[0] starts at byte offset 0
+//  %8 = add i32 %3, 6        // a[1] starts at byte offset 6
+// %13 = add i32 %3, 12       // a[2] starts at byte offset 12
+//                            // since the next member is a 'double' it does not
+//                            // start at offset 18 or 20. It starts at offset 24.
+//                            // byte [18-23] inclusive are PADDING.
+// %18 = add i32 %3, 24       // c starts at offset 24 (6 words)
+// %23 = add i32 %3, 32       // t.x[0] starts at byte offset 32 (8 words)
+// %26 = add i32 %3, 34       // t.x[1] starts at byte offset 34
+// %29 = add i32 %3, 36       // t.x[2] starts at byte offset 36
+// %32 = add i32 %3, 38       // t.x[2] starts at byte offset 38
+// %35 = add i32 %3, 40       // t.x[2] starts at byte offset 40 
+//                            // byte [42-47] inclusive are PADDING.
+// %38 = add i32 %3, 48       // b starts at byte offset 48 (12 words)
+// %43 = add i32 %3, 56       // d starts at byte offset 56 (14 words)
+//                            // even though 'e' is the next struct member,
+//                            // it does NOT start at an aligned address (does not start at 64 byte offset).
+// %46 = add i32 %3, 58       // e[0].x[0] starts at byte offset 58
+// %49 = add i32 %3, 60       // e[0].x[1] starts at byte offset 60
+// %52 = add i32 %3, 62       // e[0].x[2] starts at byte offset 62
+// %55 = add i32 %3, 64       // e[0].x[3] starts at byte offset 64
+// %58 = add i32 %3, 66       // e[0].x[4] starts at byte offset 66
+// %61 = add i32 %3, 68       // e[1].x[0] starts at byte offset 68
+// %64 = add i32 %3, 70       // e[1].x[1] starts at byte offset 70
+// %67 = add i32 %3, 72       // e[1].x[2] starts at byte offset 72
+// %70 = add i32 %3, 74       // e[1].x[3] starts at byte offset 74
+// %73 = add i32 %3, 76       // e[1].x[4] starts at byte offset 76
+//                            // 'f' starts at the next aligned address
+//                            // byte [78-79] inclusive are PADDING
+// %76 = add i32 %3, 80       // f[0].v[0] starts at byte offset 80 (20 words)
+// %79 = add i32 %3, 82       // f[0].v[1] starts at byte offset 82
+// %82 = add i32 %3, 84       // f[0].v[2] starts at byte offset 84
+//                            // byte [86-87] inclusive are PADDING
+// %85 = add i32 %3, 88       // f[0].w starts at byte offset 88 (22 words)
+// %88 = add i32 %3, 92       // f[1].v[0] starts at byte offset 92
+// %91 = add i32 %3, 94       // f[1].v[1] starts at byte offset 94
+// %94 = add i32 %3, 96       // f[1].v[2] starts at byte offset 96
+//                            // byte [98-99] inclusive are PADDING
+// %97 = add i32 %3, 100      // f[1].w starts at byte offset 100 (25 words)
+// %100 = add i32 %3, 104     // z starts at byte offset 104 (26 words)
+//
+//                           // sArr[1] starts
+//
+//                           // byte [106-111] inclusive are PADDING
+//
+//                           // ALL the following offsets are similar to offsets
+//                           // of sArr[0], shifted by 112 bytes.
+//
+// %103 = add i32 %3, 112
+// %108 = add i32 %3, 118
+// %113 = add i32 %3, 124
+// %118 = add i32 %3, 136
+// %123 = add i32 %3, 144
+// %126 = add i32 %3, 146
+// %129 = add i32 %3, 148
+// %132 = add i32 %3, 150
+// %135 = add i32 %3, 152
+// %138 = add i32 %3, 160
+// %143 = add i32 %3, 168
+// %146 = add i32 %3, 170
+// %149 = add i32 %3, 172
+// %152 = add i32 %3, 174
+// %155 = add i32 %3, 176
+// %158 = add i32 %3, 178
+// %161 = add i32 %3, 180
+// %164 = add i32 %3, 182
+// %167 = add i32 %3, 184
+// %170 = add i32 %3, 186
+// %173 = add i32 %3, 188
+// %176 = add i32 %3, 192
+// %179 = add i32 %3, 194
+// %182 = add i32 %3, 196
+// %185 = add i32 %3, 200
+// %188 = add i32 %3, 204
+// %191 = add i32 %3, 206
+// %194 = add i32 %3, 208
+// %197 = add i32 %3, 212
+// %200 = add i32 %3, 216
+
+// Initialization of sArr array.
+// CHECK: OpStore %sArr {{%\d+}}
+//
+// Check for templated 'Store' method.
+//
+// CHECK:          [[tidx_ptr:%\d+]] = OpAccessChain %_ptr_Function_uint %tid %int_0
+// CHECK:              [[tidx:%\d+]] = OpLoad %uint [[tidx_ptr]]
+// CHECK:          [[address0:%\d+]] = OpShiftRightLogical %uint [[tidx]] %uint_2
+// CHECK:              [[sArr:%\d+]] = OpLoad %_arr_S_uint_2 %sArr
+// CHECK:                [[s0:%\d+]] = OpCompositeExtract %S [[sArr]] 0
+// CHECK:                [[s1:%\d+]] = OpCompositeExtract %S [[sArr]] 1
+// CHECK:                 [[a:%\d+]] = OpCompositeExtract %_arr_v3half_uint_3 [[s0]] 0
+// CHECK:                [[a0:%\d+]] = OpCompositeExtract %v3half [[a]] 0
+// CHECK:                [[a1:%\d+]] = OpCompositeExtract %v3half [[a]] 1
+// CHECK:                [[a2:%\d+]] = OpCompositeExtract %v3half [[a]] 2
+// CHECK:               [[a00:%\d+]] = OpCompositeExtract %half [[a0]] 0
+// CHECK:               [[a01:%\d+]] = OpCompositeExtract %half [[a0]] 1
+// CHECK:               [[a02:%\d+]] = OpCompositeExtract %half [[a0]] 2
+// CHECK:               [[a10:%\d+]] = OpCompositeExtract %half [[a1]] 0
+// CHECK:               [[a11:%\d+]] = OpCompositeExtract %half [[a1]] 1
+// CHECK:               [[a12:%\d+]] = OpCompositeExtract %half [[a1]] 2
+// CHECK:               [[a20:%\d+]] = OpCompositeExtract %half [[a2]] 0
+// CHECK:               [[a21:%\d+]] = OpCompositeExtract %half [[a2]] 1
+// CHECK:               [[a22:%\d+]] = OpCompositeExtract %half [[a2]] 2
+// CHECK:         [[a00_16bit:%\d+]] = OpBitcast %ushort [[a00]]
+// CHECK:         [[a00_32bit:%\d+]] = OpUConvert %uint [[a00_16bit]]
+// CHECK:         [[a01_16bit:%\d+]] = OpBitcast %ushort [[a01]]
+// CHECK:         [[a01_32bit:%\d+]] = OpUConvert %uint [[a01_16bit]]
+// CHECK: [[a01_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a01_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a00_32bit]] [[a01_32bit_shifted]]
+// CHECK:              [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
+// CHECK:                             OpStore [[ptr]] [[word]]
+
+// CHECK:          [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
+// CHECK:         [[a02_16bit:%\d+]] = OpBitcast %ushort [[a02]]
+// CHECK:         [[a02_32bit:%\d+]] = OpUConvert %uint [[a02_16bit]]
+// CHECK:         [[a10_16bit:%\d+]] = OpBitcast %ushort [[a10]]
+// CHECK:         [[a10_32bit:%\d+]] = OpUConvert %uint [[a10_16bit]]
+// CHECK: [[a10_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a10_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a02_32bit]] [[a10_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:          [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
+// CHECK:         [[a11_16bit:%\d+]] = OpBitcast %ushort [[a11]]
+// CHECK:         [[a11_32bit:%\d+]] = OpUConvert %uint [[a11_16bit]]
+// CHECK:         [[a12_16bit:%\d+]] = OpBitcast %ushort [[a12]]
+// CHECK:         [[a12_32bit:%\d+]] = OpUConvert %uint [[a12_16bit]]
+// CHECK: [[a12_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a12_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a11_32bit]] [[a12_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:          [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
+// CHECK:         [[a20_16bit:%\d+]] = OpBitcast %ushort [[a20]]
+// CHECK:         [[a20_32bit:%\d+]] = OpUConvert %uint [[a20_16bit]]
+// CHECK:         [[a21_16bit:%\d+]] = OpBitcast %ushort [[a21]]
+// CHECK:         [[a21_32bit:%\d+]] = OpUConvert %uint [[a21_16bit]]
+// CHECK: [[a21_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a21_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a20_32bit]] [[a21_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address3]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:          [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
+// CHECK:         [[a22_16bit:%\d+]] = OpBitcast %ushort [[a22]]
+// CHECK:         [[a22_32bit:%\d+]] = OpUConvert %uint [[a22_16bit]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address4]]
+// CHECK:                              OpStore [[ptr]] [[a22_32bit]]
+
+//
+// The second member of S starts at byte offset 24 (6 words)
+//
+// CHECK: [[address6:%\d+]] = OpIAdd %uint [[address0]] %uint_6
+//
+// CHECK:             [[c:%\d+]] = OpCompositeExtract %double [[s0]] 1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address6]]
+// CHECK:         [[c_u64:%\d+]] = OpBitcast %ulong [[c]]
+// CHECK:       [[c_word0:%\d+]] = OpUConvert %uint [[c_u64]]
+// CHECK: [[c_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[c_u64]] %uint_32
+// CHECK:       [[c_word1:%\d+]] = OpUConvert %uint [[c_u64_shifted]]
+// CHECK:                          OpStore [[ptr]] [[c_word0]]
+// CHECK:      [[address7:%\d+]] = OpIAdd %uint [[address6]] %uint_1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address7]]
+// CHECK:                          OpStore [[ptr]] [[c_word1]]
+
+//
+// The third member of S starts at byte offset 32 (8 words)
+//
+// CHECK: [[address8:%\d+]] = OpIAdd %uint [[address0]] %uint_8
+//
+// CHECK:              [[t:%\d+]] = OpCompositeExtract %T [[s0]] 2
+// CHECK:              [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[t]] 0
+// CHECK:             [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:             [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:             [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:             [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:             [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:         [[x0_u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0_u32:%\d+]] = OpUConvert %uint [[x0_u16]]
+// CHECK:         [[x1_u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1_u32:%\d+]] = OpUConvert %uint [[x1_u16]]
+// CHECK: [[x1_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1_u32]] %uint_16
+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x0_u32]] [[x1_u32_shifted]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address8]]
+// CHECK:                           OpStore [[ptr]] [[word]]
+// CHECK:       [[address9:%\d+]] = OpIAdd %uint [[address8]] %uint_1
+// CHECK:         [[x2_u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2_u32:%\d+]] = OpUConvert %uint [[x2_u16]]
+// CHECK:         [[x3_u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3_u32:%\d+]] = OpUConvert %uint [[x3_u16:%\d+]]
+// CHECK: [[x3_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3_u32]] %uint_16
+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x2_u32]] [[x3_u32_shifted]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address9]]
+// CHECK:                           OpStore [[ptr]] [[word]]
+// CHECK:      [[address10:%\d+]] = OpIAdd %uint [[address9]] %uint_1
+// CHECK:         [[x4_u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4_u32:%\d+]] = OpUConvert %uint [[x4_u16]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address10]]
+// CHECK:                           OpStore [[ptr]] [[x4_u32]]
+
+//
+// The fourth member of S starts at byte offset 48 (12 words)
+//
+// CHECK: [[address12:%\d+]] = OpIAdd %uint [[address0]] %uint_12
+//
+// CHECK:             [[b:%\d+]] = OpCompositeExtract %double [[s0]] 3
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address12]]
+// CHECK:         [[b_u64:%\d+]] = OpBitcast %ulong [[b]]
+// CHECK:       [[b_word0:%\d+]] = OpUConvert %uint [[b_u64]]
+// CHECK: [[b_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[b_u64]] %uint_32
+// CHECK:       [[b_word1:%\d+]] = OpUConvert %uint [[b_u64_shifted]]
+// CHECK:                          OpStore [[ptr]] [[b_word0]]
+// CHECK:     [[address13:%\d+]] = OpIAdd %uint [[address12]] %uint_1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address13]]
+// CHECK:                          OpStore [[ptr]] [[b_word1]]
+
+//
+// The fifth member of S starts at byte offset 56 (14 words)
+//
+// CHECK: [[address14:%\d+]] = OpIAdd %uint [[address0]] %uint_14
+//
+// CHECK:     [[d:%\d+]] = OpCompositeExtract %half [[s0]] 4
+// CHECK:   [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address14]]
+// CHECK: [[d_u16:%\d+]] = OpBitcast %ushort [[d]]
+// CHECK: [[d_u32:%\d+]] = OpUConvert %uint [[d_u16]]
+// CHECK:                  OpStore [[ptr]] [[d_u32]]
+
+//
+// The sixth member of S starts at byte offset 58 (14 words + 16bit offset)
+// This is an extraordinary case of alignment. Since the sixth member only
+// contains fp16, and the fifth member was also fp16, DX packs them tightly.
+// As a result, store must occur at non-aligned offset.
+// e[0] takes the following byte offsets: 58, 60, 62, 64, 66.
+// e[1] takes the following byte offsets: 68, 70, 72, 74, 76.
+// (60-64 = index 15. 64-68 = index 16)
+// (68-72 = index 17. 72-76 = index 18)
+// (76-78 = first half of index 19)
+//
+// CHECK:     [[address14:%\d+]] = OpIAdd %uint [[address0]] %uint_14
+// CHECK:             [[e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[s0]] 5
+// CHECK:            [[e0:%\d+]] = OpCompositeExtract %T [[e]] 0
+// CHECK:            [[e1:%\d+]] = OpCompositeExtract %T [[e]] 1
+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e0]] 0
+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address14]]
+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
+// CHECK: [[x0u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x0u32]] %uint_16
+// CHECK:  [[existingWord:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:       [[newWord:%\d+]] = OpBitwiseOr %uint [[existingWord]] [[x0u32_shifted]]
+// CHECK:                          OpStore [[ptr]] [[newWord]]
+
+// CHECK:     [[address15:%\d+]] = OpIAdd %uint [[address14]] %uint_1
+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
+// CHECK: [[x2u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x2u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x1u32]] [[x2u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address15]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address16:%\d+]] = OpIAdd %uint [[address15]] %uint_1
+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
+// CHECK: [[x4u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x4u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x3u32]] [[x4u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address16]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address17:%\d+]] = OpIAdd %uint [[address14]] %uint_3
+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e1]] 0
+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
+// CHECK: [[x1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x0u32]] [[x1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address17]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address18:%\d+]] = OpIAdd %uint [[address17]] %uint_1
+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
+// CHECK: [[x3u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x2u32]] [[x3u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address18]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address19:%\d+]] = OpIAdd %uint [[address18]] %uint_1
+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address19]]
+// CHECK:                          OpStore [[ptr]] [[x4u32]]
+
+//
+// The seventh member of S starts at byte offset 80 (20 words), so:
+// for f[0]:
+// v should start at byte offset 80 (20 words)
+// w should start at byte offset 88 (22 words)
+// for f[1]:
+// v should start at byte offset 92 (23 words)
+// w should start at byte offset 100 (25 words)
+//
+// CHECK:     [[address20:%\d+]] = OpIAdd %uint [[address0]] %uint_20
+// CHECK:             [[f:%\d+]] = OpCompositeExtract %_arr_U_uint_2 [[s0]] 6
+// CHECK:            [[u0:%\d+]] = OpCompositeExtract %U [[f]] 0
+// CHECK:            [[u1:%\d+]] = OpCompositeExtract %U [[f]] 1
+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u0]] 0
+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address20]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK: [[address21:%\d+]] = OpIAdd %uint [[address20]] %uint_1
+// CHECK:     [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
+// CHECK:     [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address21]]
+// CHECK:                      OpStore [[ptr]] [[v2u32]]
+
+// CHECK: [[address22:%\d+]] = OpIAdd %uint [[address20]] %uint_2
+// CHECK:         [[w:%\d+]] = OpCompositeExtract %uint [[u0]] 1
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address22]]
+// CHECK:                      OpStore [[ptr]] [[w]]
+
+// CHECK:     [[address23:%\d+]] = OpIAdd %uint [[address20]] %uint_3
+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u1]] 0
+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address23]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address24:%\d+]] = OpIAdd %uint [[address23]] %uint_1
+// CHECK:         [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
+// CHECK:         [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address24]]
+// CHECK:                          OpStore [[ptr]] [[v2u32]]
+
+// CHECK:     [[address25:%\d+]] = OpIAdd %uint [[address23]] %uint_2
+// CHECK:             [[w:%\d+]] = OpCompositeExtract %uint [[u1]] 1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address25]]
+// CHECK:                          OpStore [[ptr]] [[w]]
+
+//
+// The eighth member of S starts at byte offset 104 (26 words)
+//
+// CHECK: [[address26:%\d+]] = OpIAdd %uint [[address0]] %uint_26
+// CHECK:         [[z:%\d+]] = OpCompositeExtract %half [[s0]] 7
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address26]]
+// CHECK:      [[zu16:%\d+]] = OpBitcast %ushort [[z]]
+// CHECK:      [[zu32:%\d+]] = OpUConvert %uint [[zu16]]
+// CHECK:                      OpStore [[ptr]] [[zu32]]
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+// We have an array of S structures (sArr). The second member (sArr[1]) should
+// start at an aligned address. A structure aligment is the maximum alignment
+// of its members.
+// In this example, sArr[1] should start at byte offset 112 (28 words)
+// It should *NOT* start at byte offset 108 (27 words).
+//
+//
+// CHECK: [[address28:%\d+]] = OpIAdd %uint [[address0]] %uint_28
+//
+// CHECK:                 [[a:%\d+]] = OpCompositeExtract %_arr_v3half_uint_3 [[s1]] 0
+// CHECK:                [[a0:%\d+]] = OpCompositeExtract %v3half [[a]] 0
+// CHECK:                [[a1:%\d+]] = OpCompositeExtract %v3half [[a]] 1
+// CHECK:                [[a2:%\d+]] = OpCompositeExtract %v3half [[a]] 2
+// CHECK:               [[a00:%\d+]] = OpCompositeExtract %half [[a0]] 0
+// CHECK:               [[a01:%\d+]] = OpCompositeExtract %half [[a0]] 1
+// CHECK:               [[a02:%\d+]] = OpCompositeExtract %half [[a0]] 2
+// CHECK:               [[a10:%\d+]] = OpCompositeExtract %half [[a1]] 0
+// CHECK:               [[a11:%\d+]] = OpCompositeExtract %half [[a1]] 1
+// CHECK:               [[a12:%\d+]] = OpCompositeExtract %half [[a1]] 2
+// CHECK:               [[a20:%\d+]] = OpCompositeExtract %half [[a2]] 0
+// CHECK:               [[a21:%\d+]] = OpCompositeExtract %half [[a2]] 1
+// CHECK:               [[a22:%\d+]] = OpCompositeExtract %half [[a2]] 2
+// CHECK:         [[a00_16bit:%\d+]] = OpBitcast %ushort [[a00]]
+// CHECK:         [[a00_32bit:%\d+]] = OpUConvert %uint [[a00_16bit]]
+// CHECK:         [[a01_16bit:%\d+]] = OpBitcast %ushort [[a01]]
+// CHECK:         [[a01_32bit:%\d+]] = OpUConvert %uint [[a01_16bit]]
+// CHECK: [[a01_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a01_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a00_32bit]] [[a01_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address28]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:         [[address29:%\d+]] = OpIAdd %uint [[address28]] %uint_1
+// CHECK:         [[a02_16bit:%\d+]] = OpBitcast %ushort [[a02]]
+// CHECK:         [[a02_32bit:%\d+]] = OpUConvert %uint [[a02_16bit]]
+// CHECK:         [[a10_16bit:%\d+]] = OpBitcast %ushort [[a10]]
+// CHECK:         [[a10_32bit:%\d+]] = OpUConvert %uint [[a10_16bit]]
+// CHECK: [[a10_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a10_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a02_32bit]] [[a10_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address29]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:         [[address30:%\d+]] = OpIAdd %uint [[address29]] %uint_1
+// CHECK:         [[a11_16bit:%\d+]] = OpBitcast %ushort [[a11]]
+// CHECK:         [[a11_32bit:%\d+]] = OpUConvert %uint [[a11_16bit]]
+// CHECK:         [[a12_16bit:%\d+]] = OpBitcast %ushort [[a12]]
+// CHECK:         [[a12_32bit:%\d+]] = OpUConvert %uint [[a12_16bit]]
+// CHECK: [[a12_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a12_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a11_32bit]] [[a12_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address30]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+
+// CHECK:         [[address31:%\d+]] = OpIAdd %uint [[address30]] %uint_1
+// CHECK:         [[a20_16bit:%\d+]] = OpBitcast %ushort [[a20]]
+// CHECK:         [[a20_32bit:%\d+]] = OpUConvert %uint [[a20_16bit]]
+// CHECK:         [[a21_16bit:%\d+]] = OpBitcast %ushort [[a21]]
+// CHECK:         [[a21_32bit:%\d+]] = OpUConvert %uint [[a21_16bit]]
+// CHECK: [[a21_32bit_shifted:%\d+]] = OpShiftLeftLogical %uint [[a21_32bit]] %uint_16
+// CHECK:              [[word:%\d+]] = OpBitwiseOr %uint [[a20_32bit]] [[a21_32bit_shifted]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address31]]
+// CHECK:                              OpStore [[ptr]] [[word]]
+// CHECK:         [[address32:%\d+]] = OpIAdd %uint [[address31]] %uint_1
+// CHECK:         [[a22_16bit:%\d+]] = OpBitcast %ushort [[a22]]
+// CHECK:         [[a22_32bit:%\d+]] = OpUConvert %uint [[a22_16bit]]
+// CHECK:               [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address32]]
+// CHECK:                              OpStore [[ptr]] [[a22_32bit]]
+
+//
+// The second member of S starts at byte offset 24 (6 words)
+//
+// CHECK: [[address34:%\d+]] = OpIAdd %uint [[address28]] %uint_6
+//
+// CHECK:             [[c:%\d+]] = OpCompositeExtract %double [[s1]] 1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address34]]
+// CHECK:         [[c_u64:%\d+]] = OpBitcast %ulong [[c]]
+// CHECK:       [[c_word0:%\d+]] = OpUConvert %uint [[c_u64]]
+// CHECK: [[c_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[c_u64]] %uint_32
+// CHECK:       [[c_word1:%\d+]] = OpUConvert %uint [[c_u64_shifted]]
+// CHECK:                          OpStore [[ptr]] [[c_word0]]
+// CHECK:     [[address35:%\d+]] = OpIAdd %uint [[address34]] %uint_1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address35]]
+// CHECK:                          OpStore [[ptr]] [[c_word1]]
+
+//
+// The third member of S starts at byte offset 32 (8 words)
+//
+// CHECK: [[address36:%\d+]] = OpIAdd %uint [[address28]] %uint_8
+//
+// CHECK:              [[t:%\d+]] = OpCompositeExtract %T [[s1]] 2
+// CHECK:              [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[t]] 0
+// CHECK:             [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:             [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:             [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:             [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:             [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:         [[x0_u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0_u32:%\d+]] = OpUConvert %uint [[x0_u16]]
+// CHECK:         [[x1_u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1_u32:%\d+]] = OpUConvert %uint [[x1_u16]]
+// CHECK: [[x1_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1_u32]] %uint_16
+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x0_u32]] [[x1_u32_shifted]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address36]]
+// CHECK:                           OpStore [[ptr]] [[word]]
+
+// CHECK:      [[address37:%\d+]] = OpIAdd %uint [[address36]] %uint_1
+// CHECK:         [[x2_u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2_u32:%\d+]] = OpUConvert %uint [[x2_u16]]
+// CHECK:         [[x3_u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3_u32:%\d+]] = OpUConvert %uint [[x3_u16:%\d+]]
+// CHECK: [[x3_u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3_u32]] %uint_16
+// CHECK:           [[word:%\d+]] = OpBitwiseOr %uint [[x2_u32]] [[x3_u32_shifted]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address37]]
+// CHECK:                           OpStore [[ptr]] [[word]]
+
+// CHECK:      [[address38:%\d+]] = OpIAdd %uint [[address37]] %uint_1
+// CHECK:         [[x4_u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4_u32:%\d+]] = OpUConvert %uint [[x4_u16]]
+// CHECK:            [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address38]]
+// CHECK:                           OpStore [[ptr]] [[x4_u32]]
+
+//
+// The fourth member of S starts at byte offset 48 (12 words)
+//
+// CHECK: [[address40:%\d+]] = OpIAdd %uint [[address28]] %uint_12
+//
+// CHECK:             [[b:%\d+]] = OpCompositeExtract %double [[s1]] 3
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address40]]
+// CHECK:         [[b_u64:%\d+]] = OpBitcast %ulong [[b]]
+// CHECK:       [[b_word0:%\d+]] = OpUConvert %uint [[b_u64]]
+// CHECK: [[b_u64_shifted:%\d+]] = OpShiftRightLogical %ulong [[b_u64]] %uint_32
+// CHECK:       [[b_word1:%\d+]] = OpUConvert %uint [[b_u64_shifted]]
+// CHECK:                          OpStore [[ptr]] [[b_word0]]
+// CHECK:     [[address41:%\d+]] = OpIAdd %uint [[address40]] %uint_1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address41]]
+// CHECK:                          OpStore [[ptr]] [[b_word1]]
+
+//
+// The fifth member of S starts at byte offset 56 (14 words)
+//
+// CHECK: [[address42:%\d+]] = OpIAdd %uint [[address28]] %uint_14
+//
+// CHECK:     [[d:%\d+]] = OpCompositeExtract %half [[s1]] 4
+// CHECK:   [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address42]]
+// CHECK: [[d_u16:%\d+]] = OpBitcast %ushort [[d]]
+// CHECK: [[d_u32:%\d+]] = OpUConvert %uint [[d_u16]]
+// CHECK:                  OpStore [[ptr]] [[d_u32]]
+
+//
+// The sixth member of S starts at byte offset 58 (14 words + 16bit offset)
+// This is an extraordinary case of alignment. Since the sixth member only
+// contains fp16, and the fifth member was also fp16, DX packs them tightly.
+// As a result, store must occur at non-aligned offset.
+// e[0] takes the following byte offsets: 58, 60, 62, 64, 66.
+// e[1] takes the following byte offsets: 68, 70, 72, 74, 76.
+// (60-64 = index 15. 64-68 = index 16)
+// (68-72 = index 17. 72-76 = index 18)
+// (76-78 = first half of index 19)
+//
+// CHECK:     [[address42:%\d+]] = OpIAdd %uint [[address28]] %uint_14
+// CHECK:             [[e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[s1]] 5
+// CHECK:            [[e0:%\d+]] = OpCompositeExtract %T [[e]] 0
+// CHECK:            [[e1:%\d+]] = OpCompositeExtract %T [[e]] 1
+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e0]] 0
+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address42]]
+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
+// CHECK: [[x0u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x0u32]] %uint_16
+// CHECK:  [[existingWord:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:       [[newWord:%\d+]] = OpBitwiseOr %uint [[existingWord]] [[x0u32_shifted]]
+// CHECK:                          OpStore [[ptr]] [[newWord]]
+
+// CHECK:     [[address43:%\d+]] = OpIAdd %uint [[address42]] %uint_1
+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
+// CHECK: [[x2u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x2u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x1u32]] [[x2u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address43]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address44:%\d+]] = OpIAdd %uint [[address43]] %uint_1
+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
+// CHECK: [[x4u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x4u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x3u32]] [[x4u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address44]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address45:%\d+]] = OpIAdd %uint [[address42]] %uint_3
+// CHECK:             [[x:%\d+]] = OpCompositeExtract %_arr_half_uint_5 [[e1]] 0
+// CHECK:            [[x0:%\d+]] = OpCompositeExtract %half [[x]] 0
+// CHECK:            [[x1:%\d+]] = OpCompositeExtract %half [[x]] 1
+// CHECK:            [[x2:%\d+]] = OpCompositeExtract %half [[x]] 2
+// CHECK:            [[x3:%\d+]] = OpCompositeExtract %half [[x]] 3
+// CHECK:            [[x4:%\d+]] = OpCompositeExtract %half [[x]] 4
+// CHECK:         [[x0u16:%\d+]] = OpBitcast %ushort [[x0]]
+// CHECK:         [[x0u32:%\d+]] = OpUConvert %uint [[x0u16]]
+// CHECK:         [[x1u16:%\d+]] = OpBitcast %ushort [[x1]]
+// CHECK:         [[x1u32:%\d+]] = OpUConvert %uint [[x1u16]]
+// CHECK: [[x1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x0u32]] [[x1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address45]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address46:%\d+]] = OpIAdd %uint [[address45]] %uint_1
+// CHECK:         [[x2u16:%\d+]] = OpBitcast %ushort [[x2]]
+// CHECK:         [[x2u32:%\d+]] = OpUConvert %uint [[x2u16]]
+// CHECK:         [[x3u16:%\d+]] = OpBitcast %ushort [[x3]]
+// CHECK:         [[x3u32:%\d+]] = OpUConvert %uint [[x3u16]]
+// CHECK: [[x3u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[x3u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[x2u32]] [[x3u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address46]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address47:%\d+]] = OpIAdd %uint [[address46]] %uint_1
+// CHECK:         [[x4u16:%\d+]] = OpBitcast %ushort [[x4]]
+// CHECK:         [[x4u32:%\d+]] = OpUConvert %uint [[x4u16]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address47]]
+// CHECK:                          OpStore [[ptr]] [[x4u32]]
+
+//
+// The seventh member of S starts at byte offset 80 (20 words), so:
+// for f[0]:
+// v should start at byte offset 80 (20 words)
+// w should start at byte offset 88 (22 words)
+// for f[1]:
+// v should start at byte offset 92 (23 words)
+// w should start at byte offset 100 (25 words)
+//
+// CHECK:     [[address48:%\d+]] = OpIAdd %uint [[address28]] %uint_20
+// CHECK:             [[f:%\d+]] = OpCompositeExtract %_arr_U_uint_2 [[s1]] 6
+// CHECK:            [[u0:%\d+]] = OpCompositeExtract %U [[f]] 0
+// CHECK:            [[u1:%\d+]] = OpCompositeExtract %U [[f]] 1
+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u0]] 0
+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address48]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK: [[address49:%\d+]] = OpIAdd %uint [[address48]] %uint_1
+// CHECK:     [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
+// CHECK:     [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address49]]
+// CHECK:                      OpStore [[ptr]] [[v2u32]]
+
+// CHECK: [[address50:%\d+]] = OpIAdd %uint [[address48]] %uint_2
+// CHECK:         [[w:%\d+]] = OpCompositeExtract %uint [[u0]] 1
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address50]]
+// CHECK:                      OpStore [[ptr]] [[w]]
+
+// CHECK:     [[address51:%\d+]] = OpIAdd %uint [[address48]] %uint_3
+// CHECK:             [[v:%\d+]] = OpCompositeExtract %_arr_half_uint_3 [[u1]] 0
+// CHECK:            [[v0:%\d+]] = OpCompositeExtract %half [[v]] 0
+// CHECK:            [[v1:%\d+]] = OpCompositeExtract %half [[v]] 1
+// CHECK:            [[v2:%\d+]] = OpCompositeExtract %half [[v]] 2
+// CHECK:         [[v0u16:%\d+]] = OpBitcast %ushort [[v0]]
+// CHECK:         [[v0u32:%\d+]] = OpUConvert %uint [[v0u16]]
+// CHECK:         [[v1u16:%\d+]] = OpBitcast %ushort [[v1]]
+// CHECK:         [[v1u32:%\d+]] = OpUConvert %uint [[v1u16]]
+// CHECK: [[v1u32_shifted:%\d+]] = OpShiftLeftLogical %uint [[v1u32]] %uint_16
+// CHECK:          [[word:%\d+]] = OpBitwiseOr %uint [[v0u32]] [[v1u32_shifted]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address51]]
+// CHECK:                          OpStore [[ptr]] [[word]]
+
+// CHECK:     [[address52:%\d+]] = OpIAdd %uint [[address51]] %uint_1
+// CHECK:         [[v2u16:%\d+]] = OpBitcast %ushort [[v2]]
+// CHECK:         [[v2u32:%\d+]] = OpUConvert %uint [[v2u16]]
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address52]]
+// CHECK:                          OpStore [[ptr]] [[v2u32]]
+
+// CHECK:     [[address53:%\d+]] = OpIAdd %uint [[address51]] %uint_2
+// CHECK:             [[w:%\d+]] = OpCompositeExtract %uint [[u1]] 1
+// CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address53]]
+// CHECK:                          OpStore [[ptr]] [[w]]
+
+//
+// The eighth member of S starts at byte offset 104 (26 words)
+//
+// CHECK: [[address54:%\d+]] = OpIAdd %uint [[address28]] %uint_26
+// CHECK:         [[z:%\d+]] = OpCompositeExtract %half [[s1]] 7
+// CHECK:       [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address54]]
+// CHECK:      [[zu16:%\d+]] = OpBitcast %ushort [[z]]
+// CHECK:      [[zu32:%\d+]] = OpUConvert %uint [[zu16]]
+// CHECK:                      OpStore [[ptr]] [[zu32]]

+ 123 - 0
tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.struct2.hlsl

@@ -0,0 +1,123 @@
+// Run: %dxc -T cs_6_2 -E main -enable-16bit-types -fvk-use-dx-layout
+
+ByteAddressBuffer buf;
+RWByteAddressBuffer buf2;
+
+struct T {
+  float16_t x[2];
+};
+
+struct S {
+  float16_t a;
+  T e[2];
+};
+
+[numthreads(64, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadId) {
+  S sArr[2] = buf.Load<S[2]>(tid.x);
+  buf2.Store<S[2]>(tid.x, sArr);
+}
+
+// Note: the DX layout tightly packs all members of S and its sub-structures.
+// It stores elements at the following byte offsets:
+// 0, 2, 4, 6, 8, 10, 12, 14, 16, 18
+//
+//                              |-----------------------|
+// address 0:                   |     a     | e[0].x[0] |
+//                              |-----------------------|
+// address 1 (byte offset 4):   | e[0].x[1] | e[1].x[0] |
+//                              |-----------------------|
+// address 2 (byte offset 8):   | e[1].x[1] |     a     |
+//                              |-----------------------|
+// address 3 (byte offset 12)   | e[0].x[0] | e[0].x[1] |
+//                              |-----------------------|
+// address 4 (byte offset 16)   | e[1].x[0] | e[1].x[1] |
+//                              |-----------------------|
+//
+
+// CHECK: OpStore %sArr
+// CHECK: OpAccessChain %_ptr_Function_uint %tid %int_0
+// CHECK: [[address0:%\d+]] = OpShiftRightLogical %uint {{%\d+}} %uint_2
+// CHECK:     [[sArr:%\d+]] = OpLoad %_arr_S_uint_2 %sArr
+// CHECK:    [[sArr0:%\d+]] = OpCompositeExtract %S [[sArr]] 0
+// CHECK:    [[sArr1:%\d+]] = OpCompositeExtract %S [[sArr]] 1
+// CHECK:     [[s0_a:%\d+]] = OpCompositeExtract %half [[sArr]] 0
+// CHECK:     [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
+// CHECK: OpBitcast %ushort
+// CHECK: OpUConvert %uint
+// CHECK: OpStore [[ptr0]]
+// CHECK:     [[s0_e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[sArr0]] 1
+// CHECK:    [[s0_e0:%\d+]] = OpCompositeExtract %T [[s0_e]] 0
+// CHECK:    [[s0_e1:%\d+]] = OpCompositeExtract %T [[s0_e]] 1
+// CHECK:  [[s0_e0_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s0_e0]] 0
+// CHECK: [[s0_e0_x0:%\d+]] = OpCompositeExtract %half [[s0_e0_x]] 0
+// CHECK: [[s0_e0_x1:%\d+]] = OpCompositeExtract %half [[s0_e0_x]] 1
+// CHECK:     [[ptr0:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address0]]
+// CHECK: OpBitcast %ushort [[s0_e0_x0]]
+// CHECK: OpUConvert %uint
+// CHECK: OpShiftLeftLogical %uint
+// CHECK: [[oldWord0:%\d+]] = OpLoad %uint [[ptr0]]
+// CHECK: [[newWord0:%\d+]] = OpBitwiseOr %uint [[oldWord0]] {{%\d+}}
+// CHECK:                     OpStore [[ptr0]] [[newWord0]]
+// CHECK: [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
+// CHECK: OpBitcast %ushort [[s0_e0_x1]]
+// CHECK: OpUConvert %uint
+// CHECK:     [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
+// CHECK:                     OpStore [[ptr1]] {{%\d+}}
+// CHECK: [[address1:%\d+]] = OpIAdd %uint [[address0]] %uint_1
+// CHECK:  [[s0_e1_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s0_e1]] 0
+// CHECK: [[s0_e1_x0:%\d+]] = OpCompositeExtract %half [[s0_e1_x]] 0
+// CHECK: [[s0_e1_x1:%\d+]] = OpCompositeExtract %half [[s0_e1_x]] 1
+// CHECK:     [[ptr1:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address1]]
+// CHECK: OpBitcast %ushort [[s0_e1_x0]]
+// CHECK: OpUConvert %uint
+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
+// CHECK: [[oldWord1:%\d+]] = OpLoad %uint [[address1]]
+// CHECK: [[newWord1:%\d+]] = OpBitwiseOr %uint [[oldWord1]] {{%\d+}}
+// CHECK:                     OpStore [[ptr1]] [[newWord1]]
+
+// CHECK: [[address2:%\d+]] = OpIAdd %uint [[address1]] %uint_1
+// CHECK: OpBitcast %ushort [[s0_e1_x1]]
+// CHECK: OpUConvert %uint
+// CHECK:     [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
+// CHECK:                     OpStore [[ptr2]] {{%\d+}}
+// CHECK: [[address2:%\d+]] = OpIAdd %uint [[address0]] %uint_2
+// CHECK:     [[s1_a:%\d+]] = OpCompositeExtract %half [[sArr1]] 0
+// CHECK:     [[ptr2:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address2]]
+// CHECK: OpBitcast %ushort [[s1_a]]
+// CHECK: OpUConvert %uint
+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
+// CHECK: [[oldWord2:%\d+]] = OpLoad %uint [[ptr2]]
+// CHECK: [[newWord2:%\d+]] = OpBitwiseOr %uint [[oldWord2]] {{%\d+}}
+// CHECK:                     OpStore [[ptr2]] [[newWord2]]
+
+// CHECK: [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
+// CHECK: [[address3:%\d+]] = OpIAdd %uint [[address2]] %uint_1
+// CHECK:     [[s1_e:%\d+]] = OpCompositeExtract %_arr_T_uint_2 [[sArr1]] 1
+// CHECK:    [[s1_e0:%\d+]] = OpCompositeExtract %T [[s1_e]] 0
+// CHECK:    [[s1_e1:%\d+]] = OpCompositeExtract %T [[s1_e]] 1
+// CHECK:  [[s1_e0_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s1_e0]] 0
+// CHECK: [[s1_e0_x0:%\d+]] = OpCompositeExtract %half [[s1_e0_x]] 0
+// CHECK: [[s1_e0_x1:%\d+]] = OpCompositeExtract %half [[s1_e0_x]] 1
+// CHECK: OpBitcast %ushort [[s1_e0_x0]]
+// CHECK: OpUConvert %uint
+// CHECK: OpBitcast %ushort [[s1_e0_x1]]
+// CHECK: OpUConvert %uint
+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
+// CHECK: OpBitwiseOr %uint
+// CHECK: [[ptr3:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address3]]
+// CHECK: OpStore [[ptr3]] {{%\d+}}
+
+// CHECK: [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
+// CHECK: [[address4:%\d+]] = OpIAdd %uint [[address3]] %uint_1
+// CHECK:  [[s1_e1_x:%\d+]] = OpCompositeExtract %_arr_half_uint_2 [[s1_e1]] 0
+// CHECK: [[s1_e1_x0:%\d+]] = OpCompositeExtract %half [[s1_e1_x]] 0
+// CHECK: [[s1_e1_x1:%\d+]] = OpCompositeExtract %half [[s1_e1_x]] 1
+// CHECK: OpBitcast %ushort
+// CHECK: OpUConvert %uint
+// CHECK: OpBitcast %ushort
+// CHECK: OpUConvert %uint
+// CHECK: OpShiftLeftLogical %uint {{%\d+}} %uint_16
+// CHECK: OpBitwiseOr %uint
+// CHECK: [[ptr4:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[address4]]
+// CHECK: OpStore [[ptr4]] {{%\d+}}

+ 9 - 0
tools/clang/unittests/SPIRV/CodeGenSpirvTest.cpp

@@ -898,9 +898,18 @@ TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct) {
 TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct2) {
   runFileTest("method.byte-address-buffer.templated-load.struct2.hlsl");
 }
+TEST_F(FileTest, ByteAddressBufferTemplatedLoadStruct3) {
+  runFileTest("method.byte-address-buffer.templated-load.struct3.hlsl");
+}
 TEST_F(FileTest, ByteAddressBufferStore) {
   runFileTest("method.byte-address-buffer.store.hlsl");
 }
+TEST_F(FileTest, ByteAddressBufferTemplatedStoreStruct) {
+  runFileTest("method.byte-address-buffer.templated-store.struct.hlsl");
+}
+TEST_F(FileTest, ByteAddressBufferTemplatedStoreStruct2) {
+  runFileTest("method.byte-address-buffer.templated-store.struct.hlsl");
+}
 TEST_F(FileTest, ByteAddressBufferGetDimensions) {
   runFileTest("method.byte-address-buffer.get-dimensions.hlsl");
 }