Преглед на файлове

Multi device copy pass and fixes (#17883)

* Introduce multi-device CopyPass and smaller fixes

Signed-off-by: Martin Winter <[email protected]>
Co-authored-by: Martin Sattlecker <[email protected]>
Co-authored-by: Martin Winter <[email protected]>
Co-authored-by: Joerg H. Mueller <[email protected]>
Martin Winter преди 1 година
родител
ревизия
2cedafc8be

+ 3 - 2
Gems/Atom/Feature/Common/Code/Source/RayTracing/RayTracingFeatureProcessor.cpp

@@ -42,12 +42,13 @@ namespace AZ
         void RayTracingFeatureProcessor::Activate()
         {
             auto deviceMask{RHI::RHISystemInterface::Get()->GetRayTracingSupport()};
+            m_rayTracingEnabled = (deviceMask != RHI::MultiDevice::NoDevices);
 
-            if (deviceMask == RHI::MultiDevice::NoDevices)
+            if (!m_rayTracingEnabled)
             {
                 return;
             }
-
+            
             m_transformServiceFeatureProcessor = GetParentScene()->GetFeatureProcessor<TransformServiceFeatureProcessor>();
 
             // initialize the ray tracing buffer pools

+ 1 - 1
Gems/Atom/RHI/Code/Include/Atom/RHI/MultiDeviceDrawItem.h

@@ -151,7 +151,7 @@ namespace AZ::RHI
         {
             for (auto& [deviceIndex, drawItem] : m_deviceDrawItemPtrs)
             {
-                drawItem->m_pipelineState = pipelineState->GetDevicePipelineState(deviceIndex).get();
+                drawItem->m_pipelineState = pipelineState ? pipelineState->GetDevicePipelineState(deviceIndex).get() : nullptr;
             }
         }
 

+ 49 - 0
Gems/Atom/RHI/Code/Include/Atom/RHI/ScopeProducerFunction.h

@@ -88,4 +88,53 @@ namespace AZ::RHI
         CompileFunction m_compileFunction;
         ExecuteFunction m_executeFunction;
     };
+
+    // Helper class to build scope producer with functions
+    class ScopeProducerFunctionNoData final : public RHI::ScopeProducer
+    {
+    public:
+        AZ_CLASS_ALLOCATOR(ScopeProducerFunctionNoData, SystemAllocator);
+
+        using PrepareFunction = AZStd::function<void(RHI::FrameGraphInterface)>;
+        using CompileFunction = AZStd::function<void(const RHI::FrameGraphCompileContext&)>;
+        using ExecuteFunction = AZStd::function<void(const RHI::FrameGraphExecuteContext&)>;
+
+        ScopeProducerFunctionNoData(
+            const RHI::ScopeId& scopeId,
+            PrepareFunction prepareFunction,
+            CompileFunction compileFunction,
+            ExecuteFunction executeFunction,
+            HardwareQueueClass hardwareQueueClass = HardwareQueueClass::Graphics,
+            int deviceIndex = RHI::MultiDevice::InvalidDeviceIndex)
+            : ScopeProducer(scopeId, deviceIndex)
+            , m_prepareFunction{ AZStd::move(prepareFunction) }
+            , m_compileFunction{ AZStd::move(compileFunction) }
+            , m_executeFunction{ AZStd::move(executeFunction) }
+        {
+            InitScope(scopeId, hardwareQueueClass, deviceIndex);
+        }
+
+    private:
+        //////////////////////////////////////////////////////////////////////////
+        // ScopeProducer overrides
+        void SetupFrameGraphDependencies(RHI::FrameGraphInterface builder) override
+        {
+            m_prepareFunction(builder);
+        }
+
+        void CompileResources(const RHI::FrameGraphCompileContext& context) override
+        {
+            m_compileFunction(context);
+        }
+
+        void BuildCommandList(const RHI::FrameGraphExecuteContext& context) override
+        {
+            m_executeFunction(context);
+        }
+        //////////////////////////////////////////////////////////////////////////
+
+        PrepareFunction m_prepareFunction;
+        CompileFunction m_compileFunction;
+        ExecuteFunction m_executeFunction;
+    };
 }

+ 8 - 7
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Buffer/BufferSystemInterface.h

@@ -30,13 +30,14 @@ namespace AZ
         //!     or you want to have more control over the pool such as define your own budget for the pool and not share the pool with others.
         enum class CommonBufferPoolType : uint8_t
         {
-            Constant = 0,           //<! For structured constants. They are often used as ConstantBuffer in shaders
-            StaticInputAssembly,    //<! For input assembly buffers that are not updated often. 
-            DynamicInputAssembly,   //<! For input assembly buffers that are updated per frame
-            ReadBack,               //<! For gpu write cpu read buffers which is mainly used to read back gpu data
-            ReadWrite,              //<! For gpu read/write buffers. They are often used as both StructuredBuffer and RWStructuredBuffer in different shaders
-            ReadOnly,               //<! For buffers which are read only. They are usually only used as StructuredBuffer in shaders
-            Indirect,               //<! For buffers which are used as indirect call arguments
+            Constant = 0, //<! For structured constants. They are often used as ConstantBuffer in shaders
+            StaticInputAssembly, //<! For input assembly buffers that are not updated often.
+            DynamicInputAssembly, //<! For input assembly buffers that are updated per frame
+            ReadBack, //<! For gpu write cpu read buffers which is mainly used to read back gpu data
+            Staging, //<! For gpu write cpu read buffers which is mainly used to read back gpu data
+            ReadWrite, //<! For gpu read/write buffers. They are often used as both StructuredBuffer and RWStructuredBuffer in different shaders
+            ReadOnly, //<! For buffers which are read only. They are usually only used as StructuredBuffer in shaders
+            Indirect, //<! For buffers which are used as indirect call arguments
 
             Count,
             Invalid = Count

+ 44 - 13
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Pass/CopyPass.h

@@ -7,36 +7,35 @@
  */
 #pragma once
 
-#include <Atom/RHI/MultiDeviceCopyItem.h>
 #include <Atom/RHI.Reflect/AttachmentEnums.h>
 #include <Atom/RHI.Reflect/Scissor.h>
 #include <Atom/RHI.Reflect/Viewport.h>
+#include <Atom/RHI/MultiDeviceCopyItem.h>
+#include <Atom/RHI/ScopeProducer.h>
 
 #include <Atom/RPI.Reflect/Pass/CopyPassData.h>
 
-#include <Atom/RPI.Public/Pass/RenderPass.h>
+#include <Atom/RPI.Public/Pass/Pass.h>
 
 namespace AZ
 {
     namespace RPI
     {
         //! A copy pass is a leaf pass (pass with no children) used for copying images and buffers on the GPU.
-        class CopyPass
-            : public RenderPass
+        class CopyPass : public Pass
         {
             AZ_RPI_PASS(CopyPass);
 
         public:
-            AZ_RTTI(CopyPass, "{7387500D-B1BA-4916-B38C-24F5C8DAF839}", RenderPass);
+            AZ_RTTI(CopyPass, "{7387500D-B1BA-4916-B38C-24F5C8DAF839}", Pass);
             AZ_CLASS_ALLOCATOR(CopyPass, SystemAllocator);
-            virtual ~CopyPass() = default;
+            virtual ~CopyPass();
 
             static Ptr<CopyPass> Create(const PassDescriptor& descriptor);
 
         protected:
             explicit CopyPass(const PassDescriptor& descriptor);
 
-            // Sets up the copy item to perform an image to image copy
             void CopyBuffer(const RHI::FrameGraphCompileContext& context);
             void CopyImage(const RHI::FrameGraphCompileContext& context);
             void CopyBufferToImage(const RHI::FrameGraphCompileContext& context);
@@ -44,20 +43,52 @@ namespace AZ
 
             // Pass behavior overrides
             void BuildInternal() override;
+            void FrameBeginInternal(FramePrepareParams params) override;
+            void ResetInternal() override;
 
             // Scope producer functions...
-            void SetupFrameGraphDependencies(RHI::FrameGraphInterface frameGraph) override;
-            void CompileResources(const RHI::FrameGraphCompileContext& context) override;
-            void BuildCommandListInternal(const RHI::FrameGraphExecuteContext& context) override;
+            void SetupFrameGraphDependenciesSameDevice(RHI::FrameGraphInterface frameGraph);
+            void CompileResourcesSameDevice(const RHI::FrameGraphCompileContext& context);
+            void BuildCommandListInternalSameDevice(const RHI::FrameGraphExecuteContext& context);
+            void SetupFrameGraphDependenciesDeviceToHost(RHI::FrameGraphInterface frameGraph);
+            void CompileResourcesDeviceToHost(const RHI::FrameGraphCompileContext& context);
+            void BuildCommandListInternalDeviceToHost(const RHI::FrameGraphExecuteContext& context);
+            void SetupFrameGraphDependenciesHostToDevice(RHI::FrameGraphInterface frameGraph);
+            void CompileResourcesHostToDevice(const RHI::FrameGraphCompileContext& context);
+            void BuildCommandListInternalHostToDevice(const RHI::FrameGraphExecuteContext& context);
 
             // Retrieves the copy item type based on the input and output attachment type
             RHI::CopyItemType GetCopyItemType();
 
             // The copy item submitted to the command list
-            RHI::MultiDeviceCopyItem m_copyItem;
+            RHI::MultiDeviceCopyItem m_copyItemSameDevice;
+            RHI::MultiDeviceCopyItem m_copyItemDeviceToHost;
+            RHI::MultiDeviceCopyItem m_copyItemHostToDevice;
+            AZStd::shared_ptr<AZ::RHI::ScopeProducer> m_copyScopeProducerSameDevice;
+            AZStd::shared_ptr<AZ::RHI::ScopeProducer> m_copyScopeProducerDeviceToHost;
+            AZStd::shared_ptr<AZ::RHI::ScopeProducer> m_copyScopeProducerHostToDevice;
 
             // Potential data provided by the PassRequest
             CopyPassData m_data;
+
+            RHI::HardwareQueueClass m_hardwareQueueClass = RHI::HardwareQueueClass::Graphics;
+
+            enum class CopyMode
+            {
+                SameDevice,
+                DifferentDevicesIntermediateHost,
+                Invalid
+            };
+            CopyMode m_copyMode = CopyMode::Invalid;
+
+            constexpr static int MaxFrames = RHI::Limits::Device::FrameCountMax;
+            int m_currentBufferIndex = 0;
+            AZStd::array<Data::Instance<Buffer>, MaxFrames> m_device1HostBuffer;
+            AZStd::array<Data::Instance<Buffer>, MaxFrames> m_device2HostBuffer;
+            AZStd::array<AZ::u64, MaxFrames> m_deviceHostBufferByteCount;
+            AZStd::array<Ptr<RHI::MultiDeviceFence>, MaxFrames> m_device1SignalFence;
+            AZStd::array<Ptr<RHI::MultiDeviceFence>, MaxFrames> m_device2WaitFence;
+            RHI::SingleDeviceImageSubresourceLayout m_inputImageLayout;
         };
-    }   // namespace RPI
-}   // namespace AZ
+    } // namespace RPI
+} // namespace AZ

+ 4 - 0
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Pass/Pass.h

@@ -51,6 +51,7 @@ namespace AZ
     {
         class FrameGraphBuilder;
         class FrameGraphAttachmentInterface;
+        class FrameGraphInterface;
     }
 
     namespace RPI
@@ -192,6 +193,9 @@ namespace AZ
             //! Adds an attachment binding to the list of this Pass' attachment bindings
             void AddAttachmentBinding(PassAttachmentBinding attachmentBinding);
 
+            // Binds all attachments from the pass 
+            void DeclareAttachmentsToFrameGraph(RHI::FrameGraphInterface frameGraph, PassSlotType slotType = PassSlotType::Uninitialized) const;
+
             // Returns a reference to the N-th input binding, where N is the index passed to the function
             PassAttachmentBinding& GetInputBinding(uint32_t index);
 

+ 0 - 3
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Pass/RenderPass.h

@@ -73,9 +73,6 @@ namespace AZ
 
             virtual void BuildCommandListInternal([[maybe_unused]] const RHI::FrameGraphExecuteContext& context){};
 
-            // Binds all attachments from the pass 
-            void DeclareAttachmentsToFrameGraph(RHI::FrameGraphInterface frameGraph) const;
-
             // Declares explicitly set dependencies between passes (execute after and execute before)
             // Note most pass ordering is determined by attachments. This is only used for
             // dependencies between passes that don't have any attachments/connections in common.

+ 3 - 0
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/RPIUtils.h

@@ -56,6 +56,9 @@ namespace AZ
         //! Loads a streaming image asset for the given file path
         Data::Instance<RPI::StreamingImage> LoadStreamingTexture(AZStd::string_view path);
 
+        // Find a format for formats with two planars (DepthStencil) based on its ImageView's aspect flag
+        RHI::Format FindFormatForAspect(RHI::Format format, RHI::ImageAspect imageAspect);
+
         //! Looks for a three arguments attribute named @attributeName in the given shader asset.
         //! Assigns the value to each non-null output variables.
         //! @param shaderAsset

+ 4 - 0
Gems/Atom/RPI/Code/Include/Atom/RPI.Reflect/Pass/CopyPassData.h

@@ -51,6 +51,10 @@ namespace AZ
             RHI::ImageSubresource m_imageDestinationSubresource;
             RHI::Origin m_imageDestinationOrigin;
 
+            // Device indices
+            int m_sourceDeviceIndex = RHI::MultiDevice::InvalidDeviceIndex;
+            int m_destinationDeviceIndex = RHI::MultiDevice::InvalidDeviceIndex; //@TODO maybe a mask so we can broadcast?
+
             // If set to true, pass will automatically create a transient output attachment based on input
             // If false, the output target of the copy will need to be specified
             bool m_cloneInput = true;

+ 5 - 0
Gems/Atom/RPI/Code/Source/RPI.Public/Buffer/BufferSystem.cpp

@@ -117,6 +117,11 @@ namespace AZ
                 bufferPoolDesc.m_heapMemoryLevel = RHI::HeapMemoryLevel::Host;
                 bufferPoolDesc.m_hostMemoryAccess = RHI::HostMemoryAccess::Read;
                 break;
+            case CommonBufferPoolType::Staging:
+                bufferPoolDesc.m_bindFlags = RHI::BufferBindFlags::CopyRead;
+                bufferPoolDesc.m_heapMemoryLevel = RHI::HeapMemoryLevel::Host;
+                bufferPoolDesc.m_hostMemoryAccess = RHI::HostMemoryAccess::Write;
+                break;
             case CommonBufferPoolType::ReadWrite:
                 // Add CopyRead flag too since it's often we need to read back GPU attachment buffers.
                 bufferPoolDesc.m_bindFlags =

+ 8 - 89
Gems/Atom/RPI/Code/Source/RPI.Public/Pass/AttachmentReadback.cpp

@@ -32,85 +32,6 @@ namespace AZ
 {
     namespace RPI
     {
-        // Helper class to build scope producer with functions
-        class ScopeProducerFunction final
-            : public RHI::ScopeProducer
-        {
-        public:
-            AZ_CLASS_ALLOCATOR(ScopeProducerFunction, SystemAllocator);
-
-            using PrepareFunction = AZStd::function<void(RHI::FrameGraphInterface)>;
-            using CompileFunction = AZStd::function<void(const RHI::FrameGraphCompileContext&)>;
-            using ExecuteFunction = AZStd::function<void(const RHI::FrameGraphExecuteContext&)>;
-
-            ScopeProducerFunction(
-                const RHI::ScopeId& scopeId,
-                PrepareFunction prepareFunction,
-                CompileFunction compileFunction,
-                ExecuteFunction executeFunction)
-                : ScopeProducer(scopeId)
-                , m_prepareFunction{ AZStd::move(prepareFunction) }
-                , m_compileFunction{ AZStd::move(compileFunction) }
-                , m_executeFunction{ AZStd::move(executeFunction) }
-            {}
-
-        private:
-            //////////////////////////////////////////////////////////////////////////
-            // ScopeProducer overrides
-            void SetupFrameGraphDependencies(RHI::FrameGraphInterface builder) override
-            {
-                m_prepareFunction(builder);
-            }
-
-            void CompileResources(const RHI::FrameGraphCompileContext& context) override
-            {
-                m_compileFunction(context);
-            }
-
-            void BuildCommandList(const RHI::FrameGraphExecuteContext& context) override
-            {
-                m_executeFunction(context);
-            }
-            //////////////////////////////////////////////////////////////////////////
-
-            PrepareFunction m_prepareFunction;
-            CompileFunction m_compileFunction;
-            ExecuteFunction m_executeFunction;
-        };
-
-
-        // Find a format for formats with two planars (DepthStencil) based on its ImageView's aspect flag
-        RHI::Format FindFormatForAspect(RHI::Format format, RHI::ImageAspect imageAspect)
-        {
-            RHI::ImageAspectFlags imageAspectFlags = RHI::GetImageAspectFlags(format);
-
-            // only need to convert is the source contains two aspects
-            if (imageAspectFlags == RHI::ImageAspectFlags::DepthStencil)
-            {
-                switch (imageAspect)
-                {
-                case RHI::ImageAspect::Stencil:
-                    return RHI::Format::R8_UINT;
-                case RHI::ImageAspect::Depth:
-                {
-                    switch (format)
-                    {
-                    case RHI::Format::D32_FLOAT_S8X24_UINT:
-                        return RHI::Format::R32_FLOAT;
-                    case RHI::Format::D24_UNORM_S8_UINT:
-                        return RHI::Format::R32_UINT;
-                    case RHI::Format::D16_UNORM_S8_UINT:
-                        return RHI::Format::R16_UNORM;
-                    default:
-                        AZ_Assert(false, "Unknown DepthStencil format. Please update this function");
-                        return RHI::Format::R32_FLOAT;
-                    }
-                }
-                }
-            }
-            return format;
-        }
-
         AttachmentReadback::AttachmentReadback(const RHI::ScopeId& scopeId) : m_dispatchItem(RHI::MultiDevice::AllDevices)
         {
             for(uint32_t i = 0; i < RHI::Limits::Device::FrameCountMax; i++)
@@ -162,12 +83,11 @@ namespace AZ
             m_decomposeOutputImageIndex = m_decomposeSrg->FindShaderInputImageIndex(Name("m_outputImage"));
 
             // build scope producer for copying
-            m_copyScopeProducer = AZStd::make_shared<ScopeProducerFunction>(
-                    scopeId,
-                    AZStd::bind(&AttachmentReadback::CopyPrepare, this, AZStd::placeholders::_1),
-                    AZStd::bind(&AttachmentReadback::CopyCompile, this, AZStd::placeholders::_1),
-                    AZStd::bind(&AttachmentReadback::CopyExecute, this, AZStd::placeholders::_1)
-                );
+            m_copyScopeProducer = AZStd::make_shared<RHI::ScopeProducerFunctionNoData>(
+                scopeId,
+                AZStd::bind(&AttachmentReadback::CopyPrepare, this, AZStd::placeholders::_1),
+                AZStd::bind(&AttachmentReadback::CopyCompile, this, AZStd::placeholders::_1),
+                AZStd::bind(&AttachmentReadback::CopyExecute, this, AZStd::placeholders::_1));
 
             m_state = ReadbackState::Idle;
         }
@@ -255,12 +175,11 @@ namespace AZ
                 if (m_imageDescriptor.m_multisampleState.m_samples > 1)
                 {
                     m_copyAttachmentId = RHI::AttachmentId(AZStd::string::format("%s_Decomposed", m_attachmentId.GetCStr()));
-                    m_decomposeScopeProducer = AZStd::make_shared<ScopeProducerFunction>(
+                    m_decomposeScopeProducer = AZStd::make_shared<RHI::ScopeProducerFunctionNoData>(
                         m_copyAttachmentId,
                         AZStd::bind(&AttachmentReadback::DecomposePrepare, this, AZStd::placeholders::_1),
                         AZStd::bind(&AttachmentReadback::DecomposeCompile, this, AZStd::placeholders::_1),
-                        AZStd::bind(&AttachmentReadback::DecomposeExecute, this, AZStd::placeholders::_1)
-                        );
+                        AZStd::bind(&AttachmentReadback::DecomposeExecute, this, AZStd::placeholders::_1));
                 }
             }
             return true;
@@ -652,5 +571,5 @@ namespace AZ
             }
             return true;
         }
-    }   // namespace RPI
+    } // namespace RPI
 }   // namespace AZ

+ 393 - 22
Gems/Atom/RPI/Code/Source/RPI.Public/Pass/CopyPass.cpp

@@ -8,21 +8,29 @@
 
 #include <Atom/RHI/CommandList.h>
 #include <Atom/RHI/DrawListTagRegistry.h>
+#include <Atom/RHI/FrameGraphBuilder.h>
 #include <Atom/RHI/RHISystemInterface.h>
-
-#include <Atom/RPI.Public/RenderPipeline.h>
+#include <Atom/RHI/ScopeProducerFunction.h>
+#include <Atom/RPI.Public/Pass/CopyPass.h>
+#include <Atom/RPI.Public/Pass/PassUtils.h>
+#include <Atom/RPI.Public/Pass/RenderPass.h>
 #include <Atom/RPI.Public/RPISystemInterface.h>
+#include <Atom/RPI.Public/RPIUtils.h>
+#include <Atom/RPI.Public/RenderPipeline.h>
 #include <Atom/RPI.Public/Scene.h>
 #include <Atom/RPI.Public/View.h>
-#include <Atom/RPI.Public/Pass/CopyPass.h>
-#include <Atom/RPI.Public/Pass/PassUtils.h>
 
 namespace AZ
 {
     namespace RPI
-    {
+    {        
         // --- Creation & Initialization ---
 
+        CopyPass::~CopyPass()
+        {
+            ResetInternal();
+        }
+
         Ptr<CopyPass> CopyPass::Create(const PassDescriptor& descriptor)
         {
             Ptr<CopyPass> pass = aznew CopyPass(descriptor);
@@ -30,7 +38,7 @@ namespace AZ
         }
 
         CopyPass::CopyPass(const PassDescriptor& descriptor)
-            : RenderPass(descriptor)
+            : Pass(descriptor)
         {
             const CopyPassData* copyData = PassUtils::GetPassData<CopyPassData>(descriptor);
 
@@ -84,6 +92,71 @@ namespace AZ
                 "CopyPass must have exactly 2 bindings: 1 input and 1 output. %s has %d bindings.",
                 GetPathName().GetCStr(), m_attachmentBindings.size());
 
+            bool sameDevice = (m_data.m_sourceDeviceIndex == -1 && m_data.m_destinationDeviceIndex == -1) ||
+                m_data.m_sourceDeviceIndex == m_data.m_destinationDeviceIndex;
+            AZ_Assert(
+                sameDevice || (m_data.m_sourceDeviceIndex != -1 && m_data.m_destinationDeviceIndex != -1),
+                "CopyPass: Either source and destination device indices must be invalid, or both must be valid");
+
+            m_copyMode = sameDevice ? CopyMode::SameDevice : CopyMode::DifferentDevicesIntermediateHost;
+
+            if (m_copyMode == CopyMode::SameDevice)
+            {
+                m_copyScopeProducerSameDevice = AZStd::make_shared<RHI::ScopeProducerFunctionNoData>(
+                    RHI::ScopeId{ GetPathName() },
+                    AZStd::bind(&CopyPass::SetupFrameGraphDependenciesSameDevice, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::CompileResourcesSameDevice, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::BuildCommandListInternalSameDevice, this, AZStd::placeholders::_1),
+                    m_hardwareQueueClass);
+            }
+            else if (m_copyMode == CopyMode::DifferentDevicesIntermediateHost)
+            {
+                [[maybe_unused]] auto* device1 =
+                    RHI::RHISystemInterface::Get()->GetDevice(m_data.m_sourceDeviceIndex != RHI::MultiDevice::InvalidDeviceIndex ? m_data.m_sourceDeviceIndex : RHI::MultiDevice::DefaultDeviceIndex);
+                AZ_Assert(
+                    device1->GetFeatures().m_signalFenceFromCPU,
+                    "CopyPass: Device to device copy is only possible if all devices support signalling fences from the CPU");
+                [[maybe_unused]] auto* device2 =
+                    RHI::RHISystemInterface::Get()->GetDevice(m_data.m_destinationDeviceIndex != RHI::MultiDevice::InvalidDeviceIndex ? m_data.m_destinationDeviceIndex : RHI::MultiDevice::DefaultDeviceIndex);
+                AZ_Assert(
+                    device2->GetFeatures().m_signalFenceFromCPU,
+                    "CopyPass: Device to device copy is only possible if all devices support signalling fences from the CPU");
+
+                // Initialize #MaxFrames fences that are signaled on device 1 and perform the copy between the host staging buffers from device 1 to device 2
+                for (auto& fence : m_device1SignalFence)
+                {
+                    fence = new RHI::MultiDeviceFence();
+                    AZ_Assert(fence != nullptr, "CopyPass failed to create a fence");
+                    [[maybe_unused]] RHI::ResultCode result = fence->Init(RHI::MultiDevice::AllDevices, RHI::FenceState::Signaled);
+                    AZ_Assert(result == RHI::ResultCode::Success, "CopyPass failed to init fence");
+                }
+
+                // Initialize #MaxFrames fences that can be waited for on device 2 before data is uploaded to device 2
+                for (auto& fence : m_device2WaitFence)
+                {
+                    fence = new RHI::MultiDeviceFence();
+                    AZ_Assert(fence != nullptr, "CopyPass failed to create a fence");
+                    [[maybe_unused]] auto result = fence->Init(RHI::MultiDevice::AllDevices, RHI::FenceState::Signaled);
+                    AZ_Assert(result == RHI::ResultCode::Success, "CopyPass failed to init fence");
+                }
+
+                m_copyScopeProducerDeviceToHost = AZStd::make_shared<RHI::ScopeProducerFunctionNoData>(
+                    RHI::ScopeId{ AZStd::string(GetPathName().GetStringView()) },
+                    AZStd::bind(&CopyPass::SetupFrameGraphDependenciesDeviceToHost, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::CompileResourcesDeviceToHost, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::BuildCommandListInternalDeviceToHost, this, AZStd::placeholders::_1),
+                    m_hardwareQueueClass,
+                    m_data.m_sourceDeviceIndex);
+
+                m_copyScopeProducerHostToDevice = AZStd::make_shared<RHI::ScopeProducerFunctionNoData>(
+                    RHI::ScopeId{ AZStd::string(GetPathName().GetStringView()) + "_2" },
+                    AZStd::bind(&CopyPass::SetupFrameGraphDependenciesHostToDevice, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::CompileResourcesHostToDevice, this, AZStd::placeholders::_1),
+                    AZStd::bind(&CopyPass::BuildCommandListInternalHostToDevice, this, AZStd::placeholders::_1),
+                    m_hardwareQueueClass,
+                    m_data.m_destinationDeviceIndex);
+            }
+            
             // Create transient attachment based on input if required
             if (m_data.m_cloneInput)
             {
@@ -109,14 +182,54 @@ namespace AZ
             }
         }
 
+        void CopyPass::FrameBeginInternal(Pass::FramePrepareParams params)
+        {
+            if (m_copyMode == CopyMode::SameDevice)
+            {
+                params.m_frameGraphBuilder->ImportScopeProducer(*m_copyScopeProducerSameDevice);
+            }
+            else if (m_copyMode == CopyMode::DifferentDevicesIntermediateHost)
+            {
+                params.m_frameGraphBuilder->ImportScopeProducer(*m_copyScopeProducerDeviceToHost);
+                params.m_frameGraphBuilder->ImportScopeProducer(*m_copyScopeProducerHostToDevice);
+                m_currentBufferIndex = (m_currentBufferIndex + 1) % MaxFrames;
+                m_device1SignalFence[m_currentBufferIndex]->Reset();
+                m_device2WaitFence[m_currentBufferIndex]->Reset();
+            }
+        }
+
+        void CopyPass::ResetInternal()
+        {
+            Pass::ResetInternal();
+            if (m_copyMode == CopyMode::DifferentDevicesIntermediateHost)
+            {
+                for (auto& fence : m_device1SignalFence)
+                {
+                    fence
+                        ->GetDeviceFence(
+                            m_data.m_sourceDeviceIndex != RHI::MultiDevice::InvalidDeviceIndex ? m_data.m_sourceDeviceIndex
+                                                                                               : RHI::MultiDevice::DefaultDeviceIndex)
+                        ->WaitOnCpu();
+                }
+                for (auto& fence : m_device2WaitFence)
+                {
+                    fence
+                        ->GetDeviceFence(
+                            m_data.m_destinationDeviceIndex != RHI::MultiDevice::InvalidDeviceIndex ? m_data.m_destinationDeviceIndex
+                                                                                                    : RHI::MultiDevice::DefaultDeviceIndex)
+                        ->WaitOnCpu();
+                }
+            }
+        }
+
         // --- Scope producer functions ---
 
-        void CopyPass::SetupFrameGraphDependencies(RHI::FrameGraphInterface frameGraph)
+        void CopyPass::SetupFrameGraphDependenciesSameDevice(RHI::FrameGraphInterface frameGraph)
         {
-            RenderPass::SetupFrameGraphDependencies(frameGraph);
+            DeclareAttachmentsToFrameGraph(frameGraph);
         }
 
-        void CopyPass::CompileResources(const RHI::FrameGraphCompileContext& context)
+        void CopyPass::CompileResourcesSameDevice(const RHI::FrameGraphCompileContext& context)
         {
             RHI::CopyItemType copyType = GetCopyItemType();
             switch (copyType)
@@ -138,11 +251,268 @@ namespace AZ
             }
         }
 
-        void CopyPass::BuildCommandListInternal(const RHI::FrameGraphExecuteContext& context)
+        void CopyPass::BuildCommandListInternalSameDevice(const RHI::FrameGraphExecuteContext& context)
         {
-            if (m_copyItem.m_type != RHI::CopyItemType::Invalid)
+            if (m_copyItemSameDevice.m_type != RHI::CopyItemType::Invalid)
             {
-                context.GetCommandList()->Submit(m_copyItem.GetDeviceCopyItem(context.GetDeviceIndex()));
+                context.GetCommandList()->Submit(m_copyItemSameDevice.GetDeviceCopyItem(context.GetDeviceIndex()));
+            }
+        }
+
+        void CopyPass::SetupFrameGraphDependenciesDeviceToHost(RHI::FrameGraphInterface frameGraph)
+        {
+            // We need the size of the output image when copying from image to image, so we need all attachments (even the output ones)
+            // We also need it so the framegraph knows the two scopes depend on each other
+            DeclareAttachmentsToFrameGraph(frameGraph);
+
+            frameGraph.SignalFence(*m_device1SignalFence[m_currentBufferIndex]);
+        }
+
+        void CopyPass::CompileResourcesDeviceToHost(const RHI::FrameGraphCompileContext& context)
+        {
+            RHI::CopyItemType copyType = GetCopyItemType();
+            auto inputId = GetInputBinding(0).GetAttachment()->GetAttachmentId();
+            switch (copyType)
+            {
+            case AZ::RHI::CopyItemType::Image:
+                [[fallthrough]];
+            case AZ::RHI::CopyItemType::ImageToBuffer:
+                {
+                    // copy image to read back buffer since only buffer can be accessed by host
+                    const auto* sourceImage = context.GetImage(inputId);
+                    if (!sourceImage)
+                    {
+                        AZ_Warning("AttachmentReadback", false, "Failed to find attachment image %s for copy to buffer", inputId.GetCStr());
+                        return;
+                    }
+                    const auto& sourceImageDescriptor = sourceImage->GetDescriptor();
+                    const uint16_t sourceMipSlice = m_data.m_imageSourceSubresource.m_mipSlice;
+                    RHI::ImageSubresourceRange sourceRange(sourceMipSlice, sourceMipSlice, 0, 0);
+                    sourceRange.m_aspectFlags = RHI::ImageAspectFlags::Color;
+
+                    RHI::ImageAspect sourceImageAspect = RHI::ImageAspect::Color;
+                    RHI::ImageAspectFlags sourceImageAspectFlags = RHI::GetImageAspectFlags(sourceImageDescriptor.m_format);
+                    if (RHI::CheckBitsAll(sourceImageAspectFlags, RHI::ImageAspectFlags::Depth))
+                    {
+                        sourceImageAspect = RHI::ImageAspect::Depth;
+                        sourceRange.m_aspectFlags = RHI::ImageAspectFlags::Depth;
+                    }
+
+                    AZStd::vector<RHI::SingleDeviceImageSubresourceLayout> sourceImageSubResourcesLayouts;
+                    sourceImageSubResourcesLayouts.resize_no_construct(sourceImageDescriptor.m_mipLevels);
+                    size_t sourceTotalSizeInBytes = 0;
+                    sourceImage->GetDeviceImage(m_data.m_sourceDeviceIndex)
+                        ->GetSubresourceLayouts(sourceRange, sourceImageSubResourcesLayouts.data(), &sourceTotalSizeInBytes);
+                    AZ::u64 sourceByteCount = sourceTotalSizeInBytes;
+
+                    if(m_deviceHostBufferByteCount[m_currentBufferIndex] != sourceByteCount)
+                    {
+                        m_deviceHostBufferByteCount[m_currentBufferIndex] = sourceByteCount;
+
+                        RPI::CommonBufferDescriptor desc;
+                        desc.m_poolType = RPI::CommonBufferPoolType::ReadBack;
+                        desc.m_bufferName = AZStd::string(GetPathName().GetStringView()) + "_hostbuffer";
+                        desc.m_byteCount = m_deviceHostBufferByteCount[m_currentBufferIndex];
+                        m_device1HostBuffer[m_currentBufferIndex] = BufferSystemInterface::Get()->CreateBufferFromCommonPool(desc);
+
+                        desc.m_bufferName = AZStd::string(GetPathName().GetStringView()) + "_hostbuffer2";
+                        desc.m_poolType = RPI::CommonBufferPoolType::Staging;
+                        m_device2HostBuffer[m_currentBufferIndex] = BufferSystemInterface::Get()->CreateBufferFromCommonPool(desc);
+                    }
+
+                    // copy descriptor for copying image to buffer
+                    RHI::MultiDeviceCopyImageToBufferDescriptor copyImageToBufferDesc;
+                    copyImageToBufferDesc.m_mdSourceImage = sourceImage;
+                    copyImageToBufferDesc.m_sourceSize = sourceImageSubResourcesLayouts[sourceMipSlice].m_size;
+                    copyImageToBufferDesc.m_sourceSubresource = RHI::ImageSubresource(sourceMipSlice, 0 /*arraySlice*/, sourceImageAspect);
+                    copyImageToBufferDesc.m_destinationOffset = 0;
+
+                    if (copyType == RHI::CopyItemType::ImageToBuffer)
+                    {
+                        copyImageToBufferDesc.m_destinationBytesPerRow = sourceImageSubResourcesLayouts[sourceMipSlice].m_bytesPerRow;
+                        copyImageToBufferDesc.m_destinationBytesPerImage = sourceImageSubResourcesLayouts[sourceMipSlice].m_bytesPerImage;
+                        copyImageToBufferDesc.m_mdDestinationBuffer = m_device1HostBuffer[m_currentBufferIndex]->GetRHIBuffer();
+                        copyImageToBufferDesc.m_destinationFormat = FindFormatForAspect(sourceImageDescriptor.m_format, sourceImageAspect);
+                    }
+                    else
+                    {
+                        auto outputId = GetOutputBinding(0).GetAttachment()->GetAttachmentId();
+                        const auto* destImage = context.GetImage(outputId);
+                        if (!destImage)
+                        {
+                            AZ_Warning(
+                                "AttachmentReadback", false, "Failed to find attachment image %s for copy to buffer", inputId.GetCStr());
+                            return;
+                        }
+
+                        const auto& destImageDescriptor = destImage->GetDescriptor();
+                        const uint16_t destMipSlice = m_data.m_imageSourceSubresource.m_mipSlice;
+                        RHI::ImageSubresourceRange destRange(destMipSlice, destMipSlice, 0, 0);
+                        destRange.m_aspectFlags = RHI::ImageAspectFlags::Color;
+
+                        destRange.m_aspectFlags = RHI::ImageAspectFlags::Color;
+                        RHI::ImageAspect destImageAspect = RHI::ImageAspect::Color;
+                        RHI::ImageAspectFlags destImageAspectFlags = RHI::GetImageAspectFlags(destImageDescriptor.m_format);
+                        if (RHI::CheckBitsAll(destImageAspectFlags, RHI::ImageAspectFlags::Depth))
+                        {
+                            destImageAspect = RHI::ImageAspect::Depth;
+                            destRange.m_aspectFlags = RHI::ImageAspectFlags::Depth;
+                        }
+
+                        AZStd::vector<RHI::SingleDeviceImageSubresourceLayout> destImageSubResourcesLayouts;
+                        destImageSubResourcesLayouts.resize_no_construct(destImageDescriptor.m_mipLevels);
+                        size_t destTotalSizeInBytes = 0;
+                        destImage->GetDeviceImage(m_data.m_sourceDeviceIndex)
+                            ->GetSubresourceLayouts(destRange, destImageSubResourcesLayouts.data(), &destTotalSizeInBytes);
+
+                        copyImageToBufferDesc.m_destinationBytesPerRow = destImageSubResourcesLayouts[destMipSlice].m_bytesPerRow;
+                        copyImageToBufferDesc.m_destinationBytesPerImage = destImageSubResourcesLayouts[destMipSlice].m_bytesPerImage;
+                        copyImageToBufferDesc.m_mdDestinationBuffer = m_device1HostBuffer[m_currentBufferIndex]->GetRHIBuffer();
+                        copyImageToBufferDesc.m_destinationFormat = FindFormatForAspect(destImageDescriptor.m_format, destImageAspect);
+                    }
+
+                    m_inputImageLayout = sourceImageSubResourcesLayouts[sourceMipSlice];
+
+                    m_copyItemDeviceToHost = copyImageToBufferDesc;
+                }
+                break;
+            case AZ::RHI::CopyItemType::Buffer:
+                [[fallthrough]];
+            case AZ::RHI::CopyItemType::BufferToImage:
+                {
+                    const auto* buffer = context.GetBuffer(inputId);
+
+                    if(m_deviceHostBufferByteCount[m_currentBufferIndex] != buffer->GetDescriptor().m_byteCount)
+                    {
+                        m_deviceHostBufferByteCount[m_currentBufferIndex] = buffer->GetDescriptor().m_byteCount;
+
+                        RPI::CommonBufferDescriptor desc;
+                        desc.m_poolType = RPI::CommonBufferPoolType::ReadBack;
+                        desc.m_bufferName = AZStd::string(GetPathName().GetStringView()) + "_hostbuffer";
+                        desc.m_byteCount = m_deviceHostBufferByteCount[m_currentBufferIndex];
+
+                        m_device1HostBuffer[m_currentBufferIndex] = BufferSystemInterface::Get()->CreateBufferFromCommonPool(desc);
+                        desc.m_bufferName = AZStd::string(GetPathName().GetStringView()) + "_hostbuffer2";
+                        m_device2HostBuffer[m_currentBufferIndex] = BufferSystemInterface::Get()->CreateBufferFromCommonPool(desc);
+                    }
+
+                    // copy buffer
+                    RHI::MultiDeviceCopyBufferDescriptor copyBuffer;
+                    copyBuffer.m_mdSourceBuffer = buffer;
+                    copyBuffer.m_mdDestinationBuffer = m_device1HostBuffer[m_currentBufferIndex]->GetRHIBuffer();
+                    copyBuffer.m_size = aznumeric_cast<uint32_t>(m_deviceHostBufferByteCount[m_currentBufferIndex]);
+
+                    m_copyItemDeviceToHost = copyBuffer;
+                }
+                break;
+            default:
+                break;
+            }
+        }
+
+        void CopyPass::BuildCommandListInternalDeviceToHost(const RHI::FrameGraphExecuteContext& context)
+        {
+            if (m_copyItemDeviceToHost.m_type != RHI::CopyItemType::Invalid)
+            {
+                context.GetCommandList()->Submit(m_copyItemDeviceToHost.GetDeviceCopyItem(context.GetDeviceIndex()));
+            }
+
+            // Once signaled on device 1, we can map the host staging buffers on device 1 and 2 and copy data from 1 -> 2 and then signal the upload on device 2
+            m_device1SignalFence[m_currentBufferIndex]
+                ->GetDeviceFence(context.GetDeviceIndex())
+                ->WaitOnCpuAsync(
+                    [this, bufferIndex = m_currentBufferIndex]()
+                    {
+                        auto bufferSize = m_device2HostBuffer[bufferIndex]->GetBufferSize();
+                        void* data1 = m_device1HostBuffer[bufferIndex]->Map(bufferSize, 0)[m_data.m_sourceDeviceIndex];
+                        void* data2 = m_device2HostBuffer[bufferIndex]->Map(bufferSize, 0)[m_data.m_destinationDeviceIndex];
+                        memcpy(data2, data1, bufferSize);
+                        m_device1HostBuffer[bufferIndex]->Unmap();
+                        m_device2HostBuffer[bufferIndex]->Unmap();
+
+                        m_device2WaitFence[bufferIndex]->GetDeviceFence(m_data.m_destinationDeviceIndex)->SignalOnCpu();
+                    });
+        }
+
+        void CopyPass::SetupFrameGraphDependenciesHostToDevice(RHI::FrameGraphInterface frameGraph)
+        {
+            DeclareAttachmentsToFrameGraph(frameGraph, PassSlotType::Output);
+            frameGraph.ExecuteAfter(m_copyScopeProducerHostToDevice->GetScopeId());
+            for (Pass* pass : m_executeBeforePasses)
+            {
+                RenderPass* renderPass = azrtti_cast<RenderPass*>(pass);
+                if (renderPass)
+                {
+                    frameGraph.ExecuteBefore(renderPass->GetScopeId());
+                }
+            }
+
+            frameGraph.WaitFence(*m_device2WaitFence[m_currentBufferIndex]);
+        }
+
+        void CopyPass::CompileResourcesHostToDevice(const RHI::FrameGraphCompileContext& context)
+        {
+            m_copyItemHostToDevice = {};
+            m_copyItemHostToDevice.m_type = RHI::CopyItemType::Invalid;
+            PassAttachmentBinding& copyDest = GetOutputBinding(0);
+            auto outputId = copyDest.GetAttachment()->GetAttachmentId();
+            RHI::CopyItemType copyType = GetCopyItemType();
+            switch (copyType)
+            {
+            case AZ::RHI::CopyItemType::Buffer:
+                [[fallthrough]];
+            case AZ::RHI::CopyItemType::ImageToBuffer:
+                {
+                    const auto* buffer = context.GetBuffer(outputId);
+                    RHI::MultiDeviceCopyBufferDescriptor copyBuffer;
+                    copyBuffer.m_mdSourceBuffer = m_device2HostBuffer[m_currentBufferIndex]->GetRHIBuffer();
+                    copyBuffer.m_mdDestinationBuffer = buffer;
+                    copyBuffer.m_size = aznumeric_cast<uint32_t>(m_device2HostBuffer[m_currentBufferIndex]->GetBufferSize());
+
+                    m_copyItemHostToDevice = copyBuffer;
+                }
+                break;
+            case AZ::RHI::CopyItemType::Image:
+                [[fallthrough]];
+            case AZ::RHI::CopyItemType::BufferToImage:
+                {
+                    RHI::MultiDeviceCopyBufferToImageDescriptor copyDesc;
+
+                    const auto* sourceBuffer = m_device2HostBuffer[m_currentBufferIndex]->GetRHIBuffer();
+                    copyDesc.m_mdSourceBuffer = sourceBuffer;
+
+                    copyDesc.m_sourceOffset = 0;
+                    if (copyType == RHI::CopyItemType::BufferToImage)
+                    {
+                        copyDesc.m_sourceBytesPerRow = m_data.m_bufferSourceBytesPerRow;
+                        copyDesc.m_sourceBytesPerImage = m_data.m_bufferSourceBytesPerImage;
+                        copyDesc.m_sourceSize = m_data.m_sourceSize;
+                    }
+                    else
+                    {
+                        copyDesc.m_sourceBytesPerRow = m_inputImageLayout.m_bytesPerRow;
+                        copyDesc.m_sourceBytesPerImage = m_inputImageLayout.m_bytesPerImage;
+                        copyDesc.m_sourceSize = m_inputImageLayout.m_size;
+                    }
+
+                    // Destination Image
+                    copyDesc.m_mdDestinationImage = context.GetImage(copyDest.GetAttachment()->GetAttachmentId());
+                    copyDesc.m_destinationOrigin = m_data.m_imageDestinationOrigin;
+                    copyDesc.m_destinationSubresource = m_data.m_imageDestinationSubresource;
+
+                    m_copyItemHostToDevice = copyDesc;
+                }
+                break;
+            default:
+                break;
+            }
+        }
+
+        void CopyPass::BuildCommandListInternalHostToDevice(const RHI::FrameGraphExecuteContext& context)
+        {
+            if (m_copyItemHostToDevice.m_type != RHI::CopyItemType::Invalid)
+            {
+                context.GetCommandList()->Submit(m_copyItemHostToDevice.GetDeviceCopyItem(context.GetDeviceIndex()));
             }
         }
 
@@ -154,7 +524,7 @@ namespace AZ
 
             // Source Buffer
             PassAttachmentBinding& copySource = GetInputBinding(0);
-            const AZ::RHI::MultiDeviceBuffer* sourceBuffer = context.GetBuffer(copySource.GetAttachment()->GetAttachmentId());
+            const auto* sourceBuffer = context.GetBuffer(copySource.GetAttachment()->GetAttachmentId());
             copyDesc.m_mdSourceBuffer = sourceBuffer;
             copyDesc.m_size = static_cast<uint32_t>(sourceBuffer->GetDescriptor().m_byteCount);
             copyDesc.m_sourceOffset = m_data.m_bufferSourceOffset;
@@ -162,9 +532,10 @@ namespace AZ
             // Destination Buffer
             PassAttachmentBinding& copyDest = GetOutputBinding(0);
             copyDesc.m_mdDestinationBuffer = context.GetBuffer(copyDest.GetAttachment()->GetAttachmentId());
+            copyDesc.m_mdDestinationBuffer = context.GetBuffer(copyDest.GetAttachment()->GetAttachmentId());
             copyDesc.m_destinationOffset = m_data.m_bufferDestinationOffset;
 
-            m_copyItem = copyDesc;
+            m_copyItemSameDevice = copyDesc;
         }
 
         void CopyPass::CopyImage(const RHI::FrameGraphCompileContext& context)
@@ -173,7 +544,7 @@ namespace AZ
 
             // Source Image
             PassAttachmentBinding& copySource = GetInputBinding(0);
-            const AZ::RHI::MultiDeviceImage* sourceImage = context.GetImage(copySource.GetAttachment()->GetAttachmentId());
+            const auto* sourceImage = context.GetImage(copySource.GetAttachment()->GetAttachmentId());
             copyDesc.m_mdSourceImage = sourceImage;
             copyDesc.m_sourceSize = sourceImage->GetDescriptor().m_size;
             copyDesc.m_sourceOrigin = m_data.m_imageSourceOrigin;
@@ -185,7 +556,7 @@ namespace AZ
             copyDesc.m_destinationOrigin = m_data.m_imageDestinationOrigin;
             copyDesc.m_destinationSubresource = m_data.m_imageDestinationSubresource;
 
-            m_copyItem = copyDesc;
+            m_copyItemSameDevice = copyDesc;
         }
 
         void CopyPass::CopyBufferToImage(const RHI::FrameGraphCompileContext& context)
@@ -194,7 +565,7 @@ namespace AZ
 
             // Source Buffer
             PassAttachmentBinding& copySource = GetInputBinding(0);
-            const AZ::RHI::MultiDeviceBuffer* sourceBuffer = context.GetBuffer(copySource.GetAttachment()->GetAttachmentId());
+            const auto* sourceBuffer = context.GetBuffer(copySource.GetAttachment()->GetAttachmentId());
             copyDesc.m_mdSourceBuffer = sourceBuffer;
             copyDesc.m_sourceSize = m_data.m_sourceSize;
             copyDesc.m_sourceOffset = m_data.m_bufferSourceOffset;
@@ -207,7 +578,7 @@ namespace AZ
             copyDesc.m_destinationOrigin = m_data.m_imageDestinationOrigin;
             copyDesc.m_destinationSubresource = m_data.m_imageDestinationSubresource;
 
-            m_copyItem = copyDesc;
+            m_copyItemSameDevice = copyDesc;
         }
 
         void CopyPass::CopyImageToBuffer(const RHI::FrameGraphCompileContext& context)
@@ -216,7 +587,7 @@ namespace AZ
 
             // Source Image
             PassAttachmentBinding& copySource = GetInputBinding(0);
-            const AZ::RHI::MultiDeviceImage* sourceImage = context.GetImage(copySource.GetAttachment()->GetAttachmentId());
+            const auto* sourceImage = context.GetImage(copySource.GetAttachment()->GetAttachmentId());
             copyDesc.m_mdSourceImage = sourceImage;
             copyDesc.m_sourceSize = sourceImage->GetDescriptor().m_size;
             copyDesc.m_sourceOrigin = m_data.m_imageSourceOrigin;
@@ -229,8 +600,8 @@ namespace AZ
             copyDesc.m_destinationBytesPerRow = m_data.m_bufferDestinationBytesPerRow;
             copyDesc.m_destinationBytesPerImage = m_data.m_bufferDestinationBytesPerImage;
 
-            m_copyItem = copyDesc;
+            m_copyItemSameDevice = copyDesc;
         }
 
-    }   // namespace RPI
-}   // namespace AZ
+    } // namespace RPI
+} // namespace AZ

+ 30 - 0
Gems/Atom/RPI/Code/Source/RPI.Public/Pass/Pass.cpp

@@ -829,6 +829,36 @@ namespace AZ
             }
         }
 
+        void Pass::DeclareAttachmentsToFrameGraph(RHI::FrameGraphInterface frameGraph, PassSlotType slotType) const
+        {
+            for (const PassAttachmentBinding& attachmentBinding : m_attachmentBindings)
+            {
+                if(slotType == PassSlotType::Uninitialized || slotType == attachmentBinding.m_slotType)
+                {
+                    if (attachmentBinding.GetAttachment() != nullptr &&
+                        frameGraph.GetAttachmentDatabase().IsAttachmentValid(attachmentBinding.GetAttachment()->GetAttachmentId()))
+                    {
+                        switch (attachmentBinding.m_unifiedScopeDesc.GetType())
+                        {
+                        case RHI::AttachmentType::Image:
+                        {
+                            frameGraph.UseAttachment(attachmentBinding.m_unifiedScopeDesc.GetAsImage(), attachmentBinding.GetAttachmentAccess(), attachmentBinding.m_scopeAttachmentUsage);
+                            break;
+                        }
+                        case RHI::AttachmentType::Buffer:
+                        {
+                            frameGraph.UseAttachment(attachmentBinding.m_unifiedScopeDesc.GetAsBuffer(), attachmentBinding.GetAttachmentAccess(), attachmentBinding.m_scopeAttachmentUsage);
+                            break;
+                        }
+                        default:
+                            AZ_Assert(false, "Error, trying to bind an attachment that is neither an image nor a buffer!");
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
         void Pass::SetupInputsFromTemplate()
         {
             if (m_template)

+ 0 - 27
Gems/Atom/RPI/Code/Source/RPI.Public/Pass/RenderPass.cpp

@@ -230,33 +230,6 @@ namespace AZ
             m_lastDeviceIndex = context.GetDeviceIndex();
         }
 
-        void RenderPass::DeclareAttachmentsToFrameGraph(RHI::FrameGraphInterface frameGraph) const
-        {
-            for (const PassAttachmentBinding& attachmentBinding : m_attachmentBindings)
-            {
-                if (attachmentBinding.GetAttachment() != nullptr &&
-                    frameGraph.GetAttachmentDatabase().IsAttachmentValid(attachmentBinding.GetAttachment()->GetAttachmentId()))
-                {
-                    switch (attachmentBinding.m_unifiedScopeDesc.GetType())
-                    {
-                    case RHI::AttachmentType::Image:
-                    {
-                        frameGraph.UseAttachment(attachmentBinding.m_unifiedScopeDesc.GetAsImage(), attachmentBinding.GetAttachmentAccess(), attachmentBinding.m_scopeAttachmentUsage);
-                        break;
-                    }
-                    case RHI::AttachmentType::Buffer:
-                    {
-                        frameGraph.UseAttachment(attachmentBinding.m_unifiedScopeDesc.GetAsBuffer(), attachmentBinding.GetAttachmentAccess(), attachmentBinding.m_scopeAttachmentUsage);
-                        break;
-                    }
-                    default:
-                        AZ_Assert(false, "Error, trying to bind an attachment that is neither an image nor a buffer!");
-                        break;
-                    }
-                }
-            }
-        }
-
         void RenderPass::DeclarePassDependenciesToFrameGraph(RHI::FrameGraphInterface frameGraph) const
         {
             for (Pass* pass : m_executeAfterPasses)

+ 32 - 0
Gems/Atom/RPI/Code/Source/RPI.Public/RPIUtils.cpp

@@ -682,6 +682,38 @@ namespace AZ
             return RPI::StreamingImage::FindOrCreate(streamingImageAsset);
         }
 
+        // Find a format for formats with two planars (DepthStencil) based on its ImageView's aspect flag
+        RHI::Format FindFormatForAspect(RHI::Format format, RHI::ImageAspect imageAspect)
+        {
+            RHI::ImageAspectFlags imageAspectFlags = RHI::GetImageAspectFlags(format);
+
+            // only need to convert if the source contains two aspects
+            if (imageAspectFlags == RHI::ImageAspectFlags::DepthStencil)
+            {
+                switch (imageAspect)
+                {
+                case RHI::ImageAspect::Stencil:
+                    return RHI::Format::R8_UINT;
+                case RHI::ImageAspect::Depth:
+                {
+                    switch (format)
+                    {
+                    case RHI::Format::D32_FLOAT_S8X24_UINT:
+                        return RHI::Format::R32_FLOAT;
+                    case RHI::Format::D24_UNORM_S8_UINT:
+                        return RHI::Format::R32_UINT;
+                    case RHI::Format::D16_UNORM_S8_UINT:
+                        return RHI::Format::R16_UNORM;
+                    default:
+                        AZ_Assert(false, "Unknown DepthStencil format. Please update this function");
+                        return RHI::Format::R32_FLOAT;
+                    }
+                }
+                }
+            }
+            return format;
+        }
+
         //! A helper function for GetComputeShaderNumThreads(), to consolidate error messages, etc.
         static bool GetAttributeArgumentByIndex(
             const Data::Asset<ShaderAsset>& shaderAsset,

+ 3 - 2
Gems/Atom/RPI/Code/Source/RPI.Reflect/Pass/PassData.cpp

@@ -150,9 +150,10 @@ namespace AZ
                     ->Field("ImageSourceOrigin", &CopyPassData::m_imageSourceOrigin)
                     ->Field("ImageDestinationSubresource", &CopyPassData::m_imageDestinationSubresource)
                     ->Field("ImageDestinationOrigin", &CopyPassData::m_imageDestinationOrigin)
+                    ->Field("SourceDeviceIndex", &CopyPassData::m_sourceDeviceIndex)
+                    ->Field("DestinationDeviceIndex", &CopyPassData::m_destinationDeviceIndex)
                     ->Field("CloneInput", &CopyPassData::m_cloneInput)
-                    ->Field("UseCopyQueue", &CopyPassData::m_useCopyQueue)
-                    ;
+                    ->Field("UseCopyQueue", &CopyPassData::m_useCopyQueue);
             }
         }