4 vuotta sitten · d19d2aff9d
--- a/Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.material
+++ b/Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.material
@@ -0,0 +1,22 @@
 
				+{
			
 
				+    "materialType": "Materials\\Types\\StandardPBR.materialtype",
			
 
				+    "propertyLayoutVersion": 3,
			
 
				+    "properties": {
			
 
				+        "general": {
			
 
				+            "enableShadows": false,
			
 
				+            "enableDirectionalLights": false,
			
 
				+            "enablePunctualLights": false,
			
 
				+            "enableAreaLights": false,
			
 
				+            "enableIBL":  true
			
 
				+        },
			
 
				+        "baseColor": {
			
 
				+           "color": [ 0.0, 1.0, 0.0 ]
			
 
				+        },
			
 
				+        "opacity": {
			
 
				+            "alphaSource": "None",
			
 
				+            "doubleSided": true,
			
 
				+            "factor": 0.25,
			
 
				+            "mode": "TintedTransparent"
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.material
+++ b/Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.material
@@ -0,0 +1,22 @@
 
				+{
			
 
				+    "materialType": "Materials\\Types\\StandardPBR.materialtype",
			
 
				+    "propertyLayoutVersion": 3,
			
 
				+    "properties": {
			
 
				+        "general": {
			
 
				+            "enableShadows": false,
			
 
				+            "enableDirectionalLights": false,
			
 
				+            "enablePunctualLights": false,
			
 
				+            "enableAreaLights": false,
			
 
				+            "enableIBL":  true
			
 
				+        },
			
 
				+        "baseColor": {
			
 
				+           "color": [ 0.0, 1.0, 0.0 ]
			
 
				+        },
			
 
				+        "opacity": {
			
 
				+            "alphaSource": "None",
			
 
				+            "doubleSided": true,
			
 
				+            "factor": 1.0,
			
 
				+            "mode": "TintedTransparent"
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/Gems/Atom/Feature/Common/Assets/Models/OcclusionCullingPlane.fbx
+++ b/Gems/Atom/Feature/Common/Assets/Models/OcclusionCullingPlane.fbx
@@ -0,0 +1,3 @@
 
				+version https://git-lfs.github.com/spec/v1
			
 
				+oid sha256:75cdf73fcb9698a76a38294a1cf927a4fb41a34869e0429e1f02bf8d361a7258
			
 
				+size 20400
			
--- a/Gems/Atom/Feature/Common/Code/Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h
+++ b/Gems/Atom/Feature/Common/Code/Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <AzCore/base.h>
			
 
				+#include <AzCore/Math/Transform.h>
			
 
				+#include <Atom/RPI.Public/FeatureProcessor.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        class OcclusionCullingPlane;
			
 
				+
			
 
				+        using OcclusionCullingPlaneHandle = AZStd::shared_ptr<OcclusionCullingPlane>;
			
 
				+
			
 
				+        // OcclusionCullingPlaneFeatureProcessorInterface provides an interface to the feature processor for code outside of Atom
			
 
				+        class OcclusionCullingPlaneFeatureProcessorInterface
			
 
				+            : public RPI::FeatureProcessor
			
 
				+        {
			
 
				+        public:
			
 
				+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneFeatureProcessorInterface, "{50F6B45E-A622-44EC-B962-DE25FBD44095}");
			
 
				+
			
 
				+            virtual OcclusionCullingPlaneHandle AddOcclusionCullingPlane(const AZ::Transform& transform) = 0;
			
 
				+            virtual void RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& handle) = 0;
			
 
				+            virtual bool IsValidOcclusionCullingPlaneHandle(const OcclusionCullingPlaneHandle& occlusionCullingPlane) const = 0;
			
 
				+            virtual void SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform) = 0;
			
 
				+            virtual void SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enabled) = 0;
			
 
				+            virtual void ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization) = 0;
			
 
				+            virtual void SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization) = 0;
			
 
				+        };
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/Atom/Feature/Common/Code/Source/CommonSystemComponent.cpp
+++ b/Gems/Atom/Feature/Common/Code/Source/CommonSystemComponent.cpp
@@ -103,6 +103,7 @@
 
				 #include <ReflectionScreenSpace/ReflectionScreenSpaceBlurPass.h>
			
 
				 #include <ReflectionScreenSpace/ReflectionScreenSpaceBlurChildPass.h>
			
 
				 #include <ReflectionScreenSpace/ReflectionCopyFrameBufferPass.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
			
 
				 
			
 
				 namespace AZ
			
 
				 {
			
@@ -138,6 +139,7 @@ namespace AZ
 
				             DiffuseProbeGridFeatureProcessor::Reflect(context);
			
 
				             DiffuseGlobalIlluminationFeatureProcessor::Reflect(context);
			
 
				             RayTracingFeatureProcessor::Reflect(context);
			
 
				+            OcclusionCullingPlaneFeatureProcessor::Reflect(context);
			
 
				 
			
 
				             if (SerializeContext* serialize = azrtti_cast<SerializeContext*>(context))
			
 
				             {
			
@@ -195,6 +197,7 @@ namespace AZ
 
				             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<DiffuseProbeGridFeatureProcessor>();
			
 
				             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<DiffuseGlobalIlluminationFeatureProcessor>();
			
 
				             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<RayTracingFeatureProcessor>();
			
 
				+            AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<OcclusionCullingPlaneFeatureProcessor>();
			
 
				 
			
 
				             // Add SkyBox pass
			
 
				             auto* passSystem = RPI::PassSystemInterface::Get();
			
@@ -301,6 +304,7 @@ namespace AZ
 
				             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<SkyBoxFeatureProcessor>();
			
 
				             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<TransformServiceFeatureProcessor>();
			
 
				             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<AuxGeomFeatureProcessor>();
			
 
				+            AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<OcclusionCullingPlaneFeatureProcessor>();
			
 
				         }
			
 
				 
			
 
				         void CommonSystemComponent::LoadPassTemplateMappings()
			
--- a/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp
+++ b/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp
@@ -0,0 +1,113 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
			
 
				+#include <AzCore/Math/Random.h>
			
 
				+#include <Atom/RPI.Public/Scene.h>
			
 
				+#include <Atom/RPI.Reflect/Asset/AssetUtils.h>
			
 
				+#include <Atom/RPI.Reflect/Material/MaterialAsset.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        static const char* OcclusionCullingPlaneDrawListTag("occlusioncullingplanevisualization");
			
 
				+
			
 
				+        OcclusionCullingPlane::~OcclusionCullingPlane()
			
 
				+        {
			
 
				+            Data::AssetBus::MultiHandler::BusDisconnect();
			
 
				+            m_meshFeatureProcessor->ReleaseMesh(m_visualizationMeshHandle);
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::Init(RPI::Scene* scene)
			
 
				+        {
			
 
				+            AZ_Assert(scene, "OcclusionCullingPlane::Init called with a null Scene pointer");
			
 
				+
			
 
				+            m_meshFeatureProcessor = scene->GetFeatureProcessor<Render::MeshFeatureProcessorInterface>();
			
 
				+
			
 
				+            // load visualization plane model and material
			
 
				+            m_visualizationModelAsset = AZ::RPI::AssetUtils::GetAssetByProductPath<AZ::RPI::ModelAsset>(
			
 
				+                "Models/OcclusionCullingPlane.azmodel",
			
 
				+                AZ::RPI::AssetUtils::TraceLevel::Assert);
			
 
				+
			
 
				+            m_visualizationMeshHandle = m_meshFeatureProcessor->AcquireMesh(m_visualizationModelAsset);
			
 
				+            m_meshFeatureProcessor->SetExcludeFromReflectionCubeMaps(m_visualizationMeshHandle, true);
			
 
				+            m_meshFeatureProcessor->SetRayTracingEnabled(m_visualizationMeshHandle, false);
			
 
				+            m_meshFeatureProcessor->SetTransform(m_visualizationMeshHandle, AZ::Transform::CreateIdentity());
			
 
				+
			
 
				+            SetVisualizationMaterial();
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::SetVisualizationMaterial()
			
 
				+        {
			
 
				+            AZStd::string materialAssetPath;
			
 
				+            if (m_transparentVisualization)
			
 
				+            {
			
 
				+                materialAssetPath = "Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.azmaterial";
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                materialAssetPath = "Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.azmaterial";
			
 
				+            }
			
 
				+
			
 
				+            RPI::AssetUtils::TraceLevel traceLevel = AZ::RPI::AssetUtils::TraceLevel::Assert;
			
 
				+            m_visualizationMaterialAsset = AZ::RPI::AssetUtils::GetAssetByProductPath<AZ::RPI::MaterialAsset>(materialAssetPath.c_str(), traceLevel);
			
 
				+            m_visualizationMaterialAsset.QueueLoad();
			
 
				+            Data::AssetBus::MultiHandler::BusConnect(m_visualizationMaterialAsset.GetId());
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::OnAssetReady(Data::Asset<Data::AssetData> asset)
			
 
				+        {
			
 
				+            if (m_visualizationMaterialAsset.GetId() == asset.GetId())
			
 
				+            {
			
 
				+                m_visualizationMaterialAsset = asset;
			
 
				+                Data::AssetBus::MultiHandler::BusDisconnect(asset.GetId());
			
 
				+
			
 
				+                m_visualizationMaterial = AZ::RPI::Material::FindOrCreate(m_visualizationMaterialAsset);
			
 
				+                m_meshFeatureProcessor->SetMaterialAssignmentMap(m_visualizationMeshHandle, m_visualizationMaterial);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::OnAssetError(Data::Asset<Data::AssetData> asset)
			
 
				+        {
			
 
				+            AZ_Error("OcclusionCullingPlane", false, "Failed to load OcclusionCullingPlane visualization asset %s", asset.ToString<AZStd::string>().c_str());
			
 
				+            Data::AssetBus::MultiHandler::BusDisconnect(asset.GetId());
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::SetTransform(const AZ::Transform& transform)
			
 
				+        {
			
 
				+            m_transform = transform;
			
 
				+
			
 
				+            // update visualization plane transform
			
 
				+            m_meshFeatureProcessor->SetTransform(m_visualizationMeshHandle, transform);
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::ShowVisualization(bool showVisualization)
			
 
				+        {
			
 
				+            if (m_showVisualization != showVisualization)
			
 
				+            {
			
 
				+                m_meshFeatureProcessor->SetVisible(m_visualizationMeshHandle, showVisualization);
			
 
				+                SetVisualizationMaterial();
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlane::SetTransparentVisualization(bool transparentVisualization)
			
 
				+        {
			
 
				+            if (m_transparentVisualization != transparentVisualization)
			
 
				+            {
			
 
				+                m_transparentVisualization = transparentVisualization;
			
 
				+                SetVisualizationMaterial();
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.h
+++ b/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.h
@@ -0,0 +1,65 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
			
 
				+#include <Atom/Feature/Mesh/MeshFeatureProcessorInterface.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        //! This class represents an OcclusionCullingPlane which is used to cull meshes that are inside the view frustum
			
 
				+        class OcclusionCullingPlane final
			
 
				+            : public AZ::Data::AssetBus::MultiHandler
			
 
				+        {
			
 
				+        public:
			
 
				+            OcclusionCullingPlane() = default;
			
 
				+            ~OcclusionCullingPlane();
			
 
				+
			
 
				+            void Init(RPI::Scene* scene);
			
 
				+
			
 
				+            void SetTransform(const AZ::Transform& transform);
			
 
				+            const AZ::Transform& GetTransform() const { return m_transform; }
			
 
				+
			
 
				+            void SetEnabled(bool enabled) { m_enabled = enabled; }
			
 
				+            bool GetEnabled() const { return m_enabled; }
			
 
				+
			
 
				+            // enables or disables rendering of the visualization plane
			
 
				+            void ShowVisualization(bool showVisualization);
			
 
				+
			
 
				+            // sets the visualization to transparent mode
			
 
				+            void SetTransparentVisualization(bool transparentVisualization);
			
 
				+
			
 
				+        private:
			
 
				+
			
 
				+            void SetVisualizationMaterial();
			
 
				+
			
 
				+            // AZ::Data::AssetBus::Handler overrides...
			
 
				+            void OnAssetReady(Data::Asset<Data::AssetData> asset) override;
			
 
				+            void OnAssetError(Data::Asset<Data::AssetData> asset) override;
			
 
				+
			
 
				+            AZ::Transform m_transform;
			
 
				+            bool m_enabled = true;
			
 
				+            bool m_showVisualization = true;
			
 
				+            bool m_transparentVisualization = false;
			
 
				+
			
 
				+            // visualization
			
 
				+            AZ::Render::MeshFeatureProcessorInterface* m_meshFeatureProcessor = nullptr;
			
 
				+            Data::Asset<RPI::ModelAsset> m_visualizationModelAsset;
			
 
				+            Data::Asset<RPI::MaterialAsset> m_visualizationMaterialAsset;
			
 
				+            Data::Instance<RPI::Material> m_visualizationMaterial;
			
 
				+            AZ::Render::MeshFeatureProcessorInterface::MeshHandle m_visualizationMeshHandle;
			
 
				+        };
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp
+++ b/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp
@@ -0,0 +1,146 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
			
 
				+#include <AzCore/std/smart_ptr/make_shared.h>
			
 
				+#include <AzCore/std/smart_ptr/intrusive_ptr.h>
			
 
				+#include <Atom/RPI.Public/Scene.h>
			
 
				+#include <Atom/RPI.Public/Culling.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::Reflect(ReflectContext* context)
			
 
				+        {
			
 
				+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
			
 
				+            {
			
 
				+                serializeContext
			
 
				+                    ->Class<OcclusionCullingPlaneFeatureProcessor, FeatureProcessor>()
			
 
				+                    ->Version(0);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::Activate()
			
 
				+        {
			
 
				+            m_occlusionCullingPlanes.reserve(InitialOcclusionCullingPlanesAllocationSize);
			
 
				+            m_rpiOcclusionPlanes.reserve(InitialOcclusionCullingPlanesAllocationSize);
			
 
				+
			
 
				+            EnableSceneNotification();
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::Deactivate()
			
 
				+        {
			
 
				+            AZ_Warning("OcclusionCullingPlaneFeatureProcessor", m_occlusionCullingPlanes.size() == 0,
			
 
				+                "Deactivating the OcclusionCullingPlaneFeatureProcessor, but there are still outstanding occlusion planes. Components\n"
			
 
				+                "using OcclusionCullingPlaneHandles should free them before the OcclusionCullingPlaneFeatureProcessor is deactivated.\n"
			
 
				+            );
			
 
				+
			
 
				+            DisableSceneNotification();
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::OnBeginPrepareRender()
			
 
				+        {
			
 
				+            if (m_rpiListNeedsUpdate)
			
 
				+            {
			
 
				+                // rebuild the RPI occlusion list
			
 
				+                m_rpiOcclusionPlanes.clear();
			
 
				+
			
 
				+                for (auto& occlusionCullingPlane : m_occlusionCullingPlanes)
			
 
				+                {
			
 
				+                    if (!occlusionCullingPlane->GetEnabled())
			
 
				+                    {
			
 
				+                        continue;
			
 
				+                    }
			
 
				+
			
 
				+                    RPI::CullingScene::OcclusionPlane rpiOcclusionPlane;
			
 
				+
			
 
				+                    static const Vector3 BL = Vector3(-0.5f, 0.0f, -0.5f);
			
 
				+                    static const Vector3 TL = Vector3(-0.5f, 0.0f,  0.5f);
			
 
				+                    static const Vector3 TR = Vector3( 0.5f, 0.0f,  0.5f);
			
 
				+                    static const Vector3 BR = Vector3( 0.5f, 0.0f, -0.5f);
			
 
				+
			
 
				+                    const AZ::Transform& transform = occlusionCullingPlane->GetTransform();
			
 
				+
			
 
				+                    // convert corners to world space
			
 
				+                    rpiOcclusionPlane.m_cornerBL = transform.TransformPoint(BL);
			
 
				+                    rpiOcclusionPlane.m_cornerTL = transform.TransformPoint(TL);
			
 
				+                    rpiOcclusionPlane.m_cornerTR = transform.TransformPoint(TR);
			
 
				+                    rpiOcclusionPlane.m_cornerBR = transform.TransformPoint(BR);
			
 
				+
			
 
				+                    // build world space AABB
			
 
				+                    AZ::Vector3 aabbMin = rpiOcclusionPlane.m_cornerBL.GetMin(rpiOcclusionPlane.m_cornerTR);
			
 
				+                    AZ::Vector3 aabbMax = rpiOcclusionPlane.m_cornerBL.GetMax(rpiOcclusionPlane.m_cornerTR);
			
 
				+                    rpiOcclusionPlane.m_aabb = Aabb::CreateFromMinMax(aabbMin, aabbMax);
			
 
				+
			
 
				+                    m_rpiOcclusionPlanes.push_back(rpiOcclusionPlane);
			
 
				+                }
			
 
				+
			
 
				+                GetParentScene()->GetCullingScene()->SetOcclusionPlanes(m_rpiOcclusionPlanes);
			
 
				+
			
 
				+                m_rpiListNeedsUpdate = false;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        OcclusionCullingPlaneHandle OcclusionCullingPlaneFeatureProcessor::AddOcclusionCullingPlane(const AZ::Transform& transform)
			
 
				+        {
			
 
				+            AZStd::shared_ptr<OcclusionCullingPlane> occlusionCullingPlane = AZStd::make_shared<OcclusionCullingPlane>();
			
 
				+            occlusionCullingPlane->Init(GetParentScene());
			
 
				+            occlusionCullingPlane->SetTransform(transform);
			
 
				+            m_occlusionCullingPlanes.push_back(occlusionCullingPlane);
			
 
				+            m_rpiListNeedsUpdate = true;
			
 
				+
			
 
				+            return occlusionCullingPlane;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& occlusionCullingPlane)
			
 
				+        {
			
 
				+            AZ_Assert(occlusionCullingPlane.get(), "RemoveOcclusionCullingPlane called with an invalid handle");
			
 
				+
			
 
				+            auto itEntry = AZStd::find_if(m_occlusionCullingPlanes.begin(), m_occlusionCullingPlanes.end(), [&](AZStd::shared_ptr<OcclusionCullingPlane> const& entry)
			
 
				+            {
			
 
				+                return (entry == occlusionCullingPlane);
			
 
				+            });
			
 
				+
			
 
				+            AZ_Assert(itEntry != m_occlusionCullingPlanes.end(), "RemoveOcclusionCullingPlane called with an occlusion plane that is not in the occlusion plane list");
			
 
				+            m_occlusionCullingPlanes.erase(itEntry);
			
 
				+            occlusionCullingPlane = nullptr;
			
 
				+            m_rpiListNeedsUpdate = true;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform)
			
 
				+        {
			
 
				+            AZ_Assert(occlusionCullingPlane.get(), "SetTransform called with an invalid handle");
			
 
				+            occlusionCullingPlane->SetTransform(transform);
			
 
				+            m_rpiListNeedsUpdate = true;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enabled)
			
 
				+        {
			
 
				+            AZ_Assert(occlusionCullingPlane.get(), "Enable called with an invalid handle");
			
 
				+            occlusionCullingPlane->SetEnabled(enabled);
			
 
				+            m_rpiListNeedsUpdate = true;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization)
			
 
				+        {
			
 
				+            AZ_Assert(occlusionCullingPlane.get(), "ShowVisualization called with an invalid handle");
			
 
				+            occlusionCullingPlane->ShowVisualization(showVisualization);
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneFeatureProcessor::SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization)
			
 
				+        {
			
 
				+            AZ_Assert(occlusionCullingPlane.get(), "SetTransparentVisualization called with an invalid handle");
			
 
				+            occlusionCullingPlane->SetTransparentVisualization(transparentVisualization);
			
 
				+        }
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h
+++ b/Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h
@@ -0,0 +1,66 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlane.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        //! This class manages OcclusionCullingPlanes which are used to cull meshes that are inside the view frustum
			
 
				+        class OcclusionCullingPlaneFeatureProcessor final
			
 
				+            : public OcclusionCullingPlaneFeatureProcessorInterface
			
 
				+        {
			
 
				+        public:
			
 
				+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneFeatureProcessor, "{C3DE91D7-EF7A-4A82-A55F-E22BC52074EA}", OcclusionCullingPlaneFeatureProcessorInterface);
			
 
				+
			
 
				+            static void Reflect(AZ::ReflectContext* context);
			
 
				+
			
 
				+            OcclusionCullingPlaneFeatureProcessor() = default;
			
 
				+            virtual ~OcclusionCullingPlaneFeatureProcessor() = default;
			
 
				+
			
 
				+            // OcclusionCullingPlaneFeatureProcessorInterface overrides
			
 
				+            OcclusionCullingPlaneHandle AddOcclusionCullingPlane(const AZ::Transform& transform) override;
			
 
				+            void RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& handle) override;
			
 
				+            bool IsValidOcclusionCullingPlaneHandle(const OcclusionCullingPlaneHandle& occlusionCullingPlane) const override { return (occlusionCullingPlane.get() != nullptr); }
			
 
				+            void SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform) override;
			
 
				+            void SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enable) override;
			
 
				+            void ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization) override;
			
 
				+            void SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization) override;
			
 
				+
			
 
				+            // FeatureProcessor overrides
			
 
				+            void Activate() override;
			
 
				+            void Deactivate() override;
			
 
				+
			
 
				+            // RPI::SceneNotificationBus overrides ...
			
 
				+            void OnBeginPrepareRender() override;
			
 
				+
			
 
				+            // retrieve the full list of occlusion planes
			
 
				+            using OcclusionCullingPlaneVector = AZStd::vector<AZStd::shared_ptr<OcclusionCullingPlane>>;
			
 
				+            OcclusionCullingPlaneVector& GetOcclusionCullingPlanes() { return m_occlusionCullingPlanes; }
			
 
				+
			
 
				+        private:
			
 
				+            AZ_DISABLE_COPY_MOVE(OcclusionCullingPlaneFeatureProcessor);
			
 
				+
			
 
				+            // list of occlusion planes
			
 
				+            const size_t InitialOcclusionCullingPlanesAllocationSize = 64;
			
 
				+            OcclusionCullingPlaneVector m_occlusionCullingPlanes;
			
 
				+
			
 
				+            // prebuilt list of RPI scene occlusion planes
			
 
				+            RPI::CullingScene::OcclusionPlaneVector m_rpiOcclusionPlanes;
			
 
				+            bool m_rpiListNeedsUpdate = false;
			
 
				+        };
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/Atom/Feature/Common/Code/atom_feature_common_files.cmake
+++ b/Gems/Atom/Feature/Common/Code/atom_feature_common_files.cmake
@@ -175,6 +175,10 @@ set(FILES
 
				     Source/MorphTargets/MorphTargetComputePass.h
			
 
				     Source/MorphTargets/MorphTargetDispatchItem.cpp
			
 
				     Source/MorphTargets/MorphTargetDispatchItem.h
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlane.h
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp
			
 
				     Source/PostProcess/PostProcessBase.cpp
			
 
				     Source/PostProcess/PostProcessBase.h
			
 
				     Source/PostProcess/PostProcessFeatureProcessor.cpp
			
--- a/Gems/Atom/Feature/Common/Code/atom_feature_common_public_files.cmake
+++ b/Gems/Atom/Feature/Common/Code/atom_feature_common_public_files.cmake
@@ -45,6 +45,7 @@ set(FILES
 
				     Include/Atom/Feature/ParamMacros/StartParamFunctionsVirtual.inl
			
 
				     Include/Atom/Feature/ParamMacros/StartParamMembers.inl
			
 
				     Include/Atom/Feature/ParamMacros/StartParamSerializeContext.inl
			
 
				+    Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h
			
 
				     Include/Atom/Feature/PostProcess/PostProcessFeatureProcessorInterface.h
			
 
				     Include/Atom/Feature/PostProcess/PostProcessParams.inl
			
 
				     Include/Atom/Feature/PostProcess/PostProcessSettings.inl
			
--- a/Gems/Atom/RPI/Code/CMakeLists.txt
+++ b/Gems/Atom/RPI/Code/CMakeLists.txt
@@ -9,6 +9,17 @@
 
				 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				 #
			
 
				 
			
 
				+ly_get_list_relative_pal_filename(pal_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/${PAL_PLATFORM_NAME})
			
 
				+
			
 
				+#for PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED and PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+include(${pal_source_dir}/PAL_${PAL_PLATFORM_NAME_LOWERCASE}.cmake)
			
 
				+
			
 
				+if(PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED)
			
 
				+    set(MASKED_OCCLUSION_CULLING_FILES "atom_rpi_masked_occlusion_files.cmake")
			
 
				+else()
			
 
				+    set(MASKED_OCCLUSION_CULLING_FILES "")
			
 
				+endif()
			
 
				+
			
 
				 ly_add_target(
			
 
				     NAME Atom_RPI.Public STATIC
			
 
				     NAMESPACE Gem
			
@@ -16,11 +27,15 @@ ly_add_target(
 
				         atom_rpi_reflect_files.cmake
			
 
				         atom_rpi_public_files.cmake
			
 
				         ../Assets/atom_rpi_asset_files.cmake
			
 
				+        ${pal_source_dir}/platform_${PAL_PLATFORM_NAME_LOWERCASE}_files.cmake
			
 
				+        ${MASKED_OCCLUSION_CULLING_FILES}
			
 
				     INCLUDE_DIRECTORIES
			
 
				         PRIVATE
			
 
				             Source
			
 
				+            ${pal_source_dir}
			
 
				         PUBLIC
			
 
				             Include
			
 
				+            External
			
 
				     BUILD_DEPENDENCIES
			
 
				         PRIVATE
			
 
				             AZ::AtomCore
			
@@ -159,8 +174,6 @@ if(PAL_TRAIT_BUILD_HOST_TOOLS)
 
				     ly_get_list_relative_pal_filename(pal_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/${PAL_PLATFORM_NAME})
			
 
				     ly_get_list_relative_pal_filename(common_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/Common)
			
 
				 
			
 
				-    include(${pal_source_dir}/PAL_${PAL_PLATFORM_NAME_LOWERCASE}.cmake) #for PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED
			
 
				-
			
 
				     if(NOT PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED)
			
 
				 
			
 
				         # Create a stub
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/CompilerSpecific.inl
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/CompilerSpecific.inl
@@ -0,0 +1,98 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Common shared include file to hide compiler/os specific functions from the rest of the code. 
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__)
			
 
				+	#define __MICROSOFT_COMPILER
			
 
				+#endif
			
 
				+
			
 
				+#if defined(_WIN32)	&& (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__clang__)) // Windows: MSVC / Intel compiler / clang
			
 
				+	#include <intrin.h>
			
 
				+	#include <new.h>
			
 
				+
			
 
				+	#define FORCE_INLINE __forceinline
			
 
				+
			
 
				+	FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
			
 
				+	{
			
 
				+		unsigned long idx;
			
 
				+		_BitScanForward(&idx, *mask);
			
 
				+		*mask &= *mask - 1;
			
 
				+		return idx;
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
			
 
				+	{
			
 
				+		return _aligned_malloc(size, alignment);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void aligned_free(void *ptr)
			
 
				+	{
			
 
				+		_aligned_free(ptr);
			
 
				+	}
			
 
				+
			
 
				+#elif defined(__GNUG__)	|| defined(__clang__) // G++ or clang
			
 
				+	#include <cpuid.h>
			
 
				+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
			
 
				+	#include <malloc/malloc.h> // memalign
			
 
				+#else
			
 
				+	#include <malloc.h> // memalign
			
 
				+#endif
			
 
				+	#include <mm_malloc.h>
			
 
				+	#include <immintrin.h>
			
 
				+	#include <new>
			
 
				+
			
 
				+	#define FORCE_INLINE inline
			
 
				+
			
 
				+	FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
			
 
				+	{
			
 
				+		unsigned long idx;
			
 
				+		idx = __builtin_ctzl(*mask);
			
 
				+		*mask &= *mask - 1;
			
 
				+		return idx;
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
			
 
				+	{
			
 
				+		return memalign(alignment, size);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void aligned_free(void *ptr)
			
 
				+	{
			
 
				+		free(ptr);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void __cpuidex(int* cpuinfo, int function, int subfunction)
			
 
				+	{
			
 
				+		__cpuid_count(function, subfunction, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE unsigned long long _xgetbv(unsigned int index)
			
 
				+	{
			
 
				+		unsigned int eax, edx;
			
 
				+		__asm__ __volatile__(
			
 
				+			"xgetbv;"
			
 
				+			: "=a" (eax), "=d"(edx)
			
 
				+			: "c" (index)
			
 
				+		);
			
 
				+		return ((unsigned long long)edx << 32) | eax;
			
 
				+	}
			
 
				+
			
 
				+#else
			
 
				+	#error Unsupported compiler
			
 
				+#endif
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/LICENSE.txt
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/LICENSE.txt
@@ -0,0 +1,181 @@
 
				+ 
			
 
				+Apache License
			
 
				+ Version 2.0, January 2004
			
 
				+
			
 
				+ http://www.apache.org/licenses/ 
			
 
				+
			
 
				+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 
			
 
				+
			
 
				+1. Definitions.
			
 
				+
			
 
				+"License" shall mean the terms and conditions for use, reproduction, and 
			
 
				+distribution as defined by Sections 1 through 9 of this document. 
			
 
				+
			
 
				+"Licensor" shall mean the copyright owner or entity authorized by the copyright 
			
 
				+owner that is granting the License. 
			
 
				+
			
 
				+"Legal Entity" shall mean the union of the acting entity and all other entities 
			
 
				+that control, are controlled by, or are under common control with that entity. 
			
 
				+For the purposes of this definition, "control" means (i) the power, direct or 
			
 
				+indirect, to cause the direction or management of such entity, whether by 
			
 
				+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 
			
 
				+outstanding shares, or (iii) beneficial ownership of such entity. 
			
 
				+
			
 
				+"You" (or "Your") shall mean an individual or Legal Entity exercising 
			
 
				+permissions granted by this License. 
			
 
				+
			
 
				+"Source" form shall mean the preferred form for making modifications, including 
			
 
				+but not limited to software source code, documentation source, and configuration 
			
 
				+files. 
			
 
				+
			
 
				+"Object" form shall mean any form resulting from mechanical transformation or 
			
 
				+translation of a Source form, including but not limited to compiled object code, 
			
 
				+generated documentation, and conversions to other media types. 
			
 
				+
			
 
				+"Work" shall mean the work of authorship, whether in Source or Object form, made 
			
 
				+available under the License, as indicated by a copyright notice that is included 
			
 
				+in or attached to the work (an example is provided in the Appendix below). 
			
 
				+
			
 
				+"Derivative Works" shall mean any work, whether in Source or Object form, that 
			
 
				+is based on (or derived from) the Work and for which the editorial revisions, 
			
 
				+annotations, elaborations, or other modifications represent, as a whole, an 
			
 
				+original work of authorship. For the purposes of this License, Derivative Works 
			
 
				+shall not include works that remain separable from, or merely link (or bind by 
			
 
				+name) to the interfaces of, the Work and Derivative Works thereof. 
			
 
				+
			
 
				+"Contribution" shall mean any work of authorship, including the original version 
			
 
				+of the Work and any modifications or additions to that Work or Derivative Works 
			
 
				+thereof, that is intentionally submitted to Licensor for inclusion in the Work 
			
 
				+by the copyright owner or by an individual or Legal Entity authorized to submit 
			
 
				+on behalf of the copyright owner. For the purposes of this definition, 
			
 
				+"submitted" means any form of electronic, verbal, or written communication sent 
			
 
				+to the Licensor or its representatives, including but not limited to 
			
 
				+communication on electronic mailing lists, source code control systems, and 
			
 
				+issue tracking systems that are managed by, or on behalf of, the Licensor for 
			
 
				+the purpose of discussing and improving the Work, but excluding communication 
			
 
				+that is conspicuously marked or otherwise designated in writing by the copyright 
			
 
				+owner as "Not a Contribution." 
			
 
				+
			
 
				+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf 
			
 
				+of whom a Contribution has been received by Licensor and subsequently 
			
 
				+incorporated within the Work. 
			
 
				+
			
 
				+2. Grant of Copyright License. Subject to the terms and conditions of this 
			
 
				+License, each Contributor hereby grants to You a perpetual, worldwide, 
			
 
				+non-exclusive, no-charge, royalty-free, irrevocable copyright license to 
			
 
				+reproduce, prepare Derivative Works of, publicly display, publicly perform, 
			
 
				+sublicense, and distribute the Work and such Derivative Works in Source or 
			
 
				+Object form. 
			
 
				+
			
 
				+3. Grant of Patent License. Subject to the terms and conditions of this License, 
			
 
				+each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, 
			
 
				+no-charge, royalty-free, irrevocable (except as stated in this section) patent 
			
 
				+license to make, have made, use, offer to sell, sell, import, and otherwise 
			
 
				+transfer the Work, where such license applies only to those patent claims 
			
 
				+licensable by such Contributor that are necessarily infringed by their 
			
 
				+Contribution(s) alone or by combination of their Contribution(s) with the Work 
			
 
				+to which such Contribution(s) was submitted. If You institute patent litigation 
			
 
				+against any entity (including a cross-claim or counterclaim in a lawsuit) 
			
 
				+alleging that the Work or a Contribution incorporated within the Work 
			
 
				+constitutes direct or contributory patent infringement, then any patent licenses 
			
 
				+granted to You under this License for that Work shall terminate as of the date 
			
 
				+such litigation is filed. 
			
 
				+
			
 
				+4. Redistribution. You may reproduce and distribute copies of the Work or 
			
 
				+Derivative Works thereof in any medium, with or without modifications, and in 
			
 
				+Source or Object form, provided that You meet the following conditions: 
			
 
				+  You must give any other recipients of the Work or Derivative Works a copy of 
			
 
				+  this License; and 
			
 
				+
			
 
				+
			
 
				+  You must cause any modified files to carry prominent notices stating that You 
			
 
				+  changed the files; and 
			
 
				+
			
 
				+
			
 
				+  You must retain, in the Source form of any Derivative Works that You 
			
 
				+  distribute, all copyright, patent, trademark, and attribution notices from the 
			
 
				+  Source form of the Work, excluding those notices that do not pertain to any 
			
 
				+  part of the Derivative Works; and 
			
 
				+
			
 
				+
			
 
				+  If the Work includes a "NOTICE" text file as part of its distribution, then 
			
 
				+  any Derivative Works that You distribute must include a readable copy of the 
			
 
				+  attribution notices contained within such NOTICE file, excluding those notices 
			
 
				+  that do not pertain to any part of the Derivative Works, in at least one of 
			
 
				+  the following places: within a NOTICE text file distributed as part of the 
			
 
				+  Derivative Works; within the Source form or documentation, if provided along 
			
 
				+  with the Derivative Works; or, within a display generated by the Derivative 
			
 
				+  Works, if and wherever such third-party notices normally appear. The contents 
			
 
				+  of the NOTICE file are for informational purposes only and do not modify the 
			
 
				+  License. You may add Your own attribution notices within Derivative Works that 
			
 
				+  You distribute, alongside or as an addendum to the NOTICE text from the Work, 
			
 
				+  provided that such additional attribution notices cannot be construed as 
			
 
				+  modifying the License.
			
 
				+You may add Your own copyright statement to Your modifications and may provide 
			
 
				+additional or different license terms and conditions for use, reproduction, or 
			
 
				+distribution of Your modifications, or for any such Derivative Works as a whole, 
			
 
				+provided Your use, reproduction, and distribution of the Work otherwise complies 
			
 
				+with the conditions stated in this License. 
			
 
				+
			
 
				+5. Submission of Contributions. Unless You explicitly state otherwise, any 
			
 
				+Contribution intentionally submitted for inclusion in the Work by You to the 
			
 
				+Licensor shall be under the terms and conditions of this License, without any 
			
 
				+additional terms or conditions. Notwithstanding the above, nothing herein shall 
			
 
				+supersede or modify the terms of any separate license agreement you may have 
			
 
				+executed with Licensor regarding such Contributions. 
			
 
				+
			
 
				+6. Trademarks. This License does not grant permission to use the trade names, 
			
 
				+trademarks, service marks, or product names of the Licensor, except as required 
			
 
				+for reasonable and customary use in describing the origin of the Work and 
			
 
				+reproducing the content of the NOTICE file. 
			
 
				+
			
 
				+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 
			
 
				+writing, Licensor provides the Work (and each Contributor provides its 
			
 
				+Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
			
 
				+KIND, either express or implied, including, without limitation, any warranties 
			
 
				+or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 
			
 
				+PARTICULAR PURPOSE. You are solely responsible for determining the 
			
 
				+appropriateness of using or redistributing the Work and assume any risks 
			
 
				+associated with Your exercise of permissions under this License. 
			
 
				+
			
 
				+8. Limitation of Liability. In no event and under no legal theory, whether in 
			
 
				+tort (including negligence), contract, or otherwise, unless required by 
			
 
				+applicable law (such as deliberate and grossly negligent acts) or agreed to in 
			
 
				+writing, shall any Contributor be liable to You for damages, including any 
			
 
				+direct, indirect, special, incidental, or consequential damages of any character 
			
 
				+arising as a result of this License or out of the use or inability to use the 
			
 
				+Work (including but not limited to damages for loss of goodwill, work stoppage, 
			
 
				+computer failure or malfunction, or any and all other commercial damages or 
			
 
				+losses), even if such Contributor has been advised of the possibility of such 
			
 
				+damages. 
			
 
				+
			
 
				+9. Accepting Warranty or Additional Liability. While redistributing the Work or 
			
 
				+Derivative Works thereof, You may choose to offer, and charge a fee for, 
			
 
				+acceptance of support, warranty, indemnity, or other liability obligations 
			
 
				+and/or rights consistent with this License. However, in accepting such 
			
 
				+obligations, You may act only on Your own behalf and on Your sole 
			
 
				+responsibility, not on behalf of any other Contributor, and only if You agree to 
			
 
				+indemnify, defend, and hold each Contributor harmless for any liability incurred 
			
 
				+by, or claims asserted against, such Contributor by reason of your accepting any 
			
 
				+such warranty or additional liability. 
			
 
				+
			
 
				+END OF TERMS AND CONDITIONS 
			
 
				+
			
 
				+APPENDIX: How to apply the Apache License to your work 
			
 
				+
			
 
				+To apply the Apache License to your work, attach the following boilerplate 
			
 
				+notice, with the fields enclosed by brackets "[]" replaced with your own 
			
 
				+identifying information. (Don't include the brackets!) The text should be 
			
 
				+enclosed in the appropriate comment syntax for the file format. We also 
			
 
				+recommend that a file or class name and description of purpose be included on 
			
 
				+the same "printed page" as the copyright notice for easier identification within 
			
 
				+third-party archives. 
			
 
				+
			
 
				+Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, 
			
 
				+Version 2.0 (the "License"); you may not use this file except in compliance with 
			
 
				+the License. You may obtain a copy of the License at 
			
 
				+http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or 
			
 
				+agreed to in writing, software distributed under the License is distributed on 
			
 
				+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 
			
 
				+or implied. See the License for the specific language governing permissions and 
			
 
				+limitations under the License.
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
@@ -0,0 +1,456 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+#include <vector>
			
 
				+#include <string.h>
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include "MaskedOcclusionCulling.h"
			
 
				+#include "CompilerSpecific.inl"
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+#include "FrameRecorder.h"
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__AVX__) || defined(__AVX2__)
			
 
				+	// For performance reasons, the MaskedOcclusionCullingAVX2/512.cpp files should be compiled with VEX encoding for SSE instructions (to avoid 
			
 
				+	// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, this file
			
 
				+	// _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to use lowest supported target platform 
			
 
				+	// (/arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
			
 
				+	#error The MaskedOcclusionCulling.cpp should be compiled with lowest supported target platform, e.g. /arch:SSE2
			
 
				+#endif
			
 
				+
			
 
				+static MaskedOcclusionCulling::Implementation DetectCPUFeatures(MaskedOcclusionCulling::pfnAlignedAlloc alignedAlloc, MaskedOcclusionCulling::pfnAlignedFree alignedFree)
			
 
				+{
			
 
				+	struct CpuInfo { int regs[4]; };
			
 
				+
			
 
				+	// Get regular CPUID values
			
 
				+	int regs[4];
			
 
				+	__cpuidex(regs, 0, 0);
			
 
				+
			
 
				+    //  MOCVectorAllocator<CpuInfo> mocalloc( alignedAlloc, alignedFree );
			
 
				+    //  std::vector<CpuInfo, MOCVectorAllocator<CpuInfo>> cpuId( mocalloc ), cpuIdEx( mocalloc );
			
 
				+    //  cpuId.resize( regs[0] );
			
 
				+    size_t cpuIdCount = regs[0];
			
 
				+    CpuInfo * cpuId = (CpuInfo*)alignedAlloc( 64, sizeof(CpuInfo) * cpuIdCount );
			
 
				+    
			
 
				+	for (size_t i = 0; i < cpuIdCount; ++i)
			
 
				+		__cpuidex(cpuId[i].regs, (int)i, 0);
			
 
				+
			
 
				+	// Get extended CPUID values
			
 
				+	__cpuidex(regs, 0x80000000, 0);
			
 
				+
			
 
				+    //cpuIdEx.resize(regs[0] - 0x80000000);
			
 
				+    size_t cpuIdExCount = regs[0] - 0x80000000;
			
 
				+    CpuInfo * cpuIdEx = (CpuInfo*)alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdExCount );
			
 
				+
			
 
				+    for (size_t i = 0; i < cpuIdExCount; ++i)
			
 
				+		__cpuidex(cpuIdEx[i].regs, 0x80000000 + (int)i, 0);
			
 
				+
			
 
				+	#define TEST_BITS(A, B)            (((A) & (B)) == (B))
			
 
				+	#define TEST_FMA_MOVE_OXSAVE       (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 12) | (1 << 22) | (1 << 27)))
			
 
				+	#define TEST_LZCNT                 (cpuIdExCount >= 1 && TEST_BITS(cpuIdEx[1].regs[2], 0x20))
			
 
				+	#define TEST_SSE41                 (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 19)))
			
 
				+	#define TEST_XMM_YMM               (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 2) | (1 << 1)))
			
 
				+	#define TEST_OPMASK_ZMM            (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 7) | (1 << 6) | (1 << 5)))
			
 
				+	#define TEST_BMI1_BMI2_AVX2        (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 3) | (1 << 5) | (1 << 8)))
			
 
				+	#define TEST_AVX512_F_BW_DQ        (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 16) | (1 << 17) | (1 << 30)))
			
 
				+
			
 
				+    MaskedOcclusionCulling::Implementation retVal = MaskedOcclusionCulling::SSE2;
			
 
				+	if (TEST_FMA_MOVE_OXSAVE && TEST_LZCNT && TEST_SSE41)
			
 
				+	{
			
 
				+		if (TEST_XMM_YMM && TEST_OPMASK_ZMM && TEST_BMI1_BMI2_AVX2 && TEST_AVX512_F_BW_DQ)
			
 
				+			retVal = MaskedOcclusionCulling::AVX512;
			
 
				+		else if (TEST_XMM_YMM && TEST_BMI1_BMI2_AVX2)
			
 
				+			retVal = MaskedOcclusionCulling::AVX2;
			
 
				+	} 
			
 
				+    else if (TEST_SSE41)
			
 
				+		retVal = MaskedOcclusionCulling::SSE41;
			
 
				+    alignedFree( cpuId );
			
 
				+    alignedFree( cpuIdEx );
			
 
				+    return retVal;
			
 
				+}
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Utility functions (not directly related to the algorithm/rasterizer)
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+void MaskedOcclusionCulling::TransformVertices(const float *mtx, const float *inVtx, float *xfVtx, unsigned int nVtx, const VertexLayout &vtxLayout)
			
 
				+{
			
 
				+	// This function pretty slow, about 10-20% slower than if the vertices are stored in aligned SOA form.
			
 
				+	if (nVtx == 0)
			
 
				+		return;
			
 
				+
			
 
				+	// Load matrix and swizzle out the z component. For post-multiplication (OGL), the matrix is assumed to be column 
			
 
				+	// major, with one column per SSE register. For pre-multiplication (DX), the matrix is assumed to be row major.
			
 
				+	__m128 mtxCol0 = _mm_loadu_ps(mtx);
			
 
				+	__m128 mtxCol1 = _mm_loadu_ps(mtx + 4);
			
 
				+	__m128 mtxCol2 = _mm_loadu_ps(mtx + 8);
			
 
				+	__m128 mtxCol3 = _mm_loadu_ps(mtx + 12);
			
 
				+
			
 
				+	int stride = vtxLayout.mStride;
			
 
				+	const char *vPtr = (const char *)inVtx;
			
 
				+	float *outPtr = xfVtx;
			
 
				+
			
 
				+	// Iterate through all vertices and transform
			
 
				+	for (unsigned int vtx = 0; vtx < nVtx; ++vtx)
			
 
				+	{
			
 
				+		__m128 xVal = _mm_load1_ps((float*)(vPtr));
			
 
				+		__m128 yVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetY));
			
 
				+		__m128 zVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetZ));
			
 
				+
			
 
				+		__m128 xform = _mm_add_ps(_mm_mul_ps(mtxCol0, xVal), _mm_add_ps(_mm_mul_ps(mtxCol1, yVal), _mm_add_ps(_mm_mul_ps(mtxCol2, zVal), mtxCol3)));
			
 
				+		_mm_storeu_ps(outPtr, xform);
			
 
				+		vPtr += stride;
			
 
				+		outPtr += 4;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Typedefs
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
			
 
				+typedef MaskedOcclusionCulling::pfnAlignedFree  pfnAlignedFree;
			
 
				+typedef MaskedOcclusionCulling::VertexLayout    VertexLayout;
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Common SSE2/SSE4.1 defines
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#define SIMD_LANES             4
			
 
				+#define TILE_HEIGHT_SHIFT      2
			
 
				+
			
 
				+#define SIMD_LANE_IDX _mm_setr_epi32(0, 1, 2, 3)
			
 
				+
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET _mm_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET _mm_setzero_si128()
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET_F _mm_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm_setzero_ps()
			
 
				+
			
 
				+#define SIMD_LANE_YCOORD_I _mm_setr_epi32(128, 384, 640, 896)
			
 
				+#define SIMD_LANE_YCOORD_F _mm_setr_ps(128.0f, 384.0f, 640.0f, 896.0f)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Common SSE2/SSE4.1 functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef __m128 __mw;
			
 
				+typedef __m128i __mwi;
			
 
				+
			
 
				+#define _mmw_set1_ps                _mm_set1_ps
			
 
				+#define _mmw_setzero_ps             _mm_setzero_ps
			
 
				+#define _mmw_and_ps                 _mm_and_ps
			
 
				+#define _mmw_or_ps                  _mm_or_ps
			
 
				+#define _mmw_xor_ps                 _mm_xor_ps
			
 
				+#define _mmw_not_ps(a)              _mm_xor_ps((a), _mm_castsi128_ps(_mm_set1_epi32(~0)))
			
 
				+#define _mmw_andnot_ps              _mm_andnot_ps
			
 
				+#define _mmw_neg_ps(a)              _mm_xor_ps((a), _mm_set1_ps(-0.0f))
			
 
				+#define _mmw_abs_ps(a)              _mm_and_ps((a), _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)))
			
 
				+#define _mmw_add_ps                 _mm_add_ps
			
 
				+#define _mmw_sub_ps                 _mm_sub_ps
			
 
				+#define _mmw_mul_ps                 _mm_mul_ps
			
 
				+#define _mmw_div_ps                 _mm_div_ps
			
 
				+#define _mmw_min_ps                 _mm_min_ps
			
 
				+#define _mmw_max_ps                 _mm_max_ps
			
 
				+#define _mmw_movemask_ps            _mm_movemask_ps
			
 
				+#define _mmw_cmpge_ps(a,b)          _mm_cmpge_ps(a, b)
			
 
				+#define _mmw_cmpgt_ps(a,b)          _mm_cmpgt_ps(a, b)
			
 
				+#define _mmw_cmpeq_ps(a,b)          _mm_cmpeq_ps(a, b)
			
 
				+#define _mmw_fmadd_ps(a,b,c)        _mm_add_ps(_mm_mul_ps(a,b), c)
			
 
				+#define _mmw_fmsub_ps(a,b,c)        _mm_sub_ps(_mm_mul_ps(a,b), c)
			
 
				+#define _mmw_shuffle_ps             _mm_shuffle_ps
			
 
				+#define _mmw_insertf32x4_ps(a,b,c)  (b)
			
 
				+#define _mmw_cvtepi32_ps            _mm_cvtepi32_ps
			
 
				+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
			
 
				+
			
 
				+#define _mmw_set1_epi32             _mm_set1_epi32
			
 
				+#define _mmw_setzero_epi32          _mm_setzero_si128
			
 
				+#define _mmw_and_epi32              _mm_and_si128
			
 
				+#define _mmw_or_epi32               _mm_or_si128
			
 
				+#define _mmw_xor_epi32              _mm_xor_si128
			
 
				+#define _mmw_not_epi32(a)           _mm_xor_si128((a), _mm_set1_epi32(~0))
			
 
				+#define _mmw_andnot_epi32           _mm_andnot_si128
			
 
				+#define _mmw_neg_epi32(a)           _mm_sub_epi32(_mm_set1_epi32(0), (a))
			
 
				+#define _mmw_add_epi32              _mm_add_epi32
			
 
				+#define _mmw_sub_epi32              _mm_sub_epi32
			
 
				+#define _mmw_subs_epu16             _mm_subs_epu16
			
 
				+#define _mmw_cmpeq_epi32            _mm_cmpeq_epi32
			
 
				+#define _mmw_cmpgt_epi32            _mm_cmpgt_epi32
			
 
				+#define _mmw_srai_epi32             _mm_srai_epi32
			
 
				+#define _mmw_srli_epi32             _mm_srli_epi32
			
 
				+#define _mmw_slli_epi32             _mm_slli_epi32
			
 
				+#define _mmw_cvtps_epi32            _mm_cvtps_epi32
			
 
				+#define _mmw_cvttps_epi32           _mm_cvttps_epi32
			
 
				+
			
 
				+#define _mmx_fmadd_ps               _mmw_fmadd_ps
			
 
				+#define _mmx_max_epi32              _mmw_max_epi32
			
 
				+#define _mmx_min_epi32              _mmw_min_epi32
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SIMD casting functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
			
 
				+
			
 
				+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
			
 
				+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
			
 
				+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
			
 
				+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
			
 
				+		return acs->m_array; \
			
 
				+	}
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Specialized SSE input assembly function for general vertex gather 
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+FORCE_INLINE void GatherVertices(__m128 *vtxX, __m128 *vtxY, __m128 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
			
 
				+{
			
 
				+	for (int lane = 0; lane < numLanes; lane++)
			
 
				+	{
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			char *vPtrX = (char *)inVtx + inTrisPtr[lane * 3 + i] * vtxLayout.mStride;
			
 
				+			char *vPtrY = vPtrX + vtxLayout.mOffsetY;
			
 
				+			char *vPtrW = vPtrX + vtxLayout.mOffsetW;
			
 
				+
			
 
				+			simd_f32(vtxX[i])[lane] = *((float*)vPtrX);
			
 
				+			simd_f32(vtxY[i])[lane] = *((float*)vPtrY);
			
 
				+			simd_f32(vtxW[i])[lane] = *((float*)vPtrW);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SSE4.1 version
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+namespace MaskedOcclusionCullingSSE41
			
 
				+{
			
 
				+	FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
			
 
				+	FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
			
 
				+	FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
			
 
				+	FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a) { return _mm_abs_epi32(a); }
			
 
				+	FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c) { return _mm_blendv_ps(a, b, c); }
			
 
				+	FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b) { return _mm_testz_si128(a, b); }
			
 
				+	FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b) { return _mm_dp_ps(a, b, 0xFF); }
			
 
				+	FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
			
 
				+	FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);	}
			
 
				+	FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
			
 
				+	{
			
 
				+		const __m128i shuff = _mm_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
			
 
				+		return _mm_shuffle_epi8(a, shuff);
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
			
 
				+	{
			
 
				+		__m128i shift = _mm_min_epi32(ishift, _mm_set1_epi32(32));
			
 
				+
			
 
				+		// Uses lookup tables and _mm_shuffle_epi8 to perform _mm_sllv_epi32(~0, shift)
			
 
				+		const __m128i byteShiftLUT = _mm_setr_epi8((char)0xFF, (char)0xFE, (char)0xFC, (char)0xF8, (char)0xF0, (char)0xE0, (char)0xC0, (char)0x80, 0, 0, 0, 0, 0, 0, 0, 0);
			
 
				+		const __m128i byteShiftOffset = _mm_setr_epi8(0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24);
			
 
				+		const __m128i byteShiftShuffle = _mm_setr_epi8(0x0, 0x0, 0x0, 0x0, 0x4, 0x4, 0x4, 0x4, 0x8, 0x8, 0x8, 0x8, 0xC, 0xC, 0xC, 0xC);
			
 
				+
			
 
				+		__m128i byteShift = _mm_shuffle_epi8(shift, byteShiftShuffle);
			
 
				+		byteShift = _mm_min_epi8(_mm_subs_epu8(byteShift, byteShiftOffset), _mm_set1_epi8(8));
			
 
				+		__m128i retMask = _mm_shuffle_epi8(byteShiftLUT, byteShift);
			
 
				+
			
 
				+		return retMask;
			
 
				+	}
			
 
				+
			
 
				+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE41;
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Include common algorithm implementation (general, SIMD independent code)
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	#include "MaskedOcclusionCullingCommon.inl"
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Utility function to create a new object using the allocator callbacks
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
			
 
				+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
			
 
				+		return object;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SSE2 version
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+namespace MaskedOcclusionCullingSSE2
			
 
				+{
			
 
				+	FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b)
			
 
				+	{ 
			
 
				+		// Do products for even / odd lanes & merge the result
			
 
				+		__m128i even = _mm_and_si128(_mm_mul_epu32(a, b), _mm_setr_epi32(~0, 0, ~0, 0));
			
 
				+		__m128i odd = _mm_slli_epi64(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 32);
			
 
				+		return _mm_or_si128(even, odd);
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b)
			
 
				+	{ 
			
 
				+		__m128i cond = _mm_cmpgt_epi32(a, b);
			
 
				+		return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b)
			
 
				+	{ 
			
 
				+		__m128i cond = _mm_cmpgt_epi32(b, a);
			
 
				+		return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a)
			
 
				+	{
			
 
				+		__m128i mask = _mm_cmplt_epi32(a, _mm_setzero_si128());
			
 
				+		return _mm_add_epi32(_mm_xor_si128(a, mask), _mm_srli_epi32(mask, 31));
			
 
				+	}
			
 
				+	FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b)
			
 
				+	{ 
			
 
				+		return _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(a, b), _mm_setzero_si128())) == 0xFFFF;
			
 
				+	}
			
 
				+	FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c)
			
 
				+	{	
			
 
				+		__m128 cond = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(c), 31));
			
 
				+		return _mm_or_ps(_mm_andnot_ps(cond, a), _mm_and_ps(cond, b));
			
 
				+	}
			
 
				+	FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b)
			
 
				+	{ 
			
 
				+		// Product and two shuffle/adds pairs (similar to hadd_ps)
			
 
				+		__m128 prod = _mm_mul_ps(a, b);
			
 
				+		__m128 dp = _mm_add_ps(prod, _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)));
			
 
				+		dp = _mm_add_ps(dp, _mm_shuffle_ps(dp, dp, _MM_SHUFFLE(0, 1, 2, 3)));
			
 
				+		return dp;
			
 
				+	}
			
 
				+	FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a)
			
 
				+	{ 
			
 
				+		int originalMode = _MM_GET_ROUNDING_MODE();
			
 
				+		_MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
			
 
				+		__m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
			
 
				+		_MM_SET_ROUNDING_MODE(originalMode);
			
 
				+		return rounded;
			
 
				+	}
			
 
				+	FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a)
			
 
				+	{ 
			
 
				+		int originalMode = _MM_GET_ROUNDING_MODE();
			
 
				+		_MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
			
 
				+		__m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
			
 
				+		_MM_SET_ROUNDING_MODE(originalMode);
			
 
				+		return rounded;
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
			
 
				+	{
			
 
				+		// Perform transpose through two 16->8 bit pack and byte shifts
			
 
				+		__m128i res = a;
			
 
				+		const __m128i mask = _mm_setr_epi8(~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0);
			
 
				+		res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
			
 
				+		res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
			
 
				+		return res;
			
 
				+	}
			
 
				+	FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
			
 
				+	{
			
 
				+		__m128i shift = _mmw_min_epi32(ishift, _mm_set1_epi32(32));
			
 
				+		
			
 
				+		// Uses scalar approach to perform _mm_sllv_epi32(~0, shift)
			
 
				+		static const unsigned int maskLUT[33] = {
			
 
				+			~0U << 0, ~0U << 1, ~0U << 2 ,  ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6 , ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10 , ~0U << 11, ~0U << 12, ~0U << 13, ~0U << 14 , ~0U << 15,
			
 
				+			~0U << 16, ~0U << 17, ~0U << 18 , ~0U << 19, ~0U << 20, ~0U << 21, ~0U << 22 , ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26 , ~0U << 27, ~0U << 28, ~0U << 29, ~0U << 30 , ~0U << 31,
			
 
				+			0U };
			
 
				+
			
 
				+		__m128i retMask;
			
 
				+		simd_i32(retMask)[0] = (int)maskLUT[simd_i32(shift)[0]];
			
 
				+		simd_i32(retMask)[1] = (int)maskLUT[simd_i32(shift)[1]];
			
 
				+		simd_i32(retMask)[2] = (int)maskLUT[simd_i32(shift)[2]];
			
 
				+		simd_i32(retMask)[3] = (int)maskLUT[simd_i32(shift)[3]];
			
 
				+		return retMask;
			
 
				+	}
			
 
				+
			
 
				+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE2;
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Include common algorithm implementation (general, SIMD independent code)
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	#include "MaskedOcclusionCullingCommon.inl"
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Utility function to create a new object using the allocator callbacks
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
			
 
				+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
			
 
				+		return object;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Object construction and allocation
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+namespace MaskedOcclusionCullingAVX512
			
 
				+{
			
 
				+	extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
			
 
				+}
			
 
				+
			
 
				+namespace MaskedOcclusionCullingAVX2
			
 
				+{
			
 
				+	extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
			
 
				+}
			
 
				+
			
 
				+MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD)
			
 
				+{
			
 
				+	return Create(RequestedSIMD, aligned_alloc, aligned_free);
			
 
				+}
			
 
				+
			
 
				+MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+{
			
 
				+	MaskedOcclusionCulling *object = nullptr;
			
 
				+
			
 
				+	MaskedOcclusionCulling::Implementation impl = DetectCPUFeatures(alignedAlloc, alignedFree);
			
 
				+
			
 
				+	if (RequestedSIMD < impl)
			
 
				+		impl = RequestedSIMD;
			
 
				+
			
 
				+	// Return best supported version
			
 
				+	if (object == nullptr && impl >= AVX512)
			
 
				+		object = MaskedOcclusionCullingAVX512::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX512 version
			
 
				+	if (object == nullptr && impl >= AVX2)
			
 
				+		object = MaskedOcclusionCullingAVX2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX2 version
			
 
				+	if (object == nullptr && impl >= SSE41)
			
 
				+		object = MaskedOcclusionCullingSSE41::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE4.1 version
			
 
				+	if (object == nullptr)
			
 
				+		object = MaskedOcclusionCullingSSE2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE2 (slow) version
			
 
				+
			
 
				+	return object;
			
 
				+}
			
 
				+
			
 
				+void MaskedOcclusionCulling::Destroy(MaskedOcclusionCulling *moc)
			
 
				+{
			
 
				+	pfnAlignedFree alignedFreeCallback = moc->mAlignedFreeCallback;
			
 
				+	moc->~MaskedOcclusionCulling();
			
 
				+	alignedFreeCallback(moc);
			
 
				+}
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.h
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.h
@@ -0,0 +1,592 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+#pragma once
			
 
				+
			
 
				+/*!
			
 
				+ *  \file MaskedOcclusionCulling.h
			
 
				+ *  \brief Masked Occlusion Culling
			
 
				+ * 
			
 
				+ *  General information
			
 
				+ *   - Input to all API functions are (x,y,w) clip-space coordinates (x positive left, y positive up, w positive away from camera).
			
 
				+ *     We entirely skip the z component and instead compute it as 1 / w, see next bullet. For TestRect the input is NDC (x/w, y/w).
			
 
				+ *   - We use a simple z = 1 / w transform, which is a bit faster than OGL/DX depth transforms. Thus, depth is REVERSED and z = 0 at
			
 
				+ *     the far plane and z = inf at w = 0. We also have to use a GREATER depth function, which explains why all the conservative
			
 
				+ *     tests will be reversed compared to what you might be used to (for example zMaxTri >= zMinBuffer is a visibility test)
			
 
				+ *   - We support different layouts for vertex data (basic AoS and SoA), but note that it's beneficial to store the position data
			
 
				+ *     as tightly in memory as possible to reduce cache misses. Big strides are bad, so it's beneficial to keep position as a separate
			
 
				+ *     stream (rather than bundled with attributes) or to keep a copy of the position data for the occlusion culling system.
			
 
				+ *   - The resolution width must be a multiple of 8 and height a multiple of 4.
			
 
				+ *   - The hierarchical Z buffer is stored OpenGL-style with the y axis pointing up. This includes the scissor box.
			
 
				+ *   - This code is only tested with Visual Studio 2015, but should hopefully be easy to port to other compilers.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Defines used to configure the implementation
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#ifndef QUICK_MASK
			
 
				+/*!
			
 
				+ * Configure the algorithm used for updating and merging hierarchical z buffer entries. If QUICK_MASK
			
 
				+ * is defined to 1, use the algorithm from the paper "Masked Software Occlusion Culling", which has good
			
 
				+ * balance between performance and low leakage. If QUICK_MASK is defined to 0, use the algorithm from
			
 
				+ * "Masked Depth Culling for Graphics Hardware" which has less leakage, but also lower performance.
			
 
				+ */
			
 
				+#define QUICK_MASK                      1
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef USE_D3D
			
 
				+/*!
			
 
				+ * Configures the library for use with Direct3D (default) or OpenGL rendering. This changes whether the 
			
 
				+ * screen space Y axis points downwards (D3D) or upwards (OGL), and is primarily important in combination 
			
 
				+ * with the PRECISE_COVERAGE define, where this is important to ensure correct rounding and tie-breaker
			
 
				+ * behaviour. It also affects the ScissorRect screen space coordinates.
			
 
				+ */
			
 
				+#define USE_D3D                         1
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef PRECISE_COVERAGE
			
 
				+/*!
			
 
				+ * Define PRECISE_COVERAGE to 1 to more closely match GPU rasterization rules. The increased precision comes
			
 
				+ * at a cost of slightly lower performance.
			
 
				+ */
			
 
				+#define PRECISE_COVERAGE                1
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef USE_AVX512
			
 
				+/*!
			
 
				+ * Define USE_AVX512 to 1 to enable experimental AVX-512 support. It's currently mostly untested and only
			
 
				+ * validated on simple examples using Intel SDE. Older compilers may not support AVX-512 intrinsics.
			
 
				+ */
			
 
				+#define USE_AVX512                      0
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef CLIPPING_PRESERVES_ORDER
			
 
				+/*!
			
 
				+ * Define CLIPPING_PRESERVES_ORDER to 1 to prevent clipping from reordering triangle rasterization
			
 
				+ * order; This comes at a cost (approx 3-4%) but removes one source of temporal frame-to-frame instability.
			
 
				+ */
			
 
				+#define CLIPPING_PRESERVES_ORDER        1
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef ENABLE_STATS
			
 
				+/*!
			
 
				+ * Define ENABLE_STATS to 1 to gather various statistics during occlusion culling. Can be used for profiling 
			
 
				+ * and debugging. Note that enabling this function will reduce performance significantly.
			
 
				+ */
			
 
				+#define ENABLE_STATS                    0
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifndef MOC_RECORDER_ENABLE
			
 
				+/*!
			
 
				+ * Define MOC_RECORDER_ENABLE to 1 to enable frame recorder (see FrameRecorder.h/cpp for details)
			
 
				+ */
			
 
				+#define MOC_RECORDER_ENABLE		        0
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+#ifndef MOC_RECORDER_ENABLE_PLAYBACK
			
 
				+/*!
			
 
				+ * Define MOC_RECORDER_ENABLE_PLAYBACK to 1 to enable compilation of the playback code (not needed 
			
 
				+   for recording)
			
 
				+ */
			
 
				+#define MOC_RECORDER_ENABLE_PLAYBACK    0
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+
			
 
				+#include <mutex>
			
 
				+
			
 
				+class FrameRecorder;
			
 
				+
			
 
				+#endif // #if MOC_RECORDER_ENABLE
			
 
				+
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Masked occlusion culling class
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+class MaskedOcclusionCulling 
			
 
				+{
			
 
				+public:
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Memory management callback functions
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	typedef void *(*pfnAlignedAlloc)(size_t alignment, size_t size);
			
 
				+	typedef void  (*pfnAlignedFree) (void *ptr);
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Enums
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	enum Implementation 
			
 
				+	{
			
 
				+		SSE2   = 0,
			
 
				+		SSE41  = 1,
			
 
				+		AVX2   = 2,
			
 
				+		AVX512 = 3
			
 
				+	};
			
 
				+
			
 
				+	enum BackfaceWinding
			
 
				+	{
			
 
				+		BACKFACE_NONE = 0,
			
 
				+		BACKFACE_CW   = 1,
			
 
				+		BACKFACE_CCW  = 2,
			
 
				+	};
			
 
				+
			
 
				+	enum CullingResult
			
 
				+	{
			
 
				+		VISIBLE     = 0x0,
			
 
				+		OCCLUDED    = 0x1,
			
 
				+		VIEW_CULLED = 0x3
			
 
				+	};
			
 
				+
			
 
				+	enum ClipPlanes
			
 
				+	{
			
 
				+		CLIP_PLANE_NONE   = 0x00,
			
 
				+		CLIP_PLANE_NEAR   = 0x01,
			
 
				+		CLIP_PLANE_LEFT   = 0x02,
			
 
				+		CLIP_PLANE_RIGHT  = 0x04,
			
 
				+		CLIP_PLANE_BOTTOM = 0x08,
			
 
				+		CLIP_PLANE_TOP    = 0x10,
			
 
				+		CLIP_PLANE_SIDES  = (CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP),
			
 
				+		CLIP_PLANE_ALL    = (CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP | CLIP_PLANE_NEAR)
			
 
				+	};
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Structs
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	/*!
			
 
				+	 * Used to specify custom vertex layout. Memory offsets to y and z coordinates are set through 
			
 
				+	 * mOffsetY and mOffsetW, and vertex stride is given by mStride. It's possible to configure both 
			
 
				+	 * AoS and SoA layouts. Note that large strides may cause more cache misses and decrease 
			
 
				+	 * performance. It is advisable to store position data as compactly in memory as possible.
			
 
				+	 */
			
 
				+	struct VertexLayout
			
 
				+	{
			
 
				+		VertexLayout() {}
			
 
				+		VertexLayout(int stride, int offsetY, int offsetZW) :
			
 
				+			mStride(stride), mOffsetY(offsetY), mOffsetW(offsetZW) {}
			
 
				+
			
 
				+		int mStride;      //!< byte stride between vertices
			
 
				+		int mOffsetY;     //!< byte offset from X to Y coordinate
			
 
				+		union {
			
 
				+			int mOffsetZ; //!< byte offset from X to Z coordinate
			
 
				+			int mOffsetW; //!< byte offset from X to W coordinate
			
 
				+		};
			
 
				+	};
			
 
				+
			
 
				+	/*!
			
 
				+	 * Used to control scissoring during rasterization. Note that we only provide coarse scissor support. 
			
 
				+	 * The scissor box x coordinates must be a multiple of 32, and the y coordinates a multiple of 8. 
			
 
				+	 * Scissoring is mainly meant as a means of enabling binning (sort middle) rasterizers in case
			
 
				+	 * application developers want to use that approach for multithreading.
			
 
				+	 */
			
 
				+	struct ScissorRect
			
 
				+	{
			
 
				+		ScissorRect() {}
			
 
				+		ScissorRect(int minX, int minY, int maxX, int maxY) :
			
 
				+			mMinX(minX), mMinY(minY), mMaxX(maxX), mMaxY(maxY) {}
			
 
				+
			
 
				+		int mMinX; //!< Screen space X coordinate for left side of scissor rect, inclusive and must be a multiple of 32
			
 
				+		int mMinY; //!< Screen space Y coordinate for bottom side of scissor rect, inclusive and must be a multiple of 8
			
 
				+		int mMaxX; //!< Screen space X coordinate for right side of scissor rect, <B>non</B> inclusive and must be a multiple of 32
			
 
				+		int mMaxY; //!< Screen space Y coordinate for top side of scissor rect, <B>non</B> inclusive and must be a multiple of 8
			
 
				+	};
			
 
				+
			
 
				+	/*!
			
 
				+	 * Used to specify storage area for a binlist, containing triangles. This struct is used for binning 
			
 
				+	 * and multithreading. The host application is responsible for allocating memory for the binlists.
			
 
				+	 */
			
 
				+	struct TriList
			
 
				+	{
			
 
				+		unsigned int mNumTriangles; //!< Maximum number of triangles that may be stored in mPtr
			
 
				+		unsigned int mTriIdx;       //!< Index of next triangle to be written, clear before calling BinTriangles to start from the beginning of the list
			
 
				+		float		 *mPtr;         //!< Scratchpad buffer allocated by the host application
			
 
				+	};
			
 
				+
			
 
				+	/*!
			
 
				+	 * Statistics that can be gathered during occluder rendering and visibility to aid debugging 
			
 
				+	 * and profiling. Must be enabled by changing the ENABLE_STATS define.
			
 
				+	 */
			
 
				+	struct OcclusionCullingStatistics
			
 
				+	{
			
 
				+		struct
			
 
				+		{
			
 
				+			long long mNumProcessedTriangles;  //!< Number of occluder triangles processed in total
			
 
				+			long long mNumRasterizedTriangles; //!< Number of occluder triangles passing view frustum and backface culling
			
 
				+			long long mNumTilesTraversed;      //!< Number of tiles traversed by the rasterizer
			
 
				+			long long mNumTilesUpdated;        //!< Number of tiles where the hierarchical z buffer was updated
			
 
				+			long long mNumTilesMerged;        //!< Number of tiles where the hierarchical z buffer was updated
			
 
				+		} mOccluders;
			
 
				+
			
 
				+		struct
			
 
				+		{
			
 
				+			long long mNumProcessedRectangles; //!< Number of rects processed (TestRect())
			
 
				+			long long mNumProcessedTriangles;  //!< Number of ocludee triangles processed (TestTriangles())
			
 
				+			long long mNumRasterizedTriangles; //!< Number of ocludee triangle passing view frustum and backface culling
			
 
				+			long long mNumTilesTraversed;      //!< Number of tiles traversed by triangle & rect rasterizers
			
 
				+		} mOccludees;
			
 
				+	};
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Functions
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Creates a new object with default state, no z buffer attached/allocated.
			
 
				+	 */
			
 
				+	static MaskedOcclusionCulling *Create(Implementation RequestedSIMD = AVX512);
			
 
				+	
			
 
				+	/*!
			
 
				+	 * \brief Creates a new object with default state, no z buffer attached/allocated.
			
 
				+	 * \param alignedAlloc Pointer to a callback function used when allocating memory
			
 
				+	 * \param alignedFree Pointer to a callback function used when freeing memory
			
 
				+	 */
			
 
				+	static MaskedOcclusionCulling *Create(Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Destroys an object and frees the z buffer memory. Note that you cannot 
			
 
				+	 * use the delete operator, and should rather use this function to free up memory.
			
 
				+	 */
			
 
				+	static void Destroy(MaskedOcclusionCulling *moc);
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Sets the resolution of the hierarchical depth buffer. This function will
			
 
				+	 *        re-allocate the current depth buffer (if present). The contents of the
			
 
				+	 *        buffer is undefined until ClearBuffer() is called.
			
 
				+	 *
			
 
				+	 * \param witdh The width of the buffer in pixels, must be a multiple of 8
			
 
				+	 * \param height The height of the buffer in pixels, must be a multiple of 4
			
 
				+	 */
			
 
				+	virtual void SetResolution(unsigned int width, unsigned int height) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	* \brief Gets the resolution of the hierarchical depth buffer. 
			
 
				+	*
			
 
				+	* \param witdh Output: The width of the buffer in pixels
			
 
				+	* \param height Output: The height of the buffer in pixels
			
 
				+	*/
			
 
				+	virtual void GetResolution(unsigned int &width, unsigned int &height) const = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Returns the tile size for the current implementation.
			
 
				+	 *
			
 
				+	 * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
			
 
				+	 *        rectangular bins.
			
 
				+	 * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
			
 
				+	 *        rectangular bins.
			
 
				+	 * \param outBinWidth Output: The width of the single bin in pixels (except for the 
			
 
				+	 *        rightmost bin width, which is extended to resolution width)
			
 
				+	 * \param outBinHeight Output: The height of the single bin in pixels (except for the 
			
 
				+	 *        bottommost bin height, which is extended to resolution height)
			
 
				+	 */
			
 
				+	virtual void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Sets the distance for the near clipping plane. Default is nearDist = 0.
			
 
				+	 *
			
 
				+	 * \param nearDist The distance to the near clipping plane, given as clip space w
			
 
				+	 */
			
 
				+	virtual void SetNearClipPlane(float nearDist) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	* \brief Gets the distance for the near clipping plane. 
			
 
				+	*/
			
 
				+	virtual float GetNearClipPlane() const = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Clears the hierarchical depth buffer.
			
 
				+	 */
			
 
				+	virtual void ClearBuffer() = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	* \brief Merge a second hierarchical depth buffer into the main buffer.
			
 
				+	*/
			
 
				+	virtual void MergeBuffer(MaskedOcclusionCulling* BufferB) = 0;
			
 
				+
			
 
				+	/*! 
			
 
				+	 * \brief Renders a mesh of occluder triangles and updates the hierarchical z buffer
			
 
				+	 *        with conservative depth values.
			
 
				+	 *
			
 
				+	 * This function is optimized for vertex layouts with stride 16 and y and w
			
 
				+	 * offsets of 4 and 12 bytes, respectively.
			
 
				+	 *
			
 
				+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
			
 
				+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
			
 
				+	 *        in clip space. The memory layout can be changed using vtxLayout.
			
 
				+	 * \param inTris Pointer to an array of vertex indices. Each triangle is created 
			
 
				+	 *        from three indices consecutively fetched from the array.
			
 
				+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
			
 
				+	 *        entries)
			
 
				+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
			
 
				+	 *        performing projection. If nullptr is passed the transform step will be skipped
			
 
				+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
			
 
				+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
			
 
				+	 *        and will not be rasterized. You may use BACKFACE_NONE to disable culling for
			
 
				+	 *        double sided geometry
			
 
				+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
			
 
				+	 *        triangle clipper. Can be used as an optimization if your application can 
			
 
				+	 *        determine (for example during culling) that a group of triangles does not 
			
 
				+	 *        intersect a certain frustum plane. However, setting an incorrect mask may 
			
 
				+	 *        cause out of bounds memory accesses.
			
 
				+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
			
 
				+	 *        description). For best performance, it is advisable to store position data
			
 
				+	 *        as compactly in memory as possible.
			
 
				+	 * \return Will return VIEW_CULLED if all triangles are either outside the frustum or
			
 
				+	 *         backface culled, returns VISIBLE otherwise.
			
 
				+	 */
			
 
				+	virtual CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Occlusion query for a rectangle with a given depth. The rectangle is given 
			
 
				+	 *        in normalized device coordinates where (x,y) coordinates between [-1,1] map 
			
 
				+	 *        to the visible screen area. The query uses a GREATER_EQUAL (reversed) depth 
			
 
				+	 *        test meaning that depth values equal to the contents of the depth buffer are
			
 
				+	 *        counted as visible.
			
 
				+	 *
			
 
				+	 * \param xmin NDC coordinate of the left side of the rectangle.
			
 
				+	 * \param ymin NDC coordinate of the bottom side of the rectangle.
			
 
				+	 * \param xmax NDC coordinate of the right side of the rectangle.
			
 
				+	 * \param ymax NDC coordinate of the top side of the rectangle.
			
 
				+	 * \param ymax NDC coordinate of the top side of the rectangle.
			
 
				+	 * \param wmin Clip space W coordinate for the rectangle.
			
 
				+	 * \return The query will return VISIBLE if the rectangle may be visible, OCCLUDED
			
 
				+	 *         if the rectangle is occluded by a previously rendered  object, or VIEW_CULLED
			
 
				+	 *         if the rectangle is outside the view frustum.
			
 
				+	 */
			
 
				+	virtual CullingResult TestRect(float xmin, float ymin, float xmax, float ymax, float wmin) const = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief This function is similar to RenderTriangles(), but performs an occlusion
			
 
				+	 *        query instead and does not update the hierarchical z buffer. The query uses 
			
 
				+	 *        a GREATER_EQUAL (reversed) depth test meaning that depth values equal to the 
			
 
				+	 *        contents of the depth buffer are counted as visible.
			
 
				+	 *
			
 
				+	 * This function is optimized for vertex layouts with stride 16 and y and w
			
 
				+	 * offsets of 4 and 12 bytes, respectively.
			
 
				+	 *
			
 
				+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
			
 
				+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
			
 
				+	 *        in clip space. The memory layout can be changed using vtxLayout.
			
 
				+	 * \param inTris Pointer to an array of triangle indices. Each triangle is created 
			
 
				+	 *        from three indices consecutively fetched from the array.
			
 
				+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
			
 
				+	 *        entries)
			
 
				+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
			
 
				+	 *        performing projection. If nullptr is passed the transform step will be skipped
			
 
				+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
			
 
				+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
			
 
				+	 *        and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
			
 
				+	 *        for double sided geometry
			
 
				+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
			
 
				+	 *        triangle clipper. Can be used as an optimization if your application can
			
 
				+	 *        determine (for example during culling) that a group of triangles does not
			
 
				+	 *        intersect a certain frustum plane. However, setting an incorrect mask may
			
 
				+	 *        cause out of bounds memory accesses.
			
 
				+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
			
 
				+	 *        description). For best performance, it is advisable to store position data
			
 
				+	 *        as compactly in memory as possible.
			
 
				+	 * \return The query will return VISIBLE if the triangle mesh may be visible, OCCLUDED
			
 
				+	 *         if the mesh is occluded by a previously rendered object, or VIEW_CULLED if all
			
 
				+	 *         triangles are entirely outside the view frustum or backface culled.
			
 
				+	 */
			
 
				+	virtual CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Perform input assembly, clipping , projection, triangle setup, and write
			
 
				+	 *        triangles to the screen space bins they overlap. This function can be used to
			
 
				+	 *        distribute work for threading (See the CullingThreadpool class for an example)
			
 
				+	 *
			
 
				+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
			
 
				+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
			
 
				+	 *        in clip space. The memory layout can be changed using vtxLayout.
			
 
				+	 * \param inTris Pointer to an array of vertex indices. Each triangle is created
			
 
				+	 *        from three indices consecutively fetched from the array.
			
 
				+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
			
 
				+	 *        entries)
			
 
				+	 * \param triLists Pointer to an array of TriList objects with one TriList object per
			
 
				+	 *        bin. If a triangle overlaps a bin, it will be written to the corresponding
			
 
				+	 *        trilist. Note that this method appends the triangles to the current list, to
			
 
				+	 *        start writing from the beginning of the list, set triList.mTriIdx = 0
			
 
				+	 * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
			
 
				+	 *        rectangular bins.
			
 
				+	 * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
			
 
				+	 *        rectangular bins.
			
 
				+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
			
 
				+	 *        performing projection. If nullptr is passed the transform step will be skipped
			
 
				+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
			
 
				+	 *        triangle clipper. Can be used as an optimization if your application can
			
 
				+	 *        determine (for example during culling) that a group of triangles does not
			
 
				+	 *        intersect a certain frustum plane. However, setting an incorrect mask may
			
 
				+	 *        cause out of bounds memory accesses.
			
 
				+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed
			
 
				+	 *        description). For best performance, it is advisable to store position data
			
 
				+	 *        as compactly in memory as possible.
			
 
				+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
			
 
				+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
			
 
				+	 *        and will not be binned / rasterized. You may use BACKFACE_NONE to disable culling
			
 
				+	 *        for double sided geometry
			
 
				+	 */
			
 
				+	virtual void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Renders all occluder triangles in a trilist. This function can be used in
			
 
				+	 *        combination with BinTriangles() to create a threded (binning) rasterizer. The
			
 
				+	 *        bins can be processed independently by different threads without risking writing
			
 
				+	 *        to overlapping memory regions.
			
 
				+	 *
			
 
				+	 * \param triLists A triangle list, filled using the BinTriangles() function that is to
			
 
				+	 *        be rendered.
			
 
				+	 * \param scissor A scissor box limiting the rendering region to the bin. The size of each
			
 
				+	 *        bin must be a multiple of 32x8 pixels due to implementation constraints. For a
			
 
				+	 *        render target with (width, height) resolution and (nBinsW, nBinsH) bins, the
			
 
				+	 *        size of a bin is:
			
 
				+	 *          binWidth = (width / nBinsW) - (width / nBinsW) % 32;
			
 
				+	 *          binHeight = (height / nBinsH) - (height / nBinsH) % 8;
			
 
				+	 *        The last row and column of tiles have a different size:
			
 
				+	 *          lastColBinWidth = width - (nBinsW-1)*binWidth;
			
 
				+	 *          lastRowBinHeight = height - (nBinsH-1)*binHeight;
			
 
				+	 */
			
 
				+	virtual void RenderTrilist(const TriList &triList, const ScissorRect *scissor) = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation.
			
 
				+	 *        Intended for visualizing the hierarchical depth buffer for debugging. The 
			
 
				+	 *        buffer is written in scanline order, from the top to bottom (D3D) or bottom to 
			
 
				+	 *        top (OGL) of the surface. See the USE_D3D define.
			
 
				+	 *
			
 
				+	 * \param depthData Pointer to memory where the per-pixel depth data is written. Must
			
 
				+	 *        hold storage for atleast width*height elements as set by setResolution.
			
 
				+	 */
			
 
				+	virtual void ComputePixelDepthBuffer(float *depthData, bool flipY) = 0;
			
 
				+	
			
 
				+	/*!
			
 
				+	 * \brief Fetch occlusion culling statistics, returns zeroes if ENABLE_STATS define is
			
 
				+	 *        not defined. The statistics can be used for profiling or debugging.
			
 
				+	 */
			
 
				+	virtual OcclusionCullingStatistics GetStatistics() = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Returns the implementation (CPU instruction set) version of this object.
			
 
				+	 */
			
 
				+	virtual Implementation GetImplementation() = 0;
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Utility function for transforming vertices and outputting them to an (x,y,z,w)
			
 
				+	 *        format suitable for the occluder rasterization and occludee testing functions.
			
 
				+	 *
			
 
				+	 * \param mtx Pointer to matrix data. The matrix should column major for post 
			
 
				+	 *        multiplication (OGL) and row major for pre-multiplication (DX). This is 
			
 
				+	 *        consistent with OpenGL / DirectX behavior.
			
 
				+	 * \param inVtx Pointer to an array of input vertices. The input vertices are given as
			
 
				+	 *        (x,y,z) coordinates. The memory layout can be changed using vtxLayout.
			
 
				+	 * \param xfVtx Pointer to an array to store transformed vertices. The transformed
			
 
				+	 *        vertices are always stored as array of structs (AoS) (x,y,z,w) packed in memory.
			
 
				+	 * \param nVtx Number of vertices to transform.
			
 
				+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
			
 
				+	 *        description). For best performance, it is advisable to store position data
			
 
				+	 *        as compactly in memory as possible. Note that for this function, the
			
 
				+	 *        w-component is assumed to be 1.0.
			
 
				+	 */
			
 
				+	static void TransformVertices(const float *mtx, const float *inVtx, float *xfVtx, unsigned int nVtx, const VertexLayout &vtxLayout = VertexLayout(12, 4, 8));
			
 
				+
			
 
				+	/*!
			
 
				+	 * \brief Get used memory alloc/free callbacks.
			
 
				+     */
			
 
				+    void GetAllocFreeCallback( pfnAlignedAlloc & allocCallback, pfnAlignedFree & freeCallback ) { allocCallback = mAlignedAllocCallback, freeCallback = mAlignedFreeCallback; }
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+    /*!
			
 
				+	 * \brief Start recording subsequent rasterization and testing calls using the FrameRecorder.
			
 
				+     *        The function calls that are recorded are:
			
 
				+     *         - ClearBuffer
			
 
				+	 *         - RenderTriangles
			
 
				+     *         - TestTriangles
			
 
				+     *         - TestRect
			
 
				+     *        All inputs and outputs are recorded, which can be used for correctness validation
			
 
				+     *        and performance testing.
			
 
				+     *
			
 
				+	 * \param outputFilePath Pointer to name of the output file. 
			
 
				+	 * \return 'true' if recording was started successfully, 'false' otherwise (file access error).
			
 
				+	 */
			
 
				+    bool RecorderStart( const char * outputFilePath ) const;
			
 
				+
			
 
				+    /*!
			
 
				+	 * \brief Stop recording, flush output and release used memory.
			
 
				+	 */
			
 
				+    void RecorderStop( ) const;
			
 
				+
			
 
				+    /*!
			
 
				+	 * \brief Manually record triangles. This is called automatically from MaskedOcclusionCulling::RenderTriangles 
			
 
				+     *  if the recording is started, but not from BinTriangles/RenderTrilist (used in multithreaded codepath), in
			
 
				+     *  which case it has to be called manually.
			
 
				+     *
			
 
				+     * \param inVtx Pointer to an array of input vertices, should point to the x component
			
 
				+     *        of the first vertex. The input vertices are given as (x,y,w) coordinates
			
 
				+     *        in clip space. The memory layout can be changed using vtxLayout.
			
 
				+     * \param inTris Pointer to an array of triangle indices. Each triangle is created
			
 
				+     *        from three indices consecutively fetched from the array.
			
 
				+     * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
			
 
				+     *        entries)
			
 
				+     * \param modelToClipMatrix all vertices will be transformed by this matrix before
			
 
				+     *        performing projection. If nullptr is passed the transform step will be skipped
			
 
				+     * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
			
 
				+     *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
			
 
				+     *        and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
			
 
				+     *        for double sided geometry
			
 
				+     * \param clipPlaneMask A mask indicating which clip planes should be considered by the
			
 
				+     *        triangle clipper. Can be used as an optimization if your application can
			
 
				+     *        determine (for example during culling) that a group of triangles does not
			
 
				+     *        intersect a certain frustum plane. However, setting an incorrect mask may
			
 
				+     *        cause out of bounds memory accesses.
			
 
				+     * \param vtxLayout A struct specifying the vertex layout (see struct for detailed
			
 
				+     *        description). For best performance, it is advisable to store position data
			
 
				+     *        as compactly in memory as possible.
			
 
				+     * \param cullingResult cull result value expected to be returned by executing the
			
 
				+     *        RenderTriangles call with recorded parameters.
			
 
				+	 */
			
 
				+    // 
			
 
				+    // merge the binned data back into original layout; in this case, call it manually from your Threadpool implementation (already added to CullingThreadpool).
			
 
				+    // If recording is not enabled, calling this function will do nothing.
			
 
				+    void RecordRenderTriangles( const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, BackfaceWinding bfWinding = BACKFACE_CW, const VertexLayout &vtxLayout = VertexLayout( 16, 4, 12 ), CullingResult cullingResult = (CullingResult)-1 );
			
 
				+#endif // #if MOC_RECORDER_ENABLE
			
 
				+
			
 
				+protected:
			
 
				+	pfnAlignedAlloc mAlignedAllocCallback;
			
 
				+	pfnAlignedFree  mAlignedFreeCallback;
			
 
				+
			
 
				+	mutable OcclusionCullingStatistics mStats;
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+    mutable FrameRecorder * mRecorder;
			
 
				+    mutable std::mutex mRecorderMutex;
			
 
				+#endif // #if MOC_RECORDER_ENABLE
			
 
				+
			
 
				+	virtual ~MaskedOcclusionCulling() {}
			
 
				+};
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
@@ -0,0 +1,243 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+#include <string.h>
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include "MaskedOcclusionCulling.h"
			
 
				+#include "CompilerSpecific.inl"
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+#include "FrameRecorder.h"
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900
			
 
				+	// If you remove/comment this error, the code will compile & use the SSE41 version instead. 
			
 
				+	#error Older versions than visual studio 2015 not supported due to compiler bug(s)
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900
			
 
				+
			
 
				+// For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid 
			
 
				+// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE
			
 
				+// version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to 
			
 
				+// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
			
 
				+#ifndef __AVX2__
			
 
				+	#error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2
			
 
				+#endif
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// AVX specific defines and constants
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#define SIMD_LANES             8
			
 
				+#define TILE_HEIGHT_SHIFT      3
			
 
				+
			
 
				+#define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7)
			
 
				+
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
			
 
				+
			
 
				+#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF)
			
 
				+
			
 
				+#define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920)
			
 
				+#define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// AVX specific typedefs and functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef __m256 __mw;
			
 
				+typedef __m256i __mwi;
			
 
				+
			
 
				+#define _mmw_set1_ps                _mm256_set1_ps
			
 
				+#define _mmw_setzero_ps             _mm256_setzero_ps
			
 
				+#define _mmw_and_ps                 _mm256_and_ps
			
 
				+#define _mmw_or_ps                  _mm256_or_ps
			
 
				+#define _mmw_xor_ps                 _mm256_xor_ps
			
 
				+#define _mmw_not_ps(a)              _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0)))
			
 
				+#define _mmw_andnot_ps              _mm256_andnot_ps
			
 
				+#define _mmw_neg_ps(a)              _mm256_xor_ps((a), _mm256_set1_ps(-0.0f))
			
 
				+#define _mmw_abs_ps(a)              _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)))
			
 
				+#define _mmw_add_ps                 _mm256_add_ps
			
 
				+#define _mmw_sub_ps                 _mm256_sub_ps
			
 
				+#define _mmw_mul_ps                 _mm256_mul_ps
			
 
				+#define _mmw_div_ps                 _mm256_div_ps
			
 
				+#define _mmw_min_ps                 _mm256_min_ps
			
 
				+#define _mmw_max_ps                 _mm256_max_ps
			
 
				+#define _mmw_fmadd_ps               _mm256_fmadd_ps
			
 
				+#define _mmw_fmsub_ps               _mm256_fmsub_ps
			
 
				+#define _mmw_movemask_ps            _mm256_movemask_ps
			
 
				+#define _mmw_blendv_ps              _mm256_blendv_ps
			
 
				+#define _mmw_cmpge_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_GE_OQ)
			
 
				+#define _mmw_cmpgt_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_GT_OQ)
			
 
				+#define _mmw_cmpeq_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
			
 
				+#define _mmw_floor_ps(x)            _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)
			
 
				+#define _mmw_ceil_ps(x)             _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)
			
 
				+#define _mmw_shuffle_ps             _mm256_shuffle_ps
			
 
				+#define _mmw_insertf32x4_ps         _mm256_insertf128_ps
			
 
				+#define _mmw_cvtepi32_ps            _mm256_cvtepi32_ps
			
 
				+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
			
 
				+
			
 
				+#define _mmw_set1_epi32             _mm256_set1_epi32
			
 
				+#define _mmw_setzero_epi32          _mm256_setzero_si256
			
 
				+#define _mmw_and_epi32              _mm256_and_si256
			
 
				+#define _mmw_or_epi32               _mm256_or_si256
			
 
				+#define _mmw_xor_epi32              _mm256_xor_si256
			
 
				+#define _mmw_not_epi32(a)           _mm256_xor_si256((a), _mm256_set1_epi32(~0))
			
 
				+#define _mmw_andnot_epi32           _mm256_andnot_si256
			
 
				+#define _mmw_neg_epi32(a)           _mm256_sub_epi32(_mm256_set1_epi32(0), (a))
			
 
				+#define _mmw_add_epi32              _mm256_add_epi32
			
 
				+#define _mmw_sub_epi32              _mm256_sub_epi32
			
 
				+#define _mmw_min_epi32              _mm256_min_epi32
			
 
				+#define _mmw_max_epi32              _mm256_max_epi32
			
 
				+#define _mmw_subs_epu16             _mm256_subs_epu16
			
 
				+#define _mmw_mullo_epi32            _mm256_mullo_epi32
			
 
				+#define _mmw_cmpeq_epi32            _mm256_cmpeq_epi32
			
 
				+#define _mmw_testz_epi32            _mm256_testz_si256
			
 
				+#define _mmw_cmpgt_epi32            _mm256_cmpgt_epi32
			
 
				+#define _mmw_srai_epi32             _mm256_srai_epi32
			
 
				+#define _mmw_srli_epi32             _mm256_srli_epi32
			
 
				+#define _mmw_slli_epi32             _mm256_slli_epi32
			
 
				+#define _mmw_sllv_ones(x)           _mm256_sllv_epi32(SIMD_BITS_ONE, x)
			
 
				+#define _mmw_transpose_epi8(x)      _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
			
 
				+#define _mmw_abs_epi32              _mm256_abs_epi32
			
 
				+#define _mmw_cvtps_epi32            _mm256_cvtps_epi32
			
 
				+#define _mmw_cvttps_epi32           _mm256_cvttps_epi32
			
 
				+
			
 
				+#define _mmx_dp4_ps(a, b)           _mm_dp_ps(a, b, 0xFF)
			
 
				+#define _mmx_fmadd_ps               _mm_fmadd_ps
			
 
				+#define _mmx_max_epi32              _mm_max_epi32
			
 
				+#define _mmx_min_epi32              _mm_min_epi32
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SIMD casting functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(float A) { return _mm256_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256 A) { return A; }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; }
			
 
				+
			
 
				+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
			
 
				+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
			
 
				+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
			
 
				+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
			
 
				+		return acs->m_array; \
			
 
				+	}
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m256, float, , 8)
			
 
				+MAKE_ACCESSOR(simd_f32, __m256, float, const, 8)
			
 
				+MAKE_ACCESSOR(simd_i32, __m256i, int, , 8)
			
 
				+MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Specialized AVX input assembly function for general vertex gather 
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
			
 
				+
			
 
				+FORCE_INLINE void GatherVertices(__m256 *vtxX, __m256 *vtxY, __m256 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
			
 
				+{
			
 
				+	assert(numLanes >= 1);
			
 
				+
			
 
				+	const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
			
 
				+	static const __m256i SIMD_LANE_MASK[9] = {
			
 
				+		_mm256_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0,  0,  0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0, ~0,  0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0,  0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0,  0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0,  0),
			
 
				+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0)
			
 
				+	};
			
 
				+
			
 
				+	// Compute per-lane index list offset that guards against out of bounds memory accesses
			
 
				+	__m256i safeTriIdxOffset = _mm256_and_si256(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]);
			
 
				+
			
 
				+	// Fetch triangle indices. 
			
 
				+	__m256i vtxIdx[3];
			
 
				+	vtxIdx[0] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 0, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+	vtxIdx[1] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 1, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+	vtxIdx[2] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 2, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+
			
 
				+	char *vPtr = (char *)inVtx;
			
 
				+
			
 
				+	// Fetch triangle vertices
			
 
				+	for (int i = 0; i < 3; i++)
			
 
				+	{
			
 
				+		vtxX[i] = _mm256_i32gather_ps((float *)vPtr, vtxIdx[i], 1);
			
 
				+		vtxY[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetY), vtxIdx[i], 1);
			
 
				+		vtxW[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetW), vtxIdx[i], 1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+namespace MaskedOcclusionCullingAVX2
			
 
				+{
			
 
				+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2;
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Include common algorithm implementation (general, SIMD independent code)
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	#include "MaskedOcclusionCullingCommon.inl"
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Utility function to create a new object using the allocator callbacks
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
			
 
				+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
			
 
				+		return object;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+namespace MaskedOcclusionCullingAVX2
			
 
				+{
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		return nullptr;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+#endif
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
@@ -0,0 +1,309 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+#include <string.h>
			
 
				+#include <assert.h>
			
 
				+#include <float.h>
			
 
				+#include "MaskedOcclusionCulling.h"
			
 
				+#include "CompilerSpecific.inl"
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+#include "FrameRecorder.h"
			
 
				+#endif
			
 
				+
			
 
				+// Make sure compiler supports AVX-512 intrinsics: Visual Studio 2017 (Update 3) || Intel C++ Compiler 16.0 || Clang 4.0 || GCC 5.0
			
 
				+#if USE_AVX512 != 0 && ((defined(_MSC_VER) && _MSC_VER >= 1911) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1600) || (defined(__clang__) && __clang_major__ >= 4) || (defined(__GNUC__) && __GNUC__ >= 5))
			
 
				+
			
 
				+// The MaskedOcclusionCullingAVX512.cpp file should be compiled avx2/avx512 architecture options turned on in the compiler. However, the SSE
			
 
				+// version in MaskedOcclusionCulling.cpp _must_ be compiled with SSE2 architecture allow backwards compatibility. Best practice is to 
			
 
				+// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
			
 
				+#ifndef __AVX2__
			
 
				+	#error For best performance, MaskedOcclusionCullingAVX512.cpp should be compiled with /arch:AVX2
			
 
				+#endif
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// AVX specific defines and constants
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#define SIMD_LANES             16
			
 
				+#define TILE_HEIGHT_SHIFT      4
			
 
				+
			
 
				+#define SIMD_LANE_IDX _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
			
 
				+
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET _mm512_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET _mm512_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
			
 
				+#define SIMD_SUB_TILE_COL_OFFSET_F _mm512_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
			
 
				+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm512_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
			
 
				+
			
 
				+#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm512_set_epi32(0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400)
			
 
				+
			
 
				+#define SIMD_LANE_YCOORD_I _mm512_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920, 2176, 2432, 2688, 2944, 3200, 3456, 3712, 3968)
			
 
				+#define SIMD_LANE_YCOORD_F _mm512_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f, 2176.0f, 2432.0f, 2688.0f, 2944.0f, 3200.0f, 3456.0f, 3712.0f, 3968.0f)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// AVX specific typedefs and functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef __m512 __mw;
			
 
				+typedef __m512i __mwi;
			
 
				+
			
 
				+#define _mmw_set1_ps                _mm512_set1_ps
			
 
				+#define _mmw_setzero_ps             _mm512_setzero_ps
			
 
				+#define _mmw_and_ps                 _mm512_and_ps
			
 
				+#define _mmw_or_ps                  _mm512_or_ps
			
 
				+#define _mmw_xor_ps                 _mm512_xor_ps
			
 
				+#define _mmw_not_ps(a)              _mm512_xor_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(~0)))
			
 
				+#define _mmw_andnot_ps              _mm512_andnot_ps
			
 
				+#define _mmw_neg_ps(a)              _mm512_xor_ps((a), _mm512_set1_ps(-0.0f))
			
 
				+#define _mmw_abs_ps(a)              _mm512_and_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)))
			
 
				+#define _mmw_add_ps                 _mm512_add_ps
			
 
				+#define _mmw_sub_ps                 _mm512_sub_ps
			
 
				+#define _mmw_mul_ps                 _mm512_mul_ps
			
 
				+#define _mmw_div_ps                 _mm512_div_ps
			
 
				+#define _mmw_min_ps                 _mm512_min_ps
			
 
				+#define _mmw_max_ps                 _mm512_max_ps
			
 
				+#define _mmw_fmadd_ps               _mm512_fmadd_ps
			
 
				+#define _mmw_fmsub_ps               _mm512_fmsub_ps
			
 
				+#define _mmw_shuffle_ps             _mm512_shuffle_ps
			
 
				+#define _mmw_insertf32x4_ps         _mm512_insertf32x4
			
 
				+#define _mmw_cvtepi32_ps            _mm512_cvtepi32_ps
			
 
				+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
			
 
				+
			
 
				+#define _mmw_set1_epi32             _mm512_set1_epi32
			
 
				+#define _mmw_setzero_epi32          _mm512_setzero_si512
			
 
				+#define _mmw_and_epi32              _mm512_and_si512
			
 
				+#define _mmw_or_epi32               _mm512_or_si512
			
 
				+#define _mmw_xor_epi32              _mm512_xor_si512
			
 
				+#define _mmw_not_epi32(a)           _mm512_xor_si512((a), _mm512_set1_epi32(~0))
			
 
				+#define _mmw_andnot_epi32           _mm512_andnot_si512
			
 
				+#define _mmw_neg_epi32(a)           _mm512_sub_epi32(_mm512_set1_epi32(0), (a))
			
 
				+#define _mmw_add_epi32              _mm512_add_epi32
			
 
				+#define _mmw_sub_epi32              _mm512_sub_epi32
			
 
				+#define _mmw_min_epi32              _mm512_min_epi32
			
 
				+#define _mmw_max_epi32              _mm512_max_epi32
			
 
				+#define _mmw_subs_epu16             _mm512_subs_epu16
			
 
				+#define _mmw_mullo_epi32            _mm512_mullo_epi32
			
 
				+#define _mmw_srai_epi32             _mm512_srai_epi32
			
 
				+#define _mmw_srli_epi32             _mm512_srli_epi32
			
 
				+#define _mmw_slli_epi32             _mm512_slli_epi32
			
 
				+#define _mmw_sllv_ones(x)           _mm512_sllv_epi32(SIMD_BITS_ONE, x)
			
 
				+#define _mmw_transpose_epi8(x)      _mm512_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
			
 
				+#define _mmw_abs_epi32              _mm512_abs_epi32
			
 
				+#define _mmw_cvtps_epi32            _mm512_cvtps_epi32
			
 
				+#define _mmw_cvttps_epi32           _mm512_cvttps_epi32
			
 
				+
			
 
				+#define _mmx_dp4_ps(a, b)           _mm_dp_ps(a, b, 0xFF)
			
 
				+#define _mmx_fmadd_ps               _mm_fmadd_ps
			
 
				+#define _mmx_max_epi32              _mm_max_epi32
			
 
				+#define _mmx_min_epi32              _mm_min_epi32
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SIMD casting functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
			
 
				+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
			
 
				+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(float A) { return _mm256_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); }
			
 
				+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256 A) { return A; }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); }
			
 
				+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; }
			
 
				+template<> FORCE_INLINE __m512  simd_cast<__m512>(float A) { return _mm512_set1_ps(A); }
			
 
				+template<> FORCE_INLINE __m512  simd_cast<__m512>(__m512i A) { return _mm512_castsi512_ps(A); }
			
 
				+template<> FORCE_INLINE __m512  simd_cast<__m512>(__m512 A) { return A; }
			
 
				+template<> FORCE_INLINE __m512i simd_cast<__m512i>(int A) { return _mm512_set1_epi32(A); }
			
 
				+template<> FORCE_INLINE __m512i simd_cast<__m512i>(__m512 A) { return _mm512_castps_si512(A); }
			
 
				+template<> FORCE_INLINE __m512i simd_cast<__m512i>(__m512i A) { return A; }
			
 
				+
			
 
				+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
			
 
				+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
			
 
				+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
			
 
				+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
			
 
				+		return acs->m_array; \
			
 
				+	}
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
			
 
				+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
			
 
				+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m256, float, , 8)
			
 
				+MAKE_ACCESSOR(simd_f32, __m256, float, const, 8)
			
 
				+MAKE_ACCESSOR(simd_i32, __m256i, int, , 8)
			
 
				+MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8)
			
 
				+
			
 
				+MAKE_ACCESSOR(simd_f32, __m512, float, , 16)
			
 
				+MAKE_ACCESSOR(simd_f32, __m512, float, const, 16)
			
 
				+MAKE_ACCESSOR(simd_i32, __m512i, int, , 16)
			
 
				+MAKE_ACCESSOR(simd_i32, __m512i, int, const, 16)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Specialized AVX input assembly function for general vertex gather 
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
			
 
				+
			
 
				+FORCE_INLINE void GatherVertices(__m512 *vtxX, __m512 *vtxY, __m512 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
			
 
				+{
			
 
				+	assert(numLanes >= 1);
			
 
				+
			
 
				+	const __m512i SIMD_TRI_IDX_OFFSET = _mm512_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45);
			
 
				+	static const __m512i SIMD_LANE_MASK[17] = {
			
 
				+		_mm512_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0),
			
 
				+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0)
			
 
				+	};
			
 
				+
			
 
				+	// Compute per-lane index list offset that guards against out of bounds memory accesses
			
 
				+	__m512i safeTriIdxOffset = _mm512_and_si512(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]);
			
 
				+
			
 
				+	// Fetch triangle indices. 
			
 
				+	__m512i vtxIdx[3];
			
 
				+	vtxIdx[0] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 0, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+	vtxIdx[1] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 1, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+	vtxIdx[2] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 2, 4), _mmw_set1_epi32(vtxLayout.mStride));
			
 
				+
			
 
				+	char *vPtr = (char *)inVtx;
			
 
				+
			
 
				+	// Fetch triangle vertices
			
 
				+	for (int i = 0; i < 3; i++)
			
 
				+	{
			
 
				+		vtxX[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)vPtr, 1);
			
 
				+		vtxY[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)(vPtr + vtxLayout.mOffsetY), 1);
			
 
				+		vtxW[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)(vPtr + vtxLayout.mOffsetW), 1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+namespace MaskedOcclusionCullingAVX512
			
 
				+{
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Poorly implemented functions. TODO: fix common (maskedOcclusionCullingCommon.inl) code to improve perf
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_floor_ps(__m512 x)
			
 
				+	{
			
 
				+		return _mm512_roundscale_ps(x, 1); // 1 = floor
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_ceil_ps(__m512 x)
			
 
				+	{
			
 
				+		return _mm512_roundscale_ps(x, 2); // 2 = ceil
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512i _mmw_cmpeq_epi32(__m512i a, __m512i b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmpeq_epi32_mask(a, b);
			
 
				+		return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0));
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512i _mmw_cmpgt_epi32(__m512i a, __m512i b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmpgt_epi32_mask(a, b);
			
 
				+		return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0));
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE bool _mmw_testz_epi32(__m512i a, __m512i b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmpeq_epi32_mask(_mm512_and_si512(a, b), _mm512_set1_epi32(0));
			
 
				+		return mask == 0xFFFF;
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_cmpge_ps(__m512 a, __m512 b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ);
			
 
				+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_cmpgt_ps(__m512 a, __m512 b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ);
			
 
				+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_cmpeq_ps(__m512 a, __m512 b)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
			
 
				+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __mmask16 _mmw_movemask_ps(const __m512 &a)
			
 
				+	{
			
 
				+		__mmask16 mask = _mm512_cmp_epi32_mask(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)), _mm512_set1_epi32(0), 4);	// a & 0x8000000 != 0
			
 
				+		return mask;
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE __m512 _mmw_blendv_ps(const __m512 &a, const __m512 &b, const __m512 &c)
			
 
				+	{
			
 
				+		__mmask16 mask = _mmw_movemask_ps(c);
			
 
				+		return _mm512_mask_mov_ps(a, mask, b);
			
 
				+	} 
			
 
				+
			
 
				+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX512;
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Include common algorithm implementation (general, SIMD independent code)
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	#include "MaskedOcclusionCullingCommon.inl"
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Utility function to create a new object using the allocator callbacks
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
			
 
				+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
			
 
				+		return object;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+namespace MaskedOcclusionCullingAVX512
			
 
				+{
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
			
 
				+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
			
 
				+
			
 
				+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
			
 
				+	{
			
 
				+		return nullptr;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+#endif
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl
@@ -0,0 +1,2053 @@
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+// Copyright 2017 Intel Corporation
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License.  You may obtain a copy
			
 
				+// of the License at
			
 
				+//
			
 
				+// http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Common SIMD math utility functions
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T> FORCE_INLINE T max(const T &a, const T &b) { return a > b ? a : b; }
			
 
				+template<typename T> FORCE_INLINE T min(const T &a, const T &b) { return a < b ? a : b; }
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Common defines and constants
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#define SIMD_ALL_LANES_MASK    ((1 << SIMD_LANES) - 1)
			
 
				+
			
 
				+// Tile dimensions are 32xN pixels. These values are not tweakable and the code must also be modified
			
 
				+// to support different tile sizes as it is tightly coupled with the SSE/AVX register size
			
 
				+#define TILE_WIDTH_SHIFT       5
			
 
				+#define TILE_WIDTH             (1 << TILE_WIDTH_SHIFT)
			
 
				+#define TILE_HEIGHT            (1 << TILE_HEIGHT_SHIFT)
			
 
				+
			
 
				+// Sub-tiles (used for updating the masked HiZ buffer) are 8x4 tiles, so there are 4x2 sub-tiles in a tile
			
 
				+#define SUB_TILE_WIDTH          8
			
 
				+#define SUB_TILE_HEIGHT         4
			
 
				+
			
 
				+// The number of fixed point bits used to represent vertex coordinates / edge slopes.
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+	#define FP_BITS             8
			
 
				+	#define FP_HALF_PIXEL       (1 << (FP_BITS - 1))
			
 
				+	#define FP_INV              (1.0f / (float)(1 << FP_BITS))
			
 
				+#else
			
 
				+	// Note that too low precision, without precise coverage, may cause overshoots / false coverage during rasterization.
			
 
				+	// This is configured for 14 bits for AVX512 and 16 bits for SSE. Max tile slope delta is roughly 
			
 
				+	// (screenWidth + 2*(GUARD_BAND_PIXEL_SIZE + 1)) * (2^FP_BITS * (TILE_HEIGHT + GUARD_BAND_PIXEL_SIZE + 1))  
			
 
				+	// and must fit in 31 bits. With this config, max image resolution (width) is ~3272, so stay well clear of this limit. 
			
 
				+	#define FP_BITS             (19 - TILE_HEIGHT_SHIFT)
			
 
				+#endif
			
 
				+
			
 
				+// Tile dimensions in fixed point coordinates
			
 
				+#define FP_TILE_HEIGHT_SHIFT    (FP_BITS + TILE_HEIGHT_SHIFT)
			
 
				+#define FP_TILE_HEIGHT          (1 << FP_TILE_HEIGHT_SHIFT)
			
 
				+
			
 
				+// Maximum number of triangles that may be generated during clipping. We process SIMD_LANES triangles at a time and
			
 
				+// clip against 5 planes, so the max should be 5*8 = 40 (we immediately draw the first clipped triangle).
			
 
				+// This number must be a power of two.
			
 
				+#define MAX_CLIPPED             (8*SIMD_LANES)
			
 
				+#define MAX_CLIPPED_WRAP        (MAX_CLIPPED - 1)
			
 
				+
			
 
				+// Size of guard band in pixels. Clipping doesn't seem to be very expensive so we use a small guard band
			
 
				+// to improve rasterization performance. It's not recommended to set the guard band to zero, as this may
			
 
				+// cause leakage along the screen border due to precision/rounding.
			
 
				+#define GUARD_BAND_PIXEL_SIZE   1.0f
			
 
				+
			
 
				+// We classify triangles as big if the bounding box is wider than this given threshold and use a tighter
			
 
				+// but slightly more expensive traversal algorithm. This improves performance greatly for sliver triangles
			
 
				+#define BIG_TRIANGLE            3
			
 
				+
			
 
				+// Only gather statistics if enabled.
			
 
				+#if ENABLE_STATS != 0
			
 
				+	#define STATS_ADD(var, val)     _InterlockedExchangeAdd64( &var, val )
			
 
				+#else
			
 
				+	#define STATS_ADD(var, val)
			
 
				+#endif
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// SIMD common defines (constant values)
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#define SIMD_BITS_ONE       _mmw_set1_epi32(~0)
			
 
				+#define SIMD_BITS_ZERO      _mmw_setzero_epi32()
			
 
				+#define SIMD_TILE_WIDTH     _mmw_set1_epi32(TILE_WIDTH)
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Vertex fetch utility function, need to be in global namespace due to template specialization
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<int N> FORCE_INLINE void VtxFetch4(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes)
			
 
				+{
			
 
				+	// Fetch 4 vectors (matching 1 sse part of the SIMD register), and continue to the next
			
 
				+	const int ssePart = (SIMD_LANES / 4) - N;
			
 
				+	for (int k = 0; k < 4; k++)
			
 
				+	{
			
 
				+		int lane = 4 * ssePart + k;
			
 
				+		if (numLanes > lane)
			
 
				+			v[k] = _mmw_insertf32x4_ps(v[k], _mm_loadu_ps(&inVtx[inTrisPtr[lane * 3 + triVtx] << 2]), ssePart);
			
 
				+	}
			
 
				+	VtxFetch4<N - 1>(v, inTrisPtr, triVtx, inVtx, numLanes);
			
 
				+}
			
 
				+
			
 
				+template<> FORCE_INLINE void VtxFetch4<0>(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes) 
			
 
				+{
			
 
				+	// Workaround for unused parameter warning
			
 
				+	(void)v; (void)inTrisPtr; (void)triVtx; (void)inVtx; (void)numLanes;
			
 
				+}
			
 
				+
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Private class containing the implementation
			
 
				+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+class MaskedOcclusionCullingPrivate : public MaskedOcclusionCulling
			
 
				+{
			
 
				+public:
			
 
				+	struct ZTile
			
 
				+	{
			
 
				+		__mw        mZMin[2];
			
 
				+		__mwi       mMask;
			
 
				+	};
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Member variables
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	__mw            mHalfWidth;
			
 
				+	__mw            mHalfHeight;
			
 
				+	__mw            mCenterX;
			
 
				+	__mw            mCenterY;
			
 
				+	__m128          mCSFrustumPlanes[5];
			
 
				+	__m128          mIHalfSize;
			
 
				+	__m128          mICenter;
			
 
				+	__m128i         mIScreenSize;
			
 
				+
			
 
				+	float           mNearDist;
			
 
				+	int             mWidth;
			
 
				+	int             mHeight;
			
 
				+	int             mTilesWidth;
			
 
				+	int             mTilesHeight;
			
 
				+
			
 
				+	ZTile           *mMaskedHiZBuffer;
			
 
				+	ScissorRect     mFullscreenScissor;
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Constructors and state handling
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	MaskedOcclusionCullingPrivate(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) : mFullscreenScissor(0, 0, 0, 0)
			
 
				+	{
			
 
				+		mMaskedHiZBuffer = nullptr;
			
 
				+		mAlignedAllocCallback = alignedAlloc;
			
 
				+		mAlignedFreeCallback = alignedFree;
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+        mRecorder = nullptr;
			
 
				+#endif
			
 
				+
			
 
				+		SetNearClipPlane(0.0f);
			
 
				+		mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f);
			
 
				+		mCSFrustumPlanes[1] = _mm_setr_ps(1.0f, 0.0f, 1.0f, 0.0f);
			
 
				+		mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f, 0.0f, 1.0f, 0.0f);
			
 
				+		mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f, 1.0f, 0.0f);
			
 
				+		mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f, 1.0f, 0.0f);
			
 
				+
			
 
				+		memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
			
 
				+
			
 
				+		SetResolution(0, 0);
			
 
				+	}
			
 
				+
			
 
				+	~MaskedOcclusionCullingPrivate() override
			
 
				+	{
			
 
				+		if (mMaskedHiZBuffer != nullptr)
			
 
				+			mAlignedFreeCallback(mMaskedHiZBuffer);
			
 
				+		mMaskedHiZBuffer = nullptr;
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+        assert( mRecorder == nullptr ); // forgot to call StopRecording()?
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	void SetResolution(unsigned int width, unsigned int height) override
			
 
				+	{
			
 
				+		// Resolution must be a multiple of the subtile size
			
 
				+		assert(width % SUB_TILE_WIDTH == 0 && height % SUB_TILE_HEIGHT == 0);
			
 
				+#if PRECISE_COVERAGE == 0
			
 
				+		// Test if combination of resolution & SLOPE_FP_BITS bits may cause 32-bit overflow. Note that the maximum resolution estimate
			
 
				+		// is only an estimate (not conservative). It's advicable to stay well below the limit.
			
 
				+		assert(width < ((1U << 31) - 1U) / ((1U << FP_BITS) * (TILE_HEIGHT + (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f))) - (2U * (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f)));
			
 
				+#endif
			
 
				+
			
 
				+		// Delete current masked hierarchical Z buffer
			
 
				+		if (mMaskedHiZBuffer != nullptr)
			
 
				+			mAlignedFreeCallback(mMaskedHiZBuffer);
			
 
				+		mMaskedHiZBuffer = nullptr;
			
 
				+
			
 
				+		// Setup various resolution dependent constant values
			
 
				+		mWidth = (int)width;
			
 
				+		mHeight = (int)height;
			
 
				+		mTilesWidth = (int)(width + TILE_WIDTH - 1) >> TILE_WIDTH_SHIFT;
			
 
				+		mTilesHeight = (int)(height + TILE_HEIGHT - 1) >> TILE_HEIGHT_SHIFT;
			
 
				+		mCenterX = _mmw_set1_ps((float)mWidth  * 0.5f);
			
 
				+		mCenterY = _mmw_set1_ps((float)mHeight * 0.5f);
			
 
				+		mICenter = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
			
 
				+		mHalfWidth = _mmw_set1_ps((float)mWidth  * 0.5f);
			
 
				+#if USE_D3D != 0
			
 
				+		mHalfHeight = _mmw_set1_ps((float)-mHeight * 0.5f);
			
 
				+		mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)-mHeight * 0.5f, (float)-mHeight * 0.5f);
			
 
				+#else
			
 
				+		mHalfHeight = _mmw_set1_ps((float)mHeight * 0.5f);
			
 
				+		mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
			
 
				+#endif
			
 
				+		mIScreenSize = _mm_setr_epi32(mWidth - 1, mWidth - 1, mHeight - 1, mHeight - 1);
			
 
				+
			
 
				+		// Setup a full screen scissor rectangle
			
 
				+		mFullscreenScissor.mMinX = 0;
			
 
				+		mFullscreenScissor.mMinY = 0;
			
 
				+		mFullscreenScissor.mMaxX = mTilesWidth << TILE_WIDTH_SHIFT;
			
 
				+		mFullscreenScissor.mMaxY = mTilesHeight << TILE_HEIGHT_SHIFT;
			
 
				+
			
 
				+		// Adjust clip planes to include a small guard band to avoid clipping leaks
			
 
				+        if (mWidth > 0.0f && mHeight > 0.0f)
			
 
				+        {
			
 
				+            float guardBandWidth = (2.0f / (float)mWidth) * GUARD_BAND_PIXEL_SIZE;
			
 
				+            float guardBandHeight = (2.0f / (float)mHeight) * GUARD_BAND_PIXEL_SIZE;
			
 
				+            mCSFrustumPlanes[1] = _mm_setr_ps(1.0f - guardBandWidth, 0.0f, 1.0f, 0.0f);
			
 
				+            mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f + guardBandWidth, 0.0f, 1.0f, 0.0f);
			
 
				+            mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f - guardBandHeight, 1.0f, 0.0f);
			
 
				+            mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f + guardBandHeight, 1.0f, 0.0f);
			
 
				+        }
			
 
				+
			
 
				+		// Allocate masked hierarchical Z buffer (if zero size leave at nullptr)
			
 
				+		if(mTilesWidth * mTilesHeight > 0)
			
 
				+			mMaskedHiZBuffer = (ZTile *)mAlignedAllocCallback(64, sizeof(ZTile) * mTilesWidth * mTilesHeight);
			
 
				+	}
			
 
				+
			
 
				+	void GetResolution(unsigned int &width, unsigned int &height) const override
			
 
				+	{
			
 
				+		width = mWidth;
			
 
				+		height = mHeight;
			
 
				+	}
			
 
				+
			
 
				+	void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) override
			
 
				+	{
			
 
				+		outBinWidth = (mWidth / nBinsW) - ((mWidth / nBinsW) % TILE_WIDTH);
			
 
				+		outBinHeight = (mHeight / nBinsH) - ((mHeight / nBinsH) % TILE_HEIGHT);
			
 
				+	}
			
 
				+
			
 
				+    void SetNearClipPlane(float nearDist) override
			
 
				+	{
			
 
				+		// Setup the near frustum plane
			
 
				+		mNearDist = nearDist;
			
 
				+		mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, -nearDist);
			
 
				+	}
			
 
				+
			
 
				+	float GetNearClipPlane() const override
			
 
				+	{
			
 
				+		return mNearDist;
			
 
				+	}
			
 
				+
			
 
				+	void ClearBuffer() override
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+		// Iterate through all depth tiles and clear to default values
			
 
				+		for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
			
 
				+		{
			
 
				+			mMaskedHiZBuffer[i].mMask = _mmw_setzero_epi32();
			
 
				+
			
 
				+			// Clear z0 to beyond infinity to ensure we never merge with clear data
			
 
				+			mMaskedHiZBuffer[i].mZMin[0] = _mmw_set1_ps(-1.0f);
			
 
				+#if QUICK_MASK != 0
			
 
				+			// Clear z1 to nearest depth value as it is pushed back on each update
			
 
				+			mMaskedHiZBuffer[i].mZMin[1] = _mmw_set1_ps(FLT_MAX);
			
 
				+#else
			
 
				+			mMaskedHiZBuffer[i].mZMin[1] = _mmw_setzero_ps();
			
 
				+#endif
			
 
				+		}
			
 
				+
			
 
				+#if ENABLE_STATS != 0
			
 
				+		memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
			
 
				+#endif
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE != 0
			
 
				+        {
			
 
				+            std::lock_guard<std::mutex> lock( mRecorderMutex );
			
 
				+            if( mRecorder != nullptr ) mRecorder->RecordClearBuffer();
			
 
				+        }
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// MergeBuffer
			
 
				+	// Utility Function merges another MOC buffer into the existing one
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	void MergeBuffer(MaskedOcclusionCulling* BufferB) override
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+		//// Iterate through all depth tiles and merge the 2 tiles
			
 
				+		for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
			
 
				+		{
			
 
				+			__mw *zMinB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mZMin;
			
 
				+			__mw *zMinA = mMaskedHiZBuffer[i].mZMin;
			
 
				+			__mwi RastMaskB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
			
 
				+
			
 
				+#if QUICK_MASK != 0
			
 
				+			// Clear z0 to beyond infinity to ensure we never merge with clear data
			
 
				+			__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
			
 
				+			// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
			
 
				+			sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
			
 
				+			if (!_mmw_testz_epi32(sign0, sign0))
			
 
				+			{
			
 
				+				STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
			
 
				+				zMinA[0] = _mmw_max_ps(zMinA[0], zMinB[0]);
			
 
				+
			
 
				+				__mwi rastMask = mMaskedHiZBuffer[i].mMask;
			
 
				+				__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
			
 
				+				// Mask out all subtiles failing the depth test (don't update these subtiles)
			
 
				+				deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zMinA[1], zMinA[0])), 31));
			
 
				+				mMaskedHiZBuffer[i].mMask = _mmw_andnot_epi32(deadLane, rastMask);
			
 
				+			}
			
 
				+
			
 
				+			// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
			
 
				+			__mwi LiveTile = _mmw_cmpeq_epi32(RastMaskB, SIMD_BITS_ZERO);
			
 
				+			// invert to have bits set for clear subtiles
			
 
				+			__mwi t0inv = _mmw_not_epi32(LiveTile);
			
 
				+			// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
			
 
				+			if (!_mmw_testz_epi32(t0inv, t0inv))
			
 
				+			{
			
 
				+				STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
			
 
				+				UpdateTileQuick(i, RastMaskB, zMinB[1]);
			
 
				+			}
			
 
				+#else 
			
 
				+			// Clear z0 to beyond infinity to ensure we never merge with clear data
			
 
				+			__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(mMaskedHiZBuffer[i].mZMin[0]), 31);
			
 
				+			// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
			
 
				+			sign1 = _mmw_cmpeq_epi32(sign1, SIMD_BITS_ZERO);
			
 
				+
			
 
				+			// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
			
 
				+			__mwi LiveTile1 = _mmw_cmpeq_epi32(mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
			
 
				+			// invert to have bits set for clear subtiles
			
 
				+			__mwi t1inv = _mmw_not_epi32(LiveTile1);
			
 
				+			// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
			
 
				+			if (_mmw_testz_epi32(sign1, sign1) && _mmw_testz_epi32(t1inv, t1inv))
			
 
				+			{
			
 
				+				mMaskedHiZBuffer[i].mMask = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
			
 
				+				mMaskedHiZBuffer[i].mZMin[0] = zMinB[0];
			
 
				+				mMaskedHiZBuffer[i].mZMin[1] = zMinB[1];
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// Clear z0 to beyond infinity to ensure we never merge with clear data
			
 
				+				__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
			
 
				+				sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
			
 
				+				// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
			
 
				+				if (!_mmw_testz_epi32(sign0, sign0))
			
 
				+				{
			
 
				+					// build a mask for Zmin[0], full if the layer has been completed, or partial if tile is still partly filled.
			
 
				+					// cant just use the completement of the mask, as tiles might not get updated by merge 
			
 
				+					__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[1]), 31);
			
 
				+					__mwi LayerMask0 = _mmw_not_epi32(sign1);
			
 
				+					__mwi LayerMask1 = _mmw_not_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask);
			
 
				+					__mwi rastMask = _mmw_or_epi32(LayerMask0, LayerMask1);
			
 
				+
			
 
				+					UpdateTileAccurate(i, rastMask, zMinB[0]);
			
 
				+				}
			
 
				+
			
 
				+				// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
			
 
				+				__mwi LiveTile = _mmw_cmpeq_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
			
 
				+				// invert to have bits set for clear subtiles
			
 
				+				__mwi t0inv = _mmw_not_epi32(LiveTile);
			
 
				+				// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
			
 
				+				if (!_mmw_testz_epi32(t0inv, t0inv))
			
 
				+				{
			
 
				+					UpdateTileAccurate(i, ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, zMinB[1]);
			
 
				+				}
			
 
				+
			
 
				+				//if (_mmw_testz_epi32(sign0, sign0) && _mmw_testz_epi32(t0inv, t0inv))
			
 
				+				//	STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
			
 
				+
			
 
				+			}
			
 
				+
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Polygon clipping functions
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	FORCE_INLINE int ClipPolygon(__m128 *outVtx, __m128 *inVtx, const __m128 &plane, int n) const
			
 
				+	{
			
 
				+		__m128 p0 = inVtx[n - 1];
			
 
				+		__m128 dist0 = _mmx_dp4_ps(p0, plane);
			
 
				+
			
 
				+		// Loop over all polygon edges and compute intersection with clip plane (if any)
			
 
				+		int nout = 0;
			
 
				+		for (int k = 0; k < n; k++)
			
 
				+		{
			
 
				+			__m128 p1 = inVtx[k];
			
 
				+			__m128 dist1 = _mmx_dp4_ps(p1, plane);
			
 
				+			int dist0Neg = _mm_movemask_ps(dist0);
			
 
				+			if (!dist0Neg)	// dist0 > 0.0f
			
 
				+				outVtx[nout++] = p0;
			
 
				+
			
 
				+			// Edge intersects the clip plane if dist0 and dist1 have opposing signs
			
 
				+			if (_mm_movemask_ps(_mm_xor_ps(dist0, dist1)))
			
 
				+			{
			
 
				+				// Always clip from the positive side to avoid T-junctions
			
 
				+				if (!dist0Neg)
			
 
				+				{
			
 
				+					__m128 t = _mm_div_ps(dist0, _mm_sub_ps(dist0, dist1));
			
 
				+					outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p1, p0), t, p0);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					__m128 t = _mm_div_ps(dist1, _mm_sub_ps(dist1, dist0));
			
 
				+					outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p0, p1), t, p1);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			dist0 = dist1;
			
 
				+			p0 = p1;
			
 
				+		}
			
 
				+		return nout;
			
 
				+	}
			
 
				+
			
 
				+	template<ClipPlanes CLIP_PLANE> void TestClipPlane(__mw *vtxX, __mw *vtxY, __mw *vtxW, unsigned int &straddleMask, unsigned int &triMask, ClipPlanes clipPlaneMask)
			
 
				+	{
			
 
				+		straddleMask = 0;
			
 
				+		// Skip masked clip planes
			
 
				+		if (!(clipPlaneMask & CLIP_PLANE))
			
 
				+			return;
			
 
				+
			
 
				+		// Evaluate all 3 vertices against the frustum plane
			
 
				+		__mw planeDp[3];
			
 
				+		for (int i = 0; i < 3; ++i)
			
 
				+		{
			
 
				+			switch (CLIP_PLANE)
			
 
				+			{
			
 
				+			case ClipPlanes::CLIP_PLANE_LEFT:   planeDp[i] = _mmw_add_ps(vtxW[i], vtxX[i]); break;
			
 
				+			case ClipPlanes::CLIP_PLANE_RIGHT:  planeDp[i] = _mmw_sub_ps(vtxW[i], vtxX[i]); break;
			
 
				+			case ClipPlanes::CLIP_PLANE_BOTTOM: planeDp[i] = _mmw_add_ps(vtxW[i], vtxY[i]); break;
			
 
				+			case ClipPlanes::CLIP_PLANE_TOP:    planeDp[i] = _mmw_sub_ps(vtxW[i], vtxY[i]); break;
			
 
				+			case ClipPlanes::CLIP_PLANE_NEAR:   planeDp[i] = _mmw_sub_ps(vtxW[i], _mmw_set1_ps(mNearDist)); break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Look at FP sign and determine if tri is inside, outside or straddles the frustum plane
			
 
				+		__mw inside = _mmw_andnot_ps(planeDp[0], _mmw_andnot_ps(planeDp[1], _mmw_not_ps(planeDp[2])));
			
 
				+		__mw outside = _mmw_and_ps(planeDp[0], _mmw_and_ps(planeDp[1], planeDp[2]));
			
 
				+		unsigned int inMask = (unsigned int)_mmw_movemask_ps(inside);
			
 
				+		unsigned int outMask = (unsigned int)_mmw_movemask_ps(outside);
			
 
				+		straddleMask = (~outMask) & (~inMask);
			
 
				+		triMask &= ~outMask;
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void ClipTriangleAndAddToBuffer(__mw *vtxX, __mw *vtxY, __mw *vtxW, __m128 *clippedTrisBuffer, int &clipWriteIdx, unsigned int &triMask, unsigned int triClipMask, ClipPlanes clipPlaneMask)
			
 
				+	{
			
 
				+		if (!triClipMask)
			
 
				+			return;
			
 
				+
			
 
				+		// Inside test all 3 triangle vertices against all active frustum planes
			
 
				+		unsigned int straddleMask[5];
			
 
				+		TestClipPlane<ClipPlanes::CLIP_PLANE_NEAR>(vtxX, vtxY, vtxW, straddleMask[0], triMask, clipPlaneMask);
			
 
				+		TestClipPlane<ClipPlanes::CLIP_PLANE_LEFT>(vtxX, vtxY, vtxW, straddleMask[1], triMask, clipPlaneMask);
			
 
				+		TestClipPlane<ClipPlanes::CLIP_PLANE_RIGHT>(vtxX, vtxY, vtxW, straddleMask[2], triMask, clipPlaneMask);
			
 
				+		TestClipPlane<ClipPlanes::CLIP_PLANE_BOTTOM>(vtxX, vtxY, vtxW, straddleMask[3], triMask, clipPlaneMask);
			
 
				+		TestClipPlane<ClipPlanes::CLIP_PLANE_TOP>(vtxX, vtxY, vtxW, straddleMask[4], triMask, clipPlaneMask);
			
 
				+
			
 
				+        // Clip triangle against straddling planes and add to the clipped triangle buffer
			
 
				+		__m128 vtxBuf[2][8];
			
 
				+
			
 
				+#if CLIPPING_PRESERVES_ORDER != 0
			
 
				+		unsigned int clipMask = triClipMask & triMask;
			
 
				+		unsigned int clipAndStraddleMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & clipMask;
			
 
				+        // no clipping needed after all - early out
			
 
				+        if (clipAndStraddleMask == 0)
			
 
				+			return;
			
 
				+		while( clipMask )
			
 
				+		{
			
 
				+			// Find and setup next triangle to clip
			
 
				+			unsigned int triIdx = find_clear_lsb(&clipMask);
			
 
				+			unsigned int triBit = (1U << triIdx);
			
 
				+			assert(triIdx < SIMD_LANES);
			
 
				+
			
 
				+			int bufIdx = 0;
			
 
				+			int nClippedVerts = 3;
			
 
				+			for (int i = 0; i < 3; i++)
			
 
				+				vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
			
 
				+
			
 
				+			// Clip triangle with straddling planes. 
			
 
				+			for (int i = 0; i < 5; ++i)
			
 
				+			{
			
 
				+				if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i))) // <- second part maybe not needed?
			
 
				+				{
			
 
				+					nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
			
 
				+					bufIdx ^= 1;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (nClippedVerts >= 3)
			
 
				+			{
			
 
				+                // Write all triangles into the clip buffer and process them next loop iteration
			
 
				+				clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
			
 
				+				clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][1];
			
 
				+				clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][2];
			
 
				+				clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
			
 
				+				for (int i = 2; i < nClippedVerts - 1; i++)
			
 
				+				{
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
			
 
				+					clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+        // since all triangles were copied to clip buffer for next iteration, skip further processing
			
 
				+		triMask = 0;
			
 
				+#else
			
 
				+		unsigned int clipMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & (triClipMask & triMask);
			
 
				+		while (clipMask)
			
 
				+		{
			
 
				+			// Find and setup next triangle to clip
			
 
				+			unsigned int triIdx = find_clear_lsb(&clipMask);
			
 
				+			unsigned int triBit = (1U << triIdx);
			
 
				+			assert(triIdx < SIMD_LANES);
			
 
				+
			
 
				+			int bufIdx = 0;
			
 
				+			int nClippedVerts = 3;
			
 
				+			for (int i = 0; i < 3; i++)
			
 
				+				vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
			
 
				+
			
 
				+			// Clip triangle with straddling planes. 
			
 
				+			for (int i = 0; i < 5; ++i)
			
 
				+			{
			
 
				+				if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i)))
			
 
				+				{
			
 
				+					nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
			
 
				+					bufIdx ^= 1;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (nClippedVerts >= 3)
			
 
				+			{
			
 
				+				// Write the first triangle back into the list of currently processed triangles
			
 
				+				for (int i = 0; i < 3; i++)
			
 
				+				{
			
 
				+					simd_f32(vtxX[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[0];
			
 
				+					simd_f32(vtxY[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[1];
			
 
				+					simd_f32(vtxW[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[2];
			
 
				+				}
			
 
				+				// Write the remaining triangles into the clip buffer and process them next loop iteration
			
 
				+				for (int i = 2; i < nClippedVerts - 1; i++)
			
 
				+				{
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
			
 
				+					clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
			
 
				+					clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
			
 
				+				}
			
 
				+			}
			
 
				+			else // Kill triangles that was removed by clipping
			
 
				+				triMask &= ~triBit;
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Vertex transform & projection
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	FORCE_INLINE void TransformVerts(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *modelToClipMatrix)
			
 
				+	{
			
 
				+		if (modelToClipMatrix != nullptr)
			
 
				+		{
			
 
				+			for (int i = 0; i < 3; ++i)
			
 
				+			{
			
 
				+				__mw tmpX, tmpY, tmpW;
			
 
				+				tmpX = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[0]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[4]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[8]), _mmw_set1_ps(modelToClipMatrix[12]))));
			
 
				+				tmpY = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[1]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[5]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[9]), _mmw_set1_ps(modelToClipMatrix[13]))));
			
 
				+				tmpW = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[3]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[7]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[11]), _mmw_set1_ps(modelToClipMatrix[15]))));
			
 
				+				vtxX[i] = tmpX;	vtxY[i] = tmpY;	vtxW[i] = tmpW;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+	FORCE_INLINE void ProjectVertices(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
			
 
				+	{
			
 
				+#if USE_D3D != 0
			
 
				+		static const int vertexOrder[] = {2, 1, 0};
			
 
				+#else
			
 
				+		static const int vertexOrder[] = {0, 1, 2};
			
 
				+#endif
			
 
				+
			
 
				+		// Project vertices and transform to screen space. Snap to sub-pixel coordinates with FP_BITS precision.
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			int idx = vertexOrder[i];
			
 
				+			__mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
			
 
				+			__mw screenX = _mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX);
			
 
				+			__mw screenY = _mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY);
			
 
				+			ipVtxX[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenX, _mmw_set1_ps(float(1 << FP_BITS))));
			
 
				+			ipVtxY[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenY, _mmw_set1_ps(float(1 << FP_BITS))));
			
 
				+			pVtxX[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[idx]), _mmw_set1_ps(FP_INV));
			
 
				+			pVtxY[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[idx]), _mmw_set1_ps(FP_INV));
			
 
				+			pVtxZ[idx] = rcpW;
			
 
				+		}
			
 
				+	}
			
 
				+#else
			
 
				+	FORCE_INLINE void ProjectVertices(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
			
 
				+	{
			
 
				+#if USE_D3D != 0
			
 
				+		static const int vertexOrder[] = {2, 1, 0};
			
 
				+#else
			
 
				+		static const int vertexOrder[] = {0, 1, 2};
			
 
				+#endif
			
 
				+		// Project vertices and transform to screen space. Round to nearest integer pixel coordinate
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			int idx = vertexOrder[i];
			
 
				+			__mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
			
 
				+
			
 
				+			// The rounding modes are set to match HW rasterization with OpenGL. In practice our samples are placed
			
 
				+			// in the (1,0) corner of each pixel, while HW rasterizer uses (0.5, 0.5). We get (1,0) because of the 
			
 
				+			// floor used when interpolating along triangle edges. The rounding modes match an offset of (0.5, -0.5)
			
 
				+			pVtxX[idx] = _mmw_ceil_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX));
			
 
				+			pVtxY[idx] = _mmw_floor_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY));
			
 
				+			pVtxZ[idx] = rcpW;
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Common SSE/AVX input assembly functions, note that there are specialized gathers for the general case in the SSE/AVX specific files
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	FORCE_INLINE void GatherVerticesFast(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes)
			
 
				+	{
			
 
				+		// This function assumes that the vertex layout is four packed x, y, z, w-values.
			
 
				+		// Since the layout is known we can get some additional performance by using a 
			
 
				+		// more optimized gather strategy.
			
 
				+		assert(numLanes >= 1);
			
 
				+
			
 
				+		// Gather vertices 
			
 
				+		__mw v[4], swz[4];
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			// Load 4 (x,y,z,w) vectors per SSE part of the SIMD register (so 4 vectors for SSE, 8 vectors for AVX)
			
 
				+			// this fetch uses templates to unroll the loop
			
 
				+			VtxFetch4<SIMD_LANES / 4>(v, inTrisPtr, i, inVtx, numLanes);
			
 
				+
			
 
				+			// Transpose each individual SSE part of the SSE/AVX register (similar to _MM_TRANSPOSE4_PS)
			
 
				+			swz[0] = _mmw_shuffle_ps(v[0], v[1], 0x44);
			
 
				+			swz[2] = _mmw_shuffle_ps(v[0], v[1], 0xEE);
			
 
				+			swz[1] = _mmw_shuffle_ps(v[2], v[3], 0x44);
			
 
				+			swz[3] = _mmw_shuffle_ps(v[2], v[3], 0xEE);
			
 
				+
			
 
				+			vtxX[i] = _mmw_shuffle_ps(swz[0], swz[1], 0x88);
			
 
				+			vtxY[i] = _mmw_shuffle_ps(swz[0], swz[1], 0xDD);
			
 
				+			vtxW[i] = _mmw_shuffle_ps(swz[2], swz[3], 0xDD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Rasterization functions
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	FORCE_INLINE void ComputeBoundingBox(__mwi &bbminX, __mwi &bbminY, __mwi &bbmaxX, __mwi &bbmaxY, const __mw *vX, const __mw *vY, const ScissorRect *scissor)
			
 
				+	{
			
 
				+		static const __mwi SIMD_PAD_W_MASK = _mmw_set1_epi32(~(TILE_WIDTH - 1));
			
 
				+		static const __mwi SIMD_PAD_H_MASK = _mmw_set1_epi32(~(TILE_HEIGHT - 1));
			
 
				+
			
 
				+		// Find Min/Max vertices
			
 
				+		bbminX = _mmw_cvttps_epi32(_mmw_min_ps(vX[0], _mmw_min_ps(vX[1], vX[2])));
			
 
				+		bbminY = _mmw_cvttps_epi32(_mmw_min_ps(vY[0], _mmw_min_ps(vY[1], vY[2])));
			
 
				+		bbmaxX = _mmw_cvttps_epi32(_mmw_max_ps(vX[0], _mmw_max_ps(vX[1], vX[2])));
			
 
				+		bbmaxY = _mmw_cvttps_epi32(_mmw_max_ps(vY[0], _mmw_max_ps(vY[1], vY[2])));
			
 
				+
			
 
				+		// Clamp to tile boundaries
			
 
				+		bbminX = _mmw_and_epi32(bbminX, SIMD_PAD_W_MASK);
			
 
				+		bbmaxX = _mmw_and_epi32(_mmw_add_epi32(bbmaxX, _mmw_set1_epi32(TILE_WIDTH)), SIMD_PAD_W_MASK);
			
 
				+		bbminY = _mmw_and_epi32(bbminY, SIMD_PAD_H_MASK);
			
 
				+		bbmaxY = _mmw_and_epi32(_mmw_add_epi32(bbmaxY, _mmw_set1_epi32(TILE_HEIGHT)), SIMD_PAD_H_MASK);
			
 
				+
			
 
				+		// Clip to scissor
			
 
				+		bbminX = _mmw_max_epi32(bbminX, _mmw_set1_epi32(scissor->mMinX));
			
 
				+		bbmaxX = _mmw_min_epi32(bbmaxX, _mmw_set1_epi32(scissor->mMaxX));
			
 
				+		bbminY = _mmw_max_epi32(bbminY, _mmw_set1_epi32(scissor->mMinY));
			
 
				+		bbmaxY = _mmw_min_epi32(bbmaxY, _mmw_set1_epi32(scissor->mMaxY));
			
 
				+	}
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+	FORCE_INLINE void SortVertices(__mwi *vX, __mwi *vY)
			
 
				+	{
			
 
				+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
			
 
				+		for (int i = 0; i < 2; i++)
			
 
				+		{
			
 
				+			__mwi ey1 = _mmw_sub_epi32(vY[1], vY[0]);
			
 
				+			__mwi ey2 = _mmw_sub_epi32(vY[2], vY[0]);
			
 
				+			__mwi swapMask = _mmw_or_epi32(_mmw_or_epi32(ey1, ey2), _mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO));
			
 
				+			__mwi sX, sY;
			
 
				+			sX = _mmw_blendv_epi32(vX[2], vX[0], swapMask);
			
 
				+			vX[0] = _mmw_blendv_epi32(vX[0], vX[1], swapMask);
			
 
				+			vX[1] = _mmw_blendv_epi32(vX[1], vX[2], swapMask);
			
 
				+			vX[2] = sX;
			
 
				+			sY = _mmw_blendv_epi32(vY[2], vY[0], swapMask);
			
 
				+			vY[0] = _mmw_blendv_epi32(vY[0], vY[1], swapMask);
			
 
				+			vY[1] = _mmw_blendv_epi32(vY[1], vY[2], swapMask);
			
 
				+			vY[2] = sY;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE int CullBackfaces(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
			
 
				+	{
			
 
				+		// Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
			
 
				+		if (!(bfWinding & BACKFACE_CW))
			
 
				+		{
			
 
				+			__mw tmpX, tmpY, tmpZ;
			
 
				+			__mwi itmpX, itmpY;
			
 
				+			itmpX = _mmw_blendv_epi32(ipVtxX[2], ipVtxX[0], simd_cast<__mwi>(ccwMask));
			
 
				+			itmpY = _mmw_blendv_epi32(ipVtxY[2], ipVtxY[0], simd_cast<__mwi>(ccwMask));
			
 
				+			tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
			
 
				+			tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
			
 
				+			tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
			
 
				+			ipVtxX[2] = _mmw_blendv_epi32(ipVtxX[0], ipVtxX[2], simd_cast<__mwi>(ccwMask));
			
 
				+			ipVtxY[2] = _mmw_blendv_epi32(ipVtxY[0], ipVtxY[2], simd_cast<__mwi>(ccwMask));
			
 
				+			pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
			
 
				+			pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
			
 
				+			pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
			
 
				+			ipVtxX[0] = itmpX;
			
 
				+			ipVtxY[0] = itmpY;
			
 
				+			pVtxX[0] = tmpX;
			
 
				+			pVtxY[0] = tmpY;
			
 
				+			pVtxZ[0] = tmpZ;
			
 
				+		}
			
 
				+
			
 
				+		// Return a lane mask with all front faces set
			
 
				+		return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
			
 
				+	}
			
 
				+#else
			
 
				+	FORCE_INLINE void SortVertices(__mw *vX, __mw *vY)
			
 
				+	{
			
 
				+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
			
 
				+		for (int i = 0; i < 2; i++)
			
 
				+		{
			
 
				+			__mw ey1 = _mmw_sub_ps(vY[1], vY[0]);
			
 
				+			__mw ey2 = _mmw_sub_ps(vY[2], vY[0]);
			
 
				+			__mw swapMask = _mmw_or_ps(_mmw_or_ps(ey1, ey2), simd_cast<__mw>(_mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO)));
			
 
				+			__mw sX, sY;
			
 
				+			sX = _mmw_blendv_ps(vX[2], vX[0], swapMask);
			
 
				+			vX[0] = _mmw_blendv_ps(vX[0], vX[1], swapMask);
			
 
				+			vX[1] = _mmw_blendv_ps(vX[1], vX[2], swapMask);
			
 
				+			vX[2] = sX;
			
 
				+			sY = _mmw_blendv_ps(vY[2], vY[0], swapMask);
			
 
				+			vY[0] = _mmw_blendv_ps(vY[0], vY[1], swapMask);
			
 
				+			vY[1] = _mmw_blendv_ps(vY[1], vY[2], swapMask);
			
 
				+			vY[2] = sY;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE int CullBackfaces(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
			
 
				+	{
			
 
				+		// Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
			
 
				+		if (!(bfWinding & BACKFACE_CW))
			
 
				+		{
			
 
				+			__mw tmpX, tmpY, tmpZ;
			
 
				+			tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
			
 
				+			tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
			
 
				+			tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
			
 
				+			pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
			
 
				+			pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
			
 
				+			pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
			
 
				+			pVtxX[0] = tmpX;
			
 
				+			pVtxY[0] = tmpY;
			
 
				+			pVtxZ[0] = tmpZ;
			
 
				+		}
			
 
				+
			
 
				+		// Return a lane mask with all front faces set
			
 
				+		return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	FORCE_INLINE void ComputeDepthPlane(const __mw *pVtxX, const __mw *pVtxY, const __mw *pVtxZ, __mw &zPixelDx, __mw &zPixelDy) const
			
 
				+	{
			
 
				+		// Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
			
 
				+		__mw x2 = _mmw_sub_ps(pVtxX[2], pVtxX[0]);
			
 
				+		__mw x1 = _mmw_sub_ps(pVtxX[1], pVtxX[0]);
			
 
				+		__mw y1 = _mmw_sub_ps(pVtxY[1], pVtxY[0]);
			
 
				+		__mw y2 = _mmw_sub_ps(pVtxY[2], pVtxY[0]);
			
 
				+		__mw z1 = _mmw_sub_ps(pVtxZ[1], pVtxZ[0]);
			
 
				+		__mw z2 = _mmw_sub_ps(pVtxZ[2], pVtxZ[0]);
			
 
				+		__mw d = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_fmsub_ps(x1, y2, _mmw_mul_ps(y1, x2)));
			
 
				+		zPixelDx = _mmw_mul_ps(_mmw_fmsub_ps(z1, y2, _mmw_mul_ps(y1, z2)), d);
			
 
				+		zPixelDy = _mmw_mul_ps(_mmw_fmsub_ps(x1, z2, _mmw_mul_ps(z1, x2)), d);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void UpdateTileQuick(int tileIdx, const __mwi &coverage, const __mw &zTriv)
			
 
				+	{
			
 
				+		// Update heuristic used in the paper "Masked Software Occlusion Culling", 
			
 
				+		// good balance between performance and accuracy
			
 
				+		STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
			
 
				+		assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
			
 
				+
			
 
				+		__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
			
 
				+		__mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
			
 
				+
			
 
				+		// Swizzle coverage mask to 8x4 subtiles and test if any subtiles are not covered at all
			
 
				+		__mwi rastMask = coverage;
			
 
				+		__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
			
 
				+
			
 
				+		// Mask out all subtiles failing the depth test (don't update these subtiles)
			
 
				+		deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zTriv, zMin[0])), 31));
			
 
				+		rastMask = _mmw_andnot_epi32(deadLane, rastMask);
			
 
				+
			
 
				+		// Use distance heuristic to discard layer 1 if incoming triangle is significantly nearer to observer
			
 
				+		// than the buffer contents. See Section 3.2 in "Masked Software Occlusion Culling"
			
 
				+		__mwi coveredLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ONE);
			
 
				+		__mw diff = _mmw_fmsub_ps(zMin[1], _mmw_set1_ps(2.0f), _mmw_add_ps(zTriv, zMin[0]));
			
 
				+		__mwi discardLayerMask = _mmw_andnot_epi32(deadLane, _mmw_or_epi32(_mmw_srai_epi32(simd_cast<__mwi>(diff), 31), coveredLane));
			
 
				+
			
 
				+		// Update the mask with incoming triangle coverage
			
 
				+		mask = _mmw_or_epi32(_mmw_andnot_epi32(discardLayerMask, mask), rastMask);
			
 
				+
			
 
				+		__mwi maskFull = _mmw_cmpeq_epi32(mask, SIMD_BITS_ONE);
			
 
				+
			
 
				+		// Compute new value for zMin[1]. This has one of four outcomes: zMin[1] = min(zMin[1], zTriv),  zMin[1] = zTriv, 
			
 
				+		// zMin[1] = FLT_MAX or unchanged, depending on if the layer is updated, discarded, fully covered, or not updated
			
 
				+		__mw opA = _mmw_blendv_ps(zTriv, zMin[1], simd_cast<__mw>(deadLane));
			
 
				+		__mw opB = _mmw_blendv_ps(zMin[1], zTriv, simd_cast<__mw>(discardLayerMask));
			
 
				+		__mw z1min = _mmw_min_ps(opA, opB);
			
 
				+		zMin[1] = _mmw_blendv_ps(z1min, _mmw_set1_ps(FLT_MAX), simd_cast<__mw>(maskFull));
			
 
				+
			
 
				+		// Propagate zMin[1] back to zMin[0] if tile was fully covered, and update the mask
			
 
				+		zMin[0] = _mmw_blendv_ps(zMin[0], z1min, simd_cast<__mw>(maskFull));
			
 
				+		mMaskedHiZBuffer[tileIdx].mMask = _mmw_andnot_epi32(maskFull, mask);
			
 
				+	}
			
 
				+
			
 
				+	FORCE_INLINE void UpdateTileAccurate(int tileIdx, const __mwi &coverage, const __mw &zTriv)
			
 
				+	{
			
 
				+		assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
			
 
				+
			
 
				+		__mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
			
 
				+		__mwi &mask = mMaskedHiZBuffer[tileIdx].mMask;
			
 
				+
			
 
				+		// Swizzle coverage mask to 8x4 subtiles
			
 
				+		__mwi rastMask = coverage;
			
 
				+
			
 
				+		// Perform individual depth tests with layer 0 & 1 and mask out all failing pixels 
			
 
				+		__mw sdist0 = _mmw_sub_ps(zMin[0], zTriv);
			
 
				+		__mw sdist1 = _mmw_sub_ps(zMin[1], zTriv);
			
 
				+		__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(sdist0), 31);
			
 
				+		__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(sdist1), 31);
			
 
				+		__mwi triMask = _mmw_and_epi32(rastMask, _mmw_or_epi32(_mmw_andnot_epi32(mask, sign0), _mmw_and_epi32(mask, sign1)));
			
 
				+
			
 
				+		// Early out if no pixels survived the depth test (this test is more accurate than
			
 
				+		// the early culling test in TraverseScanline())
			
 
				+		__mwi t0 = _mmw_cmpeq_epi32(triMask, SIMD_BITS_ZERO);
			
 
				+		__mwi t0inv = _mmw_not_epi32(t0);
			
 
				+		if (_mmw_testz_epi32(t0inv, t0inv))
			
 
				+			return;
			
 
				+
			
 
				+		STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
			
 
				+
			
 
				+		__mw zTri = _mmw_blendv_ps(zTriv, zMin[0], simd_cast<__mw>(t0));
			
 
				+
			
 
				+		// Test if incoming triangle completely overwrites layer 0 or 1
			
 
				+		__mwi layerMask0 = _mmw_andnot_epi32(triMask, _mmw_not_epi32(mask));
			
 
				+		__mwi layerMask1 = _mmw_andnot_epi32(triMask, mask);
			
 
				+		__mwi lm0 = _mmw_cmpeq_epi32(layerMask0, SIMD_BITS_ZERO);
			
 
				+		__mwi lm1 = _mmw_cmpeq_epi32(layerMask1, SIMD_BITS_ZERO);
			
 
				+		__mw z0 = _mmw_blendv_ps(zMin[0], zTri, simd_cast<__mw>(lm0));
			
 
				+		__mw z1 = _mmw_blendv_ps(zMin[1], zTri, simd_cast<__mw>(lm1));
			
 
				+
			
 
				+		// Compute distances used for merging heuristic
			
 
				+		__mw d0 = _mmw_abs_ps(sdist0);
			
 
				+		__mw d1 = _mmw_abs_ps(sdist1);
			
 
				+		__mw d2 = _mmw_abs_ps(_mmw_sub_ps(z0, z1));
			
 
				+
			
 
				+		// Find minimum distance
			
 
				+		__mwi c01 = simd_cast<__mwi>(_mmw_sub_ps(d0, d1));
			
 
				+		__mwi c02 = simd_cast<__mwi>(_mmw_sub_ps(d0, d2));
			
 
				+		__mwi c12 = simd_cast<__mwi>(_mmw_sub_ps(d1, d2));
			
 
				+		// Two tests indicating which layer the incoming triangle will merge with or 
			
 
				+		// overwrite. d0min indicates that the triangle will overwrite layer 0, and 
			
 
				+		// d1min flags that the triangle will overwrite layer 1.
			
 
				+		__mwi d0min = _mmw_or_epi32(_mmw_and_epi32(c01, c02), _mmw_or_epi32(lm0, t0));
			
 
				+		__mwi d1min = _mmw_andnot_epi32(d0min, _mmw_or_epi32(c12, lm1));
			
 
				+
			
 
				+		///////////////////////////////////////////////////////////////////////////////
			
 
				+		// Update depth buffer entry. NOTE: we always merge into layer 0, so if the 
			
 
				+		// triangle should be merged with layer 1, we first swap layer 0 & 1 and then
			
 
				+		// merge into layer 0.
			
 
				+		///////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		// Update mask based on which layer the triangle overwrites or was merged into
			
 
				+		__mw inner = _mmw_blendv_ps(simd_cast<__mw>(triMask), simd_cast<__mw>(layerMask1), simd_cast<__mw>(d0min));
			
 
				+		mask = simd_cast<__mwi>(_mmw_blendv_ps(inner, simd_cast<__mw>(layerMask0), simd_cast<__mw>(d1min)));
			
 
				+
			
 
				+		// Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
			
 
				+		// merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
			
 
				+		// with zTri.
			
 
				+		__mw e0 = _mmw_blendv_ps(z0, z1, simd_cast<__mw>(d1min));
			
 
				+		__mw e1 = _mmw_blendv_ps(z1, zTri, simd_cast<__mw>(_mmw_or_epi32(d1min, d0min)));
			
 
				+		zMin[0] = _mmw_min_ps(e0, e1);
			
 
				+
			
 
				+		// Update the zMin[1] value. There are three outcomes: keep current value,
			
 
				+		// overwrite with zTri, or overwrite with z1
			
 
				+		__mw z1t = _mmw_blendv_ps(zTri, z1, simd_cast<__mw>(d0min));
			
 
				+		zMin[1] = _mmw_blendv_ps(z1t, z0, simd_cast<__mw>(d1min));
			
 
				+	}
			
 
				+
			
 
				+	template<int TEST_Z, int NRIGHT, int NLEFT>
			
 
				+	FORCE_INLINE int TraverseScanline(int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, const __mwi *events, const __mw &zTriMin, const __mw &zTriMax, const __mw &iz0, float zx)
			
 
				+	{
			
 
				+		// Floor edge events to integer pixel coordinates (shift out fixed point bits)
			
 
				+		int eventOffset = leftOffset << TILE_WIDTH_SHIFT;
			
 
				+		__mwi right[NRIGHT], left[NLEFT];
			
 
				+		for (int i = 0; i < NRIGHT; ++i)
			
 
				+			right[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[rightEvent + i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
			
 
				+		for (int i = 0; i < NLEFT; ++i)
			
 
				+			left[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[leftEvent - i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
			
 
				+
			
 
				+		__mw z0 = _mmw_add_ps(iz0, _mmw_set1_ps(zx*leftOffset));
			
 
				+		int tileIdxEnd = tileIdx + rightOffset;
			
 
				+		tileIdx += leftOffset;
			
 
				+		for (;;)
			
 
				+		{
			
 
				+			if (TEST_Z)
			
 
				+				STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
			
 
				+			else
			
 
				+				STATS_ADD(mStats.mOccluders.mNumTilesTraversed, 1);
			
 
				+
			
 
				+			// Perform a coarse test to quickly discard occluded tiles
			
 
				+#if QUICK_MASK != 0
			
 
				+			// Only use the reference layer (layer 0) to cull as it is always conservative
			
 
				+			__mw zMinBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
			
 
				+#else
			
 
				+			// Compute zMin for the overlapped layers 
			
 
				+			__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
			
 
				+			__mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
			
 
				+			__mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
			
 
				+			__mw zMinBuf = _mmw_min_ps(zMin0, zMin1);
			
 
				+#endif
			
 
				+			__mw dist0 = _mmw_sub_ps(zTriMax, zMinBuf);
			
 
				+			if (_mmw_movemask_ps(dist0) != SIMD_ALL_LANES_MASK)
			
 
				+			{
			
 
				+				// Compute coverage mask for entire 32xN using shift operations
			
 
				+				__mwi accumulatedMask = _mmw_sllv_ones(left[0]);
			
 
				+				for (int i = 1; i < NLEFT; ++i)
			
 
				+					accumulatedMask = _mmw_and_epi32(accumulatedMask, _mmw_sllv_ones(left[i]));
			
 
				+				for (int i = 0; i < NRIGHT; ++i)
			
 
				+					accumulatedMask = _mmw_andnot_epi32(_mmw_sllv_ones(right[i]), accumulatedMask);
			
 
				+
			
 
				+				if (TEST_Z)
			
 
				+				{
			
 
				+					// Perform a conservative visibility test (test zMax against buffer for each covered 8x4 subtile)
			
 
				+					__mw zSubTileMax = _mmw_min_ps(z0, zTriMax);
			
 
				+					__mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zSubTileMax, zMinBuf));
			
 
				+
			
 
				+					__mwi rastMask = _mmw_transpose_epi8(accumulatedMask);
			
 
				+					__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
			
 
				+					zPass = _mmw_andnot_epi32(deadLane, zPass);
			
 
				+
			
 
				+					if (!_mmw_testz_epi32(zPass, zPass))
			
 
				+						return CullingResult::VISIBLE;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					// Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
			
 
				+					__mw zSubTileMin = _mmw_max_ps(z0, zTriMin);
			
 
				+#if QUICK_MASK != 0
			
 
				+					UpdateTileQuick(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
			
 
				+#else 
			
 
				+					UpdateTileAccurate(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
			
 
				+#endif
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			// Update buffer address, interpolate z and edge events
			
 
				+			tileIdx++;
			
 
				+			if (tileIdx >= tileIdxEnd)
			
 
				+				break;
			
 
				+			z0 = _mmw_add_ps(z0, _mmw_set1_ps(zx));
			
 
				+			for (int i = 0; i < NRIGHT; ++i)
			
 
				+				right[i] = _mmw_subs_epu16(right[i], SIMD_TILE_WIDTH);	// Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
			
 
				+			for (int i = 0; i < NLEFT; ++i)
			
 
				+				left[i] = _mmw_subs_epu16(left[i], SIMD_TILE_WIDTH);
			
 
				+		}
			
 
				+
			
 
				+		return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	template<int TEST_Z, int TIGHT_TRAVERSAL, int MID_VTX_RIGHT>
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+	FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mw *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy, const __mwi *edgeY, const __mwi *absEdgeX, const __mwi *slopeSign, const __mwi *eventStartRemainder, const __mwi *slopeTileRemainder)
			
 
				+#else
			
 
				+	FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mwi *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy)
			
 
				+#endif
			
 
				+	{
			
 
				+		if (TEST_Z)
			
 
				+			STATS_ADD(mStats.mOccludees.mNumRasterizedTriangles, 1);
			
 
				+		else
			
 
				+			STATS_ADD(mStats.mOccluders.mNumRasterizedTriangles, 1);
			
 
				+
			
 
				+		int cullResult;
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+		#define LEFT_EDGE_BIAS -1
			
 
				+		#define RIGHT_EDGE_BIAS 1
			
 
				+		#define UPDATE_TILE_EVENTS_Y(i) \
			
 
				+				triEventRemainder[i] = _mmw_sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]); \
			
 
				+				__mwi overflow##i = _mmw_srai_epi32(triEventRemainder[i], 31); \
			
 
				+				triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow##i, triEdgeY[i])); \
			
 
				+				triEvent[i] = _mmw_add_epi32(triEvent[i], _mmw_add_epi32(triSlopeTileDelta[i], _mmw_and_epi32(overflow##i, triSlopeSign[i])))
			
 
				+
			
 
				+		__mwi triEvent[3], triSlopeSign[3], triSlopeTileDelta[3], triEdgeY[3], triSlopeTileRemainder[3], triEventRemainder[3];
			
 
				+		for (int i = 0; i < 3; ++i)
			
 
				+		{
			
 
				+			triSlopeSign[i] = _mmw_set1_epi32(simd_i32(slopeSign[i])[triIdx]);
			
 
				+			triSlopeTileDelta[i] = _mmw_set1_epi32(simd_i32(slopeTileDelta[i])[triIdx]);
			
 
				+			triEdgeY[i] = _mmw_set1_epi32(simd_i32(edgeY[i])[triIdx]);
			
 
				+			triSlopeTileRemainder[i] = _mmw_set1_epi32(simd_i32(slopeTileRemainder[i])[triIdx]);
			
 
				+
			
 
				+			__mw triSlope = _mmw_set1_ps(simd_f32(slope[i])[triIdx]);
			
 
				+			__mwi triAbsEdgeX = _mmw_set1_epi32(simd_i32(absEdgeX[i])[triIdx]);
			
 
				+			__mwi triStartRemainder = _mmw_set1_epi32(simd_i32(eventStartRemainder[i])[triIdx]);
			
 
				+			__mwi triEventStart = _mmw_set1_epi32(simd_i32(eventStart[i])[triIdx]);
			
 
				+
			
 
				+			__mwi scanlineDelta = _mmw_cvttps_epi32(_mmw_mul_ps(triSlope, SIMD_LANE_YCOORD_F));
			
 
				+			__mwi scanlineSlopeRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(triAbsEdgeX, SIMD_LANE_YCOORD_I), _mmw_mullo_epi32(_mmw_abs_epi32(scanlineDelta), triEdgeY[i]));
			
 
				+
			
 
				+			triEventRemainder[i] = _mmw_sub_epi32(triStartRemainder, scanlineSlopeRemainder);
			
 
				+			__mwi overflow = _mmw_srai_epi32(triEventRemainder[i], 31);
			
 
				+			triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow, triEdgeY[i]));
			
 
				+			triEvent[i] = _mmw_add_epi32(_mmw_add_epi32(triEventStart, scanlineDelta), _mmw_and_epi32(overflow, triSlopeSign[i]));
			
 
				+		}
			
 
				+
			
 
				+#else
			
 
				+		#define LEFT_EDGE_BIAS 0
			
 
				+		#define RIGHT_EDGE_BIAS 0
			
 
				+		#define UPDATE_TILE_EVENTS_Y(i)		triEvent[i] = _mmw_add_epi32(triEvent[i], triSlopeTileDelta[i]);
			
 
				+
			
 
				+		// Get deltas used to increment edge events each time we traverse one scanline of tiles
			
 
				+		__mwi triSlopeTileDelta[3];
			
 
				+		triSlopeTileDelta[0] = _mmw_set1_epi32(simd_i32(slopeTileDelta[0])[triIdx]);
			
 
				+		triSlopeTileDelta[1] = _mmw_set1_epi32(simd_i32(slopeTileDelta[1])[triIdx]);
			
 
				+		triSlopeTileDelta[2] = _mmw_set1_epi32(simd_i32(slopeTileDelta[2])[triIdx]);
			
 
				+
			
 
				+		// Setup edge events for first batch of SIMD_LANES scanlines
			
 
				+		__mwi triEvent[3];
			
 
				+		triEvent[0] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[0])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[0])[triIdx])));
			
 
				+		triEvent[1] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[1])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[1])[triIdx])));
			
 
				+		triEvent[2] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[2])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[2])[triIdx])));
			
 
				+#endif
			
 
				+
			
 
				+		// For big triangles track start & end tile for each scanline and only traverse the valid region
			
 
				+		int startDelta, endDelta, topDelta, startEvent, endEvent, topEvent;
			
 
				+		if (TIGHT_TRAVERSAL)
			
 
				+		{
			
 
				+			startDelta = simd_i32(slopeTileDelta[2])[triIdx] + LEFT_EDGE_BIAS;
			
 
				+			endDelta = simd_i32(slopeTileDelta[0])[triIdx] + RIGHT_EDGE_BIAS;
			
 
				+			topDelta = simd_i32(slopeTileDelta[1])[triIdx] + (MID_VTX_RIGHT ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
			
 
				+
			
 
				+			// Compute conservative bounds for the edge events over a 32xN tile
			
 
				+			startEvent = simd_i32(eventStart[2])[triIdx] + min(0, startDelta);
			
 
				+			endEvent = simd_i32(eventStart[0])[triIdx] + max(0, endDelta) + (TILE_WIDTH << FP_BITS);
			
 
				+			if (MID_VTX_RIGHT)
			
 
				+				topEvent = simd_i32(eventStart[1])[triIdx] + max(0, topDelta) + (TILE_WIDTH << FP_BITS);
			
 
				+			else
			
 
				+				topEvent = simd_i32(eventStart[1])[triIdx] + min(0, topDelta);
			
 
				+		}
			
 
				+
			
 
				+		if (tileRowIdx <= tileMidRowIdx)
			
 
				+		{
			
 
				+			int tileStopIdx = min(tileEndRowIdx, tileMidRowIdx);
			
 
				+			// Traverse the bottom half of the triangle
			
 
				+			while (tileRowIdx < tileStopIdx)
			
 
				+			{
			
 
				+				int start = 0, end = bbWidth;
			
 
				+				if (TIGHT_TRAVERSAL)
			
 
				+				{
			
 
				+					// Compute tighter start and endpoints to avoid traversing empty space
			
 
				+					start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+					end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+					startEvent += startDelta;
			
 
				+					endEvent += endDelta;
			
 
				+				}
			
 
				+
			
 
				+				// Traverse the scanline and update the masked hierarchical z buffer
			
 
				+				cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
			
 
				+
			
 
				+				if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
			
 
				+					return CullingResult::VISIBLE;
			
 
				+
			
 
				+				// move to the next scanline of tiles, update edge events and interpolate z
			
 
				+				tileRowIdx += mTilesWidth;
			
 
				+				z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
			
 
				+				UPDATE_TILE_EVENTS_Y(0);
			
 
				+				UPDATE_TILE_EVENTS_Y(2);
			
 
				+			}
			
 
				+
			
 
				+			// Traverse the middle scanline of tiles. We must consider all three edges only in this region
			
 
				+			if (tileRowIdx < tileEndRowIdx)
			
 
				+			{
			
 
				+				int start = 0, end = bbWidth;
			
 
				+				if (TIGHT_TRAVERSAL)
			
 
				+				{
			
 
				+					// Compute tighter start and endpoints to avoid traversing lots of empty space
			
 
				+					start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+					end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+
			
 
				+					// Switch the traversal start / end to account for the upper side edge
			
 
				+					endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
			
 
				+					endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
			
 
				+					startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
			
 
				+					startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
			
 
				+					startEvent += startDelta;
			
 
				+					endEvent += endDelta;
			
 
				+				}
			
 
				+
			
 
				+				// Traverse the scanline and update the masked hierarchical z buffer. 
			
 
				+				if (MID_VTX_RIGHT)
			
 
				+					cullResult = TraverseScanline<TEST_Z, 2, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
			
 
				+				else
			
 
				+					cullResult = TraverseScanline<TEST_Z, 1, 2>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
			
 
				+
			
 
				+				if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
			
 
				+					return CullingResult::VISIBLE;
			
 
				+
			
 
				+				tileRowIdx += mTilesWidth;
			
 
				+			}
			
 
				+
			
 
				+			// Traverse the top half of the triangle
			
 
				+			if (tileRowIdx < tileEndRowIdx)
			
 
				+			{
			
 
				+				// move to the next scanline of tiles, update edge events and interpolate z
			
 
				+				z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
			
 
				+				int i0 = MID_VTX_RIGHT + 0;
			
 
				+				int i1 = MID_VTX_RIGHT + 1;
			
 
				+				UPDATE_TILE_EVENTS_Y(i0);
			
 
				+				UPDATE_TILE_EVENTS_Y(i1);
			
 
				+				for (;;)
			
 
				+				{
			
 
				+					int start = 0, end = bbWidth;
			
 
				+					if (TIGHT_TRAVERSAL)
			
 
				+					{
			
 
				+						// Compute tighter start and endpoints to avoid traversing lots of empty space
			
 
				+						start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+						end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+						startEvent += startDelta;
			
 
				+						endEvent += endDelta;
			
 
				+					}
			
 
				+
			
 
				+					// Traverse the scanline and update the masked hierarchical z buffer
			
 
				+					cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
			
 
				+
			
 
				+					if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
			
 
				+						return CullingResult::VISIBLE;
			
 
				+
			
 
				+					// move to the next scanline of tiles, update edge events and interpolate z
			
 
				+					tileRowIdx += mTilesWidth;
			
 
				+					if (tileRowIdx >= tileEndRowIdx)
			
 
				+						break;
			
 
				+					z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
			
 
				+					UPDATE_TILE_EVENTS_Y(i0);
			
 
				+					UPDATE_TILE_EVENTS_Y(i1);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (TIGHT_TRAVERSAL)
			
 
				+			{
			
 
				+				// For large triangles, switch the traversal start / end to account for the upper side edge
			
 
				+				endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
			
 
				+				endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
			
 
				+				startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
			
 
				+				startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
			
 
				+			}
			
 
				+
			
 
				+			// Traverse the top half of the triangle
			
 
				+			if (tileRowIdx < tileEndRowIdx)
			
 
				+			{
			
 
				+				int i0 = MID_VTX_RIGHT + 0;
			
 
				+				int i1 = MID_VTX_RIGHT + 1;
			
 
				+				for (;;)
			
 
				+				{
			
 
				+					int start = 0, end = bbWidth;
			
 
				+					if (TIGHT_TRAVERSAL)
			
 
				+					{
			
 
				+						// Compute tighter start and endpoints to avoid traversing lots of empty space
			
 
				+						start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+						end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
			
 
				+						startEvent += startDelta;
			
 
				+						endEvent += endDelta;
			
 
				+					}
			
 
				+
			
 
				+					// Traverse the scanline and update the masked hierarchical z buffer
			
 
				+					cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
			
 
				+
			
 
				+					if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
			
 
				+						return CullingResult::VISIBLE;
			
 
				+
			
 
				+					// move to the next scanline of tiles, update edge events and interpolate z
			
 
				+					tileRowIdx += mTilesWidth;
			
 
				+					if (tileRowIdx >= tileEndRowIdx)
			
 
				+						break;
			
 
				+					z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
			
 
				+					UPDATE_TILE_EVENTS_Y(i0);
			
 
				+					UPDATE_TILE_EVENTS_Y(i1);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
			
 
				+	}
			
 
				+
			
 
				+	template<bool TEST_Z>
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+	FORCE_INLINE int RasterizeTriangleBatch(__mwi ipVtxX[3], __mwi ipVtxY[3], __mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
			
 
				+#else
			
 
				+	FORCE_INLINE int RasterizeTriangleBatch(__mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
			
 
				+#endif
			
 
				+	{
			
 
				+		int cullResult = CullingResult::VIEW_CULLED;
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Compute bounding box and clamp to tile coordinates
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		__mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
			
 
				+		ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, scissor);
			
 
				+
			
 
				+		// Clamp bounding box to tiles (it's already padded in computeBoundingBox)
			
 
				+		__mwi bbTileMinX = _mmw_srai_epi32(bbPixelMinX, TILE_WIDTH_SHIFT);
			
 
				+		__mwi bbTileMinY = _mmw_srai_epi32(bbPixelMinY, TILE_HEIGHT_SHIFT);
			
 
				+		__mwi bbTileMaxX = _mmw_srai_epi32(bbPixelMaxX, TILE_WIDTH_SHIFT);
			
 
				+		__mwi bbTileMaxY = _mmw_srai_epi32(bbPixelMaxY, TILE_HEIGHT_SHIFT);
			
 
				+		__mwi bbTileSizeX = _mmw_sub_epi32(bbTileMaxX, bbTileMinX);
			
 
				+		__mwi bbTileSizeY = _mmw_sub_epi32(bbTileMaxY, bbTileMinY);
			
 
				+
			
 
				+		// Cull triangles with zero bounding box
			
 
				+		__mwi bboxSign = _mmw_or_epi32(_mmw_sub_epi32(bbTileSizeX, _mmw_set1_epi32(1)), _mmw_sub_epi32(bbTileSizeY, _mmw_set1_epi32(1)));
			
 
				+		triMask &= ~_mmw_movemask_ps(simd_cast<__mw>(bboxSign)) & SIMD_ALL_LANES_MASK;
			
 
				+		if (triMask == 0x0)
			
 
				+			return cullResult;
			
 
				+
			
 
				+		if (!TEST_Z)
			
 
				+			cullResult = CullingResult::VISIBLE;
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Set up screen space depth plane
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		__mw zPixelDx, zPixelDy;
			
 
				+		ComputeDepthPlane(pVtxX, pVtxY, pVtxZ, zPixelDx, zPixelDy);
			
 
				+
			
 
				+		// Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles
			
 
				+		__mw bbMinXV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinX), pVtxX[0]);
			
 
				+		__mw bbMinYV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinY), pVtxY[0]);
			
 
				+		__mw zPlaneOffset = _mmw_fmadd_ps(zPixelDx, bbMinXV0, _mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0]));
			
 
				+		__mw zTileDx = _mmw_mul_ps(zPixelDx, _mmw_set1_ps((float)TILE_WIDTH));
			
 
				+		__mw zTileDy = _mmw_mul_ps(zPixelDy, _mmw_set1_ps((float)TILE_HEIGHT));
			
 
				+		if (TEST_Z)
			
 
				+		{
			
 
				+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
			
 
				+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
			
 
				+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
			
 
				+		}
			
 
				+
			
 
				+		// Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles)
			
 
				+		__mw zMin = _mmw_min_ps(pVtxZ[0], _mmw_min_ps(pVtxZ[1], pVtxZ[2]));
			
 
				+		__mw zMax = _mmw_max_ps(pVtxZ[0], _mmw_max_ps(pVtxZ[1], pVtxZ[2]));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Sort vertices (v0 has lowest Y, and the rest is in winding order) and
			
 
				+		// compute edges. Also find the middle vertex and compute tile
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+
			
 
				+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
			
 
				+		SortVertices(ipVtxX, ipVtxY);
			
 
				+
			
 
				+		// Compute edges
			
 
				+		__mwi edgeX[3] = { _mmw_sub_epi32(ipVtxX[1], ipVtxX[0]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[1]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[0]) };
			
 
				+		__mwi edgeY[3] = { _mmw_sub_epi32(ipVtxY[1], ipVtxY[0]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[1]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[0]) };
			
 
				+
			
 
				+		// Classify if the middle vertex is on the left or right and compute its position
			
 
				+		int midVtxRight = ~_mmw_movemask_ps(simd_cast<__mw>(edgeY[1]));
			
 
				+		__mwi midPixelX = _mmw_blendv_epi32(ipVtxX[1], ipVtxX[2], edgeY[1]);
			
 
				+		__mwi midPixelY = _mmw_blendv_epi32(ipVtxY[1], ipVtxY[2], edgeY[1]);
			
 
				+		__mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(midPixelY, SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT + FP_BITS);
			
 
				+		__mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
			
 
				+
			
 
				+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
			
 
				+		// the edge originating from the middle vertex.
			
 
				+		__mwi xDiffi[2], yDiffi[2];
			
 
				+		xDiffi[0] = _mmw_sub_epi32(ipVtxX[0], _mmw_slli_epi32(bbPixelMinX, FP_BITS));
			
 
				+		xDiffi[1] = _mmw_sub_epi32(midPixelX, _mmw_slli_epi32(bbPixelMinX, FP_BITS));
			
 
				+		yDiffi[0] = _mmw_sub_epi32(ipVtxY[0], _mmw_slli_epi32(bbPixelMinY, FP_BITS));
			
 
				+		yDiffi[1] = _mmw_sub_epi32(midPixelY, _mmw_slli_epi32(bbMidTileY, FP_BITS + TILE_HEIGHT_SHIFT));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Edge slope setup - Note we do not conform to DX/GL rasterization rules
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		// Potentially flip edge to ensure that all edges have positive Y slope.
			
 
				+		edgeX[1] = _mmw_blendv_epi32(edgeX[1], _mmw_neg_epi32(edgeX[1]), edgeY[1]);
			
 
				+		edgeY[1] = _mmw_abs_epi32(edgeY[1]);
			
 
				+
			
 
				+		// Compute floating point slopes
			
 
				+		__mw slope[3];
			
 
				+		slope[0] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[0]), _mmw_cvtepi32_ps(edgeY[0]));
			
 
				+		slope[1] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[1]), _mmw_cvtepi32_ps(edgeY[1]));
			
 
				+		slope[2] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[2]), _mmw_cvtepi32_ps(edgeY[2]));
			
 
				+
			
 
				+		// Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
			
 
				+		// width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that 
			
 
				+		// vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
			
 
				+		__mw  horizontalSlopeDelta = _mmw_set1_ps(2.0f * ((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f)));
			
 
				+		__mwi horizontalSlope0 = _mmw_cmpeq_epi32(edgeY[0], _mmw_setzero_epi32());
			
 
				+		__mwi horizontalSlope1 = _mmw_cmpeq_epi32(edgeY[1], _mmw_setzero_epi32());
			
 
				+		slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, simd_cast<__mw>(horizontalSlope0));
			
 
				+		slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), simd_cast<__mw>(horizontalSlope1));
			
 
				+
			
 
				+		__mwi vy[3] = { yDiffi[0], yDiffi[1], yDiffi[0] };
			
 
				+		__mwi offset0 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[0], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
			
 
				+		__mwi offset1 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[1], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
			
 
				+		vy[0] = _mmw_blendv_epi32(yDiffi[0], offset0, horizontalSlope0);
			
 
				+		vy[1] = _mmw_blendv_epi32(yDiffi[1], offset1, horizontalSlope1);
			
 
				+
			
 
				+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
			
 
				+		// the edge originating from the middle vertex.
			
 
				+		__mwi slopeSign[3], absEdgeX[3];
			
 
				+		__mwi slopeTileDelta[3], eventStartRemainder[3], slopeTileRemainder[3], eventStart[3];
			
 
				+		for (int i = 0; i < 3; i++)
			
 
				+		{
			
 
				+			// Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction
			
 
				+			slopeSign[i] = _mmw_blendv_epi32(_mmw_set1_epi32(1), _mmw_set1_epi32(-1), edgeX[i]);
			
 
				+			absEdgeX[i] = _mmw_abs_epi32(edgeX[i]);
			
 
				+
			
 
				+			// Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we 
			
 
				+			// repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step 
			
 
				+			// one tile of scanlines at a time, the slope is computed for a tile-sized step.
			
 
				+			slopeTileDelta[i] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_set1_ps(FP_TILE_HEIGHT)));
			
 
				+			slopeTileRemainder[i] = _mmw_sub_epi32(_mmw_slli_epi32(absEdgeX[i], FP_TILE_HEIGHT_SHIFT), _mmw_mullo_epi32(_mmw_abs_epi32(slopeTileDelta[i]), edgeY[i]));
			
 
				+
			
 
				+			// Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle.
			
 
				+			// The jump can be in both positive and negative y-direction due to clipping / offscreen vertices.
			
 
				+			__mwi tileStartDir = _mmw_blendv_epi32(slopeSign[i], _mmw_neg_epi32(slopeSign[i]), vy[i]);
			
 
				+			__mwi tieBreaker = _mmw_blendv_epi32(_mmw_set1_epi32(0), _mmw_set1_epi32(1), tileStartDir);
			
 
				+			__mwi tileStartSlope = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_cvtepi32_ps(_mmw_neg_epi32(vy[i]))));
			
 
				+			__mwi tileStartRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(absEdgeX[i], _mmw_abs_epi32(vy[i])), _mmw_mullo_epi32(_mmw_abs_epi32(tileStartSlope), edgeY[i]));
			
 
				+			
			
 
				+			eventStartRemainder[i] = _mmw_sub_epi32(tileStartRemainder, tieBreaker);
			
 
				+			__mwi overflow = _mmw_srai_epi32(eventStartRemainder[i], 31);
			
 
				+			eventStartRemainder[i] = _mmw_add_epi32(eventStartRemainder[i], _mmw_and_epi32(overflow, edgeY[i]));
			
 
				+			eventStartRemainder[i] = _mmw_blendv_epi32(eventStartRemainder[i], _mmw_sub_epi32(_mmw_sub_epi32(edgeY[i], eventStartRemainder[i]), _mmw_set1_epi32(1)), vy[i]);
			
 
				+			
			
 
				+			//eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + _mmw_set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker;
			
 
				+			eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(xDiffi[i & 1], tileStartSlope), _mmw_and_epi32(overflow, tileStartDir));
			
 
				+			eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(eventStart[i], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), tieBreaker);
			
 
				+		}
			
 
				+
			
 
				+#else // PRECISE_COVERAGE
			
 
				+
			
 
				+		SortVertices(pVtxX, pVtxY);
			
 
				+
			
 
				+		// Compute edges
			
 
				+		__mw edgeX[3] = { _mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxX[2], pVtxX[1]), _mmw_sub_ps(pVtxX[2], pVtxX[0]) };
			
 
				+		__mw edgeY[3] = { _mmw_sub_ps(pVtxY[1], pVtxY[0]), _mmw_sub_ps(pVtxY[2], pVtxY[1]), _mmw_sub_ps(pVtxY[2], pVtxY[0]) };
			
 
				+
			
 
				+		// Classify if the middle vertex is on the left or right and compute its position
			
 
				+		int midVtxRight = ~_mmw_movemask_ps(edgeY[1]);
			
 
				+		__mw midPixelX = _mmw_blendv_ps(pVtxX[1], pVtxX[2], edgeY[1]);
			
 
				+		__mw midPixelY = _mmw_blendv_ps(pVtxY[1], pVtxY[2], edgeY[1]);
			
 
				+		__mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(_mmw_cvttps_epi32(midPixelY), SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT);
			
 
				+		__mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Edge slope setup - Note we do not conform to DX/GL rasterization rules
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		// Compute floating point slopes
			
 
				+		__mw slope[3];
			
 
				+		slope[0] = _mmw_div_ps(edgeX[0], edgeY[0]);
			
 
				+		slope[1] = _mmw_div_ps(edgeX[1], edgeY[1]);
			
 
				+		slope[2] = _mmw_div_ps(edgeX[2], edgeY[2]);
			
 
				+
			
 
				+		// Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
			
 
				+		// width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that 
			
 
				+		// vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
			
 
				+		__mw horizontalSlopeDelta = _mmw_set1_ps((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f));
			
 
				+		slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, _mmw_cmpeq_ps(edgeY[0], _mmw_setzero_ps()));
			
 
				+		slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), _mmw_cmpeq_ps(edgeY[1], _mmw_setzero_ps()));
			
 
				+
			
 
				+		// Convert floaing point slopes to fixed point
			
 
				+		__mwi slopeFP[3];
			
 
				+		slopeFP[0] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[0], _mmw_set1_ps(1 << FP_BITS)));
			
 
				+		slopeFP[1] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[1], _mmw_set1_ps(1 << FP_BITS)));
			
 
				+		slopeFP[2] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[2], _mmw_set1_ps(1 << FP_BITS)));
			
 
				+
			
 
				+		// Fan out edge slopes to avoid (rare) cracks at vertices. We increase right facing slopes 
			
 
				+		// by 1 LSB, which results in overshooting vertices slightly, increasing triangle coverage. 
			
 
				+		// e0 is always right facing, e1 depends on if the middle vertex is on the left or right
			
 
				+		slopeFP[0] = _mmw_add_epi32(slopeFP[0], _mmw_set1_epi32(1));
			
 
				+		slopeFP[1] = _mmw_add_epi32(slopeFP[1], _mmw_srli_epi32(_mmw_not_epi32(simd_cast<__mwi>(edgeY[1])), 31));
			
 
				+
			
 
				+		// Compute slope deltas for an SIMD_LANES scanline step (tile height)
			
 
				+		__mwi slopeTileDelta[3];
			
 
				+		slopeTileDelta[0] = _mmw_slli_epi32(slopeFP[0], TILE_HEIGHT_SHIFT);
			
 
				+		slopeTileDelta[1] = _mmw_slli_epi32(slopeFP[1], TILE_HEIGHT_SHIFT);
			
 
				+		slopeTileDelta[2] = _mmw_slli_epi32(slopeFP[2], TILE_HEIGHT_SHIFT);
			
 
				+
			
 
				+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
			
 
				+		// the edge originating from the middle vertex.
			
 
				+		__mwi xDiffi[2], yDiffi[2];
			
 
				+		xDiffi[0] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(pVtxX[0]), bbPixelMinX), FP_BITS);
			
 
				+		xDiffi[1] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(midPixelX), bbPixelMinX), FP_BITS);
			
 
				+		yDiffi[0] = _mmw_sub_epi32(_mmw_cvttps_epi32(pVtxY[0]), bbPixelMinY);
			
 
				+		yDiffi[1] = _mmw_sub_epi32(_mmw_cvttps_epi32(midPixelY), _mmw_slli_epi32(bbMidTileY, TILE_HEIGHT_SHIFT));
			
 
				+
			
 
				+		__mwi eventStart[3];
			
 
				+		eventStart[0] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[0], yDiffi[0]));
			
 
				+		eventStart[1] = _mmw_sub_epi32(xDiffi[1], _mmw_mullo_epi32(slopeFP[1], yDiffi[1]));
			
 
				+		eventStart[2] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[2], yDiffi[0]));
			
 
				+#endif
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Split bounding box into bottom - middle - top region.
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+		__mwi bbBottomIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(bbTileMinY, _mmw_set1_epi32(mTilesWidth)));
			
 
				+		__mwi bbTopIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(_mmw_add_epi32(bbTileMinY, bbTileSizeY), _mmw_set1_epi32(mTilesWidth)));
			
 
				+		__mwi bbMidIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(midTileY, _mmw_set1_epi32(mTilesWidth)));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Loop over non-culled triangle and change SIMD axis to per-pixel
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		while (triMask)
			
 
				+		{
			
 
				+			unsigned int triIdx = find_clear_lsb(&triMask);
			
 
				+			int triMidVtxRight = (midVtxRight >> triIdx) & 1;
			
 
				+
			
 
				+			// Get Triangle Zmin zMax
			
 
				+			__mw zTriMax = _mmw_set1_ps(simd_f32(zMax)[triIdx]);
			
 
				+			__mw zTriMin = _mmw_set1_ps(simd_f32(zMin)[triIdx]);
			
 
				+
			
 
				+			// Setup Zmin value for first set of 8x4 subtiles
			
 
				+			__mw z0 = _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDx)[triIdx]), SIMD_SUB_TILE_COL_OFFSET_F,
			
 
				+				_mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDy)[triIdx]), SIMD_SUB_TILE_ROW_OFFSET_F, _mmw_set1_ps(simd_f32(zPlaneOffset)[triIdx])));
			
 
				+			float zx = simd_f32(zTileDx)[triIdx];
			
 
				+			float zy = simd_f32(zTileDy)[triIdx];
			
 
				+
			
 
				+			// Get dimension of bounding box bottom, mid & top segments
			
 
				+			int bbWidth = simd_i32(bbTileSizeX)[triIdx];
			
 
				+			int bbHeight = simd_i32(bbTileSizeY)[triIdx];
			
 
				+			int tileRowIdx = simd_i32(bbBottomIdx)[triIdx];
			
 
				+			int tileMidRowIdx = simd_i32(bbMidIdx)[triIdx];
			
 
				+			int tileEndRowIdx = simd_i32(bbTopIdx)[triIdx];
			
 
				+
			
 
				+			if (bbWidth > BIG_TRIANGLE && bbHeight > BIG_TRIANGLE) // For big triangles we use a more expensive but tighter traversal algorithm
			
 
				+			{
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+				if (triMidVtxRight)
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
			
 
				+				else
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
			
 
				+#else
			
 
				+				if (triMidVtxRight)
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
			
 
				+				else
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
			
 
				+#endif
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+				if (triMidVtxRight)
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
			
 
				+				else
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
			
 
				+#else
			
 
				+				if (triMidVtxRight)
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
			
 
				+				else
			
 
				+					cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
			
 
				+#endif
			
 
				+			}
			
 
				+
			
 
				+			if (TEST_Z && cullResult == CullingResult::VISIBLE)
			
 
				+				return CullingResult::VISIBLE;
			
 
				+		}
			
 
				+
			
 
				+		return cullResult;
			
 
				+	}
			
 
				+
			
 
				+	template<int TEST_Z, int FAST_GATHER>
			
 
				+	FORCE_INLINE CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+		if (TEST_Z)
			
 
				+			STATS_ADD(mStats.mOccludees.mNumProcessedTriangles, nTris);
			
 
				+		else
			
 
				+			STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+		int originalRoundingMode = _MM_GET_ROUNDING_MODE();
			
 
				+		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
			
 
				+#endif
			
 
				+
			
 
				+		int clipHead = 0;
			
 
				+		int clipTail = 0;
			
 
				+		__m128 clipTriBuffer[MAX_CLIPPED * 3];
			
 
				+		int cullResult = CullingResult::VIEW_CULLED;
			
 
				+
			
 
				+		const unsigned int *inTrisPtr = inTris;
			
 
				+		int numLanes = SIMD_LANES;
			
 
				+		int triIndex = 0;
			
 
				+		while (triIndex < nTris || clipHead != clipTail)
			
 
				+		{
			
 
				+            __mw vtxX[3], vtxY[3], vtxW[3];
			
 
				+            unsigned int triMask = SIMD_ALL_LANES_MASK;
			
 
				+
			
 
				+            GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
			
 
				+
			
 
				+			if (triMask == 0x0)
			
 
				+				continue;
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Project, transform to screen space and perform backface culling. Note 
			
 
				+			// that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
			
 
				+			// z = 1 is near. We must also use a greater than depth test, and in effect
			
 
				+			// everything is reversed compared to regular z implementations.
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			__mwi ipVtxX[3], ipVtxY[3];
			
 
				+			ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
			
 
				+#else
			
 
				+			ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
			
 
				+#endif
			
 
				+
			
 
				+			// Perform backface test. 
			
 
				+			__mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
			
 
				+			__mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
			
 
				+			__mw triArea = _mmw_sub_ps(triArea1, triArea2);
			
 
				+			__mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
			
 
				+#else
			
 
				+			triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
			
 
				+#endif
			
 
				+
			
 
				+			if (triMask == 0x0)
			
 
				+				continue;
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Setup and rasterize a SIMD batch of triangles
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			cullResult &= RasterizeTriangleBatch<TEST_Z>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
			
 
				+#else
			
 
				+			cullResult &= RasterizeTriangleBatch<TEST_Z>(pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
			
 
				+#endif
			
 
				+
			
 
				+			if (TEST_Z && cullResult == CullingResult::VISIBLE) {
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+				_MM_SET_ROUNDING_MODE(originalRoundingMode);
			
 
				+#endif
			
 
				+				return CullingResult::VISIBLE;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+		_MM_SET_ROUNDING_MODE(originalRoundingMode);
			
 
				+#endif
			
 
				+		return (CullingResult)cullResult;
			
 
				+	}
			
 
				+
			
 
				+	CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
			
 
				+	{
			
 
				+        CullingResult retVal;
			
 
				+
			
 
				+        if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
			
 
				+			retVal = (CullingResult)RenderTriangles<0, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+        else
			
 
				+            retVal = (CullingResult)RenderTriangles<0, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+        RecordRenderTriangles( inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout, retVal );
			
 
				+#endif
			
 
				+		return retVal;
			
 
				+	}
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Occlusion query functions
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
			
 
				+	{
			
 
				+        CullingResult retVal;
			
 
				+
			
 
				+        if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
			
 
				+			retVal = (CullingResult)RenderTriangles<1, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+        else
			
 
				+		    retVal = (CullingResult)RenderTriangles<1, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+        {
			
 
				+            std::lock_guard<std::mutex> lock( mRecorderMutex );
			
 
				+            if( mRecorder != nullptr ) mRecorder->RecordTestTriangles( retVal, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout );
			
 
				+        }
			
 
				+#endif
			
 
				+        return retVal;
			
 
				+	}
			
 
				+    
			
 
				+    CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const override
			
 
				+	{
			
 
				+		STATS_ADD(mStats.mOccludees.mNumProcessedRectangles, 1);
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+		static const __m128i SIMD_TILE_PAD = _mm_setr_epi32(0, TILE_WIDTH, 0, TILE_HEIGHT);
			
 
				+		static const __m128i SIMD_TILE_PAD_MASK = _mm_setr_epi32(~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1), ~(TILE_HEIGHT - 1), ~(TILE_HEIGHT - 1));
			
 
				+		static const __m128i SIMD_SUB_TILE_PAD = _mm_setr_epi32(0, SUB_TILE_WIDTH, 0, SUB_TILE_HEIGHT);
			
 
				+		static const __m128i SIMD_SUB_TILE_PAD_MASK = _mm_setr_epi32(~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_HEIGHT - 1), ~(SUB_TILE_HEIGHT - 1));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Compute screen space bounding box and guard for out of bounds
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+#if USE_D3D != 0
			
 
				+		__m128  pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymax, ymin), mIHalfSize, mICenter);
			
 
				+#else
			
 
				+		__m128  pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymin, ymax), mIHalfSize, mICenter);
			
 
				+#endif
			
 
				+		__m128i pixelBBoxi = _mm_cvttps_epi32(pixelBBox);
			
 
				+		pixelBBoxi = _mmx_max_epi32(_mm_setzero_si128(), _mmx_min_epi32(mIScreenSize, pixelBBoxi));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		__m128i tileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_TILE_PAD), SIMD_TILE_PAD_MASK);
			
 
				+		int txMin = simd_i32(tileBBoxi)[0] >> TILE_WIDTH_SHIFT;
			
 
				+		int txMax = simd_i32(tileBBoxi)[1] >> TILE_WIDTH_SHIFT;
			
 
				+		int tileRowIdx = (simd_i32(tileBBoxi)[2] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
			
 
				+		int tileRowIdxEnd = (simd_i32(tileBBoxi)[3] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
			
 
				+
			
 
				+		if (simd_i32(tileBBoxi)[0] == simd_i32(tileBBoxi)[1] || simd_i32(tileBBoxi)[2] == simd_i32(tileBBoxi)[3])
			
 
				+        {
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+            {
			
 
				+                std::lock_guard<std::mutex> lock( mRecorderMutex );
			
 
				+                if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VIEW_CULLED, xmin, ymin, xmax, ymax, wmin );
			
 
				+            }
			
 
				+#endif
			
 
				+            return CullingResult::VIEW_CULLED;
			
 
				+        }
			
 
				+
			
 
				+		///////////////////////////////////////////////////////////////////////////////
			
 
				+		// Pad bounding box to (8x4) subtiles. Skip SIMD lanes outside the subtile BB
			
 
				+		///////////////////////////////////////////////////////////////////////////////
			
 
				+		__m128i subTileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_SUB_TILE_PAD), SIMD_SUB_TILE_PAD_MASK);
			
 
				+		__mwi stxmin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[0] - 1); // - 1 to be able to use GT test
			
 
				+		__mwi stymin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[2] - 1); // - 1 to be able to use GT test
			
 
				+		__mwi stxmax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[1]);
			
 
				+		__mwi stymax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[3]);
			
 
				+
			
 
				+		// Setup pixel coordinates used to discard lanes outside subtile BB
			
 
				+		__mwi startPixelX = _mmw_add_epi32(SIMD_SUB_TILE_COL_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[0]));
			
 
				+		__mwi pixelY = _mmw_add_epi32(SIMD_SUB_TILE_ROW_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[2]));
			
 
				+
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		// Compute z from w. Note that z is reversed order, 0 = far, 1 = near, which
			
 
				+		// means we use a greater than test, so zMax is used to test for visibility.
			
 
				+		//////////////////////////////////////////////////////////////////////////////
			
 
				+		__mw zMax = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_set1_ps(wmin));
			
 
				+
			
 
				+		for (;;)
			
 
				+		{
			
 
				+			__mwi pixelX = startPixelX;
			
 
				+			for (int tx = txMin;;)
			
 
				+			{
			
 
				+				STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
			
 
				+
			
 
				+				int tileIdx = tileRowIdx + tx;
			
 
				+				assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
			
 
				+
			
 
				+				// Fetch zMin from masked hierarchical Z buffer
			
 
				+#if QUICK_MASK != 0
			
 
				+				__mw zBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
			
 
				+#else
			
 
				+				__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
			
 
				+				__mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
			
 
				+				__mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
			
 
				+				__mw zBuf = _mmw_min_ps(zMin0, zMin1);
			
 
				+#endif
			
 
				+				// Perform conservative greater than test against hierarchical Z buffer (zMax >= zBuf means the subtile is visible)
			
 
				+				__mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zMax, zBuf));	//zPass = zMax >= zBuf ? ~0 : 0
			
 
				+
			
 
				+				// Mask out lanes corresponding to subtiles outside the bounding box
			
 
				+				__mwi bboxTestMin = _mmw_and_epi32(_mmw_cmpgt_epi32(pixelX, stxmin), _mmw_cmpgt_epi32(pixelY, stymin));
			
 
				+				__mwi bboxTestMax = _mmw_and_epi32(_mmw_cmpgt_epi32(stxmax, pixelX), _mmw_cmpgt_epi32(stymax, pixelY));
			
 
				+				__mwi boxMask = _mmw_and_epi32(bboxTestMin, bboxTestMax);
			
 
				+				zPass = _mmw_and_epi32(zPass, boxMask);
			
 
				+
			
 
				+				// If not all tiles failed the conservative z test we can immediately terminate the test
			
 
				+				if (!_mmw_testz_epi32(zPass, zPass))
			
 
				+                {
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+                    {
			
 
				+                        std::lock_guard<std::mutex> lock( mRecorderMutex );
			
 
				+                        if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VISIBLE, xmin, ymin, xmax, ymax, wmin );
			
 
				+                    }
			
 
				+#endif
			
 
				+                    return CullingResult::VISIBLE;
			
 
				+                }
			
 
				+
			
 
				+				if (++tx >= txMax)
			
 
				+					break;
			
 
				+				pixelX = _mmw_add_epi32(pixelX, _mmw_set1_epi32(TILE_WIDTH));
			
 
				+			}
			
 
				+
			
 
				+			tileRowIdx += mTilesWidth;
			
 
				+			if (tileRowIdx >= tileRowIdxEnd)
			
 
				+				break;
			
 
				+			pixelY = _mmw_add_epi32(pixelY, _mmw_set1_epi32(TILE_HEIGHT));
			
 
				+		}
			
 
				+#if MOC_RECORDER_ENABLE
			
 
				+        {
			
 
				+            std::lock_guard<std::mutex> lock( mRecorderMutex );
			
 
				+            if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::OCCLUDED, xmin, ymin, xmax, ymax, wmin );
			
 
				+        }
			
 
				+#endif
			
 
				+		return CullingResult::OCCLUDED;
			
 
				+	}
			
 
				+
			
 
				+	template<bool FAST_GATHER>
			
 
				+	FORCE_INLINE void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+		int originalRoundingMode = _MM_GET_ROUNDING_MODE();
			
 
				+		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
			
 
				+#endif
			
 
				+
			
 
				+		STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
			
 
				+
			
 
				+		int clipHead = 0;
			
 
				+		int clipTail = 0;
			
 
				+		__m128 clipTriBuffer[MAX_CLIPPED * 3];
			
 
				+
			
 
				+		const unsigned int *inTrisPtr = inTris;
			
 
				+		int numLanes = SIMD_LANES;
			
 
				+		int triIndex = 0;
			
 
				+		while (triIndex < nTris || clipHead != clipTail)
			
 
				+		{
			
 
				+            unsigned int triMask = SIMD_ALL_LANES_MASK;
			
 
				+            __mw vtxX[3], vtxY[3], vtxW[3];
			
 
				+
			
 
				+            GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
			
 
				+
			
 
				+			if (triMask == 0x0)
			
 
				+				continue;
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Project, transform to screen space and perform backface culling. Note 
			
 
				+			// that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
			
 
				+			// z = 1 is near. We must also use a greater than depth test, and in effect
			
 
				+			// everything is reversed compared to regular z implementations.
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			__mwi ipVtxX[3], ipVtxY[3];
			
 
				+			ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
			
 
				+#else
			
 
				+			ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
			
 
				+#endif
			
 
				+
			
 
				+			// Perform backface test. 
			
 
				+			__mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
			
 
				+			__mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
			
 
				+			__mw triArea = _mmw_sub_ps(triArea1, triArea2);
			
 
				+			__mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
			
 
				+
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
			
 
				+#else
			
 
				+			triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
			
 
				+#endif
			
 
				+
			
 
				+			if (triMask == 0x0)
			
 
				+				continue;
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Bin triangles
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			unsigned int binWidth;
			
 
				+			unsigned int binHeight;
			
 
				+			ComputeBinWidthHeight(nBinsW, nBinsH, binWidth, binHeight);
			
 
				+
			
 
				+			// Compute pixel bounding box
			
 
				+			__mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
			
 
				+			ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, &mFullscreenScissor);
			
 
				+
			
 
				+			while (triMask)
			
 
				+			{
			
 
				+				unsigned int triIdx = find_clear_lsb(&triMask);
			
 
				+
			
 
				+				// Clamp bounding box to bins
			
 
				+				int startX = min(nBinsW-1, simd_i32(bbPixelMinX)[triIdx] / binWidth);
			
 
				+				int startY = min(nBinsH-1, simd_i32(bbPixelMinY)[triIdx] / binHeight);
			
 
				+				int endX = min(nBinsW, (simd_i32(bbPixelMaxX)[triIdx] + binWidth - 1) / binWidth);
			
 
				+				int endY = min(nBinsH, (simd_i32(bbPixelMaxY)[triIdx] + binHeight - 1) / binHeight);
			
 
				+
			
 
				+				for (int y = startY; y < endY; ++y)
			
 
				+				{
			
 
				+					for (int x = startX; x < endX; ++x)
			
 
				+					{
			
 
				+						int binIdx = x + y * nBinsW;
			
 
				+						unsigned int writeTriIdx = triLists[binIdx].mTriIdx;
			
 
				+						for (int i = 0; i < 3; ++i)
			
 
				+						{
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+							((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 0] = simd_i32(ipVtxX[i])[triIdx];
			
 
				+							((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 1] = simd_i32(ipVtxY[i])[triIdx];
			
 
				+#else
			
 
				+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 0] = simd_f32(pVtxX[i])[triIdx];
			
 
				+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 1] = simd_f32(pVtxY[i])[triIdx];
			
 
				+#endif
			
 
				+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 2] = simd_f32(pVtxZ[i])[triIdx];
			
 
				+						}
			
 
				+						triLists[binIdx].mTriIdx++;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+		_MM_SET_ROUNDING_MODE(originalRoundingMode);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
			
 
				+	{
			
 
				+		if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
			
 
				+			BinTriangles<true>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+		else
			
 
				+			BinTriangles<false>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
			
 
				+	}
			
 
				+
			
 
				+    template<int FAST_GATHER>
			
 
				+    void GatherTransformClip( int & clipHead, int & clipTail, int & numLanes, int nTris, int & triIndex, __mw * vtxX, __mw * vtxY, __mw * vtxW, const float * inVtx, const unsigned int * &inTrisPtr, const VertexLayout & vtxLayout, const float * modelToClipMatrix, __m128 * clipTriBuffer, unsigned int &triMask, ClipPlanes clipPlaneMask )
			
 
				+    {
			
 
				+        //////////////////////////////////////////////////////////////////////////////
			
 
				+        // Assemble triangles from the index list 
			
 
				+        //////////////////////////////////////////////////////////////////////////////
			
 
				+        unsigned int triClipMask = SIMD_ALL_LANES_MASK;
			
 
				+
			
 
				+        if( clipHead != clipTail )
			
 
				+        {
			
 
				+            int clippedTris = clipHead > clipTail ? clipHead - clipTail : MAX_CLIPPED + clipHead - clipTail;
			
 
				+            clippedTris = min( clippedTris, SIMD_LANES );
			
 
				+
			
 
				+#if CLIPPING_PRESERVES_ORDER != 0
			
 
				+            // if preserving order, don't mix clipped and new triangles, handle the clip buffer fully
			
 
				+            // and then continue gathering; this is not as efficient - ideally we want to gather
			
 
				+            // at the end (if clip buffer has less than SIMD_LANES triangles) but that requires
			
 
				+            // more modifications below - something to do in the future.
			
 
				+            numLanes = 0;
			
 
				+#else
			
 
				+            // Fill out SIMD registers by fetching more triangles. 
			
 
				+            numLanes = max( 0, min( SIMD_LANES - clippedTris, nTris - triIndex ) );
			
 
				+#endif
			
 
				+
			
 
				+            if( numLanes > 0 ) {
			
 
				+                if( FAST_GATHER )
			
 
				+                    GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
			
 
				+                else
			
 
				+                    GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
			
 
				+
			
 
				+                TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
			
 
				+            }
			
 
				+
			
 
				+            for( int clipTri = numLanes; clipTri < numLanes + clippedTris; clipTri++ )
			
 
				+            {
			
 
				+                int triIdx = clipTail * 3;
			
 
				+                for( int i = 0; i < 3; i++ )
			
 
				+                {
			
 
				+                    simd_f32( vtxX[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[0];
			
 
				+                    simd_f32( vtxY[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[1];
			
 
				+                    simd_f32( vtxW[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[2];
			
 
				+                }
			
 
				+                clipTail = ( clipTail + 1 ) & ( MAX_CLIPPED - 1 );
			
 
				+            }
			
 
				+
			
 
				+            triIndex += numLanes;
			
 
				+            inTrisPtr += numLanes * 3;
			
 
				+
			
 
				+            triMask = ( 1U << ( clippedTris + numLanes ) ) - 1;
			
 
				+            triClipMask = ( 1U << numLanes ) - 1; // Don't re-clip already clipped triangles
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            numLanes = min( SIMD_LANES, nTris - triIndex );
			
 
				+            triMask = ( 1U << numLanes ) - 1;
			
 
				+            triClipMask = triMask;
			
 
				+
			
 
				+            if( FAST_GATHER )
			
 
				+                GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
			
 
				+            else
			
 
				+                GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
			
 
				+
			
 
				+            TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
			
 
				+
			
 
				+            triIndex += SIMD_LANES;
			
 
				+            inTrisPtr += SIMD_LANES * 3;
			
 
				+        }
			
 
				+
			
 
				+        //////////////////////////////////////////////////////////////////////////////
			
 
				+        // Clip transformed triangles
			
 
				+        //////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+        if( clipPlaneMask != ClipPlanes::CLIP_PLANE_NONE )
			
 
				+            ClipTriangleAndAddToBuffer( vtxX, vtxY, vtxW, clipTriBuffer, clipHead, triMask, triClipMask, clipPlaneMask );
			
 
				+    }
			
 
				+
			
 
				+	void RenderTrilist(const TriList &triList, const ScissorRect *scissor) override
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+
			
 
				+		// Setup fullscreen scissor rect as default
			
 
				+		scissor = scissor == nullptr ? &mFullscreenScissor : scissor;
			
 
				+
			
 
				+		for (unsigned int i = 0; i < triList.mTriIdx; i += SIMD_LANES)
			
 
				+		{
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Fetch triangle vertices
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			unsigned int numLanes = min((unsigned int)SIMD_LANES, triList.mTriIdx - i);
			
 
				+			unsigned int triMask = (1U << numLanes) - 1;
			
 
				+
			
 
				+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
			
 
				+#if PRECISE_COVERAGE != 0
			
 
				+			__mwi ipVtxX[3], ipVtxY[3];
			
 
				+			for (unsigned int l = 0; l < numLanes; ++l)
			
 
				+			{
			
 
				+				unsigned int triIdx = i + l;
			
 
				+				for (int v = 0; v < 3; ++v)
			
 
				+				{
			
 
				+					simd_i32(ipVtxX[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 0];
			
 
				+					simd_i32(ipVtxY[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 1];
			
 
				+					simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			for (int v = 0; v < 3; ++v)
			
 
				+			{
			
 
				+				pVtxX[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[v]), _mmw_set1_ps(FP_INV));
			
 
				+				pVtxY[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[v]), _mmw_set1_ps(FP_INV));
			
 
				+			}
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Setup and rasterize a SIMD batch of triangles
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			RasterizeTriangleBatch<false>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, scissor);
			
 
				+#else
			
 
				+			for (unsigned int l = 0; l < numLanes; ++l)
			
 
				+			{
			
 
				+				unsigned int triIdx = i + l;
			
 
				+				for (int v = 0; v < 3; ++v)
			
 
				+				{
			
 
				+					simd_f32(pVtxX[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 0];
			
 
				+					simd_f32(pVtxY[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 1];
			
 
				+					simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+			// Setup and rasterize a SIMD batch of triangles
			
 
				+			//////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+			RasterizeTriangleBatch<false>(pVtxX, pVtxY, pVtxZ, triMask, scissor);
			
 
				+#endif
			
 
				+
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+	// Debugging and statistics
			
 
				+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+	MaskedOcclusionCulling::Implementation GetImplementation() override
			
 
				+	{
			
 
				+		return gInstructionSet;
			
 
				+	}
			
 
				+
			
 
				+	void ComputePixelDepthBuffer(float *depthData, bool flipY) override
			
 
				+	{
			
 
				+		assert(mMaskedHiZBuffer != nullptr);
			
 
				+		for (int y = 0; y < mHeight; y++)
			
 
				+		{
			
 
				+			for (int x = 0; x < mWidth; x++)
			
 
				+			{
			
 
				+				// Compute 32xN tile index (SIMD value offset)
			
 
				+				int tx = x / TILE_WIDTH;
			
 
				+				int ty = y / TILE_HEIGHT;
			
 
				+				int tileIdx = ty * mTilesWidth + tx;
			
 
				+
			
 
				+				// Compute 8x4 subtile index (SIMD lane offset)
			
 
				+				int stx = (x % TILE_WIDTH) / SUB_TILE_WIDTH;
			
 
				+				int sty = (y % TILE_HEIGHT) / SUB_TILE_HEIGHT;
			
 
				+				int subTileIdx = sty * 4 + stx;
			
 
				+
			
 
				+				// Compute pixel index in subtile (bit index in 32-bit word)
			
 
				+				int px = (x % SUB_TILE_WIDTH);
			
 
				+				int py = (y % SUB_TILE_HEIGHT);
			
 
				+				int bitIdx = py * 8 + px;
			
 
				+
			
 
				+				int pixelLayer = (simd_i32(mMaskedHiZBuffer[tileIdx].mMask)[subTileIdx] >> bitIdx) & 1;
			
 
				+				float pixelDepth = simd_f32(mMaskedHiZBuffer[tileIdx].mZMin[pixelLayer])[subTileIdx];
			
 
				+
			
 
				+                if( flipY )
			
 
				+                    depthData[( mHeight - y - 1 ) * mWidth + x] = pixelDepth;
			
 
				+                else
			
 
				+                    depthData[y * mWidth + x] = pixelDepth;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	OcclusionCullingStatistics GetStatistics() override
			
 
				+	{
			
 
				+		return mStats;
			
 
				+	}
			
 
				+
			
 
				+};
			
--- a/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/PackageInfo.json
+++ b/Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/PackageInfo.json
@@ -0,0 +1,6 @@
 
				+{
			
 
				+    "PackageName": "Masked Occlusion Culling",
			
 
				+    "URL": "https://software.intel.com/content/www/us/en/develop/articles/masked-software-occlusion-culling.html",
			
 
				+    "License": "Apache 2.0",
			
 
				+    "LicenseFile": "LICENSE.txt"
			
 
				+}
			
--- a/Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Culling.h
+++ b/Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Culling.h
@@ -31,7 +31,6 @@
 
				 #include <AzFramework/Visibility/IVisibilitySystem.h>
			
 
				 
			
 
				 #include <Atom/RPI.Public/View.h>
			
 
				-
			
 
				 #include <Atom/RHI/DrawList.h>
			
 
				 
			
 
				 #include <AtomCore/std/parallel/concurrency_checker.h>
			
@@ -97,6 +96,9 @@ namespace AZ
 
				             };
			
 
				             LodData m_lodData;
			
 
				 
			
 
				+            //! Flag indicating if the object is visible, i.e., was not culled out in the last frame
			
 
				+            bool m_isVisible = true;
			
 
				+
			
 
				             void SetDebugName([[maybe_unused]] const AZ::Name& debugName)
			
 
				             {
			
 
				 #ifdef AZ_CULL_DEBUG_ENABLED
			
@@ -213,6 +215,21 @@ namespace AZ
 
				             void Activate(const class Scene* parentScene);
			
 
				             void Deactivate();
			
 
				 
			
 
				+            struct OcclusionPlane
			
 
				+            {
			
 
				+                // World space corners of the occluson plane
			
 
				+                Vector3 m_cornerBL;
			
 
				+                Vector3 m_cornerTL;
			
 
				+                Vector3 m_cornerTR;
			
 
				+                Vector3 m_cornerBR;
			
 
				+
			
 
				+                Aabb m_aabb;
			
 
				+            };
			
 
				+            using OcclusionPlaneVector = AZStd::vector<OcclusionPlane>;
			
 
				+
			
 
				+            //! Sets a list of occlusion planes to be used during the culling process.
			
 
				+            void SetOcclusionPlanes(const OcclusionPlaneVector& occlusionPlanes) { m_occlusionPlanes = occlusionPlanes; }
			
 
				+
			
 
				             //! Notifies the CullingScene that culling will begin for this frame.
			
 
				             void BeginCulling(const AZStd::vector<ViewPtr>& views);
			
 
				 
			
@@ -251,12 +268,9 @@ namespace AZ
 
				 
			
 
				             const Scene* m_parentScene = nullptr;
			
 
				             AzFramework::IVisibilityScene* m_visScene = nullptr;
			
 
				-
			
 
				             CullingDebugContext m_debugCtx;
			
 
				-
			
 
				             AZStd::concurrency_checker m_cullDataConcurrencyCheck;
			
 
				-
			
 
				-            AZStd::mutex m_mutex;
			
 
				+            OcclusionPlaneVector m_occlusionPlanes;
			
 
				         };
			
 
				         
			
 
				 
			
--- a/Gems/Atom/RPI/Code/Include/Atom/RPI.Public/View.h
+++ b/Gems/Atom/RPI/Code/Include/Atom/RPI.Public/View.h
@@ -24,6 +24,8 @@
 
				 #include <AzCore/std/containers/vector.h>
			
 
				 #include <AzCore/Name/Name.h>
			
 
				 
			
 
				+class MaskedOcclusionCulling;
			
 
				+
			
 
				 namespace AZ
			
 
				 {
			
 
				     namespace  RHI
			
@@ -57,7 +59,7 @@ namespace AZ
 
				             //! Only use this function to create a new view object. And force using smart pointer to manage view's life time
			
 
				             static ViewPtr CreateView(const AZ::Name& name, UsageFlags usage);
			
 
				 
			
 
				-            ~View() = default;
			
 
				+            ~View();
			
 
				 
			
 
				             void SetDrawListMask(const RHI::DrawListMask& drawListMask);
			
 
				             RHI::DrawListMask GetDrawListMask() const { return m_drawListMask; }
			
@@ -126,6 +128,12 @@ namespace AZ
 
				             //! Notifies consumers when the world to clip matrix has changed.
			
 
				             void ConnectWorldToClipMatrixChangedHandler(MatrixChangedEvent::Handler& handler);
			
 
				 
			
 
				+            //! Prepare for view culling
			
 
				+            void BeginCulling();
			
 
				+
			
 
				+            //! Returns the masked occlusion culling interface
			
 
				+            MaskedOcclusionCulling* GetMaskedOcclusionCulling();
			
 
				+
			
 
				         private:
			
 
				             View() = delete;
			
 
				             View(const AZ::Name& name, UsageFlags usage);
			
@@ -193,6 +201,9 @@ namespace AZ
 
				 
			
 
				             MatrixChangedEvent m_onWorldToClipMatrixChange;
			
 
				             MatrixChangedEvent m_onWorldToViewMatrixChange;
			
 
				+
			
 
				+            // Masked Occlusion Culling interface
			
 
				+            MaskedOcclusionCulling* m_maskedOcclusionCulling = nullptr;
			
 
				         };
			
 
				 
			
 
				         AZ_DEFINE_ENUM_BITWISE_OPERATORS(View::UsageFlags);
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Android.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Android.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Platform.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Platform.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#include "Atom_RPI_Traits_Android.h"
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Android/PAL_android.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Android/PAL_android.cmake
@@ -0,0 +1,13 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Android/platform_android_files.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Android/platform_android_files.cmake
@@ -0,0 +1,15 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    Atom_RPI_Traits_Platform.h
			
 
				+    Atom_RPI_Traits_Android.h
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Linux.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Linux.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Platform.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Platform.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#include "Atom_RPI_Traits_Linux.h"
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Linux/PAL_linux.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Linux/PAL_linux.cmake
@@ -10,3 +10,4 @@
 
				 #
			
 
				 
			
 
				 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Linux/platform_linux_files.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Linux/platform_linux_files.cmake
@@ -0,0 +1,15 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    Atom_RPI_Traits_Platform.h
			
 
				+    Atom_RPI_Traits_Linux.h
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Mac.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Mac.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Platform.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Platform.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#include "Atom_RPI_Traits_Mac.h"
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Mac/PAL_mac.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Mac/PAL_mac.cmake
@@ -10,3 +10,4 @@
 
				 #
			
 
				 
			
 
				 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED TRUE)
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Mac/platform_mac_files.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Mac/platform_mac_files.cmake
@@ -0,0 +1,15 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    Atom_RPI_Traits_Platform.h
			
 
				+    Atom_RPI_Traits_Mac.h
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Platform.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Platform.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#include "Atom_RPI_Traits_Windows.h"
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Windows.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Windows.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 1
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Windows/PAL_windows.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Windows/PAL_windows.cmake
@@ -10,3 +10,17 @@
 
				 #
			
 
				 
			
 
				 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED TRUE)
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED TRUE)
			
 
				+
			
 
				+ly_add_source_properties(
			
 
				+    SOURCES External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
			
 
				+    PROPERTY COMPILE_OPTIONS
			
 
				+    VALUES /arch:AVX2 /W3
			
 
				+)
			
 
				+ly_add_source_properties(
			
 
				+    SOURCES 
			
 
				+        External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
			
 
				+        External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
			
 
				+    PROPERTY COMPILE_OPTIONS
			
 
				+    VALUES /W3
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/Windows/platform_windows_files.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/Windows/platform_windows_files.cmake
@@ -0,0 +1,15 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    Atom_RPI_Traits_Platform.h
			
 
				+    Atom_RPI_Traits_Windows.h
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_Platform.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_Platform.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#include "Atom_RPI_Traits_iOS.h"
			
--- a/Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_iOS.h
+++ b/Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_iOS.h
@@ -0,0 +1,14 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+#pragma once
			
 
				+
			
 
				+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0
			
--- a/Gems/Atom/RPI/Code/Source/Platform/iOS/PAL_ios.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/iOS/PAL_ios.cmake
@@ -0,0 +1,13 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
			
 
				+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)
			
--- a/Gems/Atom/RPI/Code/Source/Platform/iOS/platform_ios_files.cmake
+++ b/Gems/Atom/RPI/Code/Source/Platform/iOS/platform_ios_files.cmake
@@ -0,0 +1,15 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    Atom_RPI_Traits_Platform.h
			
 
				+    Atom_RPI_Traits_iOS.h
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/Source/RPI.Public/Culling.cpp
+++ b/Gems/Atom/RPI/Code/Source/RPI.Public/Culling.cpp
@@ -20,15 +20,20 @@
 
				 
			
 
				 #include <Atom/RHI/CpuProfiler.h>
			
 
				 
			
 
				+#include <AzCore/Math/MatrixUtils.h>
			
 
				 #include <AzCore/Math/ShapeIntersection.h>
			
 
				 #include <AzCore/Casting/numeric_cast.h>
			
 
				-
			
 
				 #include <AzCore/std/parallel/lock.h>
			
 
				 #include <AzCore/Casting/numeric_cast.h>
			
 
				 #include <AzCore/Debug/EventTrace.h>
			
 
				 #include <AzCore/Debug/Timer.h>
			
 
				 #include <AzCore/Jobs/JobFunction.h>
			
 
				 #include <AzCore/Jobs/Job.h>
			
 
				+#include <Atom_RPI_Traits_Platform.h>
			
 
				+
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+#include <MaskedOcclusionCulling/MaskedOcclusionCulling.h>
			
 
				+#endif
			
 
				 
			
 
				 //Enables more inner-loop profiling scopes (can create high overhead in RadTelemetry if there are many-many objects in a scene)
			
 
				 //#define AZ_CULL_PROFILE_DETAILED
			
@@ -272,21 +277,26 @@ namespace AZ
 
				         public:
			
 
				             AZ_CLASS_ALLOCATOR(AddObjectsToViewJob, ThreadPoolAllocator, 0);
			
 
				 
			
 
				+            struct JobData
			
 
				+            {
			
 
				+                CullingDebugContext* m_debugCtx = nullptr;
			
 
				+                const Scene* m_scene = nullptr;
			
 
				+                View* m_view = nullptr;
			
 
				+                Frustum m_frustum;
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+                MaskedOcclusionCulling* m_maskedOcclusionCulling = nullptr;
			
 
				+#endif
			
 
				+            };
			
 
				+
			
 
				         private:
			
 
				-            CullingDebugContext* m_debugCtx;
			
 
				-            const Scene* m_scene;
			
 
				-            View* m_view;
			
 
				-            Frustum m_frustum;
			
 
				+            const AZStd::shared_ptr<JobData> m_jobData;
			
 
				             CullingScene::WorkListType m_worklist;
			
 
				 
			
 
				         public:
			
 
				-            AddObjectsToViewJob(CullingDebugContext& debugCtx, const Scene& scene, View& view, Frustum& frustum, CullingScene::WorkListType& worklist)
			
 
				+            AddObjectsToViewJob(const AZStd::shared_ptr<AddObjectsToViewJob::JobData>& jobData, CullingScene::WorkListType& worklist)
			
 
				                 : Job(true, nullptr)        //auto-deletes, no JobContext
			
 
				-                , m_debugCtx(&debugCtx)
			
 
				-                , m_scene(&scene)
			
 
				-                , m_view(&view)
			
 
				-                , m_frustum(frustum)                 //capture by value
			
 
				-                , m_worklist(AZStd::move(worklist))  //capture by value
			
 
				+                , m_jobData(jobData)
			
 
				+                , m_worklist(worklist)
			
 
				             {
			
 
				             }
			
 
				 
			
@@ -295,37 +305,50 @@ namespace AZ
 
				             {
			
 
				                 AZ_PROFILE_FUNCTION(Debug::ProfileCategory::AzRender);
			
 
				 
			
 
				-                const View::UsageFlags viewFlags = m_view->GetUsageFlags();
			
 
				-                const RHI::DrawListMask drawListMask = m_view->GetDrawListMask();
			
 
				+                const View::UsageFlags viewFlags = m_jobData->m_view->GetUsageFlags();
			
 
				+                const RHI::DrawListMask drawListMask = m_jobData->m_view->GetDrawListMask();
			
 
				                 uint32_t numDrawPackets = 0;
			
 
				                 uint32_t numVisibleCullables = 0;
			
 
				 
			
 
				                 for (const AzFramework::IVisibilityScene::NodeData& nodeData : m_worklist)
			
 
				                 {
			
 
				                     //If a node is entirely contained within the frustum, then we can skip the fine grained culling.
			
 
				-                    bool nodeIsContainedInFrustum = ShapeIntersection::Contains(m_frustum, nodeData.m_bounds);
			
 
				+                    bool nodeIsContainedInFrustum = ShapeIntersection::Contains(m_jobData->m_frustum, nodeData.m_bounds);
			
 
				 
			
 
				 #ifdef AZ_CULL_PROFILE_VERBOSE
			
 
				                     AZ_PROFILE_SCOPE_DYNAMIC(Debug::ProfileCategory::AzRender, "process node (view: %s, skip fine cull: %d",
			
 
				                         m_view->GetName().GetCStr(), nodeIsContainedInFrustum ? 1 : 0);
			
 
				 #endif
			
 
				 
			
 
				-                    if (nodeIsContainedInFrustum || !m_debugCtx->m_enableFrustumCulling)
			
 
				+                    if (nodeIsContainedInFrustum || !m_jobData->m_debugCtx->m_enableFrustumCulling)
			
 
				                     {
			
 
				                         //Add all objects within this node to the view, without any extra culling
			
 
				                         for (AzFramework::VisibilityEntry* visibleEntry : nodeData.m_entries)
			
 
				                         {
			
 
				-                            if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
			
 
				                             {
			
 
				-                                Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
			
 
				-                                if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
			
 
				-                                    c->m_cullData.m_hideFlags & viewFlags ||
			
 
				-                                    c->m_cullData.m_scene != m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
			
 
				+                                if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
			
 
				                                 {
			
 
				-                                    continue;
			
 
				+                                    Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
			
 
				+
			
 
				+                                    // reset visibility flag to false, update to true if all culling checks pass
			
 
				+                                    c->m_isVisible = false;
			
 
				+
			
 
				+                                    if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
			
 
				+                                        c->m_cullData.m_hideFlags & viewFlags ||
			
 
				+                                        c->m_cullData.m_scene != m_jobData->m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
			
 
				+                                    {
			
 
				+                                        continue;
			
 
				+                                    }
			
 
				+
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+                                    if (TestOcclusionCulling(visibleEntry) == MaskedOcclusionCulling::CullingResult::VISIBLE)
			
 
				+#endif
			
 
				+                                    {
			
 
				+                                        numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_jobData->m_view);
			
 
				+                                        ++numVisibleCullables;
			
 
				+                                        c->m_isVisible = true;
			
 
				+                                    }
			
 
				                                 }
			
 
				-                                numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_view);
			
 
				-                                ++numVisibleCullables;
			
 
				                             }
			
 
				                         }
			
 
				                     }
			
@@ -337,68 +360,78 @@ namespace AZ
 
				                             if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
			
 
				                             {
			
 
				                                 Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
			
 
				+
			
 
				+                                // reset visibility flag to false, update to true if all culling checks pass
			
 
				+                                c->m_isVisible = false;
			
 
				+
			
 
				                                 if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
			
 
				                                     c->m_cullData.m_hideFlags & viewFlags ||
			
 
				-                                    c->m_cullData.m_scene != m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
			
 
				+                                    c->m_cullData.m_scene != m_jobData->m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
			
 
				                                 {
			
 
				                                     continue;
			
 
				                                 }
			
 
				 
			
 
				-                                IntersectResult res = ShapeIntersection::Classify(m_frustum, c->m_cullData.m_boundingSphere);
			
 
				+                                IntersectResult res = ShapeIntersection::Classify(m_jobData->m_frustum, c->m_cullData.m_boundingSphere);
			
 
				                                 if (res == IntersectResult::Exterior)
			
 
				                                 {
			
 
				                                     continue;
			
 
				                                 }
			
 
				-                                else if (res == IntersectResult::Interior || ShapeIntersection::Overlaps(m_frustum, c->m_cullData.m_boundingObb))
			
 
				+                                else if (res == IntersectResult::Interior || ShapeIntersection::Overlaps(m_jobData->m_frustum, c->m_cullData.m_boundingObb))
			
 
				                                 {
			
 
				-                                    numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_view);
			
 
				-                                    ++numVisibleCullables;
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+                                    if (TestOcclusionCulling(visibleEntry) == MaskedOcclusionCulling::CullingResult::VISIBLE)
			
 
				+#endif
			
 
				+                                    {
			
 
				+                                        numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_jobData->m_view);
			
 
				+                                        ++numVisibleCullables;
			
 
				+                                        c->m_isVisible = true;
			
 
				+                                    }
			
 
				                                 }
			
 
				                             }
			
 
				                         }
			
 
				                     }
			
 
				 
			
 
				-                    if (m_debugCtx->m_debugDraw && (m_view->GetName() == m_debugCtx->m_currentViewSelectionName))
			
 
				+                    if (m_jobData->m_debugCtx->m_debugDraw && (m_jobData->m_view->GetName() == m_jobData->m_debugCtx->m_currentViewSelectionName))
			
 
				                     {
			
 
				                         AZ_PROFILE_SCOPE(Debug::ProfileCategory::AzRender, "debug draw culling");
			
 
				 
			
 
				-                        AuxGeomDrawPtr auxGeomPtr = AuxGeomFeatureProcessorInterface::GetDrawQueueForScene(m_scene);
			
 
				+                        AuxGeomDrawPtr auxGeomPtr = AuxGeomFeatureProcessorInterface::GetDrawQueueForScene(m_jobData->m_scene);
			
 
				                         if (auxGeomPtr)
			
 
				                         {
			
 
				                             //Draw the node bounds
			
 
				                             // "Fully visible" nodes are nodes that are fully inside the frustum. "Partially visible" nodes intersect the edges of the frustum.
			
 
				                             // Since the nodes of an octree have lots of overlapping boxes with coplanar edges, it's easier to view these separately, so
			
 
				                             // we have a few debug booleans to toggle which ones to draw.
			
 
				-                            if (nodeIsContainedInFrustum && m_debugCtx->m_drawFullyVisibleNodes)
			
 
				+                            if (nodeIsContainedInFrustum && m_jobData->m_debugCtx->m_drawFullyVisibleNodes)
			
 
				                             {
			
 
				                                 auxGeomPtr->DrawAabb(nodeData.m_bounds, Colors::Lime, RPI::AuxGeomDraw::DrawStyle::Line, RPI::AuxGeomDraw::DepthTest::Off);
			
 
				                             }
			
 
				-                            else if (!nodeIsContainedInFrustum && m_debugCtx->m_drawPartiallyVisibleNodes)
			
 
				+                            else if (!nodeIsContainedInFrustum && m_jobData->m_debugCtx->m_drawPartiallyVisibleNodes)
			
 
				                             {
			
 
				                                 auxGeomPtr->DrawAabb(nodeData.m_bounds, Colors::Yellow, RPI::AuxGeomDraw::DrawStyle::Line, RPI::AuxGeomDraw::DepthTest::Off);
			
 
				                             }
			
 
				 
			
 
				                             //Draw bounds on individual objects
			
 
				-                            if (m_debugCtx->m_drawBoundingBoxes || m_debugCtx->m_drawBoundingSpheres || m_debugCtx->m_drawLodRadii)
			
 
				+                            if (m_jobData->m_debugCtx->m_drawBoundingBoxes || m_jobData->m_debugCtx->m_drawBoundingSpheres || m_jobData->m_debugCtx->m_drawLodRadii)
			
 
				                             {
			
 
				                                 for (AzFramework::VisibilityEntry* visibleEntry : nodeData.m_entries)
			
 
				                                 {
			
 
				                                     if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
			
 
				                                     {
			
 
				                                         Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
			
 
				-                                        if (m_debugCtx->m_drawBoundingBoxes)
			
 
				+                                        if (m_jobData->m_debugCtx->m_drawBoundingBoxes)
			
 
				                                         {
			
 
				                                             auxGeomPtr->DrawObb(c->m_cullData.m_boundingObb, Matrix3x4::Identity(),
			
 
				                                                 nodeIsContainedInFrustum ? Colors::Lime : Colors::Yellow, AuxGeomDraw::DrawStyle::Line);
			
 
				                                         }
			
 
				 
			
 
				-                                        if (m_debugCtx->m_drawBoundingSpheres)
			
 
				+                                        if (m_jobData->m_debugCtx->m_drawBoundingSpheres)
			
 
				                                         {
			
 
				                                             auxGeomPtr->DrawSphere(c->m_cullData.m_boundingSphere.GetCenter(), c->m_cullData.m_boundingSphere.GetRadius(),
			
 
				                                                 Color(0.5f, 0.5f, 0.5f, 0.3f), AuxGeomDraw::DrawStyle::Shaded);
			
 
				                                         }
			
 
				 
			
 
				-                                        if (m_debugCtx->m_drawLodRadii)
			
 
				+                                        if (m_jobData->m_debugCtx->m_drawLodRadii)
			
 
				                                         {
			
 
				                                             auxGeomPtr->DrawSphere(c->m_cullData.m_boundingSphere.GetCenter(),
			
 
				                                                 c->m_lodData.m_lodSelectionRadius,
			
@@ -411,9 +444,9 @@ namespace AZ
 
				                     }
			
 
				                 }
			
 
				 
			
 
				-                if (m_debugCtx->m_enableStats)
			
 
				+                if (m_jobData->m_debugCtx->m_enableStats)
			
 
				                 {
			
 
				-                    CullingDebugContext::CullStats& cullStats = m_debugCtx->GetCullStatsForView(m_view);
			
 
				+                    CullingDebugContext::CullStats& cullStats = m_jobData->m_debugCtx->GetCullStatsForView(m_jobData->m_view);
			
 
				 
			
 
				                     //no need for mutex here since these are all atomics
			
 
				                     cullStats.m_numVisibleDrawPackets += numDrawPackets;
			
@@ -421,6 +454,63 @@ namespace AZ
 
				                     ++cullStats.m_numJobs;
			
 
				                 }
			
 
				             }
			
 
				+
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            MaskedOcclusionCulling::CullingResult TestOcclusionCulling(AzFramework::VisibilityEntry* visibleEntry)
			
 
				+            {
			
 
				+                if (!m_jobData->m_maskedOcclusionCulling)
			
 
				+                {
			
 
				+                    return MaskedOcclusionCulling::CullingResult::VISIBLE;
			
 
				+                }
			
 
				+
			
 
				+                if (visibleEntry->m_boundingVolume.Contains(m_jobData->m_view->GetCameraTransform().GetTranslation()))
			
 
				+                {
			
 
				+                    // camera is inside bounding volume
			
 
				+                    return MaskedOcclusionCulling::CullingResult::VISIBLE;
			
 
				+                }
			
 
				+
			
 
				+                const Vector3& minBound = visibleEntry->m_boundingVolume.GetMin();
			
 
				+                const Vector3& maxBound = visibleEntry->m_boundingVolume.GetMax();
			
 
				+
			
 
				+                // compute bounding volume corners
			
 
				+                Vector4 corners[8];
			
 
				+                corners[0] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), minBound.GetY(), minBound.GetZ(), 1.0f);
			
 
				+                corners[1] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), minBound.GetY(), maxBound.GetZ(), 1.0f);
			
 
				+                corners[2] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), minBound.GetY(), maxBound.GetZ(), 1.0f);
			
 
				+                corners[3] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), minBound.GetY(), minBound.GetZ(), 1.0f);
			
 
				+                corners[4] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), maxBound.GetY(), minBound.GetZ(), 1.0f);
			
 
				+                corners[5] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), maxBound.GetY(), maxBound.GetZ(), 1.0f);
			
 
				+                corners[6] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), maxBound.GetY(), maxBound.GetZ(), 1.0f);
			
 
				+                corners[7] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), maxBound.GetY(), minBound.GetZ(), 1.0f);
			
 
				+
			
 
				+                // find min clip-space depth and NDC min/max
			
 
				+                float minDepth = FLT_MAX;
			
 
				+                float ndcMinX = FLT_MAX;
			
 
				+                float ndcMinY = FLT_MAX;
			
 
				+                float ndcMaxX = -FLT_MAX;
			
 
				+                float ndcMaxY = -FLT_MAX;
			
 
				+                for (uint32_t index = 0; index < 8; ++index)
			
 
				+                {
			
 
				+                    minDepth = AZStd::min(minDepth, corners[index].GetW());
			
 
				+
			
 
				+                    // convert to NDC
			
 
				+                    corners[index] /= corners[index].GetW();
			
 
				+
			
 
				+                    ndcMinX = AZStd::min(ndcMinX, corners[index].GetX());
			
 
				+                    ndcMinY = AZStd::min(ndcMinY, corners[index].GetY());
			
 
				+                    ndcMaxX = AZStd::max(ndcMaxX, corners[index].GetX());
			
 
				+                    ndcMaxY = AZStd::max(ndcMaxY, corners[index].GetY());
			
 
				+                }
			
 
				+
			
 
				+                if (minDepth < 0.00000001f)
			
 
				+                {
			
 
				+                    return MaskedOcclusionCulling::VISIBLE;
			
 
				+                }
			
 
				+
			
 
				+                // test against the occlusion buffer, which contains only the manually placed occlusion planes
			
 
				+                return m_jobData->m_maskedOcclusionCulling->TestRect(ndcMinX, ndcMinY, ndcMaxX, ndcMaxY, minDepth);
			
 
				+            }
			
 
				+#endif
			
 
				         };
			
 
				 
			
 
				         void CullingScene::ProcessCullables(const Scene& scene, View& view, AZ::Job& parentJob)
			
@@ -454,8 +544,67 @@ namespace AZ
 
				                 cullStats.m_cameraViewToWorld = view.GetViewToWorldMatrix();
			
 
				             }
			
 
				 
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            // setup occlusion culling, if necessary
			
 
				+            MaskedOcclusionCulling* maskedOcclusionCulling = m_occlusionPlanes.empty() ? nullptr : view.GetMaskedOcclusionCulling();
			
 
				+            if (maskedOcclusionCulling)
			
 
				+            {
			
 
				+                // frustum cull occlusion planes
			
 
				+                using VisibleOcclusionPlane = AZStd::pair<OcclusionPlane, float>;
			
 
				+                AZStd::vector<VisibleOcclusionPlane> visibleOccluders;
			
 
				+                for (const auto& occlusionPlane : m_occlusionPlanes)
			
 
				+                {
			
 
				+                    if (ShapeIntersection::Overlaps(frustum, occlusionPlane.m_aabb))
			
 
				+                    {
			
 
				+                        // occluder is visible, compute view space distance and add to list
			
 
				+                        float depth = (view.GetWorldToViewMatrix() * occlusionPlane.m_aabb.GetMin()).GetZ();
			
 
				+                        depth = AZStd::min(depth, (view.GetWorldToViewMatrix() * occlusionPlane.m_aabb.GetMax()).GetZ());
			
 
				+
			
 
				+                        visibleOccluders.push_back(AZStd::make_pair(occlusionPlane, depth));
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                // sort the occlusion planes by view space distance, front-to-back
			
 
				+                AZStd::sort(visibleOccluders.begin(), visibleOccluders.end(), [](const VisibleOcclusionPlane& LHS, const VisibleOcclusionPlane& RHS)
			
 
				+                {
			
 
				+                    return LHS.second > RHS.second;
			
 
				+                });
			
 
				+
			
 
				+                for (const VisibleOcclusionPlane& occlusionPlane: visibleOccluders)
			
 
				+                {
			
 
				+                    // convert to clip-space
			
 
				+                    Vector4 projectedBL = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerBL);
			
 
				+                    Vector4 projectedTL = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerTL);
			
 
				+                    Vector4 projectedTR = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerTR);
			
 
				+                    Vector4 projectedBR = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerBR);
			
 
				+
			
 
				+                    // store to float array
			
 
				+                    float verts[16];
			
 
				+                    projectedBL.StoreToFloat4(&verts[0]);
			
 
				+                    projectedTL.StoreToFloat4(&verts[4]);
			
 
				+                    projectedTR.StoreToFloat4(&verts[8]);
			
 
				+                    projectedBR.StoreToFloat4(&verts[12]);
			
 
				+
			
 
				+                    static uint32_t indices[6] = { 0, 1, 2, 2, 3, 0 };
			
 
				+
			
 
				+                    // render into the occlusion buffer, specifying BACKFACE_NONE so it functions as a double-sided occluder
			
 
				+                    maskedOcclusionCulling->RenderTriangles((float*)verts, indices, 2, nullptr, MaskedOcclusionCulling::BACKFACE_NONE);
			
 
				+                }
			
 
				+            }
			
 
				+#endif
			
 
				+
			
 
				             WorkListType worklist;
			
 
				-            auto nodeVisitorLambda = [this, &scene, &view, &parentJob, &frustum, &worklist](const AzFramework::IVisibilityScene::NodeData& nodeData) -> void
			
 
				+
			
 
				+            AZStd::shared_ptr<AddObjectsToViewJob::JobData> jobData = AZStd::make_shared<AddObjectsToViewJob::JobData>();
			
 
				+            jobData->m_debugCtx = &m_debugCtx;
			
 
				+            jobData->m_scene = &scene;
			
 
				+            jobData->m_view = &view;
			
 
				+            jobData->m_frustum = frustum;
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            jobData->m_maskedOcclusionCulling = maskedOcclusionCulling;
			
 
				+#endif
			
 
				+
			
 
				+            auto nodeVisitorLambda = [this, jobData, &parentJob, &frustum, &worklist](const AzFramework::IVisibilityScene::NodeData& nodeData) -> void
			
 
				             {
			
 
				                 AZ_PROFILE_SCOPE(Debug::ProfileCategory::AzRender, "nodeVisitorLambda()");
			
 
				                 AZ_Assert(nodeData.m_entries.size() > 0, "should not get called with 0 entries");
			
@@ -468,7 +617,7 @@ namespace AZ
 
				                 if (worklist.size() == worklist.capacity())
			
 
				                 {
			
 
				                     //Kick off a job to process the (full) worklist
			
 
				-                    AddObjectsToViewJob* job = aznew AddObjectsToViewJob(m_debugCtx, scene, view, frustum, worklist); //pool allocated (cheap), auto-deletes when job finishes
			
 
				+                    AddObjectsToViewJob* job = aznew AddObjectsToViewJob(jobData, worklist); //pool allocated (cheap), auto-deletes when job finishes
			
 
				                     worklist.clear();
			
 
				                     parentJob.SetContinuation(job);
			
 
				                     job->Start();
			
@@ -486,8 +635,16 @@ namespace AZ
 
				 
			
 
				             if (worklist.size() > 0)
			
 
				             {
			
 
				+                AZStd::shared_ptr<AddObjectsToViewJob::JobData> remainingJobData = AZStd::make_shared<AddObjectsToViewJob::JobData>();
			
 
				+                remainingJobData->m_debugCtx = &m_debugCtx;
			
 
				+                remainingJobData->m_scene = &scene;
			
 
				+                remainingJobData->m_view = &view;
			
 
				+                remainingJobData->m_frustum = frustum;
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+                remainingJobData->m_maskedOcclusionCulling = maskedOcclusionCulling;
			
 
				+#endif
			
 
				                 //Kick off a job to process any remaining workitems
			
 
				-                AddObjectsToViewJob* job = aznew AddObjectsToViewJob(m_debugCtx, scene, view, frustum, worklist); //pool allocated (cheap), auto-deletes when job finishes
			
 
				+                AddObjectsToViewJob* job = aznew AddObjectsToViewJob(remainingJobData, worklist); //pool allocated (cheap), auto-deletes when job finishes
			
 
				                 parentJob.SetContinuation(job);
			
 
				                 job->Start();
			
 
				             }
			
@@ -576,6 +733,11 @@ namespace AZ
 
				             m_debugCtx.ResetCullStats();
			
 
				             m_debugCtx.m_numCullablesInScene = GetNumCullables();
			
 
				 
			
 
				+            for (auto& view : views)
			
 
				+            {
			
 
				+                view->BeginCulling();
			
 
				+            }
			
 
				+
			
 
				             AuxGeomDrawPtr auxGeom;
			
 
				             if (m_debugCtx.m_debugDraw)
			
 
				             {
			
--- a/Gems/Atom/RPI/Code/Source/RPI.Public/View.cpp
+++ b/Gems/Atom/RPI/Code/Source/RPI.Public/View.cpp
@@ -15,18 +15,28 @@
 
				 #include <Atom/RPI.Public/RPISystemInterface.h>
			
 
				 #include <Atom/RPI.Public/Shader/ShaderResourceGroup.h>
			
 
				 #include <Atom/RPI.Public/Culling.h>
			
 
				-
			
 
				+#include <Atom/RPI.Public/RenderPipeline.h>
			
 
				+#include <Atom/RPI.Public/Pass/Specific/SwapChainPass.h>
			
 
				 #include <Atom/RHI/DrawListTagRegistry.h>
			
 
				 
			
 
				 #include <AzCore/Casting/lossy_cast.h>
			
 
				 #include <AzCore/Component/ComponentApplicationBus.h>
			
 
				 #include <AzCore/Math/MatrixUtils.h>
			
 
				 #include <AzCore/Serialization/SerializeContext.h>
			
 
				+#include <Atom_RPI_Traits_Platform.h>
			
 
				+
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+#include <MaskedOcclusionCulling/MaskedOcclusionCulling.h>
			
 
				+#endif
			
 
				 
			
 
				 namespace AZ
			
 
				 {
			
 
				     namespace RPI
			
 
				     {
			
 
				+        // fixed-size software occlusion culling buffer
			
 
				+        const uint32_t MaskedSoftwareOcclusionCullingWidth = 1920;
			
 
				+        const uint32_t MaskedSoftwareOcclusionCullingHeight = 1080;
			
 
				+
			
 
				         ViewPtr View::CreateView(const AZ::Name& name, UsageFlags usage)
			
 
				         {
			
 
				             View* view = aznew View(name, usage);
			
@@ -51,6 +61,21 @@ namespace AZ
 
				             {
			
 
				                 m_shaderResourceGroup = ShaderResourceGroup::Create(viewSrgAsset);
			
 
				             }
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            m_maskedOcclusionCulling = MaskedOcclusionCulling::Create();
			
 
				+            m_maskedOcclusionCulling->SetResolution(MaskedSoftwareOcclusionCullingWidth, MaskedSoftwareOcclusionCullingHeight);
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+        View::~View()
			
 
				+        {
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            if (m_maskedOcclusionCulling)
			
 
				+            {
			
 
				+                MaskedOcclusionCulling::Destroy(m_maskedOcclusionCulling);
			
 
				+                m_maskedOcclusionCulling = nullptr;
			
 
				+            }
			
 
				+#endif
			
 
				         }
			
 
				 
			
 
				         void View::SetDrawListMask(const RHI::DrawListMask& drawListMask)
			
@@ -374,5 +399,17 @@ namespace AZ
 
				             m_shaderResourceGroup->Compile();
			
 
				             m_needBuildSrg = false;
			
 
				         }
			
 
				+
			
 
				+        void View::BeginCulling()
			
 
				+        {
			
 
				+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
			
 
				+            m_maskedOcclusionCulling->ClearBuffer();
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+        MaskedOcclusionCulling* View::GetMaskedOcclusionCulling()
			
 
				+        {
			
 
				+            return m_maskedOcclusionCulling;
			
 
				+        }
			
 
				     } // namespace RPI
			
 
				 } // namespace AZ
			
--- a/Gems/Atom/RPI/Code/atom_rpi_masked_occlusion_files.cmake
+++ b/Gems/Atom/RPI/Code/atom_rpi_masked_occlusion_files.cmake
@@ -0,0 +1,18 @@
 
				+#
			
 
				+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+# its licensors.
			
 
				+#
			
 
				+# For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+# distribution (the "License"). All use of this software is governed by the License,
			
 
				+# or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+#
			
 
				+
			
 
				+set(FILES
			
 
				+    External/MaskedOcclusionCulling/MaskedOcclusionCulling.h
			
 
				+    External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl
			
 
				+    External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
			
 
				+    External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
			
 
				+    External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
			
 
				+)
			
--- a/Gems/Atom/RPI/Code/atom_rpi_public_files.cmake
+++ b/Gems/Atom/RPI/Code/atom_rpi_public_files.cmake
@@ -180,4 +180,4 @@ set(FILES
 
				     Source/RPI.Public/GpuQuery/Query.cpp
			
 
				     Source/RPI.Public/GpuQuery/QueryPool.cpp
			
 
				     Source/RPI.Public/GpuQuery/TimestampQueryPool.cpp
			
 
				-)
			
 
				+)
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/Module.cpp
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/Module.cpp
@@ -25,6 +25,7 @@
 
				 #include <Material/MaterialComponent.h>
			
 
				 #include <Mesh/MeshComponent.h>
			
 
				 #include <ReflectionProbe/ReflectionProbeComponent.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
			
 
				 #include <PostProcess/PostFxLayerComponent.h>
			
 
				 #include <PostProcess/Bloom/BloomComponent.h>
			
 
				 #include <PostProcess/DepthOfField/DepthOfFieldComponent.h>
			
@@ -57,6 +58,7 @@
 
				 #include <Mesh/EditorMeshComponent.h>
			
 
				 #include <Mesh/EditorMeshSystemComponent.h>
			
 
				 #include <ReflectionProbe/EditorReflectionProbeComponent.h>
			
 
				+#include <OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h>
			
 
				 #include <PostProcess/EditorPostFxLayerComponent.h>
			
 
				 #include <PostProcess/Bloom/EditorBloomComponent.h>
			
 
				 #include <PostProcess/DepthOfField/EditorDepthOfFieldComponent.h>
			
@@ -117,6 +119,7 @@ namespace AZ
 
				                         DeferredFogComponent::CreateDescriptor(),
			
 
				                         SurfaceData::SurfaceDataMeshComponent::CreateDescriptor(),
			
 
				                         AttachmentComponent::CreateDescriptor(),
			
 
				+                        OcclusionCullingPlaneComponent::CreateDescriptor(),
			
 
				 
			
 
				 #ifdef ATOMLYINTEGRATION_FEATURE_COMMON_EDITOR
			
 
				                         EditorAreaLightComponent::CreateDescriptor(),
			
@@ -149,6 +152,7 @@ namespace AZ
 
				                         EditorDeferredFogComponent::CreateDescriptor(),
			
 
				                         SurfaceData::EditorSurfaceDataMeshComponent::CreateDescriptor(),
			
 
				                         EditorAttachmentComponent::CreateDescriptor(),
			
 
				+                        EditorOcclusionCullingPlaneComponent::CreateDescriptor(),
			
 
				 #endif
			
 
				                     });
			
 
				             }
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp
@@ -0,0 +1,95 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#include <OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h>
			
 
				+#include <AzFramework/StringFunc/StringFunc.h>
			
 
				+#include <AzToolsFramework/API/ToolsApplicationAPI.h>
			
 
				+#include <AzToolsFramework/Entity/EditorEntityInfoBus.h>
			
 
				+#include <AzToolsFramework/API/EditorAssetSystemAPI.h>
			
 
				+#include <AzCore/Component/Entity.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        void EditorOcclusionCullingPlaneComponent::Reflect(AZ::ReflectContext* context)
			
 
				+        {
			
 
				+            BaseClass::Reflect(context);
			
 
				+
			
 
				+            if (AZ::SerializeContext* serializeContext = azrtti_cast<AZ::SerializeContext*>(context))
			
 
				+            {
			
 
				+                serializeContext->Class<EditorOcclusionCullingPlaneComponent, BaseClass>()
			
 
				+                    ->Version(1, ConvertToEditorRenderComponentAdapter<1>)
			
 
				+                ;
			
 
				+
			
 
				+                if (AZ::EditContext* editContext = serializeContext->GetEditContext())
			
 
				+                {
			
 
				+                    editContext->Class<EditorOcclusionCullingPlaneComponent>(
			
 
				+                        "Occlusion Culling Plane", "The OcclusionCullingPlane component is used to cull meshes that are inside the view frustum and behind the occlusion plane")
			
 
				+                        ->ClassElement(AZ::Edit::ClassElements::EditorData, "")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::Category, "Atom")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::Icon, "Icons/Components/Component_Placeholder.svg")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::ViewportIcon, "Icons/Components/Viewport/Component_Placeholder.png")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::AppearsInAddComponentMenu, AZ_CRC("Game", 0x232b318c))
			
 
				+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
			
 
				+                        ;
			
 
				+
			
 
				+                    editContext->Class<OcclusionCullingPlaneComponentController>(
			
 
				+                        "OcclusionCullingPlaneComponentController", "")
			
 
				+                        ->ClassElement(AZ::Edit::ClassElements::EditorData, "")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
			
 
				+                        ->DataElement(AZ::Edit::UIHandlers::Default, &OcclusionCullingPlaneComponentController::m_configuration, "Configuration", "")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::Visibility, AZ::Edit::PropertyVisibility::ShowChildrenOnly)
			
 
				+                        ;
			
 
				+
			
 
				+                    editContext->Class<OcclusionCullingPlaneComponentConfig>(
			
 
				+                        "OcclusionCullingPlaneComponentConfig", "")
			
 
				+                        ->ClassElement(AZ::Edit::ClassElements::Group, "Settings")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
			
 
				+                        ->DataElement(AZ::Edit::UIHandlers::CheckBox, &OcclusionCullingPlaneComponentConfig::m_showVisualization, "Show Visualization", "Show the occlusion culling plane visualization")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::ChangeNotify, Edit::PropertyRefreshLevels::ValuesOnly)
			
 
				+                        ->DataElement(AZ::Edit::UIHandlers::CheckBox, &OcclusionCullingPlaneComponentConfig::m_transparentVisualization, "Transparent Visualization", "Sets the occlusion culling plane visualization as transparent")
			
 
				+                            ->Attribute(AZ::Edit::Attributes::ChangeNotify, Edit::PropertyRefreshLevels::ValuesOnly)
			
 
				+                        ;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (auto behaviorContext = azrtti_cast<BehaviorContext*>(context))
			
 
				+            {
			
 
				+                behaviorContext->ConstantProperty("EditorOcclusionCullingPlaneComponentTypeId", BehaviorConstant(Uuid(EditorOcclusionCullingPlaneComponentTypeId)))
			
 
				+                    ->Attribute(AZ::Script::Attributes::Module, "render")
			
 
				+                    ->Attribute(AZ::Script::Attributes::Scope, AZ::Script::Attributes::ScopeFlags::Automation);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        EditorOcclusionCullingPlaneComponent::EditorOcclusionCullingPlaneComponent()
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        EditorOcclusionCullingPlaneComponent::EditorOcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config)
			
 
				+            : BaseClass(config)
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        void EditorOcclusionCullingPlaneComponent::Activate()
			
 
				+        {
			
 
				+            BaseClass::Activate();
			
 
				+            AzFramework::EntityDebugDisplayEventBus::Handler::BusConnect(GetEntityId());
			
 
				+        }
			
 
				+
			
 
				+        void EditorOcclusionCullingPlaneComponent::Deactivate()
			
 
				+        {
			
 
				+            AzFramework::EntityDebugDisplayEventBus::Handler::BusDisconnect();
			
 
				+            BaseClass::Deactivate();
			
 
				+        }
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <AzFramework/Entity/EntityDebugDisplayBus.h>
			
 
				+#include <AzToolsFramework/API/ComponentEntitySelectionBus.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
			
 
				+#include <Atom/Feature/Utils/EditorRenderComponentAdapter.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {        
			
 
				+        class EditorOcclusionCullingPlaneComponent final
			
 
				+            : public EditorRenderComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentConfig>
			
 
				+            , private AzFramework::EntityDebugDisplayEventBus::Handler
			
 
				+        {
			
 
				+        public:
			
 
				+            using BaseClass = EditorRenderComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentConfig>;
			
 
				+            AZ_EDITOR_COMPONENT(AZ::Render::EditorOcclusionCullingPlaneComponent, EditorOcclusionCullingPlaneComponentTypeId, BaseClass);
			
 
				+
			
 
				+            static void Reflect(AZ::ReflectContext* context);
			
 
				+
			
 
				+            EditorOcclusionCullingPlaneComponent();
			
 
				+            EditorOcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config);
			
 
				+
			
 
				+            // AZ::Component overrides
			
 
				+            void Activate() override;
			
 
				+            void Deactivate() override;
			
 
				+        };
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        OcclusionCullingPlaneComponent::OcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config)
			
 
				+            : BaseClass(config)
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponent::Reflect(AZ::ReflectContext* context)
			
 
				+        {
			
 
				+            BaseClass::Reflect(context);
			
 
				+
			
 
				+            if (auto serializeContext = azrtti_cast<AZ::SerializeContext*>(context))
			
 
				+            {
			
 
				+                serializeContext->Class<OcclusionCullingPlaneComponent, BaseClass>()
			
 
				+                    ->Version(0)
			
 
				+                    ;
			
 
				+            }
			
 
				+
			
 
				+            if (auto behaviorContext = azrtti_cast<BehaviorContext*>(context))
			
 
				+            {
			
 
				+                behaviorContext->ConstantProperty("OcclusionCullingPlaneComponentTypeId", BehaviorConstant(Uuid(OcclusionCullingPlaneComponentTypeId)))
			
 
				+                    ->Attribute(AZ::Script::Attributes::Module, "render")
			
 
				+                    ->Attribute(AZ::Script::Attributes::Scope, AZ::Script::Attributes::ScopeFlags::Common);
			
 
				+            }
			
 
				+        }
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h
@@ -0,0 +1,37 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
			
 
				+#include <AzFramework/Components/ComponentAdapter.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        class OcclusionCullingPlaneComponent final
			
 
				+            : public AzFramework::Components::ComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponentConfig>
			
 
				+        {
			
 
				+        public:
			
 
				+            using BaseClass = AzFramework::Components::ComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponentConfig>;
			
 
				+            AZ_COMPONENT(AZ::Render::OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentTypeId, BaseClass);
			
 
				+
			
 
				+            OcclusionCullingPlaneComponent() = default;
			
 
				+            OcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config);
			
 
				+
			
 
				+            static void Reflect(AZ::ReflectContext* context);
			
 
				+        };
			
 
				+
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        static constexpr const char* const OcclusionCullingPlaneComponentTypeId = "{F7537387-15A8-48F0-A1F3-D19C5886B886}";
			
 
				+        static constexpr const char* const EditorOcclusionCullingPlaneComponentTypeId = "{BE7CC17B-32EB-49B0-BAD9-D26E3A059012}";
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp
@@ -0,0 +1,143 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
			
 
				+
			
 
				+#include <Atom/RPI.Public/Model/Model.h>
			
 
				+#include <Atom/RPI.Public/Image/StreamingImage.h>
			
 
				+#include <Atom/RPI.Public/Scene.h>
			
 
				+
			
 
				+#include <AzCore/Asset/AssetManager.h>
			
 
				+#include <AzCore/Asset/AssetManagerBus.h>
			
 
				+#include <AzCore/Debug/EventTrace.h>
			
 
				+#include <AzCore/Serialization/SerializeContext.h>
			
 
				+
			
 
				+#include <AzFramework/Entity/EntityContextBus.h>
			
 
				+#include <AzFramework/Entity/EntityContext.h>
			
 
				+#include <AzFramework/Scene/Scene.h>
			
 
				+
			
 
				+#include <AzCore/RTTI/BehaviorContext.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        void OcclusionCullingPlaneComponentConfig::Reflect(ReflectContext* context)
			
 
				+        {
			
 
				+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
			
 
				+            {
			
 
				+                serializeContext->Class<OcclusionCullingPlaneComponentConfig>()
			
 
				+                    ->Version(0)
			
 
				+                    ->Field("ShowVisualization", &OcclusionCullingPlaneComponentConfig::m_showVisualization)
			
 
				+                    ->Field("TransparentVisualization", &OcclusionCullingPlaneComponentConfig::m_transparentVisualization)
			
 
				+                    ;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::Reflect(ReflectContext* context)
			
 
				+        {
			
 
				+            OcclusionCullingPlaneComponentConfig::Reflect(context);
			
 
				+
			
 
				+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
			
 
				+            {
			
 
				+                serializeContext->Class<OcclusionCullingPlaneComponentController>()
			
 
				+                    ->Version(0)
			
 
				+                    ->Field("Configuration", &OcclusionCullingPlaneComponentController::m_configuration);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::GetDependentServices(AZ::ComponentDescriptor::DependencyArrayType& dependent)
			
 
				+        {
			
 
				+            dependent.push_back(AZ_CRC("TransformService", 0x8ee22c50));
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::GetProvidedServices(AZ::ComponentDescriptor::DependencyArrayType& provided)
			
 
				+        {
			
 
				+            provided.push_back(AZ_CRC("OcclusionCullingPlaneService", 0x9123f33d));
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::GetIncompatibleServices(AZ::ComponentDescriptor::DependencyArrayType& incompatible)
			
 
				+        {
			
 
				+            incompatible.push_back(AZ_CRC("OcclusionCullingPlaneService", 0x9123f33d));
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::GetRequiredServices(AZ::ComponentDescriptor::DependencyArrayType& required)
			
 
				+        {
			
 
				+            required.push_back(AZ_CRC("TransformService"));
			
 
				+        }
			
 
				+
			
 
				+        OcclusionCullingPlaneComponentController::OcclusionCullingPlaneComponentController(const OcclusionCullingPlaneComponentConfig& config)
			
 
				+            : m_configuration(config)
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::Activate(AZ::EntityId entityId)
			
 
				+        {
			
 
				+            m_entityId = entityId;
			
 
				+
			
 
				+            TransformNotificationBus::Handler::BusConnect(m_entityId);
			
 
				+
			
 
				+            m_featureProcessor = RPI::Scene::GetFeatureProcessorForEntity<OcclusionCullingPlaneFeatureProcessorInterface>(entityId);
			
 
				+            AZ_Assert(m_featureProcessor, "OcclusionCullingPlaneComponentController was unable to find a OcclusionCullingPlaneFeatureProcessor on the EntityContext provided.");
			
 
				+
			
 
				+            m_transformInterface = TransformBus::FindFirstHandler(entityId);
			
 
				+            AZ_Assert(m_transformInterface, "Unable to attach to a TransformBus handler");
			
 
				+            if (!m_transformInterface)
			
 
				+            {
			
 
				+                return;
			
 
				+            }
			
 
				+
			
 
				+            // add this occlusion plane to the feature processor
			
 
				+            const AZ::Transform& transform = m_transformInterface->GetWorldTM();
			
 
				+            m_handle = m_featureProcessor->AddOcclusionCullingPlane(transform);
			
 
				+
			
 
				+            // set visualization
			
 
				+            m_featureProcessor->ShowVisualization(m_handle, m_configuration.m_showVisualization);
			
 
				+            m_featureProcessor->SetTransparentVisualization(m_handle, m_configuration.m_transparentVisualization);
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::Deactivate()
			
 
				+        {
			
 
				+            if (m_featureProcessor)
			
 
				+            {
			
 
				+                m_featureProcessor->RemoveOcclusionCullingPlane(m_handle);
			
 
				+            }
			
 
				+
			
 
				+            Data::AssetBus::MultiHandler::BusDisconnect();
			
 
				+            TransformNotificationBus::Handler::BusDisconnect();
			
 
				+
			
 
				+            m_transformInterface = nullptr;
			
 
				+            m_featureProcessor = nullptr;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::SetConfiguration(const OcclusionCullingPlaneComponentConfig& config)
			
 
				+        {
			
 
				+            m_configuration = config;
			
 
				+        }
			
 
				+
			
 
				+        const OcclusionCullingPlaneComponentConfig& OcclusionCullingPlaneComponentController::GetConfiguration() const
			
 
				+        {
			
 
				+            return m_configuration;
			
 
				+        }
			
 
				+
			
 
				+        void OcclusionCullingPlaneComponentController::OnTransformChanged([[maybe_unused]] const AZ::Transform& local, const AZ::Transform& world)
			
 
				+        {
			
 
				+            if (!m_featureProcessor)
			
 
				+            {
			
 
				+                return;
			
 
				+            }
			
 
				+
			
 
				+            m_featureProcessor->SetTransform(m_handle, world);
			
 
				+        }
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h
@@ -0,0 +1,81 @@
 
				+/*
			
 
				+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
			
 
				+* its licensors.
			
 
				+*
			
 
				+* For complete copyright and license terms please see the LICENSE at the root of this
			
 
				+* distribution (the "License"). All use of this software is governed by the License,
			
 
				+* or, if provided, by the license below or the license accompanying this file. Do not
			
 
				+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
			
 
				+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+*
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <AzCore/Asset/AssetCommon.h>
			
 
				+#include <AzCore/Component/Component.h>
			
 
				+#include <AzCore/Component/TransformBus.h>
			
 
				+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
			
 
				+#include <Atom/RPI.Public/Model/Model.h>
			
 
				+#include <LmbrCentral/Shape/BoxShapeComponentBus.h>
			
 
				+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
			
 
				+
			
 
				+namespace AZ
			
 
				+{
			
 
				+    namespace Render
			
 
				+    {
			
 
				+        class OcclusionCullingPlaneComponentConfig final
			
 
				+            : public AZ::ComponentConfig
			
 
				+        {
			
 
				+        public:
			
 
				+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneComponentConfig, "{D0E107CA-5AFB-4675-BC97-94BCA5F248DB}", ComponentConfig);
			
 
				+            AZ_CLASS_ALLOCATOR(OcclusionCullingPlaneComponentConfig, SystemAllocator, 0);
			
 
				+            static void Reflect(AZ::ReflectContext* context);
			
 
				+
			
 
				+            bool m_showVisualization = true;
			
 
				+            bool m_transparentVisualization = false;
			
 
				+
			
 
				+            OcclusionCullingPlaneComponentConfig() = default;
			
 
				+        };
			
 
				+
			
 
				+        class OcclusionCullingPlaneComponentController final
			
 
				+            : public Data::AssetBus::MultiHandler
			
 
				+            , private TransformNotificationBus::Handler
			
 
				+        {
			
 
				+        public:
			
 
				+            friend class EditorOcclusionCullingPlaneComponent;
			
 
				+
			
 
				+            AZ_CLASS_ALLOCATOR(OcclusionCullingPlaneComponentController, AZ::SystemAllocator, 0);
			
 
				+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneComponentController, "{8EDA3C7D-5171-4843-9969-4D84DB13F221}");
			
 
				+
			
 
				+            static void Reflect(AZ::ReflectContext* context);
			
 
				+            static void GetDependentServices(AZ::ComponentDescriptor::DependencyArrayType& dependent);
			
 
				+            static void GetProvidedServices(AZ::ComponentDescriptor::DependencyArrayType& provided);
			
 
				+            static void GetIncompatibleServices(AZ::ComponentDescriptor::DependencyArrayType& incompatible);
			
 
				+            static void GetRequiredServices(AZ::ComponentDescriptor::DependencyArrayType& required);
			
 
				+
			
 
				+            OcclusionCullingPlaneComponentController() = default;
			
 
				+            OcclusionCullingPlaneComponentController(const OcclusionCullingPlaneComponentConfig& config);
			
 
				+
			
 
				+            void Activate(AZ::EntityId entityId);
			
 
				+            void Deactivate();
			
 
				+            void SetConfiguration(const OcclusionCullingPlaneComponentConfig& config);
			
 
				+            const OcclusionCullingPlaneComponentConfig& GetConfiguration() const;
			
 
				+
			
 
				+        private:
			
 
				+
			
 
				+            AZ_DISABLE_COPY(OcclusionCullingPlaneComponentController);
			
 
				+
			
 
				+            // TransformNotificationBus overrides
			
 
				+            void OnTransformChanged(const AZ::Transform& local, const AZ::Transform& world) override;
			
 
				+
			
 
				+            // handle for this occlusion plane in the feature processor
			
 
				+            OcclusionCullingPlaneHandle m_handle;
			
 
				+
			
 
				+            OcclusionCullingPlaneFeatureProcessorInterface* m_featureProcessor = nullptr;
			
 
				+            TransformInterface* m_transformInterface = nullptr;
			
 
				+            AZ::EntityId m_entityId;
			
 
				+            OcclusionCullingPlaneComponentConfig m_configuration;
			
 
				+        };
			
 
				+    } // namespace Render
			
 
				+} // namespace AZ
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_editor_files.cmake
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_editor_files.cmake
@@ -56,6 +56,8 @@ set(FILES
 
				     Source/Mesh/EditorMeshSystemComponent.h
			
 
				     Source/Mesh/MeshThumbnail.h
			
 
				     Source/Mesh/MeshThumbnail.cpp
			
 
				+    Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h
			
 
				+    Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp
			
 
				     Source/PostProcess/EditorPostFxLayerComponent.cpp
			
 
				     Source/PostProcess/EditorPostFxLayerComponent.h
			
 
				     Source/PostProcess/Bloom/EditorBloomComponent.cpp
			
--- a/Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_files.cmake
+++ b/Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_files.cmake
@@ -72,6 +72,10 @@ set(FILES
 
				     Source/Mesh/MeshComponent.cpp
			
 
				     Source/Mesh/MeshComponentController.h
			
 
				     Source/Mesh/MeshComponentController.cpp
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h
			
 
				+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp
			
 
				     Source/PostProcess/PostFxLayerComponent.cpp
			
 
				     Source/PostProcess/PostFxLayerComponent.h
			
 
				     Source/PostProcess/PostFxLayerComponentConfig.cpp