Selaa lähdekoodia

Merge pull request #1110 from aws-lumberyard-dev/Atom/dmcdiar/ATOM-15517

[ATOM-15517] Software Occlusion Culling
dmcdiarmid-ly 4 vuotta sitten
vanhempi
commit
d19d2aff9d
56 muutettua tiedostoa jossa 5464 lisäystä ja 51 poistoa
  1. 22 0
      Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.material
  2. 22 0
      Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.material
  3. 3 0
      Gems/Atom/Feature/Common/Assets/Models/OcclusionCullingPlane.fbx
  4. 43 0
      Gems/Atom/Feature/Common/Code/Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h
  5. 4 0
      Gems/Atom/Feature/Common/Code/Source/CommonSystemComponent.cpp
  6. 113 0
      Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp
  7. 65 0
      Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.h
  8. 146 0
      Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp
  9. 66 0
      Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h
  10. 4 0
      Gems/Atom/Feature/Common/Code/atom_feature_common_files.cmake
  11. 1 0
      Gems/Atom/Feature/Common/Code/atom_feature_common_public_files.cmake
  12. 15 2
      Gems/Atom/RPI/Code/CMakeLists.txt
  13. 98 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/CompilerSpecific.inl
  14. 181 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/LICENSE.txt
  15. 456 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
  16. 592 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.h
  17. 243 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
  18. 309 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
  19. 2053 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl
  20. 6 0
      Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/PackageInfo.json
  21. 19 5
      Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Culling.h
  22. 12 1
      Gems/Atom/RPI/Code/Include/Atom/RPI.Public/View.h
  23. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Android.h
  24. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Platform.h
  25. 13 0
      Gems/Atom/RPI/Code/Source/Platform/Android/PAL_android.cmake
  26. 15 0
      Gems/Atom/RPI/Code/Source/Platform/Android/platform_android_files.cmake
  27. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Linux.h
  28. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Platform.h
  29. 1 0
      Gems/Atom/RPI/Code/Source/Platform/Linux/PAL_linux.cmake
  30. 15 0
      Gems/Atom/RPI/Code/Source/Platform/Linux/platform_linux_files.cmake
  31. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Mac.h
  32. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Platform.h
  33. 1 0
      Gems/Atom/RPI/Code/Source/Platform/Mac/PAL_mac.cmake
  34. 15 0
      Gems/Atom/RPI/Code/Source/Platform/Mac/platform_mac_files.cmake
  35. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Platform.h
  36. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Windows.h
  37. 14 0
      Gems/Atom/RPI/Code/Source/Platform/Windows/PAL_windows.cmake
  38. 15 0
      Gems/Atom/RPI/Code/Source/Platform/Windows/platform_windows_files.cmake
  39. 14 0
      Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_Platform.h
  40. 14 0
      Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_iOS.h
  41. 13 0
      Gems/Atom/RPI/Code/Source/Platform/iOS/PAL_ios.cmake
  42. 15 0
      Gems/Atom/RPI/Code/Source/Platform/iOS/platform_ios_files.cmake
  43. 203 41
      Gems/Atom/RPI/Code/Source/RPI.Public/Culling.cpp
  44. 38 1
      Gems/Atom/RPI/Code/Source/RPI.Public/View.cpp
  45. 18 0
      Gems/Atom/RPI/Code/atom_rpi_masked_occlusion_files.cmake
  46. 1 1
      Gems/Atom/RPI/Code/atom_rpi_public_files.cmake
  47. 4 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/Module.cpp
  48. 95 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp
  49. 43 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h
  50. 43 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp
  51. 37 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h
  52. 22 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h
  53. 143 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp
  54. 81 0
      Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h
  55. 2 0
      Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_editor_files.cmake
  56. 4 0
      Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_files.cmake

+ 22 - 0
Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.material

@@ -0,0 +1,22 @@
+{
+    "materialType": "Materials\\Types\\StandardPBR.materialtype",
+    "propertyLayoutVersion": 3,
+    "properties": {
+        "general": {
+            "enableShadows": false,
+            "enableDirectionalLights": false,
+            "enablePunctualLights": false,
+            "enableAreaLights": false,
+            "enableIBL":  true
+        },
+        "baseColor": {
+           "color": [ 0.0, 1.0, 0.0 ]
+        },
+        "opacity": {
+            "alphaSource": "None",
+            "doubleSided": true,
+            "factor": 0.25,
+            "mode": "TintedTransparent"
+        }
+    }
+}

+ 22 - 0
Gems/Atom/Feature/Common/Assets/Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.material

@@ -0,0 +1,22 @@
+{
+    "materialType": "Materials\\Types\\StandardPBR.materialtype",
+    "propertyLayoutVersion": 3,
+    "properties": {
+        "general": {
+            "enableShadows": false,
+            "enableDirectionalLights": false,
+            "enablePunctualLights": false,
+            "enableAreaLights": false,
+            "enableIBL":  true
+        },
+        "baseColor": {
+           "color": [ 0.0, 1.0, 0.0 ]
+        },
+        "opacity": {
+            "alphaSource": "None",
+            "doubleSided": true,
+            "factor": 1.0,
+            "mode": "TintedTransparent"
+        }
+    }
+}

+ 3 - 0
Gems/Atom/Feature/Common/Assets/Models/OcclusionCullingPlane.fbx

@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75cdf73fcb9698a76a38294a1cf927a4fb41a34869e0429e1f02bf8d361a7258
+size 20400

+ 43 - 0
Gems/Atom/Feature/Common/Code/Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h

@@ -0,0 +1,43 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <AzCore/base.h>
+#include <AzCore/Math/Transform.h>
+#include <Atom/RPI.Public/FeatureProcessor.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        class OcclusionCullingPlane;
+
+        using OcclusionCullingPlaneHandle = AZStd::shared_ptr<OcclusionCullingPlane>;
+
+        // OcclusionCullingPlaneFeatureProcessorInterface provides an interface to the feature processor for code outside of Atom
+        class OcclusionCullingPlaneFeatureProcessorInterface
+            : public RPI::FeatureProcessor
+        {
+        public:
+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneFeatureProcessorInterface, "{50F6B45E-A622-44EC-B962-DE25FBD44095}");
+
+            virtual OcclusionCullingPlaneHandle AddOcclusionCullingPlane(const AZ::Transform& transform) = 0;
+            virtual void RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& handle) = 0;
+            virtual bool IsValidOcclusionCullingPlaneHandle(const OcclusionCullingPlaneHandle& occlusionCullingPlane) const = 0;
+            virtual void SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform) = 0;
+            virtual void SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enabled) = 0;
+            virtual void ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization) = 0;
+            virtual void SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization) = 0;
+        };
+    } // namespace Render
+} // namespace AZ

+ 4 - 0
Gems/Atom/Feature/Common/Code/Source/CommonSystemComponent.cpp

@@ -103,6 +103,7 @@
 #include <ReflectionScreenSpace/ReflectionScreenSpaceBlurPass.h>
 #include <ReflectionScreenSpace/ReflectionScreenSpaceBlurChildPass.h>
 #include <ReflectionScreenSpace/ReflectionCopyFrameBufferPass.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
 
 namespace AZ
 {
@@ -138,6 +139,7 @@ namespace AZ
             DiffuseProbeGridFeatureProcessor::Reflect(context);
             DiffuseGlobalIlluminationFeatureProcessor::Reflect(context);
             RayTracingFeatureProcessor::Reflect(context);
+            OcclusionCullingPlaneFeatureProcessor::Reflect(context);
 
             if (SerializeContext* serialize = azrtti_cast<SerializeContext*>(context))
             {
@@ -195,6 +197,7 @@ namespace AZ
             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<DiffuseProbeGridFeatureProcessor>();
             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<DiffuseGlobalIlluminationFeatureProcessor>();
             AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<RayTracingFeatureProcessor>();
+            AZ::RPI::FeatureProcessorFactory::Get()->RegisterFeatureProcessor<OcclusionCullingPlaneFeatureProcessor>();
 
             // Add SkyBox pass
             auto* passSystem = RPI::PassSystemInterface::Get();
@@ -301,6 +304,7 @@ namespace AZ
             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<SkyBoxFeatureProcessor>();
             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<TransformServiceFeatureProcessor>();
             AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<AuxGeomFeatureProcessor>();
+            AZ::RPI::FeatureProcessorFactory::Get()->UnregisterFeatureProcessor<OcclusionCullingPlaneFeatureProcessor>();
         }
 
         void CommonSystemComponent::LoadPassTemplateMappings()

+ 113 - 0
Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp

@@ -0,0 +1,113 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
+#include <AzCore/Math/Random.h>
+#include <Atom/RPI.Public/Scene.h>
+#include <Atom/RPI.Reflect/Asset/AssetUtils.h>
+#include <Atom/RPI.Reflect/Material/MaterialAsset.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        static const char* OcclusionCullingPlaneDrawListTag("occlusioncullingplanevisualization");
+
+        OcclusionCullingPlane::~OcclusionCullingPlane()
+        {
+            Data::AssetBus::MultiHandler::BusDisconnect();
+            m_meshFeatureProcessor->ReleaseMesh(m_visualizationMeshHandle);
+        }
+
+        void OcclusionCullingPlane::Init(RPI::Scene* scene)
+        {
+            AZ_Assert(scene, "OcclusionCullingPlane::Init called with a null Scene pointer");
+
+            m_meshFeatureProcessor = scene->GetFeatureProcessor<Render::MeshFeatureProcessorInterface>();
+
+            // load visualization plane model and material
+            m_visualizationModelAsset = AZ::RPI::AssetUtils::GetAssetByProductPath<AZ::RPI::ModelAsset>(
+                "Models/OcclusionCullingPlane.azmodel",
+                AZ::RPI::AssetUtils::TraceLevel::Assert);
+
+            m_visualizationMeshHandle = m_meshFeatureProcessor->AcquireMesh(m_visualizationModelAsset);
+            m_meshFeatureProcessor->SetExcludeFromReflectionCubeMaps(m_visualizationMeshHandle, true);
+            m_meshFeatureProcessor->SetRayTracingEnabled(m_visualizationMeshHandle, false);
+            m_meshFeatureProcessor->SetTransform(m_visualizationMeshHandle, AZ::Transform::CreateIdentity());
+
+            SetVisualizationMaterial();
+        }
+
+        void OcclusionCullingPlane::SetVisualizationMaterial()
+        {
+            AZStd::string materialAssetPath;
+            if (m_transparentVisualization)
+            {
+                materialAssetPath = "Materials/OcclusionCullingPlane/OcclusionCullingPlaneTransparentVisualization.azmaterial";
+            }
+            else
+            {
+                materialAssetPath = "Materials/OcclusionCullingPlane/OcclusionCullingPlaneVisualization.azmaterial";
+            }
+
+            RPI::AssetUtils::TraceLevel traceLevel = AZ::RPI::AssetUtils::TraceLevel::Assert;
+            m_visualizationMaterialAsset = AZ::RPI::AssetUtils::GetAssetByProductPath<AZ::RPI::MaterialAsset>(materialAssetPath.c_str(), traceLevel);
+            m_visualizationMaterialAsset.QueueLoad();
+            Data::AssetBus::MultiHandler::BusConnect(m_visualizationMaterialAsset.GetId());
+        }
+
+        void OcclusionCullingPlane::OnAssetReady(Data::Asset<Data::AssetData> asset)
+        {
+            if (m_visualizationMaterialAsset.GetId() == asset.GetId())
+            {
+                m_visualizationMaterialAsset = asset;
+                Data::AssetBus::MultiHandler::BusDisconnect(asset.GetId());
+
+                m_visualizationMaterial = AZ::RPI::Material::FindOrCreate(m_visualizationMaterialAsset);
+                m_meshFeatureProcessor->SetMaterialAssignmentMap(m_visualizationMeshHandle, m_visualizationMaterial);
+            }
+        }
+
+        void OcclusionCullingPlane::OnAssetError(Data::Asset<Data::AssetData> asset)
+        {
+            AZ_Error("OcclusionCullingPlane", false, "Failed to load OcclusionCullingPlane visualization asset %s", asset.ToString<AZStd::string>().c_str());
+            Data::AssetBus::MultiHandler::BusDisconnect(asset.GetId());
+        }
+
+        void OcclusionCullingPlane::SetTransform(const AZ::Transform& transform)
+        {
+            m_transform = transform;
+
+            // update visualization plane transform
+            m_meshFeatureProcessor->SetTransform(m_visualizationMeshHandle, transform);
+        }
+
+        void OcclusionCullingPlane::ShowVisualization(bool showVisualization)
+        {
+            if (m_showVisualization != showVisualization)
+            {
+                m_meshFeatureProcessor->SetVisible(m_visualizationMeshHandle, showVisualization);
+                SetVisualizationMaterial();
+            }
+        }
+
+        void OcclusionCullingPlane::SetTransparentVisualization(bool transparentVisualization)
+        {
+            if (m_transparentVisualization != transparentVisualization)
+            {
+                m_transparentVisualization = transparentVisualization;
+                SetVisualizationMaterial();
+            }
+        }
+
+    } // namespace Render
+} // namespace AZ

+ 65 - 0
Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlane.h

@@ -0,0 +1,65 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
+#include <Atom/Feature/Mesh/MeshFeatureProcessorInterface.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        //! This class represents an OcclusionCullingPlane which is used to cull meshes that are inside the view frustum
+        class OcclusionCullingPlane final
+            : public AZ::Data::AssetBus::MultiHandler
+        {
+        public:
+            OcclusionCullingPlane() = default;
+            ~OcclusionCullingPlane();
+
+            void Init(RPI::Scene* scene);
+
+            void SetTransform(const AZ::Transform& transform);
+            const AZ::Transform& GetTransform() const { return m_transform; }
+
+            void SetEnabled(bool enabled) { m_enabled = enabled; }
+            bool GetEnabled() const { return m_enabled; }
+
+            // enables or disables rendering of the visualization plane
+            void ShowVisualization(bool showVisualization);
+
+            // sets the visualization to transparent mode
+            void SetTransparentVisualization(bool transparentVisualization);
+
+        private:
+
+            void SetVisualizationMaterial();
+
+            // AZ::Data::AssetBus::Handler overrides...
+            void OnAssetReady(Data::Asset<Data::AssetData> asset) override;
+            void OnAssetError(Data::Asset<Data::AssetData> asset) override;
+
+            AZ::Transform m_transform;
+            bool m_enabled = true;
+            bool m_showVisualization = true;
+            bool m_transparentVisualization = false;
+
+            // visualization
+            AZ::Render::MeshFeatureProcessorInterface* m_meshFeatureProcessor = nullptr;
+            Data::Asset<RPI::ModelAsset> m_visualizationModelAsset;
+            Data::Asset<RPI::MaterialAsset> m_visualizationMaterialAsset;
+            Data::Instance<RPI::Material> m_visualizationMaterial;
+            AZ::Render::MeshFeatureProcessorInterface::MeshHandle m_visualizationMeshHandle;
+        };
+    } // namespace Render
+} // namespace AZ

+ 146 - 0
Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp

@@ -0,0 +1,146 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#include <OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h>
+#include <AzCore/std/smart_ptr/make_shared.h>
+#include <AzCore/std/smart_ptr/intrusive_ptr.h>
+#include <Atom/RPI.Public/Scene.h>
+#include <Atom/RPI.Public/Culling.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        void OcclusionCullingPlaneFeatureProcessor::Reflect(ReflectContext* context)
+        {
+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
+            {
+                serializeContext
+                    ->Class<OcclusionCullingPlaneFeatureProcessor, FeatureProcessor>()
+                    ->Version(0);
+            }
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::Activate()
+        {
+            m_occlusionCullingPlanes.reserve(InitialOcclusionCullingPlanesAllocationSize);
+            m_rpiOcclusionPlanes.reserve(InitialOcclusionCullingPlanesAllocationSize);
+
+            EnableSceneNotification();
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::Deactivate()
+        {
+            AZ_Warning("OcclusionCullingPlaneFeatureProcessor", m_occlusionCullingPlanes.size() == 0,
+                "Deactivating the OcclusionCullingPlaneFeatureProcessor, but there are still outstanding occlusion planes. Components\n"
+                "using OcclusionCullingPlaneHandles should free them before the OcclusionCullingPlaneFeatureProcessor is deactivated.\n"
+            );
+
+            DisableSceneNotification();
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::OnBeginPrepareRender()
+        {
+            if (m_rpiListNeedsUpdate)
+            {
+                // rebuild the RPI occlusion list
+                m_rpiOcclusionPlanes.clear();
+
+                for (auto& occlusionCullingPlane : m_occlusionCullingPlanes)
+                {
+                    if (!occlusionCullingPlane->GetEnabled())
+                    {
+                        continue;
+                    }
+
+                    RPI::CullingScene::OcclusionPlane rpiOcclusionPlane;
+
+                    static const Vector3 BL = Vector3(-0.5f, 0.0f, -0.5f);
+                    static const Vector3 TL = Vector3(-0.5f, 0.0f,  0.5f);
+                    static const Vector3 TR = Vector3( 0.5f, 0.0f,  0.5f);
+                    static const Vector3 BR = Vector3( 0.5f, 0.0f, -0.5f);
+
+                    const AZ::Transform& transform = occlusionCullingPlane->GetTransform();
+
+                    // convert corners to world space
+                    rpiOcclusionPlane.m_cornerBL = transform.TransformPoint(BL);
+                    rpiOcclusionPlane.m_cornerTL = transform.TransformPoint(TL);
+                    rpiOcclusionPlane.m_cornerTR = transform.TransformPoint(TR);
+                    rpiOcclusionPlane.m_cornerBR = transform.TransformPoint(BR);
+
+                    // build world space AABB
+                    AZ::Vector3 aabbMin = rpiOcclusionPlane.m_cornerBL.GetMin(rpiOcclusionPlane.m_cornerTR);
+                    AZ::Vector3 aabbMax = rpiOcclusionPlane.m_cornerBL.GetMax(rpiOcclusionPlane.m_cornerTR);
+                    rpiOcclusionPlane.m_aabb = Aabb::CreateFromMinMax(aabbMin, aabbMax);
+
+                    m_rpiOcclusionPlanes.push_back(rpiOcclusionPlane);
+                }
+
+                GetParentScene()->GetCullingScene()->SetOcclusionPlanes(m_rpiOcclusionPlanes);
+
+                m_rpiListNeedsUpdate = false;
+            }
+        }
+
+        OcclusionCullingPlaneHandle OcclusionCullingPlaneFeatureProcessor::AddOcclusionCullingPlane(const AZ::Transform& transform)
+        {
+            AZStd::shared_ptr<OcclusionCullingPlane> occlusionCullingPlane = AZStd::make_shared<OcclusionCullingPlane>();
+            occlusionCullingPlane->Init(GetParentScene());
+            occlusionCullingPlane->SetTransform(transform);
+            m_occlusionCullingPlanes.push_back(occlusionCullingPlane);
+            m_rpiListNeedsUpdate = true;
+
+            return occlusionCullingPlane;
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& occlusionCullingPlane)
+        {
+            AZ_Assert(occlusionCullingPlane.get(), "RemoveOcclusionCullingPlane called with an invalid handle");
+
+            auto itEntry = AZStd::find_if(m_occlusionCullingPlanes.begin(), m_occlusionCullingPlanes.end(), [&](AZStd::shared_ptr<OcclusionCullingPlane> const& entry)
+            {
+                return (entry == occlusionCullingPlane);
+            });
+
+            AZ_Assert(itEntry != m_occlusionCullingPlanes.end(), "RemoveOcclusionCullingPlane called with an occlusion plane that is not in the occlusion plane list");
+            m_occlusionCullingPlanes.erase(itEntry);
+            occlusionCullingPlane = nullptr;
+            m_rpiListNeedsUpdate = true;
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform)
+        {
+            AZ_Assert(occlusionCullingPlane.get(), "SetTransform called with an invalid handle");
+            occlusionCullingPlane->SetTransform(transform);
+            m_rpiListNeedsUpdate = true;
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enabled)
+        {
+            AZ_Assert(occlusionCullingPlane.get(), "Enable called with an invalid handle");
+            occlusionCullingPlane->SetEnabled(enabled);
+            m_rpiListNeedsUpdate = true;
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization)
+        {
+            AZ_Assert(occlusionCullingPlane.get(), "ShowVisualization called with an invalid handle");
+            occlusionCullingPlane->ShowVisualization(showVisualization);
+        }
+
+        void OcclusionCullingPlaneFeatureProcessor::SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization)
+        {
+            AZ_Assert(occlusionCullingPlane.get(), "SetTransparentVisualization called with an invalid handle");
+            occlusionCullingPlane->SetTransparentVisualization(transparentVisualization);
+        }
+    } // namespace Render
+} // namespace AZ

+ 66 - 0
Gems/Atom/Feature/Common/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h

@@ -0,0 +1,66 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlane.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        //! This class manages OcclusionCullingPlanes which are used to cull meshes that are inside the view frustum
+        class OcclusionCullingPlaneFeatureProcessor final
+            : public OcclusionCullingPlaneFeatureProcessorInterface
+        {
+        public:
+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneFeatureProcessor, "{C3DE91D7-EF7A-4A82-A55F-E22BC52074EA}", OcclusionCullingPlaneFeatureProcessorInterface);
+
+            static void Reflect(AZ::ReflectContext* context);
+
+            OcclusionCullingPlaneFeatureProcessor() = default;
+            virtual ~OcclusionCullingPlaneFeatureProcessor() = default;
+
+            // OcclusionCullingPlaneFeatureProcessorInterface overrides
+            OcclusionCullingPlaneHandle AddOcclusionCullingPlane(const AZ::Transform& transform) override;
+            void RemoveOcclusionCullingPlane(OcclusionCullingPlaneHandle& handle) override;
+            bool IsValidOcclusionCullingPlaneHandle(const OcclusionCullingPlaneHandle& occlusionCullingPlane) const override { return (occlusionCullingPlane.get() != nullptr); }
+            void SetTransform(const OcclusionCullingPlaneHandle& occlusionCullingPlane, const AZ::Transform& transform) override;
+            void SetEnabled(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool enable) override;
+            void ShowVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool showVisualization) override;
+            void SetTransparentVisualization(const OcclusionCullingPlaneHandle& occlusionCullingPlane, bool transparentVisualization) override;
+
+            // FeatureProcessor overrides
+            void Activate() override;
+            void Deactivate() override;
+
+            // RPI::SceneNotificationBus overrides ...
+            void OnBeginPrepareRender() override;
+
+            // retrieve the full list of occlusion planes
+            using OcclusionCullingPlaneVector = AZStd::vector<AZStd::shared_ptr<OcclusionCullingPlane>>;
+            OcclusionCullingPlaneVector& GetOcclusionCullingPlanes() { return m_occlusionCullingPlanes; }
+
+        private:
+            AZ_DISABLE_COPY_MOVE(OcclusionCullingPlaneFeatureProcessor);
+
+            // list of occlusion planes
+            const size_t InitialOcclusionCullingPlanesAllocationSize = 64;
+            OcclusionCullingPlaneVector m_occlusionCullingPlanes;
+
+            // prebuilt list of RPI scene occlusion planes
+            RPI::CullingScene::OcclusionPlaneVector m_rpiOcclusionPlanes;
+            bool m_rpiListNeedsUpdate = false;
+        };
+    } // namespace Render
+} // namespace AZ

+ 4 - 0
Gems/Atom/Feature/Common/Code/atom_feature_common_files.cmake

@@ -175,6 +175,10 @@ set(FILES
     Source/MorphTargets/MorphTargetComputePass.h
     Source/MorphTargets/MorphTargetDispatchItem.cpp
     Source/MorphTargets/MorphTargetDispatchItem.h
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.h
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessor.cpp
+    Source/OcclusionCullingPlane/OcclusionCullingPlane.h
+    Source/OcclusionCullingPlane/OcclusionCullingPlane.cpp
     Source/PostProcess/PostProcessBase.cpp
     Source/PostProcess/PostProcessBase.h
     Source/PostProcess/PostProcessFeatureProcessor.cpp

+ 1 - 0
Gems/Atom/Feature/Common/Code/atom_feature_common_public_files.cmake

@@ -45,6 +45,7 @@ set(FILES
     Include/Atom/Feature/ParamMacros/StartParamFunctionsVirtual.inl
     Include/Atom/Feature/ParamMacros/StartParamMembers.inl
     Include/Atom/Feature/ParamMacros/StartParamSerializeContext.inl
+    Include/Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h
     Include/Atom/Feature/PostProcess/PostProcessFeatureProcessorInterface.h
     Include/Atom/Feature/PostProcess/PostProcessParams.inl
     Include/Atom/Feature/PostProcess/PostProcessSettings.inl

+ 15 - 2
Gems/Atom/RPI/Code/CMakeLists.txt

@@ -9,6 +9,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #
 
+ly_get_list_relative_pal_filename(pal_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/${PAL_PLATFORM_NAME})
+
+#for PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED and PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED
+include(${pal_source_dir}/PAL_${PAL_PLATFORM_NAME_LOWERCASE}.cmake)
+
+if(PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED)
+    set(MASKED_OCCLUSION_CULLING_FILES "atom_rpi_masked_occlusion_files.cmake")
+else()
+    set(MASKED_OCCLUSION_CULLING_FILES "")
+endif()
+
 ly_add_target(
     NAME Atom_RPI.Public STATIC
     NAMESPACE Gem
@@ -16,11 +27,15 @@ ly_add_target(
         atom_rpi_reflect_files.cmake
         atom_rpi_public_files.cmake
         ../Assets/atom_rpi_asset_files.cmake
+        ${pal_source_dir}/platform_${PAL_PLATFORM_NAME_LOWERCASE}_files.cmake
+        ${MASKED_OCCLUSION_CULLING_FILES}
     INCLUDE_DIRECTORIES
         PRIVATE
             Source
+            ${pal_source_dir}
         PUBLIC
             Include
+            External
     BUILD_DEPENDENCIES
         PRIVATE
             AZ::AtomCore
@@ -159,8 +174,6 @@ if(PAL_TRAIT_BUILD_HOST_TOOLS)
     ly_get_list_relative_pal_filename(pal_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/${PAL_PLATFORM_NAME})
     ly_get_list_relative_pal_filename(common_source_dir ${CMAKE_CURRENT_LIST_DIR}/Source/Platform/Common)
 
-    include(${pal_source_dir}/PAL_${PAL_PLATFORM_NAME_LOWERCASE}.cmake) #for PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED
-
     if(NOT PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED)
 
         # Create a stub

+ 98 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/CompilerSpecific.inl

@@ -0,0 +1,98 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Common shared include file to hide compiler/os specific functions from the rest of the code. 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+	#define __MICROSOFT_COMPILER
+#endif
+
+#if defined(_WIN32)	&& (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__clang__)) // Windows: MSVC / Intel compiler / clang
+	#include <intrin.h>
+	#include <new.h>
+
+	#define FORCE_INLINE __forceinline
+
+	FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
+	{
+		unsigned long idx;
+		_BitScanForward(&idx, *mask);
+		*mask &= *mask - 1;
+		return idx;
+	}
+
+	FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
+	{
+		return _aligned_malloc(size, alignment);
+	}
+
+	FORCE_INLINE void aligned_free(void *ptr)
+	{
+		_aligned_free(ptr);
+	}
+
+#elif defined(__GNUG__)	|| defined(__clang__) // G++ or clang
+	#include <cpuid.h>
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
+	#include <malloc/malloc.h> // memalign
+#else
+	#include <malloc.h> // memalign
+#endif
+	#include <mm_malloc.h>
+	#include <immintrin.h>
+	#include <new>
+
+	#define FORCE_INLINE inline
+
+	FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
+	{
+		unsigned long idx;
+		idx = __builtin_ctzl(*mask);
+		*mask &= *mask - 1;
+		return idx;
+	}
+
+	FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
+	{
+		return memalign(alignment, size);
+	}
+
+	FORCE_INLINE void aligned_free(void *ptr)
+	{
+		free(ptr);
+	}
+
+	FORCE_INLINE void __cpuidex(int* cpuinfo, int function, int subfunction)
+	{
+		__cpuid_count(function, subfunction, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+	}
+
+	FORCE_INLINE unsigned long long _xgetbv(unsigned int index)
+	{
+		unsigned int eax, edx;
+		__asm__ __volatile__(
+			"xgetbv;"
+			: "=a" (eax), "=d"(edx)
+			: "c" (index)
+		);
+		return ((unsigned long long)edx << 32) | eax;
+	}
+
+#else
+	#error Unsupported compiler
+#endif

+ 181 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/LICENSE.txt

@@ -0,0 +1,181 @@
+ 
+Apache License
+ Version 2.0, January 2004
+
+ http://www.apache.org/licenses/ 
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and 
+distribution as defined by Sections 1 through 9 of this document. 
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright 
+owner that is granting the License. 
+
+"Legal Entity" shall mean the union of the acting entity and all other entities 
+that control, are controlled by, or are under common control with that entity. 
+For the purposes of this definition, "control" means (i) the power, direct or 
+indirect, to cause the direction or management of such entity, whether by 
+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 
+outstanding shares, or (iii) beneficial ownership of such entity. 
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising 
+permissions granted by this License. 
+
+"Source" form shall mean the preferred form for making modifications, including 
+but not limited to software source code, documentation source, and configuration 
+files. 
+
+"Object" form shall mean any form resulting from mechanical transformation or 
+translation of a Source form, including but not limited to compiled object code, 
+generated documentation, and conversions to other media types. 
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made 
+available under the License, as indicated by a copyright notice that is included 
+in or attached to the work (an example is provided in the Appendix below). 
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that 
+is based on (or derived from) the Work and for which the editorial revisions, 
+annotations, elaborations, or other modifications represent, as a whole, an 
+original work of authorship. For the purposes of this License, Derivative Works 
+shall not include works that remain separable from, or merely link (or bind by 
+name) to the interfaces of, the Work and Derivative Works thereof. 
+
+"Contribution" shall mean any work of authorship, including the original version 
+of the Work and any modifications or additions to that Work or Derivative Works 
+thereof, that is intentionally submitted to Licensor for inclusion in the Work 
+by the copyright owner or by an individual or Legal Entity authorized to submit 
+on behalf of the copyright owner. For the purposes of this definition, 
+"submitted" means any form of electronic, verbal, or written communication sent 
+to the Licensor or its representatives, including but not limited to 
+communication on electronic mailing lists, source code control systems, and 
+issue tracking systems that are managed by, or on behalf of, the Licensor for 
+the purpose of discussing and improving the Work, but excluding communication 
+that is conspicuously marked or otherwise designated in writing by the copyright 
+owner as "Not a Contribution." 
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf 
+of whom a Contribution has been received by Licensor and subsequently 
+incorporated within the Work. 
+
+2. Grant of Copyright License. Subject to the terms and conditions of this 
+License, each Contributor hereby grants to You a perpetual, worldwide, 
+non-exclusive, no-charge, royalty-free, irrevocable copyright license to 
+reproduce, prepare Derivative Works of, publicly display, publicly perform, 
+sublicense, and distribute the Work and such Derivative Works in Source or 
+Object form. 
+
+3. Grant of Patent License. Subject to the terms and conditions of this License, 
+each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, 
+no-charge, royalty-free, irrevocable (except as stated in this section) patent 
+license to make, have made, use, offer to sell, sell, import, and otherwise 
+transfer the Work, where such license applies only to those patent claims 
+licensable by such Contributor that are necessarily infringed by their 
+Contribution(s) alone or by combination of their Contribution(s) with the Work 
+to which such Contribution(s) was submitted. If You institute patent litigation 
+against any entity (including a cross-claim or counterclaim in a lawsuit) 
+alleging that the Work or a Contribution incorporated within the Work 
+constitutes direct or contributory patent infringement, then any patent licenses 
+granted to You under this License for that Work shall terminate as of the date 
+such litigation is filed. 
+
+4. Redistribution. You may reproduce and distribute copies of the Work or 
+Derivative Works thereof in any medium, with or without modifications, and in 
+Source or Object form, provided that You meet the following conditions: 
+  You must give any other recipients of the Work or Derivative Works a copy of 
+  this License; and 
+
+
+  You must cause any modified files to carry prominent notices stating that You 
+  changed the files; and 
+
+
+  You must retain, in the Source form of any Derivative Works that You 
+  distribute, all copyright, patent, trademark, and attribution notices from the 
+  Source form of the Work, excluding those notices that do not pertain to any 
+  part of the Derivative Works; and 
+
+
+  If the Work includes a "NOTICE" text file as part of its distribution, then 
+  any Derivative Works that You distribute must include a readable copy of the 
+  attribution notices contained within such NOTICE file, excluding those notices 
+  that do not pertain to any part of the Derivative Works, in at least one of 
+  the following places: within a NOTICE text file distributed as part of the 
+  Derivative Works; within the Source form or documentation, if provided along 
+  with the Derivative Works; or, within a display generated by the Derivative 
+  Works, if and wherever such third-party notices normally appear. The contents 
+  of the NOTICE file are for informational purposes only and do not modify the 
+  License. You may add Your own attribution notices within Derivative Works that 
+  You distribute, alongside or as an addendum to the NOTICE text from the Work, 
+  provided that such additional attribution notices cannot be construed as 
+  modifying the License.
+You may add Your own copyright statement to Your modifications and may provide 
+additional or different license terms and conditions for use, reproduction, or 
+distribution of Your modifications, or for any such Derivative Works as a whole, 
+provided Your use, reproduction, and distribution of the Work otherwise complies 
+with the conditions stated in this License. 
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any 
+Contribution intentionally submitted for inclusion in the Work by You to the 
+Licensor shall be under the terms and conditions of this License, without any 
+additional terms or conditions. Notwithstanding the above, nothing herein shall 
+supersede or modify the terms of any separate license agreement you may have 
+executed with Licensor regarding such Contributions. 
+
+6. Trademarks. This License does not grant permission to use the trade names, 
+trademarks, service marks, or product names of the Licensor, except as required 
+for reasonable and customary use in describing the origin of the Work and 
+reproducing the content of the NOTICE file. 
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 
+writing, Licensor provides the Work (and each Contributor provides its 
+Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
+KIND, either express or implied, including, without limitation, any warranties 
+or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 
+PARTICULAR PURPOSE. You are solely responsible for determining the 
+appropriateness of using or redistributing the Work and assume any risks 
+associated with Your exercise of permissions under this License. 
+
+8. Limitation of Liability. In no event and under no legal theory, whether in 
+tort (including negligence), contract, or otherwise, unless required by 
+applicable law (such as deliberate and grossly negligent acts) or agreed to in 
+writing, shall any Contributor be liable to You for damages, including any 
+direct, indirect, special, incidental, or consequential damages of any character 
+arising as a result of this License or out of the use or inability to use the 
+Work (including but not limited to damages for loss of goodwill, work stoppage, 
+computer failure or malfunction, or any and all other commercial damages or 
+losses), even if such Contributor has been advised of the possibility of such 
+damages. 
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or 
+Derivative Works thereof, You may choose to offer, and charge a fee for, 
+acceptance of support, warranty, indemnity, or other liability obligations 
+and/or rights consistent with this License. However, in accepting such 
+obligations, You may act only on Your own behalf and on Your sole 
+responsibility, not on behalf of any other Contributor, and only if You agree to 
+indemnify, defend, and hold each Contributor harmless for any liability incurred 
+by, or claims asserted against, such Contributor by reason of your accepting any 
+such warranty or additional liability. 
+
+END OF TERMS AND CONDITIONS 
+
+APPENDIX: How to apply the Apache License to your work 
+
+To apply the Apache License to your work, attach the following boilerplate 
+notice, with the fields enclosed by brackets "[]" replaced with your own 
+identifying information. (Don't include the brackets!) The text should be 
+enclosed in the appropriate comment syntax for the file format. We also 
+recommend that a file or class name and description of purpose be included on 
+the same "printed page" as the copyright notice for easier identification within 
+third-party archives. 
+
+Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, 
+Version 2.0 (the "License"); you may not use this file except in compliance with 
+the License. You may obtain a copy of the License at 
+http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or 
+agreed to in writing, software distributed under the License is distributed on 
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 
+or implied. See the License for the specific language governing permissions and 
+limitations under the License.

+ 456 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp

@@ -0,0 +1,456 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+#include <vector>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include "MaskedOcclusionCulling.h"
+#include "CompilerSpecific.inl"
+
+#if MOC_RECORDER_ENABLE
+#include "FrameRecorder.h"
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__)
+	// For performance reasons, the MaskedOcclusionCullingAVX2/512.cpp files should be compiled with VEX encoding for SSE instructions (to avoid 
+	// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, this file
+	// _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to use lowest supported target platform 
+	// (/arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
+	#error The MaskedOcclusionCulling.cpp should be compiled with lowest supported target platform, e.g. /arch:SSE2
+#endif
+
+static MaskedOcclusionCulling::Implementation DetectCPUFeatures(MaskedOcclusionCulling::pfnAlignedAlloc alignedAlloc, MaskedOcclusionCulling::pfnAlignedFree alignedFree)
+{
+	struct CpuInfo { int regs[4]; };
+
+	// Get regular CPUID values
+	int regs[4];
+	__cpuidex(regs, 0, 0);
+
+    //  MOCVectorAllocator<CpuInfo> mocalloc( alignedAlloc, alignedFree );
+    //  std::vector<CpuInfo, MOCVectorAllocator<CpuInfo>> cpuId( mocalloc ), cpuIdEx( mocalloc );
+    //  cpuId.resize( regs[0] );
+    size_t cpuIdCount = regs[0];
+    CpuInfo * cpuId = (CpuInfo*)alignedAlloc( 64, sizeof(CpuInfo) * cpuIdCount );
+    
+	for (size_t i = 0; i < cpuIdCount; ++i)
+		__cpuidex(cpuId[i].regs, (int)i, 0);
+
+	// Get extended CPUID values
+	__cpuidex(regs, 0x80000000, 0);
+
+    //cpuIdEx.resize(regs[0] - 0x80000000);
+    size_t cpuIdExCount = regs[0] - 0x80000000;
+    CpuInfo * cpuIdEx = (CpuInfo*)alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdExCount );
+
+    for (size_t i = 0; i < cpuIdExCount; ++i)
+		__cpuidex(cpuIdEx[i].regs, 0x80000000 + (int)i, 0);
+
+	#define TEST_BITS(A, B)            (((A) & (B)) == (B))
+	#define TEST_FMA_MOVE_OXSAVE       (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 12) | (1 << 22) | (1 << 27)))
+	#define TEST_LZCNT                 (cpuIdExCount >= 1 && TEST_BITS(cpuIdEx[1].regs[2], 0x20))
+	#define TEST_SSE41                 (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 19)))
+	#define TEST_XMM_YMM               (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 2) | (1 << 1)))
+	#define TEST_OPMASK_ZMM            (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 7) | (1 << 6) | (1 << 5)))
+	#define TEST_BMI1_BMI2_AVX2        (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 3) | (1 << 5) | (1 << 8)))
+	#define TEST_AVX512_F_BW_DQ        (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 16) | (1 << 17) | (1 << 30)))
+
+    MaskedOcclusionCulling::Implementation retVal = MaskedOcclusionCulling::SSE2;
+	if (TEST_FMA_MOVE_OXSAVE && TEST_LZCNT && TEST_SSE41)
+	{
+		if (TEST_XMM_YMM && TEST_OPMASK_ZMM && TEST_BMI1_BMI2_AVX2 && TEST_AVX512_F_BW_DQ)
+			retVal = MaskedOcclusionCulling::AVX512;
+		else if (TEST_XMM_YMM && TEST_BMI1_BMI2_AVX2)
+			retVal = MaskedOcclusionCulling::AVX2;
+	} 
+    else if (TEST_SSE41)
+		retVal = MaskedOcclusionCulling::SSE41;
+    alignedFree( cpuId );
+    alignedFree( cpuIdEx );
+    return retVal;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Utility functions (not directly related to the algorithm/rasterizer)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void MaskedOcclusionCulling::TransformVertices(const float *mtx, const float *inVtx, float *xfVtx, unsigned int nVtx, const VertexLayout &vtxLayout)
+{
+	// This function pretty slow, about 10-20% slower than if the vertices are stored in aligned SOA form.
+	if (nVtx == 0)
+		return;
+
+	// Load matrix and swizzle out the z component. For post-multiplication (OGL), the matrix is assumed to be column 
+	// major, with one column per SSE register. For pre-multiplication (DX), the matrix is assumed to be row major.
+	__m128 mtxCol0 = _mm_loadu_ps(mtx);
+	__m128 mtxCol1 = _mm_loadu_ps(mtx + 4);
+	__m128 mtxCol2 = _mm_loadu_ps(mtx + 8);
+	__m128 mtxCol3 = _mm_loadu_ps(mtx + 12);
+
+	int stride = vtxLayout.mStride;
+	const char *vPtr = (const char *)inVtx;
+	float *outPtr = xfVtx;
+
+	// Iterate through all vertices and transform
+	for (unsigned int vtx = 0; vtx < nVtx; ++vtx)
+	{
+		__m128 xVal = _mm_load1_ps((float*)(vPtr));
+		__m128 yVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetY));
+		__m128 zVal = _mm_load1_ps((float*)(vPtr + vtxLayout.mOffsetZ));
+
+		__m128 xform = _mm_add_ps(_mm_mul_ps(mtxCol0, xVal), _mm_add_ps(_mm_mul_ps(mtxCol1, yVal), _mm_add_ps(_mm_mul_ps(mtxCol2, zVal), mtxCol3)));
+		_mm_storeu_ps(outPtr, xform);
+		vPtr += stride;
+		outPtr += 4;
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Typedefs
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
+typedef MaskedOcclusionCulling::pfnAlignedFree  pfnAlignedFree;
+typedef MaskedOcclusionCulling::VertexLayout    VertexLayout;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Common SSE2/SSE4.1 defines
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define SIMD_LANES             4
+#define TILE_HEIGHT_SHIFT      2
+
+#define SIMD_LANE_IDX _mm_setr_epi32(0, 1, 2, 3)
+
+#define SIMD_SUB_TILE_COL_OFFSET _mm_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET _mm_setzero_si128()
+#define SIMD_SUB_TILE_COL_OFFSET_F _mm_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm_setzero_ps()
+
+#define SIMD_LANE_YCOORD_I _mm_setr_epi32(128, 384, 640, 896)
+#define SIMD_LANE_YCOORD_F _mm_setr_ps(128.0f, 384.0f, 640.0f, 896.0f)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Common SSE2/SSE4.1 functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef __m128 __mw;
+typedef __m128i __mwi;
+
+#define _mmw_set1_ps                _mm_set1_ps
+#define _mmw_setzero_ps             _mm_setzero_ps
+#define _mmw_and_ps                 _mm_and_ps
+#define _mmw_or_ps                  _mm_or_ps
+#define _mmw_xor_ps                 _mm_xor_ps
+#define _mmw_not_ps(a)              _mm_xor_ps((a), _mm_castsi128_ps(_mm_set1_epi32(~0)))
+#define _mmw_andnot_ps              _mm_andnot_ps
+#define _mmw_neg_ps(a)              _mm_xor_ps((a), _mm_set1_ps(-0.0f))
+#define _mmw_abs_ps(a)              _mm_and_ps((a), _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)))
+#define _mmw_add_ps                 _mm_add_ps
+#define _mmw_sub_ps                 _mm_sub_ps
+#define _mmw_mul_ps                 _mm_mul_ps
+#define _mmw_div_ps                 _mm_div_ps
+#define _mmw_min_ps                 _mm_min_ps
+#define _mmw_max_ps                 _mm_max_ps
+#define _mmw_movemask_ps            _mm_movemask_ps
+#define _mmw_cmpge_ps(a,b)          _mm_cmpge_ps(a, b)
+#define _mmw_cmpgt_ps(a,b)          _mm_cmpgt_ps(a, b)
+#define _mmw_cmpeq_ps(a,b)          _mm_cmpeq_ps(a, b)
+#define _mmw_fmadd_ps(a,b,c)        _mm_add_ps(_mm_mul_ps(a,b), c)
+#define _mmw_fmsub_ps(a,b,c)        _mm_sub_ps(_mm_mul_ps(a,b), c)
+#define _mmw_shuffle_ps             _mm_shuffle_ps
+#define _mmw_insertf32x4_ps(a,b,c)  (b)
+#define _mmw_cvtepi32_ps            _mm_cvtepi32_ps
+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
+
+#define _mmw_set1_epi32             _mm_set1_epi32
+#define _mmw_setzero_epi32          _mm_setzero_si128
+#define _mmw_and_epi32              _mm_and_si128
+#define _mmw_or_epi32               _mm_or_si128
+#define _mmw_xor_epi32              _mm_xor_si128
+#define _mmw_not_epi32(a)           _mm_xor_si128((a), _mm_set1_epi32(~0))
+#define _mmw_andnot_epi32           _mm_andnot_si128
+#define _mmw_neg_epi32(a)           _mm_sub_epi32(_mm_set1_epi32(0), (a))
+#define _mmw_add_epi32              _mm_add_epi32
+#define _mmw_sub_epi32              _mm_sub_epi32
+#define _mmw_subs_epu16             _mm_subs_epu16
+#define _mmw_cmpeq_epi32            _mm_cmpeq_epi32
+#define _mmw_cmpgt_epi32            _mm_cmpgt_epi32
+#define _mmw_srai_epi32             _mm_srai_epi32
+#define _mmw_srli_epi32             _mm_srli_epi32
+#define _mmw_slli_epi32             _mm_slli_epi32
+#define _mmw_cvtps_epi32            _mm_cvtps_epi32
+#define _mmw_cvttps_epi32           _mm_cvttps_epi32
+
+#define _mmx_fmadd_ps               _mmw_fmadd_ps
+#define _mmx_max_epi32              _mmw_max_epi32
+#define _mmx_min_epi32              _mmw_min_epi32
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMD casting functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
+
+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
+		return acs->m_array; \
+	}
+
+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized SSE input assembly function for general vertex gather 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+FORCE_INLINE void GatherVertices(__m128 *vtxX, __m128 *vtxY, __m128 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
+{
+	for (int lane = 0; lane < numLanes; lane++)
+	{
+		for (int i = 0; i < 3; i++)
+		{
+			char *vPtrX = (char *)inVtx + inTrisPtr[lane * 3 + i] * vtxLayout.mStride;
+			char *vPtrY = vPtrX + vtxLayout.mOffsetY;
+			char *vPtrW = vPtrX + vtxLayout.mOffsetW;
+
+			simd_f32(vtxX[i])[lane] = *((float*)vPtrX);
+			simd_f32(vtxY[i])[lane] = *((float*)vPtrY);
+			simd_f32(vtxW[i])[lane] = *((float*)vPtrW);
+		}
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SSE4.1 version
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace MaskedOcclusionCullingSSE41
+{
+	FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
+	FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
+	FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
+	FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a) { return _mm_abs_epi32(a); }
+	FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c) { return _mm_blendv_ps(a, b, c); }
+	FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b) { return _mm_testz_si128(a, b); }
+	FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b) { return _mm_dp_ps(a, b, 0xFF); }
+	FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+	FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);	}
+	FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
+	{
+		const __m128i shuff = _mm_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
+		return _mm_shuffle_epi8(a, shuff);
+	}
+	FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
+	{
+		__m128i shift = _mm_min_epi32(ishift, _mm_set1_epi32(32));
+
+		// Uses lookup tables and _mm_shuffle_epi8 to perform _mm_sllv_epi32(~0, shift)
+		const __m128i byteShiftLUT = _mm_setr_epi8((char)0xFF, (char)0xFE, (char)0xFC, (char)0xF8, (char)0xF0, (char)0xE0, (char)0xC0, (char)0x80, 0, 0, 0, 0, 0, 0, 0, 0);
+		const __m128i byteShiftOffset = _mm_setr_epi8(0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24);
+		const __m128i byteShiftShuffle = _mm_setr_epi8(0x0, 0x0, 0x0, 0x0, 0x4, 0x4, 0x4, 0x4, 0x8, 0x8, 0x8, 0x8, 0xC, 0xC, 0xC, 0xC);
+
+		__m128i byteShift = _mm_shuffle_epi8(shift, byteShiftShuffle);
+		byteShift = _mm_min_epi8(_mm_subs_epu8(byteShift, byteShiftOffset), _mm_set1_epi8(8));
+		__m128i retMask = _mm_shuffle_epi8(byteShiftLUT, byteShift);
+
+		return retMask;
+	}
+
+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE41;
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Include common algorithm implementation (general, SIMD independent code)
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	#include "MaskedOcclusionCullingCommon.inl"
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Utility function to create a new object using the allocator callbacks
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
+		return object;
+	}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SSE2 version
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace MaskedOcclusionCullingSSE2
+{
+	FORCE_INLINE __m128i _mmw_mullo_epi32(const __m128i &a, const __m128i &b)
+	{ 
+		// Do products for even / odd lanes & merge the result
+		__m128i even = _mm_and_si128(_mm_mul_epu32(a, b), _mm_setr_epi32(~0, 0, ~0, 0));
+		__m128i odd = _mm_slli_epi64(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 32);
+		return _mm_or_si128(even, odd);
+	}
+	FORCE_INLINE __m128i _mmw_min_epi32(const __m128i &a, const __m128i &b)
+	{ 
+		__m128i cond = _mm_cmpgt_epi32(a, b);
+		return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
+	}
+	FORCE_INLINE __m128i _mmw_max_epi32(const __m128i &a, const __m128i &b)
+	{ 
+		__m128i cond = _mm_cmpgt_epi32(b, a);
+		return _mm_or_si128(_mm_andnot_si128(cond, a), _mm_and_si128(cond, b));
+	}
+	FORCE_INLINE __m128i _mmw_abs_epi32(const __m128i &a)
+	{
+		__m128i mask = _mm_cmplt_epi32(a, _mm_setzero_si128());
+		return _mm_add_epi32(_mm_xor_si128(a, mask), _mm_srli_epi32(mask, 31));
+	}
+	FORCE_INLINE int _mmw_testz_epi32(const __m128i &a, const __m128i &b)
+	{ 
+		return _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(a, b), _mm_setzero_si128())) == 0xFFFF;
+	}
+	FORCE_INLINE __m128 _mmw_blendv_ps(const __m128 &a, const __m128 &b, const __m128 &c)
+	{	
+		__m128 cond = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(c), 31));
+		return _mm_or_ps(_mm_andnot_ps(cond, a), _mm_and_ps(cond, b));
+	}
+	FORCE_INLINE __m128 _mmx_dp4_ps(const __m128 &a, const __m128 &b)
+	{ 
+		// Product and two shuffle/adds pairs (similar to hadd_ps)
+		__m128 prod = _mm_mul_ps(a, b);
+		__m128 dp = _mm_add_ps(prod, _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1)));
+		dp = _mm_add_ps(dp, _mm_shuffle_ps(dp, dp, _MM_SHUFFLE(0, 1, 2, 3)));
+		return dp;
+	}
+	FORCE_INLINE __m128 _mmw_floor_ps(const __m128 &a)
+	{ 
+		int originalMode = _MM_GET_ROUNDING_MODE();
+		_MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+		__m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+		_MM_SET_ROUNDING_MODE(originalMode);
+		return rounded;
+	}
+	FORCE_INLINE __m128 _mmw_ceil_ps(const __m128 &a)
+	{ 
+		int originalMode = _MM_GET_ROUNDING_MODE();
+		_MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+		__m128 rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+		_MM_SET_ROUNDING_MODE(originalMode);
+		return rounded;
+	}
+	FORCE_INLINE __m128i _mmw_transpose_epi8(const __m128i &a)
+	{
+		// Perform transpose through two 16->8 bit pack and byte shifts
+		__m128i res = a;
+		const __m128i mask = _mm_setr_epi8(~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0);
+		res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
+		res = _mm_packus_epi16(_mm_and_si128(res, mask), _mm_srli_epi16(res, 8));
+		return res;
+	}
+	FORCE_INLINE __m128i _mmw_sllv_ones(const __m128i &ishift)
+	{
+		__m128i shift = _mmw_min_epi32(ishift, _mm_set1_epi32(32));
+		
+		// Uses scalar approach to perform _mm_sllv_epi32(~0, shift)
+		static const unsigned int maskLUT[33] = {
+			~0U << 0, ~0U << 1, ~0U << 2 ,  ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6 , ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10 , ~0U << 11, ~0U << 12, ~0U << 13, ~0U << 14 , ~0U << 15,
+			~0U << 16, ~0U << 17, ~0U << 18 , ~0U << 19, ~0U << 20, ~0U << 21, ~0U << 22 , ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26 , ~0U << 27, ~0U << 28, ~0U << 29, ~0U << 30 , ~0U << 31,
+			0U };
+
+		__m128i retMask;
+		simd_i32(retMask)[0] = (int)maskLUT[simd_i32(shift)[0]];
+		simd_i32(retMask)[1] = (int)maskLUT[simd_i32(shift)[1]];
+		simd_i32(retMask)[2] = (int)maskLUT[simd_i32(shift)[2]];
+		simd_i32(retMask)[3] = (int)maskLUT[simd_i32(shift)[3]];
+		return retMask;
+	}
+
+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE2;
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Include common algorithm implementation (general, SIMD independent code)
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	#include "MaskedOcclusionCullingCommon.inl"
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Utility function to create a new object using the allocator callbacks
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
+		return object;
+	}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Object construction and allocation
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace MaskedOcclusionCullingAVX512
+{
+	extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
+}
+
+namespace MaskedOcclusionCullingAVX2
+{
+	extern MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
+}
+
+MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD)
+{
+	return Create(RequestedSIMD, aligned_alloc, aligned_free);
+}
+
+MaskedOcclusionCulling *MaskedOcclusionCulling::Create(Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+{
+	MaskedOcclusionCulling *object = nullptr;
+
+	MaskedOcclusionCulling::Implementation impl = DetectCPUFeatures(alignedAlloc, alignedFree);
+
+	if (RequestedSIMD < impl)
+		impl = RequestedSIMD;
+
+	// Return best supported version
+	if (object == nullptr && impl >= AVX512)
+		object = MaskedOcclusionCullingAVX512::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX512 version
+	if (object == nullptr && impl >= AVX2)
+		object = MaskedOcclusionCullingAVX2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use AVX2 version
+	if (object == nullptr && impl >= SSE41)
+		object = MaskedOcclusionCullingSSE41::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE4.1 version
+	if (object == nullptr)
+		object = MaskedOcclusionCullingSSE2::CreateMaskedOcclusionCulling(alignedAlloc, alignedFree); // Use SSE2 (slow) version
+
+	return object;
+}
+
+void MaskedOcclusionCulling::Destroy(MaskedOcclusionCulling *moc)
+{
+	pfnAlignedFree alignedFreeCallback = moc->mAlignedFreeCallback;
+	moc->~MaskedOcclusionCulling();
+	alignedFreeCallback(moc);
+}

+ 592 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCulling.h

@@ -0,0 +1,592 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+#pragma once
+
+/*!
+ *  \file MaskedOcclusionCulling.h
+ *  \brief Masked Occlusion Culling
+ * 
+ *  General information
+ *   - Input to all API functions are (x,y,w) clip-space coordinates (x positive left, y positive up, w positive away from camera).
+ *     We entirely skip the z component and instead compute it as 1 / w, see next bullet. For TestRect the input is NDC (x/w, y/w).
+ *   - We use a simple z = 1 / w transform, which is a bit faster than OGL/DX depth transforms. Thus, depth is REVERSED and z = 0 at
+ *     the far plane and z = inf at w = 0. We also have to use a GREATER depth function, which explains why all the conservative
+ *     tests will be reversed compared to what you might be used to (for example zMaxTri >= zMinBuffer is a visibility test)
+ *   - We support different layouts for vertex data (basic AoS and SoA), but note that it's beneficial to store the position data
+ *     as tightly in memory as possible to reduce cache misses. Big strides are bad, so it's beneficial to keep position as a separate
+ *     stream (rather than bundled with attributes) or to keep a copy of the position data for the occlusion culling system.
+ *   - The resolution width must be a multiple of 8 and height a multiple of 4.
+ *   - The hierarchical Z buffer is stored OpenGL-style with the y axis pointing up. This includes the scissor box.
+ *   - This code is only tested with Visual Studio 2015, but should hopefully be easy to port to other compilers.
+ */
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Defines used to configure the implementation
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QUICK_MASK
+/*!
+ * Configure the algorithm used for updating and merging hierarchical z buffer entries. If QUICK_MASK
+ * is defined to 1, use the algorithm from the paper "Masked Software Occlusion Culling", which has good
+ * balance between performance and low leakage. If QUICK_MASK is defined to 0, use the algorithm from
+ * "Masked Depth Culling for Graphics Hardware" which has less leakage, but also lower performance.
+ */
+#define QUICK_MASK                      1
+
+#endif
+
+#ifndef USE_D3D
+/*!
+ * Configures the library for use with Direct3D (default) or OpenGL rendering. This changes whether the 
+ * screen space Y axis points downwards (D3D) or upwards (OGL), and is primarily important in combination 
+ * with the PRECISE_COVERAGE define, where this is important to ensure correct rounding and tie-breaker
+ * behaviour. It also affects the ScissorRect screen space coordinates.
+ */
+#define USE_D3D                         1
+
+#endif
+
+#ifndef PRECISE_COVERAGE
+/*!
+ * Define PRECISE_COVERAGE to 1 to more closely match GPU rasterization rules. The increased precision comes
+ * at a cost of slightly lower performance.
+ */
+#define PRECISE_COVERAGE                1
+
+#endif
+
+#ifndef USE_AVX512
+/*!
+ * Define USE_AVX512 to 1 to enable experimental AVX-512 support. It's currently mostly untested and only
+ * validated on simple examples using Intel SDE. Older compilers may not support AVX-512 intrinsics.
+ */
+#define USE_AVX512                      0
+
+#endif
+
+#ifndef CLIPPING_PRESERVES_ORDER
+/*!
+ * Define CLIPPING_PRESERVES_ORDER to 1 to prevent clipping from reordering triangle rasterization
+ * order; This comes at a cost (approx 3-4%) but removes one source of temporal frame-to-frame instability.
+ */
+#define CLIPPING_PRESERVES_ORDER        1
+
+#endif
+
+#ifndef ENABLE_STATS
+/*!
+ * Define ENABLE_STATS to 1 to gather various statistics during occlusion culling. Can be used for profiling 
+ * and debugging. Note that enabling this function will reduce performance significantly.
+ */
+#define ENABLE_STATS                    0
+
+#endif
+
+#ifndef MOC_RECORDER_ENABLE
+/*!
+ * Define MOC_RECORDER_ENABLE to 1 to enable frame recorder (see FrameRecorder.h/cpp for details)
+ */
+#define MOC_RECORDER_ENABLE		        0
+
+#endif
+
+#if MOC_RECORDER_ENABLE
+#ifndef MOC_RECORDER_ENABLE_PLAYBACK
+/*!
+ * Define MOC_RECORDER_ENABLE_PLAYBACK to 1 to enable compilation of the playback code (not needed 
+   for recording)
+ */
+#define MOC_RECORDER_ENABLE_PLAYBACK    0
+#endif
+#endif
+
+
+#if MOC_RECORDER_ENABLE
+
+#include <mutex>
+
+class FrameRecorder;
+
+#endif // #if MOC_RECORDER_ENABLE
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Masked occlusion culling class
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class MaskedOcclusionCulling 
+{
+public:
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Memory management callback functions
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	typedef void *(*pfnAlignedAlloc)(size_t alignment, size_t size);
+	typedef void  (*pfnAlignedFree) (void *ptr);
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Enums
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	enum Implementation 
+	{
+		SSE2   = 0,
+		SSE41  = 1,
+		AVX2   = 2,
+		AVX512 = 3
+	};
+
+	enum BackfaceWinding
+	{
+		BACKFACE_NONE = 0,
+		BACKFACE_CW   = 1,
+		BACKFACE_CCW  = 2,
+	};
+
+	enum CullingResult
+	{
+		VISIBLE     = 0x0,
+		OCCLUDED    = 0x1,
+		VIEW_CULLED = 0x3
+	};
+
+	enum ClipPlanes
+	{
+		CLIP_PLANE_NONE   = 0x00,
+		CLIP_PLANE_NEAR   = 0x01,
+		CLIP_PLANE_LEFT   = 0x02,
+		CLIP_PLANE_RIGHT  = 0x04,
+		CLIP_PLANE_BOTTOM = 0x08,
+		CLIP_PLANE_TOP    = 0x10,
+		CLIP_PLANE_SIDES  = (CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP),
+		CLIP_PLANE_ALL    = (CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP | CLIP_PLANE_NEAR)
+	};
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Structs
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	/*!
+	 * Used to specify custom vertex layout. Memory offsets to y and z coordinates are set through 
+	 * mOffsetY and mOffsetW, and vertex stride is given by mStride. It's possible to configure both 
+	 * AoS and SoA layouts. Note that large strides may cause more cache misses and decrease 
+	 * performance. It is advisable to store position data as compactly in memory as possible.
+	 */
+	struct VertexLayout
+	{
+		VertexLayout() {}
+		VertexLayout(int stride, int offsetY, int offsetZW) :
+			mStride(stride), mOffsetY(offsetY), mOffsetW(offsetZW) {}
+
+		int mStride;      //!< byte stride between vertices
+		int mOffsetY;     //!< byte offset from X to Y coordinate
+		union {
+			int mOffsetZ; //!< byte offset from X to Z coordinate
+			int mOffsetW; //!< byte offset from X to W coordinate
+		};
+	};
+
+	/*!
+	 * Used to control scissoring during rasterization. Note that we only provide coarse scissor support. 
+	 * The scissor box x coordinates must be a multiple of 32, and the y coordinates a multiple of 8. 
+	 * Scissoring is mainly meant as a means of enabling binning (sort middle) rasterizers in case
+	 * application developers want to use that approach for multithreading.
+	 */
+	struct ScissorRect
+	{
+		ScissorRect() {}
+		ScissorRect(int minX, int minY, int maxX, int maxY) :
+			mMinX(minX), mMinY(minY), mMaxX(maxX), mMaxY(maxY) {}
+
+		int mMinX; //!< Screen space X coordinate for left side of scissor rect, inclusive and must be a multiple of 32
+		int mMinY; //!< Screen space Y coordinate for bottom side of scissor rect, inclusive and must be a multiple of 8
+		int mMaxX; //!< Screen space X coordinate for right side of scissor rect, <B>non</B> inclusive and must be a multiple of 32
+		int mMaxY; //!< Screen space Y coordinate for top side of scissor rect, <B>non</B> inclusive and must be a multiple of 8
+	};
+
+	/*!
+	 * Used to specify storage area for a binlist, containing triangles. This struct is used for binning 
+	 * and multithreading. The host application is responsible for allocating memory for the binlists.
+	 */
+	struct TriList
+	{
+		unsigned int mNumTriangles; //!< Maximum number of triangles that may be stored in mPtr
+		unsigned int mTriIdx;       //!< Index of next triangle to be written, clear before calling BinTriangles to start from the beginning of the list
+		float		 *mPtr;         //!< Scratchpad buffer allocated by the host application
+	};
+
+	/*!
+	 * Statistics that can be gathered during occluder rendering and visibility to aid debugging 
+	 * and profiling. Must be enabled by changing the ENABLE_STATS define.
+	 */
+	struct OcclusionCullingStatistics
+	{
+		struct
+		{
+			long long mNumProcessedTriangles;  //!< Number of occluder triangles processed in total
+			long long mNumRasterizedTriangles; //!< Number of occluder triangles passing view frustum and backface culling
+			long long mNumTilesTraversed;      //!< Number of tiles traversed by the rasterizer
+			long long mNumTilesUpdated;        //!< Number of tiles where the hierarchical z buffer was updated
+			long long mNumTilesMerged;        //!< Number of tiles where the hierarchical z buffer was updated
+		} mOccluders;
+
+		struct
+		{
+			long long mNumProcessedRectangles; //!< Number of rects processed (TestRect())
+			long long mNumProcessedTriangles;  //!< Number of ocludee triangles processed (TestTriangles())
+			long long mNumRasterizedTriangles; //!< Number of ocludee triangle passing view frustum and backface culling
+			long long mNumTilesTraversed;      //!< Number of tiles traversed by triangle & rect rasterizers
+		} mOccludees;
+	};
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Functions
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	/*!
+	 * \brief Creates a new object with default state, no z buffer attached/allocated.
+	 */
+	static MaskedOcclusionCulling *Create(Implementation RequestedSIMD = AVX512);
+	
+	/*!
+	 * \brief Creates a new object with default state, no z buffer attached/allocated.
+	 * \param alignedAlloc Pointer to a callback function used when allocating memory
+	 * \param alignedFree Pointer to a callback function used when freeing memory
+	 */
+	static MaskedOcclusionCulling *Create(Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree);
+
+	/*!
+	 * \brief Destroys an object and frees the z buffer memory. Note that you cannot 
+	 * use the delete operator, and should rather use this function to free up memory.
+	 */
+	static void Destroy(MaskedOcclusionCulling *moc);
+
+	/*!
+	 * \brief Sets the resolution of the hierarchical depth buffer. This function will
+	 *        re-allocate the current depth buffer (if present). The contents of the
+	 *        buffer is undefined until ClearBuffer() is called.
+	 *
+	 * \param witdh The width of the buffer in pixels, must be a multiple of 8
+	 * \param height The height of the buffer in pixels, must be a multiple of 4
+	 */
+	virtual void SetResolution(unsigned int width, unsigned int height) = 0;
+
+	/*!
+	* \brief Gets the resolution of the hierarchical depth buffer. 
+	*
+	* \param witdh Output: The width of the buffer in pixels
+	* \param height Output: The height of the buffer in pixels
+	*/
+	virtual void GetResolution(unsigned int &width, unsigned int &height) const = 0;
+
+	/*!
+	 * \brief Returns the tile size for the current implementation.
+	 *
+	 * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
+	 *        rectangular bins.
+	 * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
+	 *        rectangular bins.
+	 * \param outBinWidth Output: The width of the single bin in pixels (except for the 
+	 *        rightmost bin width, which is extended to resolution width)
+	 * \param outBinHeight Output: The height of the single bin in pixels (except for the 
+	 *        bottommost bin height, which is extended to resolution height)
+	 */
+	virtual void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) = 0;
+
+	/*!
+	 * \brief Sets the distance for the near clipping plane. Default is nearDist = 0.
+	 *
+	 * \param nearDist The distance to the near clipping plane, given as clip space w
+	 */
+	virtual void SetNearClipPlane(float nearDist) = 0;
+
+	/*!
+	* \brief Gets the distance for the near clipping plane. 
+	*/
+	virtual float GetNearClipPlane() const = 0;
+
+	/*!
+	 * \brief Clears the hierarchical depth buffer.
+	 */
+	virtual void ClearBuffer() = 0;
+
+	/*!
+	* \brief Merge a second hierarchical depth buffer into the main buffer.
+	*/
+	virtual void MergeBuffer(MaskedOcclusionCulling* BufferB) = 0;
+
+	/*! 
+	 * \brief Renders a mesh of occluder triangles and updates the hierarchical z buffer
+	 *        with conservative depth values.
+	 *
+	 * This function is optimized for vertex layouts with stride 16 and y and w
+	 * offsets of 4 and 12 bytes, respectively.
+	 *
+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
+	 *        in clip space. The memory layout can be changed using vtxLayout.
+	 * \param inTris Pointer to an array of vertex indices. Each triangle is created 
+	 *        from three indices consecutively fetched from the array.
+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
+	 *        entries)
+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
+	 *        performing projection. If nullptr is passed the transform step will be skipped
+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
+	 *        and will not be rasterized. You may use BACKFACE_NONE to disable culling for
+	 *        double sided geometry
+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
+	 *        triangle clipper. Can be used as an optimization if your application can 
+	 *        determine (for example during culling) that a group of triangles does not 
+	 *        intersect a certain frustum plane. However, setting an incorrect mask may 
+	 *        cause out of bounds memory accesses.
+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
+	 *        description). For best performance, it is advisable to store position data
+	 *        as compactly in memory as possible.
+	 * \return Will return VIEW_CULLED if all triangles are either outside the frustum or
+	 *         backface culled, returns VISIBLE otherwise.
+	 */
+	virtual CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
+
+	/*!
+	 * \brief Occlusion query for a rectangle with a given depth. The rectangle is given 
+	 *        in normalized device coordinates where (x,y) coordinates between [-1,1] map 
+	 *        to the visible screen area. The query uses a GREATER_EQUAL (reversed) depth 
+	 *        test meaning that depth values equal to the contents of the depth buffer are
+	 *        counted as visible.
+	 *
+	 * \param xmin NDC coordinate of the left side of the rectangle.
+	 * \param ymin NDC coordinate of the bottom side of the rectangle.
+	 * \param xmax NDC coordinate of the right side of the rectangle.
+	 * \param ymax NDC coordinate of the top side of the rectangle.
+	 * \param ymax NDC coordinate of the top side of the rectangle.
+	 * \param wmin Clip space W coordinate for the rectangle.
+	 * \return The query will return VISIBLE if the rectangle may be visible, OCCLUDED
+	 *         if the rectangle is occluded by a previously rendered  object, or VIEW_CULLED
+	 *         if the rectangle is outside the view frustum.
+	 */
+	virtual CullingResult TestRect(float xmin, float ymin, float xmax, float ymax, float wmin) const = 0;
+
+	/*!
+	 * \brief This function is similar to RenderTriangles(), but performs an occlusion
+	 *        query instead and does not update the hierarchical z buffer. The query uses 
+	 *        a GREATER_EQUAL (reversed) depth test meaning that depth values equal to the 
+	 *        contents of the depth buffer are counted as visible.
+	 *
+	 * This function is optimized for vertex layouts with stride 16 and y and w
+	 * offsets of 4 and 12 bytes, respectively.
+	 *
+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
+	 *        in clip space. The memory layout can be changed using vtxLayout.
+	 * \param inTris Pointer to an array of triangle indices. Each triangle is created 
+	 *        from three indices consecutively fetched from the array.
+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
+	 *        entries)
+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
+	 *        performing projection. If nullptr is passed the transform step will be skipped
+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
+	 *        and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
+	 *        for double sided geometry
+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
+	 *        triangle clipper. Can be used as an optimization if your application can
+	 *        determine (for example during culling) that a group of triangles does not
+	 *        intersect a certain frustum plane. However, setting an incorrect mask may
+	 *        cause out of bounds memory accesses.
+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
+	 *        description). For best performance, it is advisable to store position data
+	 *        as compactly in memory as possible.
+	 * \return The query will return VISIBLE if the triangle mesh may be visible, OCCLUDED
+	 *         if the mesh is occluded by a previously rendered object, or VIEW_CULLED if all
+	 *         triangles are entirely outside the view frustum or backface culled.
+	 */
+	virtual CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
+
+	/*!
+	 * \brief Perform input assembly, clipping , projection, triangle setup, and write
+	 *        triangles to the screen space bins they overlap. This function can be used to
+	 *        distribute work for threading (See the CullingThreadpool class for an example)
+	 *
+	 * \param inVtx Pointer to an array of input vertices, should point to the x component
+	 *        of the first vertex. The input vertices are given as (x,y,w) coordinates
+	 *        in clip space. The memory layout can be changed using vtxLayout.
+	 * \param inTris Pointer to an array of vertex indices. Each triangle is created
+	 *        from three indices consecutively fetched from the array.
+	 * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
+	 *        entries)
+	 * \param triLists Pointer to an array of TriList objects with one TriList object per
+	 *        bin. If a triangle overlaps a bin, it will be written to the corresponding
+	 *        trilist. Note that this method appends the triangles to the current list, to
+	 *        start writing from the beginning of the list, set triList.mTriIdx = 0
+	 * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
+	 *        rectangular bins.
+	 * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
+	 *        rectangular bins.
+	 * \param modelToClipMatrix all vertices will be transformed by this matrix before
+	 *        performing projection. If nullptr is passed the transform step will be skipped
+	 * \param clipPlaneMask A mask indicating which clip planes should be considered by the
+	 *        triangle clipper. Can be used as an optimization if your application can
+	 *        determine (for example during culling) that a group of triangles does not
+	 *        intersect a certain frustum plane. However, setting an incorrect mask may
+	 *        cause out of bounds memory accesses.
+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed
+	 *        description). For best performance, it is advisable to store position data
+	 *        as compactly in memory as possible.
+	 * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
+	 *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
+	 *        and will not be binned / rasterized. You may use BACKFACE_NONE to disable culling
+	 *        for double sided geometry
+	 */
+	virtual void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)) = 0;
+
+	/*!
+	 * \brief Renders all occluder triangles in a trilist. This function can be used in
+	 *        combination with BinTriangles() to create a threded (binning) rasterizer. The
+	 *        bins can be processed independently by different threads without risking writing
+	 *        to overlapping memory regions.
+	 *
+	 * \param triLists A triangle list, filled using the BinTriangles() function that is to
+	 *        be rendered.
+	 * \param scissor A scissor box limiting the rendering region to the bin. The size of each
+	 *        bin must be a multiple of 32x8 pixels due to implementation constraints. For a
+	 *        render target with (width, height) resolution and (nBinsW, nBinsH) bins, the
+	 *        size of a bin is:
+	 *          binWidth = (width / nBinsW) - (width / nBinsW) % 32;
+	 *          binHeight = (height / nBinsH) - (height / nBinsH) % 8;
+	 *        The last row and column of tiles have a different size:
+	 *          lastColBinWidth = width - (nBinsW-1)*binWidth;
+	 *          lastRowBinHeight = height - (nBinsH-1)*binHeight;
+	 */
+	virtual void RenderTrilist(const TriList &triList, const ScissorRect *scissor) = 0;
+
+	/*!
+	 * \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation.
+	 *        Intended for visualizing the hierarchical depth buffer for debugging. The 
+	 *        buffer is written in scanline order, from the top to bottom (D3D) or bottom to 
+	 *        top (OGL) of the surface. See the USE_D3D define.
+	 *
+	 * \param depthData Pointer to memory where the per-pixel depth data is written. Must
+	 *        hold storage for atleast width*height elements as set by setResolution.
+	 */
+	virtual void ComputePixelDepthBuffer(float *depthData, bool flipY) = 0;
+	
+	/*!
+	 * \brief Fetch occlusion culling statistics, returns zeroes if ENABLE_STATS define is
+	 *        not defined. The statistics can be used for profiling or debugging.
+	 */
+	virtual OcclusionCullingStatistics GetStatistics() = 0;
+
+	/*!
+	 * \brief Returns the implementation (CPU instruction set) version of this object.
+	 */
+	virtual Implementation GetImplementation() = 0;
+
+	/*!
+	 * \brief Utility function for transforming vertices and outputting them to an (x,y,z,w)
+	 *        format suitable for the occluder rasterization and occludee testing functions.
+	 *
+	 * \param mtx Pointer to matrix data. The matrix should column major for post 
+	 *        multiplication (OGL) and row major for pre-multiplication (DX). This is 
+	 *        consistent with OpenGL / DirectX behavior.
+	 * \param inVtx Pointer to an array of input vertices. The input vertices are given as
+	 *        (x,y,z) coordinates. The memory layout can be changed using vtxLayout.
+	 * \param xfVtx Pointer to an array to store transformed vertices. The transformed
+	 *        vertices are always stored as array of structs (AoS) (x,y,z,w) packed in memory.
+	 * \param nVtx Number of vertices to transform.
+	 * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 
+	 *        description). For best performance, it is advisable to store position data
+	 *        as compactly in memory as possible. Note that for this function, the
+	 *        w-component is assumed to be 1.0.
+	 */
+	static void TransformVertices(const float *mtx, const float *inVtx, float *xfVtx, unsigned int nVtx, const VertexLayout &vtxLayout = VertexLayout(12, 4, 8));
+
+	/*!
+	 * \brief Get used memory alloc/free callbacks.
+     */
+    void GetAllocFreeCallback( pfnAlignedAlloc & allocCallback, pfnAlignedFree & freeCallback ) { allocCallback = mAlignedAllocCallback, freeCallback = mAlignedFreeCallback; }
+
+#if MOC_RECORDER_ENABLE
+    /*!
+	 * \brief Start recording subsequent rasterization and testing calls using the FrameRecorder.
+     *        The function calls that are recorded are:
+     *         - ClearBuffer
+	 *         - RenderTriangles
+     *         - TestTriangles
+     *         - TestRect
+     *        All inputs and outputs are recorded, which can be used for correctness validation
+     *        and performance testing.
+     *
+	 * \param outputFilePath Pointer to name of the output file. 
+	 * \return 'true' if recording was started successfully, 'false' otherwise (file access error).
+	 */
+    bool RecorderStart( const char * outputFilePath ) const;
+
+    /*!
+	 * \brief Stop recording, flush output and release used memory.
+	 */
+    void RecorderStop( ) const;
+
+    /*!
+	 * \brief Manually record triangles. This is called automatically from MaskedOcclusionCulling::RenderTriangles 
+     *  if the recording is started, but not from BinTriangles/RenderTrilist (used in multithreaded codepath), in
+     *  which case it has to be called manually.
+     *
+     * \param inVtx Pointer to an array of input vertices, should point to the x component
+     *        of the first vertex. The input vertices are given as (x,y,w) coordinates
+     *        in clip space. The memory layout can be changed using vtxLayout.
+     * \param inTris Pointer to an array of triangle indices. Each triangle is created
+     *        from three indices consecutively fetched from the array.
+     * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
+     *        entries)
+     * \param modelToClipMatrix all vertices will be transformed by this matrix before
+     *        performing projection. If nullptr is passed the transform step will be skipped
+     * \param bfWinding Sets triangle winding order to consider backfacing, must be one one
+     *        of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
+     *        and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
+     *        for double sided geometry
+     * \param clipPlaneMask A mask indicating which clip planes should be considered by the
+     *        triangle clipper. Can be used as an optimization if your application can
+     *        determine (for example during culling) that a group of triangles does not
+     *        intersect a certain frustum plane. However, setting an incorrect mask may
+     *        cause out of bounds memory accesses.
+     * \param vtxLayout A struct specifying the vertex layout (see struct for detailed
+     *        description). For best performance, it is advisable to store position data
+     *        as compactly in memory as possible.
+     * \param cullingResult cull result value expected to be returned by executing the
+     *        RenderTriangles call with recorded parameters.
+	 */
+    // 
+    // merge the binned data back into original layout; in this case, call it manually from your Threadpool implementation (already added to CullingThreadpool).
+    // If recording is not enabled, calling this function will do nothing.
+    void RecordRenderTriangles( const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix = nullptr, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, BackfaceWinding bfWinding = BACKFACE_CW, const VertexLayout &vtxLayout = VertexLayout( 16, 4, 12 ), CullingResult cullingResult = (CullingResult)-1 );
+#endif // #if MOC_RECORDER_ENABLE
+
+protected:
+	pfnAlignedAlloc mAlignedAllocCallback;
+	pfnAlignedFree  mAlignedFreeCallback;
+
+	mutable OcclusionCullingStatistics mStats;
+
+#if MOC_RECORDER_ENABLE
+    mutable FrameRecorder * mRecorder;
+    mutable std::mutex mRecorderMutex;
+#endif // #if MOC_RECORDER_ENABLE
+
+	virtual ~MaskedOcclusionCulling() {}
+};

+ 243 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp

@@ -0,0 +1,243 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include "MaskedOcclusionCulling.h"
+#include "CompilerSpecific.inl"
+
+#if MOC_RECORDER_ENABLE
+#include "FrameRecorder.h"
+#endif
+
+#if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900
+	// If you remove/comment this error, the code will compile & use the SSE41 version instead. 
+	#error Older versions than visual studio 2015 not supported due to compiler bug(s)
+#endif
+
+#if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900
+
+// For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid 
+// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE
+// version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to 
+// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
+#ifndef __AVX2__
+	#error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AVX specific defines and constants
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define SIMD_LANES             8
+#define TILE_HEIGHT_SHIFT      3
+
+#define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7)
+
+#define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
+#define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
+
+#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF)
+
+#define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920)
+#define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AVX specific typedefs and functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef __m256 __mw;
+typedef __m256i __mwi;
+
+#define _mmw_set1_ps                _mm256_set1_ps
+#define _mmw_setzero_ps             _mm256_setzero_ps
+#define _mmw_and_ps                 _mm256_and_ps
+#define _mmw_or_ps                  _mm256_or_ps
+#define _mmw_xor_ps                 _mm256_xor_ps
+#define _mmw_not_ps(a)              _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0)))
+#define _mmw_andnot_ps              _mm256_andnot_ps
+#define _mmw_neg_ps(a)              _mm256_xor_ps((a), _mm256_set1_ps(-0.0f))
+#define _mmw_abs_ps(a)              _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)))
+#define _mmw_add_ps                 _mm256_add_ps
+#define _mmw_sub_ps                 _mm256_sub_ps
+#define _mmw_mul_ps                 _mm256_mul_ps
+#define _mmw_div_ps                 _mm256_div_ps
+#define _mmw_min_ps                 _mm256_min_ps
+#define _mmw_max_ps                 _mm256_max_ps
+#define _mmw_fmadd_ps               _mm256_fmadd_ps
+#define _mmw_fmsub_ps               _mm256_fmsub_ps
+#define _mmw_movemask_ps            _mm256_movemask_ps
+#define _mmw_blendv_ps              _mm256_blendv_ps
+#define _mmw_cmpge_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_GE_OQ)
+#define _mmw_cmpgt_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_GT_OQ)
+#define _mmw_cmpeq_ps(a,b)          _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
+#define _mmw_floor_ps(x)            _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)
+#define _mmw_ceil_ps(x)             _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)
+#define _mmw_shuffle_ps             _mm256_shuffle_ps
+#define _mmw_insertf32x4_ps         _mm256_insertf128_ps
+#define _mmw_cvtepi32_ps            _mm256_cvtepi32_ps
+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
+
+#define _mmw_set1_epi32             _mm256_set1_epi32
+#define _mmw_setzero_epi32          _mm256_setzero_si256
+#define _mmw_and_epi32              _mm256_and_si256
+#define _mmw_or_epi32               _mm256_or_si256
+#define _mmw_xor_epi32              _mm256_xor_si256
+#define _mmw_not_epi32(a)           _mm256_xor_si256((a), _mm256_set1_epi32(~0))
+#define _mmw_andnot_epi32           _mm256_andnot_si256
+#define _mmw_neg_epi32(a)           _mm256_sub_epi32(_mm256_set1_epi32(0), (a))
+#define _mmw_add_epi32              _mm256_add_epi32
+#define _mmw_sub_epi32              _mm256_sub_epi32
+#define _mmw_min_epi32              _mm256_min_epi32
+#define _mmw_max_epi32              _mm256_max_epi32
+#define _mmw_subs_epu16             _mm256_subs_epu16
+#define _mmw_mullo_epi32            _mm256_mullo_epi32
+#define _mmw_cmpeq_epi32            _mm256_cmpeq_epi32
+#define _mmw_testz_epi32            _mm256_testz_si256
+#define _mmw_cmpgt_epi32            _mm256_cmpgt_epi32
+#define _mmw_srai_epi32             _mm256_srai_epi32
+#define _mmw_srli_epi32             _mm256_srli_epi32
+#define _mmw_slli_epi32             _mm256_slli_epi32
+#define _mmw_sllv_ones(x)           _mm256_sllv_epi32(SIMD_BITS_ONE, x)
+#define _mmw_transpose_epi8(x)      _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
+#define _mmw_abs_epi32              _mm256_abs_epi32
+#define _mmw_cvtps_epi32            _mm256_cvtps_epi32
+#define _mmw_cvttps_epi32           _mm256_cvttps_epi32
+
+#define _mmx_dp4_ps(a, b)           _mm_dp_ps(a, b, 0xFF)
+#define _mmx_fmadd_ps               _mm_fmadd_ps
+#define _mmx_max_epi32              _mm_max_epi32
+#define _mmx_min_epi32              _mm_min_epi32
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMD casting functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(float A) { return _mm256_set1_ps(A); }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256 A) { return A; }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; }
+
+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
+		return acs->m_array; \
+	}
+
+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
+
+MAKE_ACCESSOR(simd_f32, __m256, float, , 8)
+MAKE_ACCESSOR(simd_f32, __m256, float, const, 8)
+MAKE_ACCESSOR(simd_i32, __m256i, int, , 8)
+MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized AVX input assembly function for general vertex gather 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
+
+FORCE_INLINE void GatherVertices(__m256 *vtxX, __m256 *vtxY, __m256 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
+{
+	assert(numLanes >= 1);
+
+	const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
+	static const __m256i SIMD_LANE_MASK[9] = {
+		_mm256_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0),
+		_mm256_setr_epi32(~0,  0,  0,  0,  0,  0,  0,  0),
+		_mm256_setr_epi32(~0, ~0,  0,  0,  0,  0,  0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0,  0,  0,  0,  0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0, ~0,  0,  0,  0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0,  0,  0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0,  0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0,  0),
+		_mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0)
+	};
+
+	// Compute per-lane index list offset that guards against out of bounds memory accesses
+	__m256i safeTriIdxOffset = _mm256_and_si256(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]);
+
+	// Fetch triangle indices. 
+	__m256i vtxIdx[3];
+	vtxIdx[0] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 0, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
+	vtxIdx[1] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 1, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
+	vtxIdx[2] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 2, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
+
+	char *vPtr = (char *)inVtx;
+
+	// Fetch triangle vertices
+	for (int i = 0; i < 3; i++)
+	{
+		vtxX[i] = _mm256_i32gather_ps((float *)vPtr, vtxIdx[i], 1);
+		vtxY[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetY), vtxIdx[i], 1);
+		vtxW[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetW), vtxIdx[i], 1);
+	}
+}
+
+namespace MaskedOcclusionCullingAVX2
+{
+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2;
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Include common algorithm implementation (general, SIMD independent code)
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	#include "MaskedOcclusionCullingCommon.inl"
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Utility function to create a new object using the allocator callbacks
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	
+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
+		return object;
+	}
+};
+
+#else
+
+namespace MaskedOcclusionCullingAVX2
+{
+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		return nullptr;
+	}
+};
+
+#endif

+ 309 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp

@@ -0,0 +1,309 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include "MaskedOcclusionCulling.h"
+#include "CompilerSpecific.inl"
+
+#if MOC_RECORDER_ENABLE
+#include "FrameRecorder.h"
+#endif
+
+// Make sure compiler supports AVX-512 intrinsics: Visual Studio 2017 (Update 3) || Intel C++ Compiler 16.0 || Clang 4.0 || GCC 5.0
+#if USE_AVX512 != 0 && ((defined(_MSC_VER) && _MSC_VER >= 1911) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1600) || (defined(__clang__) && __clang_major__ >= 4) || (defined(__GNUC__) && __GNUC__ >= 5))
+
+// The MaskedOcclusionCullingAVX512.cpp file should be compiled avx2/avx512 architecture options turned on in the compiler. However, the SSE
+// version in MaskedOcclusionCulling.cpp _must_ be compiled with SSE2 architecture allow backwards compatibility. Best practice is to 
+// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
+#ifndef __AVX2__
+	#error For best performance, MaskedOcclusionCullingAVX512.cpp should be compiled with /arch:AVX2
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AVX specific defines and constants
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define SIMD_LANES             16
+#define TILE_HEIGHT_SHIFT      4
+
+#define SIMD_LANE_IDX _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+
+#define SIMD_SUB_TILE_COL_OFFSET _mm512_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET _mm512_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
+#define SIMD_SUB_TILE_COL_OFFSET_F _mm512_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
+#define SIMD_SUB_TILE_ROW_OFFSET_F _mm512_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
+
+#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm512_set_epi32(0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400)
+
+#define SIMD_LANE_YCOORD_I _mm512_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920, 2176, 2432, 2688, 2944, 3200, 3456, 3712, 3968)
+#define SIMD_LANE_YCOORD_F _mm512_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f, 2176.0f, 2432.0f, 2688.0f, 2944.0f, 3200.0f, 3456.0f, 3712.0f, 3968.0f)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// AVX specific typedefs and functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef __m512 __mw;
+typedef __m512i __mwi;
+
+#define _mmw_set1_ps                _mm512_set1_ps
+#define _mmw_setzero_ps             _mm512_setzero_ps
+#define _mmw_and_ps                 _mm512_and_ps
+#define _mmw_or_ps                  _mm512_or_ps
+#define _mmw_xor_ps                 _mm512_xor_ps
+#define _mmw_not_ps(a)              _mm512_xor_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(~0)))
+#define _mmw_andnot_ps              _mm512_andnot_ps
+#define _mmw_neg_ps(a)              _mm512_xor_ps((a), _mm512_set1_ps(-0.0f))
+#define _mmw_abs_ps(a)              _mm512_and_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)))
+#define _mmw_add_ps                 _mm512_add_ps
+#define _mmw_sub_ps                 _mm512_sub_ps
+#define _mmw_mul_ps                 _mm512_mul_ps
+#define _mmw_div_ps                 _mm512_div_ps
+#define _mmw_min_ps                 _mm512_min_ps
+#define _mmw_max_ps                 _mm512_max_ps
+#define _mmw_fmadd_ps               _mm512_fmadd_ps
+#define _mmw_fmsub_ps               _mm512_fmsub_ps
+#define _mmw_shuffle_ps             _mm512_shuffle_ps
+#define _mmw_insertf32x4_ps         _mm512_insertf32x4
+#define _mmw_cvtepi32_ps            _mm512_cvtepi32_ps
+#define _mmw_blendv_epi32(a,b,c)    simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
+
+#define _mmw_set1_epi32             _mm512_set1_epi32
+#define _mmw_setzero_epi32          _mm512_setzero_si512
+#define _mmw_and_epi32              _mm512_and_si512
+#define _mmw_or_epi32               _mm512_or_si512
+#define _mmw_xor_epi32              _mm512_xor_si512
+#define _mmw_not_epi32(a)           _mm512_xor_si512((a), _mm512_set1_epi32(~0))
+#define _mmw_andnot_epi32           _mm512_andnot_si512
+#define _mmw_neg_epi32(a)           _mm512_sub_epi32(_mm512_set1_epi32(0), (a))
+#define _mmw_add_epi32              _mm512_add_epi32
+#define _mmw_sub_epi32              _mm512_sub_epi32
+#define _mmw_min_epi32              _mm512_min_epi32
+#define _mmw_max_epi32              _mm512_max_epi32
+#define _mmw_subs_epu16             _mm512_subs_epu16
+#define _mmw_mullo_epi32            _mm512_mullo_epi32
+#define _mmw_srai_epi32             _mm512_srai_epi32
+#define _mmw_srli_epi32             _mm512_srli_epi32
+#define _mmw_slli_epi32             _mm512_slli_epi32
+#define _mmw_sllv_ones(x)           _mm512_sllv_epi32(SIMD_BITS_ONE, x)
+#define _mmw_transpose_epi8(x)      _mm512_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
+#define _mmw_abs_epi32              _mm512_abs_epi32
+#define _mmw_cvtps_epi32            _mm512_cvtps_epi32
+#define _mmw_cvttps_epi32           _mm512_cvttps_epi32
+
+#define _mmx_dp4_ps(a, b)           _mm_dp_ps(a, b, 0xFF)
+#define _mmx_fmadd_ps               _mm_fmadd_ps
+#define _mmx_max_epi32              _mm_max_epi32
+#define _mmx_min_epi32              _mm_min_epi32
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMD casting functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
+template<> FORCE_INLINE __m128  simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
+template<> FORCE_INLINE __m128  simd_cast<__m128>(__m128 A) { return A; }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
+template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(float A) { return _mm256_set1_ps(A); }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); }
+template<> FORCE_INLINE __m256  simd_cast<__m256>(__m256 A) { return A; }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); }
+template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; }
+template<> FORCE_INLINE __m512  simd_cast<__m512>(float A) { return _mm512_set1_ps(A); }
+template<> FORCE_INLINE __m512  simd_cast<__m512>(__m512i A) { return _mm512_castsi512_ps(A); }
+template<> FORCE_INLINE __m512  simd_cast<__m512>(__m512 A) { return A; }
+template<> FORCE_INLINE __m512i simd_cast<__m512i>(int A) { return _mm512_set1_epi32(A); }
+template<> FORCE_INLINE __m512i simd_cast<__m512i>(__m512 A) { return _mm512_castps_si512(A); }
+template<> FORCE_INLINE __m512i simd_cast<__m512i>(__m512i A) { return A; }
+
+#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
+	FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
+		union accessor { simd_type m_native; base_type m_array[elements]; }; \
+		is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
+		return acs->m_array; \
+	}
+
+MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
+MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
+MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
+
+MAKE_ACCESSOR(simd_f32, __m256, float, , 8)
+MAKE_ACCESSOR(simd_f32, __m256, float, const, 8)
+MAKE_ACCESSOR(simd_i32, __m256i, int, , 8)
+MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8)
+
+MAKE_ACCESSOR(simd_f32, __m512, float, , 16)
+MAKE_ACCESSOR(simd_f32, __m512, float, const, 16)
+MAKE_ACCESSOR(simd_i32, __m512i, int, , 16)
+MAKE_ACCESSOR(simd_i32, __m512i, int, const, 16)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized AVX input assembly function for general vertex gather 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
+
+FORCE_INLINE void GatherVertices(__m512 *vtxX, __m512 *vtxY, __m512 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
+{
+	assert(numLanes >= 1);
+
+	const __m512i SIMD_TRI_IDX_OFFSET = _mm512_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45);
+	static const __m512i SIMD_LANE_MASK[17] = {
+		_mm512_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,  0),
+		_mm512_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0)
+	};
+
+	// Compute per-lane index list offset that guards against out of bounds memory accesses
+	__m512i safeTriIdxOffset = _mm512_and_si512(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]);
+
+	// Fetch triangle indices. 
+	__m512i vtxIdx[3];
+	vtxIdx[0] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 0, 4), _mmw_set1_epi32(vtxLayout.mStride));
+	vtxIdx[1] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 1, 4), _mmw_set1_epi32(vtxLayout.mStride));
+	vtxIdx[2] = _mmw_mullo_epi32(_mm512_i32gather_epi32(safeTriIdxOffset, (const int*)inTrisPtr + 2, 4), _mmw_set1_epi32(vtxLayout.mStride));
+
+	char *vPtr = (char *)inVtx;
+
+	// Fetch triangle vertices
+	for (int i = 0; i < 3; i++)
+	{
+		vtxX[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)vPtr, 1);
+		vtxY[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)(vPtr + vtxLayout.mOffsetY), 1);
+		vtxW[i] = _mm512_i32gather_ps(vtxIdx[i], (float *)(vPtr + vtxLayout.mOffsetW), 1);
+	}
+}
+
+namespace MaskedOcclusionCullingAVX512
+{
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Poorly implemented functions. TODO: fix common (maskedOcclusionCullingCommon.inl) code to improve perf
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	FORCE_INLINE __m512 _mmw_floor_ps(__m512 x)
+	{
+		return _mm512_roundscale_ps(x, 1); // 1 = floor
+	}
+
+	FORCE_INLINE __m512 _mmw_ceil_ps(__m512 x)
+	{
+		return _mm512_roundscale_ps(x, 2); // 2 = ceil
+	}
+
+	FORCE_INLINE __m512i _mmw_cmpeq_epi32(__m512i a, __m512i b)
+	{
+		__mmask16 mask = _mm512_cmpeq_epi32_mask(a, b);
+		return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0));
+	}
+
+	FORCE_INLINE __m512i _mmw_cmpgt_epi32(__m512i a, __m512i b)
+	{
+		__mmask16 mask = _mm512_cmpgt_epi32_mask(a, b);
+		return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0));
+	}
+
+	FORCE_INLINE bool _mmw_testz_epi32(__m512i a, __m512i b)
+	{
+		__mmask16 mask = _mm512_cmpeq_epi32_mask(_mm512_and_si512(a, b), _mm512_set1_epi32(0));
+		return mask == 0xFFFF;
+	}
+
+	FORCE_INLINE __m512 _mmw_cmpge_ps(__m512 a, __m512 b)
+	{
+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ);
+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
+	}
+
+	FORCE_INLINE __m512 _mmw_cmpgt_ps(__m512 a, __m512 b)
+	{
+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ);
+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
+	}
+
+	FORCE_INLINE __m512 _mmw_cmpeq_ps(__m512 a, __m512 b)
+	{
+		__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+		return _mm512_castsi512_ps(_mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)));
+	}
+
+	FORCE_INLINE __mmask16 _mmw_movemask_ps(const __m512 &a)
+	{
+		__mmask16 mask = _mm512_cmp_epi32_mask(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)), _mm512_set1_epi32(0), 4);	// a & 0x8000000 != 0
+		return mask;
+	}
+
+	FORCE_INLINE __m512 _mmw_blendv_ps(const __m512 &a, const __m512 &b, const __m512 &c)
+	{
+		__mmask16 mask = _mmw_movemask_ps(c);
+		return _mm512_mask_mov_ps(a, mask, b);
+	} 
+
+	static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX512;
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Include common algorithm implementation (general, SIMD independent code)
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	#include "MaskedOcclusionCullingCommon.inl"
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Utility function to create a new object using the allocator callbacks
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	
+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
+		new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
+		return object;
+	}
+};
+
+#else
+
+namespace MaskedOcclusionCullingAVX512
+{
+	typedef MaskedOcclusionCulling::pfnAlignedAlloc            pfnAlignedAlloc;
+	typedef MaskedOcclusionCulling::pfnAlignedFree             pfnAlignedFree;
+
+	MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
+	{
+		return nullptr;
+	}
+};
+
+#endif

+ 2053 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl

@@ -0,0 +1,2053 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright 2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations
+// under the License.
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Common SIMD math utility functions
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T> FORCE_INLINE T max(const T &a, const T &b) { return a > b ? a : b; }
+template<typename T> FORCE_INLINE T min(const T &a, const T &b) { return a < b ? a : b; }
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Common defines and constants
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define SIMD_ALL_LANES_MASK    ((1 << SIMD_LANES) - 1)
+
+// Tile dimensions are 32xN pixels. These values are not tweakable and the code must also be modified
+// to support different tile sizes as it is tightly coupled with the SSE/AVX register size
+#define TILE_WIDTH_SHIFT       5
+#define TILE_WIDTH             (1 << TILE_WIDTH_SHIFT)
+#define TILE_HEIGHT            (1 << TILE_HEIGHT_SHIFT)
+
+// Sub-tiles (used for updating the masked HiZ buffer) are 8x4 tiles, so there are 4x2 sub-tiles in a tile
+#define SUB_TILE_WIDTH          8
+#define SUB_TILE_HEIGHT         4
+
+// The number of fixed point bits used to represent vertex coordinates / edge slopes.
+#if PRECISE_COVERAGE != 0
+	#define FP_BITS             8
+	#define FP_HALF_PIXEL       (1 << (FP_BITS - 1))
+	#define FP_INV              (1.0f / (float)(1 << FP_BITS))
+#else
+	// Note that too low precision, without precise coverage, may cause overshoots / false coverage during rasterization.
+	// This is configured for 14 bits for AVX512 and 16 bits for SSE. Max tile slope delta is roughly 
+	// (screenWidth + 2*(GUARD_BAND_PIXEL_SIZE + 1)) * (2^FP_BITS * (TILE_HEIGHT + GUARD_BAND_PIXEL_SIZE + 1))  
+	// and must fit in 31 bits. With this config, max image resolution (width) is ~3272, so stay well clear of this limit. 
+	#define FP_BITS             (19 - TILE_HEIGHT_SHIFT)
+#endif
+
+// Tile dimensions in fixed point coordinates
+#define FP_TILE_HEIGHT_SHIFT    (FP_BITS + TILE_HEIGHT_SHIFT)
+#define FP_TILE_HEIGHT          (1 << FP_TILE_HEIGHT_SHIFT)
+
+// Maximum number of triangles that may be generated during clipping. We process SIMD_LANES triangles at a time and
+// clip against 5 planes, so the max should be 5*8 = 40 (we immediately draw the first clipped triangle).
+// This number must be a power of two.
+#define MAX_CLIPPED             (8*SIMD_LANES)
+#define MAX_CLIPPED_WRAP        (MAX_CLIPPED - 1)
+
+// Size of guard band in pixels. Clipping doesn't seem to be very expensive so we use a small guard band
+// to improve rasterization performance. It's not recommended to set the guard band to zero, as this may
+// cause leakage along the screen border due to precision/rounding.
+#define GUARD_BAND_PIXEL_SIZE   1.0f
+
+// We classify triangles as big if the bounding box is wider than this given threshold and use a tighter
+// but slightly more expensive traversal algorithm. This improves performance greatly for sliver triangles
+#define BIG_TRIANGLE            3
+
+// Only gather statistics if enabled.
+#if ENABLE_STATS != 0
+	#define STATS_ADD(var, val)     _InterlockedExchangeAdd64( &var, val )
+#else
+	#define STATS_ADD(var, val)
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMD common defines (constant values)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define SIMD_BITS_ONE       _mmw_set1_epi32(~0)
+#define SIMD_BITS_ZERO      _mmw_setzero_epi32()
+#define SIMD_TILE_WIDTH     _mmw_set1_epi32(TILE_WIDTH)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Vertex fetch utility function, need to be in global namespace due to template specialization
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int N> FORCE_INLINE void VtxFetch4(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes)
+{
+	// Fetch 4 vectors (matching 1 sse part of the SIMD register), and continue to the next
+	const int ssePart = (SIMD_LANES / 4) - N;
+	for (int k = 0; k < 4; k++)
+	{
+		int lane = 4 * ssePart + k;
+		if (numLanes > lane)
+			v[k] = _mmw_insertf32x4_ps(v[k], _mm_loadu_ps(&inVtx[inTrisPtr[lane * 3 + triVtx] << 2]), ssePart);
+	}
+	VtxFetch4<N - 1>(v, inTrisPtr, triVtx, inVtx, numLanes);
+}
+
+template<> FORCE_INLINE void VtxFetch4<0>(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes) 
+{
+	// Workaround for unused parameter warning
+	(void)v; (void)inTrisPtr; (void)triVtx; (void)inVtx; (void)numLanes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Private class containing the implementation
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class MaskedOcclusionCullingPrivate : public MaskedOcclusionCulling
+{
+public:
+	struct ZTile
+	{
+		__mw        mZMin[2];
+		__mwi       mMask;
+	};
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Member variables
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	__mw            mHalfWidth;
+	__mw            mHalfHeight;
+	__mw            mCenterX;
+	__mw            mCenterY;
+	__m128          mCSFrustumPlanes[5];
+	__m128          mIHalfSize;
+	__m128          mICenter;
+	__m128i         mIScreenSize;
+
+	float           mNearDist;
+	int             mWidth;
+	int             mHeight;
+	int             mTilesWidth;
+	int             mTilesHeight;
+
+	ZTile           *mMaskedHiZBuffer;
+	ScissorRect     mFullscreenScissor;
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Constructors and state handling
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	MaskedOcclusionCullingPrivate(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) : mFullscreenScissor(0, 0, 0, 0)
+	{
+		mMaskedHiZBuffer = nullptr;
+		mAlignedAllocCallback = alignedAlloc;
+		mAlignedFreeCallback = alignedFree;
+#if MOC_RECORDER_ENABLE
+        mRecorder = nullptr;
+#endif
+
+		SetNearClipPlane(0.0f);
+		mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f);
+		mCSFrustumPlanes[1] = _mm_setr_ps(1.0f, 0.0f, 1.0f, 0.0f);
+		mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f, 0.0f, 1.0f, 0.0f);
+		mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f, 1.0f, 0.0f);
+		mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f, 1.0f, 0.0f);
+
+		memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
+
+		SetResolution(0, 0);
+	}
+
+	~MaskedOcclusionCullingPrivate() override
+	{
+		if (mMaskedHiZBuffer != nullptr)
+			mAlignedFreeCallback(mMaskedHiZBuffer);
+		mMaskedHiZBuffer = nullptr;
+
+#if MOC_RECORDER_ENABLE
+        assert( mRecorder == nullptr ); // forgot to call StopRecording()?
+#endif
+	}
+
+	void SetResolution(unsigned int width, unsigned int height) override
+	{
+		// Resolution must be a multiple of the subtile size
+		assert(width % SUB_TILE_WIDTH == 0 && height % SUB_TILE_HEIGHT == 0);
+#if PRECISE_COVERAGE == 0
+		// Test if combination of resolution & SLOPE_FP_BITS bits may cause 32-bit overflow. Note that the maximum resolution estimate
+		// is only an estimate (not conservative). It's advicable to stay well below the limit.
+		assert(width < ((1U << 31) - 1U) / ((1U << FP_BITS) * (TILE_HEIGHT + (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f))) - (2U * (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f)));
+#endif
+
+		// Delete current masked hierarchical Z buffer
+		if (mMaskedHiZBuffer != nullptr)
+			mAlignedFreeCallback(mMaskedHiZBuffer);
+		mMaskedHiZBuffer = nullptr;
+
+		// Setup various resolution dependent constant values
+		mWidth = (int)width;
+		mHeight = (int)height;
+		mTilesWidth = (int)(width + TILE_WIDTH - 1) >> TILE_WIDTH_SHIFT;
+		mTilesHeight = (int)(height + TILE_HEIGHT - 1) >> TILE_HEIGHT_SHIFT;
+		mCenterX = _mmw_set1_ps((float)mWidth  * 0.5f);
+		mCenterY = _mmw_set1_ps((float)mHeight * 0.5f);
+		mICenter = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
+		mHalfWidth = _mmw_set1_ps((float)mWidth  * 0.5f);
+#if USE_D3D != 0
+		mHalfHeight = _mmw_set1_ps((float)-mHeight * 0.5f);
+		mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)-mHeight * 0.5f, (float)-mHeight * 0.5f);
+#else
+		mHalfHeight = _mmw_set1_ps((float)mHeight * 0.5f);
+		mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
+#endif
+		mIScreenSize = _mm_setr_epi32(mWidth - 1, mWidth - 1, mHeight - 1, mHeight - 1);
+
+		// Setup a full screen scissor rectangle
+		mFullscreenScissor.mMinX = 0;
+		mFullscreenScissor.mMinY = 0;
+		mFullscreenScissor.mMaxX = mTilesWidth << TILE_WIDTH_SHIFT;
+		mFullscreenScissor.mMaxY = mTilesHeight << TILE_HEIGHT_SHIFT;
+
+		// Adjust clip planes to include a small guard band to avoid clipping leaks
+        if (mWidth > 0.0f && mHeight > 0.0f)
+        {
+            float guardBandWidth = (2.0f / (float)mWidth) * GUARD_BAND_PIXEL_SIZE;
+            float guardBandHeight = (2.0f / (float)mHeight) * GUARD_BAND_PIXEL_SIZE;
+            mCSFrustumPlanes[1] = _mm_setr_ps(1.0f - guardBandWidth, 0.0f, 1.0f, 0.0f);
+            mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f + guardBandWidth, 0.0f, 1.0f, 0.0f);
+            mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f - guardBandHeight, 1.0f, 0.0f);
+            mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f + guardBandHeight, 1.0f, 0.0f);
+        }
+
+		// Allocate masked hierarchical Z buffer (if zero size leave at nullptr)
+		if(mTilesWidth * mTilesHeight > 0)
+			mMaskedHiZBuffer = (ZTile *)mAlignedAllocCallback(64, sizeof(ZTile) * mTilesWidth * mTilesHeight);
+	}
+
+	void GetResolution(unsigned int &width, unsigned int &height) const override
+	{
+		width = mWidth;
+		height = mHeight;
+	}
+
+	void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) override
+	{
+		outBinWidth = (mWidth / nBinsW) - ((mWidth / nBinsW) % TILE_WIDTH);
+		outBinHeight = (mHeight / nBinsH) - ((mHeight / nBinsH) % TILE_HEIGHT);
+	}
+
+    void SetNearClipPlane(float nearDist) override
+	{
+		// Setup the near frustum plane
+		mNearDist = nearDist;
+		mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, -nearDist);
+	}
+
+	float GetNearClipPlane() const override
+	{
+		return mNearDist;
+	}
+
+	void ClearBuffer() override
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+
+		// Iterate through all depth tiles and clear to default values
+		for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
+		{
+			mMaskedHiZBuffer[i].mMask = _mmw_setzero_epi32();
+
+			// Clear z0 to beyond infinity to ensure we never merge with clear data
+			mMaskedHiZBuffer[i].mZMin[0] = _mmw_set1_ps(-1.0f);
+#if QUICK_MASK != 0
+			// Clear z1 to nearest depth value as it is pushed back on each update
+			mMaskedHiZBuffer[i].mZMin[1] = _mmw_set1_ps(FLT_MAX);
+#else
+			mMaskedHiZBuffer[i].mZMin[1] = _mmw_setzero_ps();
+#endif
+		}
+
+#if ENABLE_STATS != 0
+		memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
+#endif
+
+#if MOC_RECORDER_ENABLE != 0
+        {
+            std::lock_guard<std::mutex> lock( mRecorderMutex );
+            if( mRecorder != nullptr ) mRecorder->RecordClearBuffer();
+        }
+#endif
+	}
+
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// MergeBuffer
+	// Utility Function merges another MOC buffer into the existing one
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	void MergeBuffer(MaskedOcclusionCulling* BufferB) override
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+
+		//// Iterate through all depth tiles and merge the 2 tiles
+		for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
+		{
+			__mw *zMinB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mZMin;
+			__mw *zMinA = mMaskedHiZBuffer[i].mZMin;
+			__mwi RastMaskB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
+
+#if QUICK_MASK != 0
+			// Clear z0 to beyond infinity to ensure we never merge with clear data
+			__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
+			// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
+			sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
+			if (!_mmw_testz_epi32(sign0, sign0))
+			{
+				STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
+				zMinA[0] = _mmw_max_ps(zMinA[0], zMinB[0]);
+
+				__mwi rastMask = mMaskedHiZBuffer[i].mMask;
+				__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
+				// Mask out all subtiles failing the depth test (don't update these subtiles)
+				deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zMinA[1], zMinA[0])), 31));
+				mMaskedHiZBuffer[i].mMask = _mmw_andnot_epi32(deadLane, rastMask);
+			}
+
+			// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
+			__mwi LiveTile = _mmw_cmpeq_epi32(RastMaskB, SIMD_BITS_ZERO);
+			// invert to have bits set for clear subtiles
+			__mwi t0inv = _mmw_not_epi32(LiveTile);
+			// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
+			if (!_mmw_testz_epi32(t0inv, t0inv))
+			{
+				STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
+				UpdateTileQuick(i, RastMaskB, zMinB[1]);
+			}
+#else 
+			// Clear z0 to beyond infinity to ensure we never merge with clear data
+			__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(mMaskedHiZBuffer[i].mZMin[0]), 31);
+			// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
+			sign1 = _mmw_cmpeq_epi32(sign1, SIMD_BITS_ZERO);
+
+			// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
+			__mwi LiveTile1 = _mmw_cmpeq_epi32(mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
+			// invert to have bits set for clear subtiles
+			__mwi t1inv = _mmw_not_epi32(LiveTile1);
+			// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
+			if (_mmw_testz_epi32(sign1, sign1) && _mmw_testz_epi32(t1inv, t1inv))
+			{
+				mMaskedHiZBuffer[i].mMask = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
+				mMaskedHiZBuffer[i].mZMin[0] = zMinB[0];
+				mMaskedHiZBuffer[i].mZMin[1] = zMinB[1];
+			}
+			else
+			{
+				// Clear z0 to beyond infinity to ensure we never merge with clear data
+				__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
+				sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
+				// Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
+				if (!_mmw_testz_epi32(sign0, sign0))
+				{
+					// build a mask for Zmin[0], full if the layer has been completed, or partial if tile is still partly filled.
+					// cant just use the completement of the mask, as tiles might not get updated by merge 
+					__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[1]), 31);
+					__mwi LayerMask0 = _mmw_not_epi32(sign1);
+					__mwi LayerMask1 = _mmw_not_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask);
+					__mwi rastMask = _mmw_or_epi32(LayerMask0, LayerMask1);
+
+					UpdateTileAccurate(i, rastMask, zMinB[0]);
+				}
+
+				// Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
+				__mwi LiveTile = _mmw_cmpeq_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
+				// invert to have bits set for clear subtiles
+				__mwi t0inv = _mmw_not_epi32(LiveTile);
+				// VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
+				if (!_mmw_testz_epi32(t0inv, t0inv))
+				{
+					UpdateTileAccurate(i, ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, zMinB[1]);
+				}
+
+				//if (_mmw_testz_epi32(sign0, sign0) && _mmw_testz_epi32(t0inv, t0inv))
+				//	STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
+
+			}
+
+#endif
+		}
+	}
+
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Polygon clipping functions
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	FORCE_INLINE int ClipPolygon(__m128 *outVtx, __m128 *inVtx, const __m128 &plane, int n) const
+	{
+		__m128 p0 = inVtx[n - 1];
+		__m128 dist0 = _mmx_dp4_ps(p0, plane);
+
+		// Loop over all polygon edges and compute intersection with clip plane (if any)
+		int nout = 0;
+		for (int k = 0; k < n; k++)
+		{
+			__m128 p1 = inVtx[k];
+			__m128 dist1 = _mmx_dp4_ps(p1, plane);
+			int dist0Neg = _mm_movemask_ps(dist0);
+			if (!dist0Neg)	// dist0 > 0.0f
+				outVtx[nout++] = p0;
+
+			// Edge intersects the clip plane if dist0 and dist1 have opposing signs
+			if (_mm_movemask_ps(_mm_xor_ps(dist0, dist1)))
+			{
+				// Always clip from the positive side to avoid T-junctions
+				if (!dist0Neg)
+				{
+					__m128 t = _mm_div_ps(dist0, _mm_sub_ps(dist0, dist1));
+					outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p1, p0), t, p0);
+				}
+				else
+				{
+					__m128 t = _mm_div_ps(dist1, _mm_sub_ps(dist1, dist0));
+					outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p0, p1), t, p1);
+				}
+			}
+
+			dist0 = dist1;
+			p0 = p1;
+		}
+		return nout;
+	}
+
+	template<ClipPlanes CLIP_PLANE> void TestClipPlane(__mw *vtxX, __mw *vtxY, __mw *vtxW, unsigned int &straddleMask, unsigned int &triMask, ClipPlanes clipPlaneMask)
+	{
+		straddleMask = 0;
+		// Skip masked clip planes
+		if (!(clipPlaneMask & CLIP_PLANE))
+			return;
+
+		// Evaluate all 3 vertices against the frustum plane
+		__mw planeDp[3];
+		for (int i = 0; i < 3; ++i)
+		{
+			switch (CLIP_PLANE)
+			{
+			case ClipPlanes::CLIP_PLANE_LEFT:   planeDp[i] = _mmw_add_ps(vtxW[i], vtxX[i]); break;
+			case ClipPlanes::CLIP_PLANE_RIGHT:  planeDp[i] = _mmw_sub_ps(vtxW[i], vtxX[i]); break;
+			case ClipPlanes::CLIP_PLANE_BOTTOM: planeDp[i] = _mmw_add_ps(vtxW[i], vtxY[i]); break;
+			case ClipPlanes::CLIP_PLANE_TOP:    planeDp[i] = _mmw_sub_ps(vtxW[i], vtxY[i]); break;
+			case ClipPlanes::CLIP_PLANE_NEAR:   planeDp[i] = _mmw_sub_ps(vtxW[i], _mmw_set1_ps(mNearDist)); break;
+			}
+		}
+
+		// Look at FP sign and determine if tri is inside, outside or straddles the frustum plane
+		__mw inside = _mmw_andnot_ps(planeDp[0], _mmw_andnot_ps(planeDp[1], _mmw_not_ps(planeDp[2])));
+		__mw outside = _mmw_and_ps(planeDp[0], _mmw_and_ps(planeDp[1], planeDp[2]));
+		unsigned int inMask = (unsigned int)_mmw_movemask_ps(inside);
+		unsigned int outMask = (unsigned int)_mmw_movemask_ps(outside);
+		straddleMask = (~outMask) & (~inMask);
+		triMask &= ~outMask;
+	}
+
+	FORCE_INLINE void ClipTriangleAndAddToBuffer(__mw *vtxX, __mw *vtxY, __mw *vtxW, __m128 *clippedTrisBuffer, int &clipWriteIdx, unsigned int &triMask, unsigned int triClipMask, ClipPlanes clipPlaneMask)
+	{
+		if (!triClipMask)
+			return;
+
+		// Inside test all 3 triangle vertices against all active frustum planes
+		unsigned int straddleMask[5];
+		TestClipPlane<ClipPlanes::CLIP_PLANE_NEAR>(vtxX, vtxY, vtxW, straddleMask[0], triMask, clipPlaneMask);
+		TestClipPlane<ClipPlanes::CLIP_PLANE_LEFT>(vtxX, vtxY, vtxW, straddleMask[1], triMask, clipPlaneMask);
+		TestClipPlane<ClipPlanes::CLIP_PLANE_RIGHT>(vtxX, vtxY, vtxW, straddleMask[2], triMask, clipPlaneMask);
+		TestClipPlane<ClipPlanes::CLIP_PLANE_BOTTOM>(vtxX, vtxY, vtxW, straddleMask[3], triMask, clipPlaneMask);
+		TestClipPlane<ClipPlanes::CLIP_PLANE_TOP>(vtxX, vtxY, vtxW, straddleMask[4], triMask, clipPlaneMask);
+
+        // Clip triangle against straddling planes and add to the clipped triangle buffer
+		__m128 vtxBuf[2][8];
+
+#if CLIPPING_PRESERVES_ORDER != 0
+		unsigned int clipMask = triClipMask & triMask;
+		unsigned int clipAndStraddleMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & clipMask;
+        // no clipping needed after all - early out
+        if (clipAndStraddleMask == 0)
+			return;
+		while( clipMask )
+		{
+			// Find and setup next triangle to clip
+			unsigned int triIdx = find_clear_lsb(&clipMask);
+			unsigned int triBit = (1U << triIdx);
+			assert(triIdx < SIMD_LANES);
+
+			int bufIdx = 0;
+			int nClippedVerts = 3;
+			for (int i = 0; i < 3; i++)
+				vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
+
+			// Clip triangle with straddling planes. 
+			for (int i = 0; i < 5; ++i)
+			{
+				if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i))) // <- second part maybe not needed?
+				{
+					nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
+					bufIdx ^= 1;
+				}
+			}
+
+			if (nClippedVerts >= 3)
+			{
+                // Write all triangles into the clip buffer and process them next loop iteration
+				clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
+				clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][1];
+				clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][2];
+				clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
+				for (int i = 2; i < nClippedVerts - 1; i++)
+				{
+					clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
+					clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
+					clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
+					clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
+				}
+			}
+		}
+        // since all triangles were copied to clip buffer for next iteration, skip further processing
+		triMask = 0;
+#else
+		unsigned int clipMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & (triClipMask & triMask);
+		while (clipMask)
+		{
+			// Find and setup next triangle to clip
+			unsigned int triIdx = find_clear_lsb(&clipMask);
+			unsigned int triBit = (1U << triIdx);
+			assert(triIdx < SIMD_LANES);
+
+			int bufIdx = 0;
+			int nClippedVerts = 3;
+			for (int i = 0; i < 3; i++)
+				vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
+
+			// Clip triangle with straddling planes. 
+			for (int i = 0; i < 5; ++i)
+			{
+				if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i)))
+				{
+					nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
+					bufIdx ^= 1;
+				}
+			}
+
+			if (nClippedVerts >= 3)
+			{
+				// Write the first triangle back into the list of currently processed triangles
+				for (int i = 0; i < 3; i++)
+				{
+					simd_f32(vtxX[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[0];
+					simd_f32(vtxY[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[1];
+					simd_f32(vtxW[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[2];
+				}
+				// Write the remaining triangles into the clip buffer and process them next loop iteration
+				for (int i = 2; i < nClippedVerts - 1; i++)
+				{
+					clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
+					clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
+					clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
+					clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
+				}
+			}
+			else // Kill triangles that was removed by clipping
+				triMask &= ~triBit;
+		}
+#endif
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Vertex transform & projection
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	FORCE_INLINE void TransformVerts(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *modelToClipMatrix)
+	{
+		if (modelToClipMatrix != nullptr)
+		{
+			for (int i = 0; i < 3; ++i)
+			{
+				__mw tmpX, tmpY, tmpW;
+				tmpX = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[0]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[4]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[8]), _mmw_set1_ps(modelToClipMatrix[12]))));
+				tmpY = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[1]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[5]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[9]), _mmw_set1_ps(modelToClipMatrix[13]))));
+				tmpW = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[3]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[7]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[11]), _mmw_set1_ps(modelToClipMatrix[15]))));
+				vtxX[i] = tmpX;	vtxY[i] = tmpY;	vtxW[i] = tmpW;
+			}
+		}
+	}
+
+#if PRECISE_COVERAGE != 0
+	FORCE_INLINE void ProjectVertices(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
+	{
+#if USE_D3D != 0
+		static const int vertexOrder[] = {2, 1, 0};
+#else
+		static const int vertexOrder[] = {0, 1, 2};
+#endif
+
+		// Project vertices and transform to screen space. Snap to sub-pixel coordinates with FP_BITS precision.
+		for (int i = 0; i < 3; i++)
+		{
+			int idx = vertexOrder[i];
+			__mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
+			__mw screenX = _mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX);
+			__mw screenY = _mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY);
+			ipVtxX[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenX, _mmw_set1_ps(float(1 << FP_BITS))));
+			ipVtxY[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenY, _mmw_set1_ps(float(1 << FP_BITS))));
+			pVtxX[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[idx]), _mmw_set1_ps(FP_INV));
+			pVtxY[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[idx]), _mmw_set1_ps(FP_INV));
+			pVtxZ[idx] = rcpW;
+		}
+	}
+#else
+	FORCE_INLINE void ProjectVertices(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
+	{
+#if USE_D3D != 0
+		static const int vertexOrder[] = {2, 1, 0};
+#else
+		static const int vertexOrder[] = {0, 1, 2};
+#endif
+		// Project vertices and transform to screen space. Round to nearest integer pixel coordinate
+		for (int i = 0; i < 3; i++)
+		{
+			int idx = vertexOrder[i];
+			__mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
+
+			// The rounding modes are set to match HW rasterization with OpenGL. In practice our samples are placed
+			// in the (1,0) corner of each pixel, while HW rasterizer uses (0.5, 0.5). We get (1,0) because of the 
+			// floor used when interpolating along triangle edges. The rounding modes match an offset of (0.5, -0.5)
+			pVtxX[idx] = _mmw_ceil_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX));
+			pVtxY[idx] = _mmw_floor_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY));
+			pVtxZ[idx] = rcpW;
+		}
+	}
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Common SSE/AVX input assembly functions, note that there are specialized gathers for the general case in the SSE/AVX specific files
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	FORCE_INLINE void GatherVerticesFast(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes)
+	{
+		// This function assumes that the vertex layout is four packed x, y, z, w-values.
+		// Since the layout is known we can get some additional performance by using a 
+		// more optimized gather strategy.
+		assert(numLanes >= 1);
+
+		// Gather vertices 
+		__mw v[4], swz[4];
+		for (int i = 0; i < 3; i++)
+		{
+			// Load 4 (x,y,z,w) vectors per SSE part of the SIMD register (so 4 vectors for SSE, 8 vectors for AVX)
+			// this fetch uses templates to unroll the loop
+			VtxFetch4<SIMD_LANES / 4>(v, inTrisPtr, i, inVtx, numLanes);
+
+			// Transpose each individual SSE part of the SSE/AVX register (similar to _MM_TRANSPOSE4_PS)
+			swz[0] = _mmw_shuffle_ps(v[0], v[1], 0x44);
+			swz[2] = _mmw_shuffle_ps(v[0], v[1], 0xEE);
+			swz[1] = _mmw_shuffle_ps(v[2], v[3], 0x44);
+			swz[3] = _mmw_shuffle_ps(v[2], v[3], 0xEE);
+
+			vtxX[i] = _mmw_shuffle_ps(swz[0], swz[1], 0x88);
+			vtxY[i] = _mmw_shuffle_ps(swz[0], swz[1], 0xDD);
+			vtxW[i] = _mmw_shuffle_ps(swz[2], swz[3], 0xDD);
+		}
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Rasterization functions
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	FORCE_INLINE void ComputeBoundingBox(__mwi &bbminX, __mwi &bbminY, __mwi &bbmaxX, __mwi &bbmaxY, const __mw *vX, const __mw *vY, const ScissorRect *scissor)
+	{
+		static const __mwi SIMD_PAD_W_MASK = _mmw_set1_epi32(~(TILE_WIDTH - 1));
+		static const __mwi SIMD_PAD_H_MASK = _mmw_set1_epi32(~(TILE_HEIGHT - 1));
+
+		// Find Min/Max vertices
+		bbminX = _mmw_cvttps_epi32(_mmw_min_ps(vX[0], _mmw_min_ps(vX[1], vX[2])));
+		bbminY = _mmw_cvttps_epi32(_mmw_min_ps(vY[0], _mmw_min_ps(vY[1], vY[2])));
+		bbmaxX = _mmw_cvttps_epi32(_mmw_max_ps(vX[0], _mmw_max_ps(vX[1], vX[2])));
+		bbmaxY = _mmw_cvttps_epi32(_mmw_max_ps(vY[0], _mmw_max_ps(vY[1], vY[2])));
+
+		// Clamp to tile boundaries
+		bbminX = _mmw_and_epi32(bbminX, SIMD_PAD_W_MASK);
+		bbmaxX = _mmw_and_epi32(_mmw_add_epi32(bbmaxX, _mmw_set1_epi32(TILE_WIDTH)), SIMD_PAD_W_MASK);
+		bbminY = _mmw_and_epi32(bbminY, SIMD_PAD_H_MASK);
+		bbmaxY = _mmw_and_epi32(_mmw_add_epi32(bbmaxY, _mmw_set1_epi32(TILE_HEIGHT)), SIMD_PAD_H_MASK);
+
+		// Clip to scissor
+		bbminX = _mmw_max_epi32(bbminX, _mmw_set1_epi32(scissor->mMinX));
+		bbmaxX = _mmw_min_epi32(bbmaxX, _mmw_set1_epi32(scissor->mMaxX));
+		bbminY = _mmw_max_epi32(bbminY, _mmw_set1_epi32(scissor->mMinY));
+		bbmaxY = _mmw_min_epi32(bbmaxY, _mmw_set1_epi32(scissor->mMaxY));
+	}
+
+#if PRECISE_COVERAGE != 0
+	FORCE_INLINE void SortVertices(__mwi *vX, __mwi *vY)
+	{
+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+		for (int i = 0; i < 2; i++)
+		{
+			__mwi ey1 = _mmw_sub_epi32(vY[1], vY[0]);
+			__mwi ey2 = _mmw_sub_epi32(vY[2], vY[0]);
+			__mwi swapMask = _mmw_or_epi32(_mmw_or_epi32(ey1, ey2), _mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO));
+			__mwi sX, sY;
+			sX = _mmw_blendv_epi32(vX[2], vX[0], swapMask);
+			vX[0] = _mmw_blendv_epi32(vX[0], vX[1], swapMask);
+			vX[1] = _mmw_blendv_epi32(vX[1], vX[2], swapMask);
+			vX[2] = sX;
+			sY = _mmw_blendv_epi32(vY[2], vY[0], swapMask);
+			vY[0] = _mmw_blendv_epi32(vY[0], vY[1], swapMask);
+			vY[1] = _mmw_blendv_epi32(vY[1], vY[2], swapMask);
+			vY[2] = sY;
+		}
+	}
+
+	FORCE_INLINE int CullBackfaces(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
+	{
+		// Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
+		if (!(bfWinding & BACKFACE_CW))
+		{
+			__mw tmpX, tmpY, tmpZ;
+			__mwi itmpX, itmpY;
+			itmpX = _mmw_blendv_epi32(ipVtxX[2], ipVtxX[0], simd_cast<__mwi>(ccwMask));
+			itmpY = _mmw_blendv_epi32(ipVtxY[2], ipVtxY[0], simd_cast<__mwi>(ccwMask));
+			tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
+			tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
+			tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
+			ipVtxX[2] = _mmw_blendv_epi32(ipVtxX[0], ipVtxX[2], simd_cast<__mwi>(ccwMask));
+			ipVtxY[2] = _mmw_blendv_epi32(ipVtxY[0], ipVtxY[2], simd_cast<__mwi>(ccwMask));
+			pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
+			pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
+			pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
+			ipVtxX[0] = itmpX;
+			ipVtxY[0] = itmpY;
+			pVtxX[0] = tmpX;
+			pVtxY[0] = tmpY;
+			pVtxZ[0] = tmpZ;
+		}
+
+		// Return a lane mask with all front faces set
+		return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
+	}
+#else
+	FORCE_INLINE void SortVertices(__mw *vX, __mw *vY)
+	{
+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+		for (int i = 0; i < 2; i++)
+		{
+			__mw ey1 = _mmw_sub_ps(vY[1], vY[0]);
+			__mw ey2 = _mmw_sub_ps(vY[2], vY[0]);
+			__mw swapMask = _mmw_or_ps(_mmw_or_ps(ey1, ey2), simd_cast<__mw>(_mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO)));
+			__mw sX, sY;
+			sX = _mmw_blendv_ps(vX[2], vX[0], swapMask);
+			vX[0] = _mmw_blendv_ps(vX[0], vX[1], swapMask);
+			vX[1] = _mmw_blendv_ps(vX[1], vX[2], swapMask);
+			vX[2] = sX;
+			sY = _mmw_blendv_ps(vY[2], vY[0], swapMask);
+			vY[0] = _mmw_blendv_ps(vY[0], vY[1], swapMask);
+			vY[1] = _mmw_blendv_ps(vY[1], vY[2], swapMask);
+			vY[2] = sY;
+		}
+	}
+
+	FORCE_INLINE int CullBackfaces(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
+	{
+		// Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
+		if (!(bfWinding & BACKFACE_CW))
+		{
+			__mw tmpX, tmpY, tmpZ;
+			tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
+			tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
+			tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
+			pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
+			pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
+			pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
+			pVtxX[0] = tmpX;
+			pVtxY[0] = tmpY;
+			pVtxZ[0] = tmpZ;
+		}
+
+		// Return a lane mask with all front faces set
+		return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
+	}
+#endif
+
+	FORCE_INLINE void ComputeDepthPlane(const __mw *pVtxX, const __mw *pVtxY, const __mw *pVtxZ, __mw &zPixelDx, __mw &zPixelDy) const
+	{
+		// Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
+		__mw x2 = _mmw_sub_ps(pVtxX[2], pVtxX[0]);
+		__mw x1 = _mmw_sub_ps(pVtxX[1], pVtxX[0]);
+		__mw y1 = _mmw_sub_ps(pVtxY[1], pVtxY[0]);
+		__mw y2 = _mmw_sub_ps(pVtxY[2], pVtxY[0]);
+		__mw z1 = _mmw_sub_ps(pVtxZ[1], pVtxZ[0]);
+		__mw z2 = _mmw_sub_ps(pVtxZ[2], pVtxZ[0]);
+		__mw d = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_fmsub_ps(x1, y2, _mmw_mul_ps(y1, x2)));
+		zPixelDx = _mmw_mul_ps(_mmw_fmsub_ps(z1, y2, _mmw_mul_ps(y1, z2)), d);
+		zPixelDy = _mmw_mul_ps(_mmw_fmsub_ps(x1, z2, _mmw_mul_ps(z1, x2)), d);
+	}
+
+	FORCE_INLINE void UpdateTileQuick(int tileIdx, const __mwi &coverage, const __mw &zTriv)
+	{
+		// Update heuristic used in the paper "Masked Software Occlusion Culling", 
+		// good balance between performance and accuracy
+		STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
+		assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
+
+		__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
+		__mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
+
+		// Swizzle coverage mask to 8x4 subtiles and test if any subtiles are not covered at all
+		__mwi rastMask = coverage;
+		__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
+
+		// Mask out all subtiles failing the depth test (don't update these subtiles)
+		deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zTriv, zMin[0])), 31));
+		rastMask = _mmw_andnot_epi32(deadLane, rastMask);
+
+		// Use distance heuristic to discard layer 1 if incoming triangle is significantly nearer to observer
+		// than the buffer contents. See Section 3.2 in "Masked Software Occlusion Culling"
+		__mwi coveredLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ONE);
+		__mw diff = _mmw_fmsub_ps(zMin[1], _mmw_set1_ps(2.0f), _mmw_add_ps(zTriv, zMin[0]));
+		__mwi discardLayerMask = _mmw_andnot_epi32(deadLane, _mmw_or_epi32(_mmw_srai_epi32(simd_cast<__mwi>(diff), 31), coveredLane));
+
+		// Update the mask with incoming triangle coverage
+		mask = _mmw_or_epi32(_mmw_andnot_epi32(discardLayerMask, mask), rastMask);
+
+		__mwi maskFull = _mmw_cmpeq_epi32(mask, SIMD_BITS_ONE);
+
+		// Compute new value for zMin[1]. This has one of four outcomes: zMin[1] = min(zMin[1], zTriv),  zMin[1] = zTriv, 
+		// zMin[1] = FLT_MAX or unchanged, depending on if the layer is updated, discarded, fully covered, or not updated
+		__mw opA = _mmw_blendv_ps(zTriv, zMin[1], simd_cast<__mw>(deadLane));
+		__mw opB = _mmw_blendv_ps(zMin[1], zTriv, simd_cast<__mw>(discardLayerMask));
+		__mw z1min = _mmw_min_ps(opA, opB);
+		zMin[1] = _mmw_blendv_ps(z1min, _mmw_set1_ps(FLT_MAX), simd_cast<__mw>(maskFull));
+
+		// Propagate zMin[1] back to zMin[0] if tile was fully covered, and update the mask
+		zMin[0] = _mmw_blendv_ps(zMin[0], z1min, simd_cast<__mw>(maskFull));
+		mMaskedHiZBuffer[tileIdx].mMask = _mmw_andnot_epi32(maskFull, mask);
+	}
+
+	FORCE_INLINE void UpdateTileAccurate(int tileIdx, const __mwi &coverage, const __mw &zTriv)
+	{
+		assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
+
+		__mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
+		__mwi &mask = mMaskedHiZBuffer[tileIdx].mMask;
+
+		// Swizzle coverage mask to 8x4 subtiles
+		__mwi rastMask = coverage;
+
+		// Perform individual depth tests with layer 0 & 1 and mask out all failing pixels 
+		__mw sdist0 = _mmw_sub_ps(zMin[0], zTriv);
+		__mw sdist1 = _mmw_sub_ps(zMin[1], zTriv);
+		__mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(sdist0), 31);
+		__mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(sdist1), 31);
+		__mwi triMask = _mmw_and_epi32(rastMask, _mmw_or_epi32(_mmw_andnot_epi32(mask, sign0), _mmw_and_epi32(mask, sign1)));
+
+		// Early out if no pixels survived the depth test (this test is more accurate than
+		// the early culling test in TraverseScanline())
+		__mwi t0 = _mmw_cmpeq_epi32(triMask, SIMD_BITS_ZERO);
+		__mwi t0inv = _mmw_not_epi32(t0);
+		if (_mmw_testz_epi32(t0inv, t0inv))
+			return;
+
+		STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
+
+		__mw zTri = _mmw_blendv_ps(zTriv, zMin[0], simd_cast<__mw>(t0));
+
+		// Test if incoming triangle completely overwrites layer 0 or 1
+		__mwi layerMask0 = _mmw_andnot_epi32(triMask, _mmw_not_epi32(mask));
+		__mwi layerMask1 = _mmw_andnot_epi32(triMask, mask);
+		__mwi lm0 = _mmw_cmpeq_epi32(layerMask0, SIMD_BITS_ZERO);
+		__mwi lm1 = _mmw_cmpeq_epi32(layerMask1, SIMD_BITS_ZERO);
+		__mw z0 = _mmw_blendv_ps(zMin[0], zTri, simd_cast<__mw>(lm0));
+		__mw z1 = _mmw_blendv_ps(zMin[1], zTri, simd_cast<__mw>(lm1));
+
+		// Compute distances used for merging heuristic
+		__mw d0 = _mmw_abs_ps(sdist0);
+		__mw d1 = _mmw_abs_ps(sdist1);
+		__mw d2 = _mmw_abs_ps(_mmw_sub_ps(z0, z1));
+
+		// Find minimum distance
+		__mwi c01 = simd_cast<__mwi>(_mmw_sub_ps(d0, d1));
+		__mwi c02 = simd_cast<__mwi>(_mmw_sub_ps(d0, d2));
+		__mwi c12 = simd_cast<__mwi>(_mmw_sub_ps(d1, d2));
+		// Two tests indicating which layer the incoming triangle will merge with or 
+		// overwrite. d0min indicates that the triangle will overwrite layer 0, and 
+		// d1min flags that the triangle will overwrite layer 1.
+		__mwi d0min = _mmw_or_epi32(_mmw_and_epi32(c01, c02), _mmw_or_epi32(lm0, t0));
+		__mwi d1min = _mmw_andnot_epi32(d0min, _mmw_or_epi32(c12, lm1));
+
+		///////////////////////////////////////////////////////////////////////////////
+		// Update depth buffer entry. NOTE: we always merge into layer 0, so if the 
+		// triangle should be merged with layer 1, we first swap layer 0 & 1 and then
+		// merge into layer 0.
+		///////////////////////////////////////////////////////////////////////////////
+
+		// Update mask based on which layer the triangle overwrites or was merged into
+		__mw inner = _mmw_blendv_ps(simd_cast<__mw>(triMask), simd_cast<__mw>(layerMask1), simd_cast<__mw>(d0min));
+		mask = simd_cast<__mwi>(_mmw_blendv_ps(inner, simd_cast<__mw>(layerMask0), simd_cast<__mw>(d1min)));
+
+		// Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
+		// merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
+		// with zTri.
+		__mw e0 = _mmw_blendv_ps(z0, z1, simd_cast<__mw>(d1min));
+		__mw e1 = _mmw_blendv_ps(z1, zTri, simd_cast<__mw>(_mmw_or_epi32(d1min, d0min)));
+		zMin[0] = _mmw_min_ps(e0, e1);
+
+		// Update the zMin[1] value. There are three outcomes: keep current value,
+		// overwrite with zTri, or overwrite with z1
+		__mw z1t = _mmw_blendv_ps(zTri, z1, simd_cast<__mw>(d0min));
+		zMin[1] = _mmw_blendv_ps(z1t, z0, simd_cast<__mw>(d1min));
+	}
+
+	template<int TEST_Z, int NRIGHT, int NLEFT>
+	FORCE_INLINE int TraverseScanline(int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, const __mwi *events, const __mw &zTriMin, const __mw &zTriMax, const __mw &iz0, float zx)
+	{
+		// Floor edge events to integer pixel coordinates (shift out fixed point bits)
+		int eventOffset = leftOffset << TILE_WIDTH_SHIFT;
+		__mwi right[NRIGHT], left[NLEFT];
+		for (int i = 0; i < NRIGHT; ++i)
+			right[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[rightEvent + i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
+		for (int i = 0; i < NLEFT; ++i)
+			left[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[leftEvent - i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
+
+		__mw z0 = _mmw_add_ps(iz0, _mmw_set1_ps(zx*leftOffset));
+		int tileIdxEnd = tileIdx + rightOffset;
+		tileIdx += leftOffset;
+		for (;;)
+		{
+			if (TEST_Z)
+				STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
+			else
+				STATS_ADD(mStats.mOccluders.mNumTilesTraversed, 1);
+
+			// Perform a coarse test to quickly discard occluded tiles
+#if QUICK_MASK != 0
+			// Only use the reference layer (layer 0) to cull as it is always conservative
+			__mw zMinBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
+#else
+			// Compute zMin for the overlapped layers 
+			__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
+			__mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
+			__mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
+			__mw zMinBuf = _mmw_min_ps(zMin0, zMin1);
+#endif
+			__mw dist0 = _mmw_sub_ps(zTriMax, zMinBuf);
+			if (_mmw_movemask_ps(dist0) != SIMD_ALL_LANES_MASK)
+			{
+				// Compute coverage mask for entire 32xN using shift operations
+				__mwi accumulatedMask = _mmw_sllv_ones(left[0]);
+				for (int i = 1; i < NLEFT; ++i)
+					accumulatedMask = _mmw_and_epi32(accumulatedMask, _mmw_sllv_ones(left[i]));
+				for (int i = 0; i < NRIGHT; ++i)
+					accumulatedMask = _mmw_andnot_epi32(_mmw_sllv_ones(right[i]), accumulatedMask);
+
+				if (TEST_Z)
+				{
+					// Perform a conservative visibility test (test zMax against buffer for each covered 8x4 subtile)
+					__mw zSubTileMax = _mmw_min_ps(z0, zTriMax);
+					__mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zSubTileMax, zMinBuf));
+
+					__mwi rastMask = _mmw_transpose_epi8(accumulatedMask);
+					__mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
+					zPass = _mmw_andnot_epi32(deadLane, zPass);
+
+					if (!_mmw_testz_epi32(zPass, zPass))
+						return CullingResult::VISIBLE;
+				}
+				else
+				{
+					// Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
+					__mw zSubTileMin = _mmw_max_ps(z0, zTriMin);
+#if QUICK_MASK != 0
+					UpdateTileQuick(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
+#else 
+					UpdateTileAccurate(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
+#endif
+				}
+			}
+
+			// Update buffer address, interpolate z and edge events
+			tileIdx++;
+			if (tileIdx >= tileIdxEnd)
+				break;
+			z0 = _mmw_add_ps(z0, _mmw_set1_ps(zx));
+			for (int i = 0; i < NRIGHT; ++i)
+				right[i] = _mmw_subs_epu16(right[i], SIMD_TILE_WIDTH);	// Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
+			for (int i = 0; i < NLEFT; ++i)
+				left[i] = _mmw_subs_epu16(left[i], SIMD_TILE_WIDTH);
+		}
+
+		return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
+	}
+
+
+	template<int TEST_Z, int TIGHT_TRAVERSAL, int MID_VTX_RIGHT>
+#if PRECISE_COVERAGE != 0
+	FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mw *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy, const __mwi *edgeY, const __mwi *absEdgeX, const __mwi *slopeSign, const __mwi *eventStartRemainder, const __mwi *slopeTileRemainder)
+#else
+	FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mwi *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy)
+#endif
+	{
+		if (TEST_Z)
+			STATS_ADD(mStats.mOccludees.mNumRasterizedTriangles, 1);
+		else
+			STATS_ADD(mStats.mOccluders.mNumRasterizedTriangles, 1);
+
+		int cullResult;
+
+#if PRECISE_COVERAGE != 0
+		#define LEFT_EDGE_BIAS -1
+		#define RIGHT_EDGE_BIAS 1
+		#define UPDATE_TILE_EVENTS_Y(i) \
+				triEventRemainder[i] = _mmw_sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]); \
+				__mwi overflow##i = _mmw_srai_epi32(triEventRemainder[i], 31); \
+				triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow##i, triEdgeY[i])); \
+				triEvent[i] = _mmw_add_epi32(triEvent[i], _mmw_add_epi32(triSlopeTileDelta[i], _mmw_and_epi32(overflow##i, triSlopeSign[i])))
+
+		__mwi triEvent[3], triSlopeSign[3], triSlopeTileDelta[3], triEdgeY[3], triSlopeTileRemainder[3], triEventRemainder[3];
+		for (int i = 0; i < 3; ++i)
+		{
+			triSlopeSign[i] = _mmw_set1_epi32(simd_i32(slopeSign[i])[triIdx]);
+			triSlopeTileDelta[i] = _mmw_set1_epi32(simd_i32(slopeTileDelta[i])[triIdx]);
+			triEdgeY[i] = _mmw_set1_epi32(simd_i32(edgeY[i])[triIdx]);
+			triSlopeTileRemainder[i] = _mmw_set1_epi32(simd_i32(slopeTileRemainder[i])[triIdx]);
+
+			__mw triSlope = _mmw_set1_ps(simd_f32(slope[i])[triIdx]);
+			__mwi triAbsEdgeX = _mmw_set1_epi32(simd_i32(absEdgeX[i])[triIdx]);
+			__mwi triStartRemainder = _mmw_set1_epi32(simd_i32(eventStartRemainder[i])[triIdx]);
+			__mwi triEventStart = _mmw_set1_epi32(simd_i32(eventStart[i])[triIdx]);
+
+			__mwi scanlineDelta = _mmw_cvttps_epi32(_mmw_mul_ps(triSlope, SIMD_LANE_YCOORD_F));
+			__mwi scanlineSlopeRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(triAbsEdgeX, SIMD_LANE_YCOORD_I), _mmw_mullo_epi32(_mmw_abs_epi32(scanlineDelta), triEdgeY[i]));
+
+			triEventRemainder[i] = _mmw_sub_epi32(triStartRemainder, scanlineSlopeRemainder);
+			__mwi overflow = _mmw_srai_epi32(triEventRemainder[i], 31);
+			triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow, triEdgeY[i]));
+			triEvent[i] = _mmw_add_epi32(_mmw_add_epi32(triEventStart, scanlineDelta), _mmw_and_epi32(overflow, triSlopeSign[i]));
+		}
+
+#else
+		#define LEFT_EDGE_BIAS 0
+		#define RIGHT_EDGE_BIAS 0
+		#define UPDATE_TILE_EVENTS_Y(i)		triEvent[i] = _mmw_add_epi32(triEvent[i], triSlopeTileDelta[i]);
+
+		// Get deltas used to increment edge events each time we traverse one scanline of tiles
+		__mwi triSlopeTileDelta[3];
+		triSlopeTileDelta[0] = _mmw_set1_epi32(simd_i32(slopeTileDelta[0])[triIdx]);
+		triSlopeTileDelta[1] = _mmw_set1_epi32(simd_i32(slopeTileDelta[1])[triIdx]);
+		triSlopeTileDelta[2] = _mmw_set1_epi32(simd_i32(slopeTileDelta[2])[triIdx]);
+
+		// Setup edge events for first batch of SIMD_LANES scanlines
+		__mwi triEvent[3];
+		triEvent[0] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[0])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[0])[triIdx])));
+		triEvent[1] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[1])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[1])[triIdx])));
+		triEvent[2] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[2])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[2])[triIdx])));
+#endif
+
+		// For big triangles track start & end tile for each scanline and only traverse the valid region
+		int startDelta, endDelta, topDelta, startEvent, endEvent, topEvent;
+		if (TIGHT_TRAVERSAL)
+		{
+			startDelta = simd_i32(slopeTileDelta[2])[triIdx] + LEFT_EDGE_BIAS;
+			endDelta = simd_i32(slopeTileDelta[0])[triIdx] + RIGHT_EDGE_BIAS;
+			topDelta = simd_i32(slopeTileDelta[1])[triIdx] + (MID_VTX_RIGHT ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
+
+			// Compute conservative bounds for the edge events over a 32xN tile
+			startEvent = simd_i32(eventStart[2])[triIdx] + min(0, startDelta);
+			endEvent = simd_i32(eventStart[0])[triIdx] + max(0, endDelta) + (TILE_WIDTH << FP_BITS);
+			if (MID_VTX_RIGHT)
+				topEvent = simd_i32(eventStart[1])[triIdx] + max(0, topDelta) + (TILE_WIDTH << FP_BITS);
+			else
+				topEvent = simd_i32(eventStart[1])[triIdx] + min(0, topDelta);
+		}
+
+		if (tileRowIdx <= tileMidRowIdx)
+		{
+			int tileStopIdx = min(tileEndRowIdx, tileMidRowIdx);
+			// Traverse the bottom half of the triangle
+			while (tileRowIdx < tileStopIdx)
+			{
+				int start = 0, end = bbWidth;
+				if (TIGHT_TRAVERSAL)
+				{
+					// Compute tighter start and endpoints to avoid traversing empty space
+					start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+					end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+					startEvent += startDelta;
+					endEvent += endDelta;
+				}
+
+				// Traverse the scanline and update the masked hierarchical z buffer
+				cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
+
+				if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
+					return CullingResult::VISIBLE;
+
+				// move to the next scanline of tiles, update edge events and interpolate z
+				tileRowIdx += mTilesWidth;
+				z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
+				UPDATE_TILE_EVENTS_Y(0);
+				UPDATE_TILE_EVENTS_Y(2);
+			}
+
+			// Traverse the middle scanline of tiles. We must consider all three edges only in this region
+			if (tileRowIdx < tileEndRowIdx)
+			{
+				int start = 0, end = bbWidth;
+				if (TIGHT_TRAVERSAL)
+				{
+					// Compute tighter start and endpoints to avoid traversing lots of empty space
+					start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+					end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+
+					// Switch the traversal start / end to account for the upper side edge
+					endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
+					endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
+					startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
+					startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
+					startEvent += startDelta;
+					endEvent += endDelta;
+				}
+
+				// Traverse the scanline and update the masked hierarchical z buffer. 
+				if (MID_VTX_RIGHT)
+					cullResult = TraverseScanline<TEST_Z, 2, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
+				else
+					cullResult = TraverseScanline<TEST_Z, 1, 2>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
+
+				if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
+					return CullingResult::VISIBLE;
+
+				tileRowIdx += mTilesWidth;
+			}
+
+			// Traverse the top half of the triangle
+			if (tileRowIdx < tileEndRowIdx)
+			{
+				// move to the next scanline of tiles, update edge events and interpolate z
+				z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
+				int i0 = MID_VTX_RIGHT + 0;
+				int i1 = MID_VTX_RIGHT + 1;
+				UPDATE_TILE_EVENTS_Y(i0);
+				UPDATE_TILE_EVENTS_Y(i1);
+				for (;;)
+				{
+					int start = 0, end = bbWidth;
+					if (TIGHT_TRAVERSAL)
+					{
+						// Compute tighter start and endpoints to avoid traversing lots of empty space
+						start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+						end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+						startEvent += startDelta;
+						endEvent += endDelta;
+					}
+
+					// Traverse the scanline and update the masked hierarchical z buffer
+					cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
+
+					if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
+						return CullingResult::VISIBLE;
+
+					// move to the next scanline of tiles, update edge events and interpolate z
+					tileRowIdx += mTilesWidth;
+					if (tileRowIdx >= tileEndRowIdx)
+						break;
+					z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
+					UPDATE_TILE_EVENTS_Y(i0);
+					UPDATE_TILE_EVENTS_Y(i1);
+				}
+			}
+		}
+		else
+		{
+			if (TIGHT_TRAVERSAL)
+			{
+				// For large triangles, switch the traversal start / end to account for the upper side edge
+				endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
+				endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
+				startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
+				startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
+			}
+
+			// Traverse the top half of the triangle
+			if (tileRowIdx < tileEndRowIdx)
+			{
+				int i0 = MID_VTX_RIGHT + 0;
+				int i1 = MID_VTX_RIGHT + 1;
+				for (;;)
+				{
+					int start = 0, end = bbWidth;
+					if (TIGHT_TRAVERSAL)
+					{
+						// Compute tighter start and endpoints to avoid traversing lots of empty space
+						start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+						end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
+						startEvent += startDelta;
+						endEvent += endDelta;
+					}
+
+					// Traverse the scanline and update the masked hierarchical z buffer
+					cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
+
+					if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
+						return CullingResult::VISIBLE;
+
+					// move to the next scanline of tiles, update edge events and interpolate z
+					tileRowIdx += mTilesWidth;
+					if (tileRowIdx >= tileEndRowIdx)
+						break;
+					z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
+					UPDATE_TILE_EVENTS_Y(i0);
+					UPDATE_TILE_EVENTS_Y(i1);
+				}
+			}
+		}
+
+		return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
+	}
+
+	template<bool TEST_Z>
+#if PRECISE_COVERAGE != 0
+	FORCE_INLINE int RasterizeTriangleBatch(__mwi ipVtxX[3], __mwi ipVtxY[3], __mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
+#else
+	FORCE_INLINE int RasterizeTriangleBatch(__mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
+#endif
+	{
+		int cullResult = CullingResult::VIEW_CULLED;
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Compute bounding box and clamp to tile coordinates
+		//////////////////////////////////////////////////////////////////////////////
+
+		__mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
+		ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, scissor);
+
+		// Clamp bounding box to tiles (it's already padded in computeBoundingBox)
+		__mwi bbTileMinX = _mmw_srai_epi32(bbPixelMinX, TILE_WIDTH_SHIFT);
+		__mwi bbTileMinY = _mmw_srai_epi32(bbPixelMinY, TILE_HEIGHT_SHIFT);
+		__mwi bbTileMaxX = _mmw_srai_epi32(bbPixelMaxX, TILE_WIDTH_SHIFT);
+		__mwi bbTileMaxY = _mmw_srai_epi32(bbPixelMaxY, TILE_HEIGHT_SHIFT);
+		__mwi bbTileSizeX = _mmw_sub_epi32(bbTileMaxX, bbTileMinX);
+		__mwi bbTileSizeY = _mmw_sub_epi32(bbTileMaxY, bbTileMinY);
+
+		// Cull triangles with zero bounding box
+		__mwi bboxSign = _mmw_or_epi32(_mmw_sub_epi32(bbTileSizeX, _mmw_set1_epi32(1)), _mmw_sub_epi32(bbTileSizeY, _mmw_set1_epi32(1)));
+		triMask &= ~_mmw_movemask_ps(simd_cast<__mw>(bboxSign)) & SIMD_ALL_LANES_MASK;
+		if (triMask == 0x0)
+			return cullResult;
+
+		if (!TEST_Z)
+			cullResult = CullingResult::VISIBLE;
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Set up screen space depth plane
+		//////////////////////////////////////////////////////////////////////////////
+
+		__mw zPixelDx, zPixelDy;
+		ComputeDepthPlane(pVtxX, pVtxY, pVtxZ, zPixelDx, zPixelDy);
+
+		// Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles
+		__mw bbMinXV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinX), pVtxX[0]);
+		__mw bbMinYV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinY), pVtxY[0]);
+		__mw zPlaneOffset = _mmw_fmadd_ps(zPixelDx, bbMinXV0, _mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0]));
+		__mw zTileDx = _mmw_mul_ps(zPixelDx, _mmw_set1_ps((float)TILE_WIDTH));
+		__mw zTileDy = _mmw_mul_ps(zPixelDy, _mmw_set1_ps((float)TILE_HEIGHT));
+		if (TEST_Z)
+		{
+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
+		}
+		else
+		{
+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
+			zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
+		}
+
+		// Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles)
+		__mw zMin = _mmw_min_ps(pVtxZ[0], _mmw_min_ps(pVtxZ[1], pVtxZ[2]));
+		__mw zMax = _mmw_max_ps(pVtxZ[0], _mmw_max_ps(pVtxZ[1], pVtxZ[2]));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Sort vertices (v0 has lowest Y, and the rest is in winding order) and
+		// compute edges. Also find the middle vertex and compute tile
+		//////////////////////////////////////////////////////////////////////////////
+
+#if PRECISE_COVERAGE != 0
+
+		// Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
+		SortVertices(ipVtxX, ipVtxY);
+
+		// Compute edges
+		__mwi edgeX[3] = { _mmw_sub_epi32(ipVtxX[1], ipVtxX[0]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[1]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[0]) };
+		__mwi edgeY[3] = { _mmw_sub_epi32(ipVtxY[1], ipVtxY[0]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[1]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[0]) };
+
+		// Classify if the middle vertex is on the left or right and compute its position
+		int midVtxRight = ~_mmw_movemask_ps(simd_cast<__mw>(edgeY[1]));
+		__mwi midPixelX = _mmw_blendv_epi32(ipVtxX[1], ipVtxX[2], edgeY[1]);
+		__mwi midPixelY = _mmw_blendv_epi32(ipVtxY[1], ipVtxY[2], edgeY[1]);
+		__mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(midPixelY, SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT + FP_BITS);
+		__mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
+
+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
+		// the edge originating from the middle vertex.
+		__mwi xDiffi[2], yDiffi[2];
+		xDiffi[0] = _mmw_sub_epi32(ipVtxX[0], _mmw_slli_epi32(bbPixelMinX, FP_BITS));
+		xDiffi[1] = _mmw_sub_epi32(midPixelX, _mmw_slli_epi32(bbPixelMinX, FP_BITS));
+		yDiffi[0] = _mmw_sub_epi32(ipVtxY[0], _mmw_slli_epi32(bbPixelMinY, FP_BITS));
+		yDiffi[1] = _mmw_sub_epi32(midPixelY, _mmw_slli_epi32(bbMidTileY, FP_BITS + TILE_HEIGHT_SHIFT));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Edge slope setup - Note we do not conform to DX/GL rasterization rules
+		//////////////////////////////////////////////////////////////////////////////
+
+		// Potentially flip edge to ensure that all edges have positive Y slope.
+		edgeX[1] = _mmw_blendv_epi32(edgeX[1], _mmw_neg_epi32(edgeX[1]), edgeY[1]);
+		edgeY[1] = _mmw_abs_epi32(edgeY[1]);
+
+		// Compute floating point slopes
+		__mw slope[3];
+		slope[0] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[0]), _mmw_cvtepi32_ps(edgeY[0]));
+		slope[1] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[1]), _mmw_cvtepi32_ps(edgeY[1]));
+		slope[2] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[2]), _mmw_cvtepi32_ps(edgeY[2]));
+
+		// Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
+		// width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that 
+		// vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
+		__mw  horizontalSlopeDelta = _mmw_set1_ps(2.0f * ((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f)));
+		__mwi horizontalSlope0 = _mmw_cmpeq_epi32(edgeY[0], _mmw_setzero_epi32());
+		__mwi horizontalSlope1 = _mmw_cmpeq_epi32(edgeY[1], _mmw_setzero_epi32());
+		slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, simd_cast<__mw>(horizontalSlope0));
+		slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), simd_cast<__mw>(horizontalSlope1));
+
+		__mwi vy[3] = { yDiffi[0], yDiffi[1], yDiffi[0] };
+		__mwi offset0 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[0], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
+		__mwi offset1 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[1], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
+		vy[0] = _mmw_blendv_epi32(yDiffi[0], offset0, horizontalSlope0);
+		vy[1] = _mmw_blendv_epi32(yDiffi[1], offset1, horizontalSlope1);
+
+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
+		// the edge originating from the middle vertex.
+		__mwi slopeSign[3], absEdgeX[3];
+		__mwi slopeTileDelta[3], eventStartRemainder[3], slopeTileRemainder[3], eventStart[3];
+		for (int i = 0; i < 3; i++)
+		{
+			// Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction
+			slopeSign[i] = _mmw_blendv_epi32(_mmw_set1_epi32(1), _mmw_set1_epi32(-1), edgeX[i]);
+			absEdgeX[i] = _mmw_abs_epi32(edgeX[i]);
+
+			// Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we 
+			// repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step 
+			// one tile of scanlines at a time, the slope is computed for a tile-sized step.
+			slopeTileDelta[i] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_set1_ps(FP_TILE_HEIGHT)));
+			slopeTileRemainder[i] = _mmw_sub_epi32(_mmw_slli_epi32(absEdgeX[i], FP_TILE_HEIGHT_SHIFT), _mmw_mullo_epi32(_mmw_abs_epi32(slopeTileDelta[i]), edgeY[i]));
+
+			// Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle.
+			// The jump can be in both positive and negative y-direction due to clipping / offscreen vertices.
+			__mwi tileStartDir = _mmw_blendv_epi32(slopeSign[i], _mmw_neg_epi32(slopeSign[i]), vy[i]);
+			__mwi tieBreaker = _mmw_blendv_epi32(_mmw_set1_epi32(0), _mmw_set1_epi32(1), tileStartDir);
+			__mwi tileStartSlope = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_cvtepi32_ps(_mmw_neg_epi32(vy[i]))));
+			__mwi tileStartRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(absEdgeX[i], _mmw_abs_epi32(vy[i])), _mmw_mullo_epi32(_mmw_abs_epi32(tileStartSlope), edgeY[i]));
+			
+			eventStartRemainder[i] = _mmw_sub_epi32(tileStartRemainder, tieBreaker);
+			__mwi overflow = _mmw_srai_epi32(eventStartRemainder[i], 31);
+			eventStartRemainder[i] = _mmw_add_epi32(eventStartRemainder[i], _mmw_and_epi32(overflow, edgeY[i]));
+			eventStartRemainder[i] = _mmw_blendv_epi32(eventStartRemainder[i], _mmw_sub_epi32(_mmw_sub_epi32(edgeY[i], eventStartRemainder[i]), _mmw_set1_epi32(1)), vy[i]);
+			
+			//eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + _mmw_set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker;
+			eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(xDiffi[i & 1], tileStartSlope), _mmw_and_epi32(overflow, tileStartDir));
+			eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(eventStart[i], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), tieBreaker);
+		}
+
+#else // PRECISE_COVERAGE
+
+		SortVertices(pVtxX, pVtxY);
+
+		// Compute edges
+		__mw edgeX[3] = { _mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxX[2], pVtxX[1]), _mmw_sub_ps(pVtxX[2], pVtxX[0]) };
+		__mw edgeY[3] = { _mmw_sub_ps(pVtxY[1], pVtxY[0]), _mmw_sub_ps(pVtxY[2], pVtxY[1]), _mmw_sub_ps(pVtxY[2], pVtxY[0]) };
+
+		// Classify if the middle vertex is on the left or right and compute its position
+		int midVtxRight = ~_mmw_movemask_ps(edgeY[1]);
+		__mw midPixelX = _mmw_blendv_ps(pVtxX[1], pVtxX[2], edgeY[1]);
+		__mw midPixelY = _mmw_blendv_ps(pVtxY[1], pVtxY[2], edgeY[1]);
+		__mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(_mmw_cvttps_epi32(midPixelY), SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT);
+		__mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Edge slope setup - Note we do not conform to DX/GL rasterization rules
+		//////////////////////////////////////////////////////////////////////////////
+
+		// Compute floating point slopes
+		__mw slope[3];
+		slope[0] = _mmw_div_ps(edgeX[0], edgeY[0]);
+		slope[1] = _mmw_div_ps(edgeX[1], edgeY[1]);
+		slope[2] = _mmw_div_ps(edgeX[2], edgeY[2]);
+
+		// Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
+		// width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that 
+		// vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
+		__mw horizontalSlopeDelta = _mmw_set1_ps((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f));
+		slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, _mmw_cmpeq_ps(edgeY[0], _mmw_setzero_ps()));
+		slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), _mmw_cmpeq_ps(edgeY[1], _mmw_setzero_ps()));
+
+		// Convert floaing point slopes to fixed point
+		__mwi slopeFP[3];
+		slopeFP[0] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[0], _mmw_set1_ps(1 << FP_BITS)));
+		slopeFP[1] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[1], _mmw_set1_ps(1 << FP_BITS)));
+		slopeFP[2] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[2], _mmw_set1_ps(1 << FP_BITS)));
+
+		// Fan out edge slopes to avoid (rare) cracks at vertices. We increase right facing slopes 
+		// by 1 LSB, which results in overshooting vertices slightly, increasing triangle coverage. 
+		// e0 is always right facing, e1 depends on if the middle vertex is on the left or right
+		slopeFP[0] = _mmw_add_epi32(slopeFP[0], _mmw_set1_epi32(1));
+		slopeFP[1] = _mmw_add_epi32(slopeFP[1], _mmw_srli_epi32(_mmw_not_epi32(simd_cast<__mwi>(edgeY[1])), 31));
+
+		// Compute slope deltas for an SIMD_LANES scanline step (tile height)
+		__mwi slopeTileDelta[3];
+		slopeTileDelta[0] = _mmw_slli_epi32(slopeFP[0], TILE_HEIGHT_SHIFT);
+		slopeTileDelta[1] = _mmw_slli_epi32(slopeFP[1], TILE_HEIGHT_SHIFT);
+		slopeTileDelta[2] = _mmw_slli_epi32(slopeFP[2], TILE_HEIGHT_SHIFT);
+
+		// Compute edge events for the bottom of the bounding box, or for the middle tile in case of 
+		// the edge originating from the middle vertex.
+		__mwi xDiffi[2], yDiffi[2];
+		xDiffi[0] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(pVtxX[0]), bbPixelMinX), FP_BITS);
+		xDiffi[1] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(midPixelX), bbPixelMinX), FP_BITS);
+		yDiffi[0] = _mmw_sub_epi32(_mmw_cvttps_epi32(pVtxY[0]), bbPixelMinY);
+		yDiffi[1] = _mmw_sub_epi32(_mmw_cvttps_epi32(midPixelY), _mmw_slli_epi32(bbMidTileY, TILE_HEIGHT_SHIFT));
+
+		__mwi eventStart[3];
+		eventStart[0] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[0], yDiffi[0]));
+		eventStart[1] = _mmw_sub_epi32(xDiffi[1], _mmw_mullo_epi32(slopeFP[1], yDiffi[1]));
+		eventStart[2] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[2], yDiffi[0]));
+#endif
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Split bounding box into bottom - middle - top region.
+		//////////////////////////////////////////////////////////////////////////////
+
+		__mwi bbBottomIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(bbTileMinY, _mmw_set1_epi32(mTilesWidth)));
+		__mwi bbTopIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(_mmw_add_epi32(bbTileMinY, bbTileSizeY), _mmw_set1_epi32(mTilesWidth)));
+		__mwi bbMidIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(midTileY, _mmw_set1_epi32(mTilesWidth)));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Loop over non-culled triangle and change SIMD axis to per-pixel
+		//////////////////////////////////////////////////////////////////////////////
+		while (triMask)
+		{
+			unsigned int triIdx = find_clear_lsb(&triMask);
+			int triMidVtxRight = (midVtxRight >> triIdx) & 1;
+
+			// Get Triangle Zmin zMax
+			__mw zTriMax = _mmw_set1_ps(simd_f32(zMax)[triIdx]);
+			__mw zTriMin = _mmw_set1_ps(simd_f32(zMin)[triIdx]);
+
+			// Setup Zmin value for first set of 8x4 subtiles
+			__mw z0 = _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDx)[triIdx]), SIMD_SUB_TILE_COL_OFFSET_F,
+				_mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDy)[triIdx]), SIMD_SUB_TILE_ROW_OFFSET_F, _mmw_set1_ps(simd_f32(zPlaneOffset)[triIdx])));
+			float zx = simd_f32(zTileDx)[triIdx];
+			float zy = simd_f32(zTileDy)[triIdx];
+
+			// Get dimension of bounding box bottom, mid & top segments
+			int bbWidth = simd_i32(bbTileSizeX)[triIdx];
+			int bbHeight = simd_i32(bbTileSizeY)[triIdx];
+			int tileRowIdx = simd_i32(bbBottomIdx)[triIdx];
+			int tileMidRowIdx = simd_i32(bbMidIdx)[triIdx];
+			int tileEndRowIdx = simd_i32(bbTopIdx)[triIdx];
+
+			if (bbWidth > BIG_TRIANGLE && bbHeight > BIG_TRIANGLE) // For big triangles we use a more expensive but tighter traversal algorithm
+			{
+#if PRECISE_COVERAGE != 0
+				if (triMidVtxRight)
+					cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+				else
+					cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+#else
+				if (triMidVtxRight)
+					cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
+				else
+					cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
+#endif
+			}
+			else
+			{
+#if PRECISE_COVERAGE != 0
+				if (triMidVtxRight)
+					cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+				else
+					cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
+#else
+				if (triMidVtxRight)
+					cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
+				else
+					cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
+#endif
+			}
+
+			if (TEST_Z && cullResult == CullingResult::VISIBLE)
+				return CullingResult::VISIBLE;
+		}
+
+		return cullResult;
+	}
+
+	template<int TEST_Z, int FAST_GATHER>
+	FORCE_INLINE CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+
+		if (TEST_Z)
+			STATS_ADD(mStats.mOccludees.mNumProcessedTriangles, nTris);
+		else
+			STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
+
+#if PRECISE_COVERAGE != 0
+		int originalRoundingMode = _MM_GET_ROUNDING_MODE();
+		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+#endif
+
+		int clipHead = 0;
+		int clipTail = 0;
+		__m128 clipTriBuffer[MAX_CLIPPED * 3];
+		int cullResult = CullingResult::VIEW_CULLED;
+
+		const unsigned int *inTrisPtr = inTris;
+		int numLanes = SIMD_LANES;
+		int triIndex = 0;
+		while (triIndex < nTris || clipHead != clipTail)
+		{
+            __mw vtxX[3], vtxY[3], vtxW[3];
+            unsigned int triMask = SIMD_ALL_LANES_MASK;
+
+            GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
+
+			if (triMask == 0x0)
+				continue;
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Project, transform to screen space and perform backface culling. Note 
+			// that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
+			// z = 1 is near. We must also use a greater than depth test, and in effect
+			// everything is reversed compared to regular z implementations.
+			//////////////////////////////////////////////////////////////////////////////
+
+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
+
+#if PRECISE_COVERAGE != 0
+			__mwi ipVtxX[3], ipVtxY[3];
+			ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+#else
+			ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+#endif
+
+			// Perform backface test. 
+			__mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
+			__mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
+			__mw triArea = _mmw_sub_ps(triArea1, triArea2);
+			__mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
+
+#if PRECISE_COVERAGE != 0
+			triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
+#else
+			triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
+#endif
+
+			if (triMask == 0x0)
+				continue;
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Setup and rasterize a SIMD batch of triangles
+			//////////////////////////////////////////////////////////////////////////////
+#if PRECISE_COVERAGE != 0
+			cullResult &= RasterizeTriangleBatch<TEST_Z>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
+#else
+			cullResult &= RasterizeTriangleBatch<TEST_Z>(pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
+#endif
+
+			if (TEST_Z && cullResult == CullingResult::VISIBLE) {
+#if PRECISE_COVERAGE != 0
+				_MM_SET_ROUNDING_MODE(originalRoundingMode);
+#endif
+				return CullingResult::VISIBLE;
+			}
+		}
+
+#if PRECISE_COVERAGE != 0
+		_MM_SET_ROUNDING_MODE(originalRoundingMode);
+#endif
+		return (CullingResult)cullResult;
+	}
+
+	CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
+	{
+        CullingResult retVal;
+
+        if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
+			retVal = (CullingResult)RenderTriangles<0, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+        else
+            retVal = (CullingResult)RenderTriangles<0, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+
+#if MOC_RECORDER_ENABLE
+        RecordRenderTriangles( inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout, retVal );
+#endif
+		return retVal;
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Occlusion query functions
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
+	{
+        CullingResult retVal;
+
+        if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
+			retVal = (CullingResult)RenderTriangles<1, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+        else
+		    retVal = (CullingResult)RenderTriangles<1, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+
+#if MOC_RECORDER_ENABLE
+        {
+            std::lock_guard<std::mutex> lock( mRecorderMutex );
+            if( mRecorder != nullptr ) mRecorder->RecordTestTriangles( retVal, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout );
+        }
+#endif
+        return retVal;
+	}
+    
+    CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const override
+	{
+		STATS_ADD(mStats.mOccludees.mNumProcessedRectangles, 1);
+		assert(mMaskedHiZBuffer != nullptr);
+
+		static const __m128i SIMD_TILE_PAD = _mm_setr_epi32(0, TILE_WIDTH, 0, TILE_HEIGHT);
+		static const __m128i SIMD_TILE_PAD_MASK = _mm_setr_epi32(~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1), ~(TILE_HEIGHT - 1), ~(TILE_HEIGHT - 1));
+		static const __m128i SIMD_SUB_TILE_PAD = _mm_setr_epi32(0, SUB_TILE_WIDTH, 0, SUB_TILE_HEIGHT);
+		static const __m128i SIMD_SUB_TILE_PAD_MASK = _mm_setr_epi32(~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_HEIGHT - 1), ~(SUB_TILE_HEIGHT - 1));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Compute screen space bounding box and guard for out of bounds
+		//////////////////////////////////////////////////////////////////////////////
+#if USE_D3D != 0
+		__m128  pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymax, ymin), mIHalfSize, mICenter);
+#else
+		__m128  pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymin, ymax), mIHalfSize, mICenter);
+#endif
+		__m128i pixelBBoxi = _mm_cvttps_epi32(pixelBBox);
+		pixelBBoxi = _mmx_max_epi32(_mm_setzero_si128(), _mmx_min_epi32(mIScreenSize, pixelBBoxi));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal
+		//////////////////////////////////////////////////////////////////////////////
+		__m128i tileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_TILE_PAD), SIMD_TILE_PAD_MASK);
+		int txMin = simd_i32(tileBBoxi)[0] >> TILE_WIDTH_SHIFT;
+		int txMax = simd_i32(tileBBoxi)[1] >> TILE_WIDTH_SHIFT;
+		int tileRowIdx = (simd_i32(tileBBoxi)[2] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
+		int tileRowIdxEnd = (simd_i32(tileBBoxi)[3] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
+
+		if (simd_i32(tileBBoxi)[0] == simd_i32(tileBBoxi)[1] || simd_i32(tileBBoxi)[2] == simd_i32(tileBBoxi)[3])
+        {
+#if MOC_RECORDER_ENABLE
+            {
+                std::lock_guard<std::mutex> lock( mRecorderMutex );
+                if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VIEW_CULLED, xmin, ymin, xmax, ymax, wmin );
+            }
+#endif
+            return CullingResult::VIEW_CULLED;
+        }
+
+		///////////////////////////////////////////////////////////////////////////////
+		// Pad bounding box to (8x4) subtiles. Skip SIMD lanes outside the subtile BB
+		///////////////////////////////////////////////////////////////////////////////
+		__m128i subTileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_SUB_TILE_PAD), SIMD_SUB_TILE_PAD_MASK);
+		__mwi stxmin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[0] - 1); // - 1 to be able to use GT test
+		__mwi stymin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[2] - 1); // - 1 to be able to use GT test
+		__mwi stxmax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[1]);
+		__mwi stymax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[3]);
+
+		// Setup pixel coordinates used to discard lanes outside subtile BB
+		__mwi startPixelX = _mmw_add_epi32(SIMD_SUB_TILE_COL_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[0]));
+		__mwi pixelY = _mmw_add_epi32(SIMD_SUB_TILE_ROW_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[2]));
+
+		//////////////////////////////////////////////////////////////////////////////
+		// Compute z from w. Note that z is reversed order, 0 = far, 1 = near, which
+		// means we use a greater than test, so zMax is used to test for visibility.
+		//////////////////////////////////////////////////////////////////////////////
+		__mw zMax = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_set1_ps(wmin));
+
+		for (;;)
+		{
+			__mwi pixelX = startPixelX;
+			for (int tx = txMin;;)
+			{
+				STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
+
+				int tileIdx = tileRowIdx + tx;
+				assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
+
+				// Fetch zMin from masked hierarchical Z buffer
+#if QUICK_MASK != 0
+				__mw zBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
+#else
+				__mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
+				__mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
+				__mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
+				__mw zBuf = _mmw_min_ps(zMin0, zMin1);
+#endif
+				// Perform conservative greater than test against hierarchical Z buffer (zMax >= zBuf means the subtile is visible)
+				__mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zMax, zBuf));	//zPass = zMax >= zBuf ? ~0 : 0
+
+				// Mask out lanes corresponding to subtiles outside the bounding box
+				__mwi bboxTestMin = _mmw_and_epi32(_mmw_cmpgt_epi32(pixelX, stxmin), _mmw_cmpgt_epi32(pixelY, stymin));
+				__mwi bboxTestMax = _mmw_and_epi32(_mmw_cmpgt_epi32(stxmax, pixelX), _mmw_cmpgt_epi32(stymax, pixelY));
+				__mwi boxMask = _mmw_and_epi32(bboxTestMin, bboxTestMax);
+				zPass = _mmw_and_epi32(zPass, boxMask);
+
+				// If not all tiles failed the conservative z test we can immediately terminate the test
+				if (!_mmw_testz_epi32(zPass, zPass))
+                {
+#if MOC_RECORDER_ENABLE
+                    {
+                        std::lock_guard<std::mutex> lock( mRecorderMutex );
+                        if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VISIBLE, xmin, ymin, xmax, ymax, wmin );
+                    }
+#endif
+                    return CullingResult::VISIBLE;
+                }
+
+				if (++tx >= txMax)
+					break;
+				pixelX = _mmw_add_epi32(pixelX, _mmw_set1_epi32(TILE_WIDTH));
+			}
+
+			tileRowIdx += mTilesWidth;
+			if (tileRowIdx >= tileRowIdxEnd)
+				break;
+			pixelY = _mmw_add_epi32(pixelY, _mmw_set1_epi32(TILE_HEIGHT));
+		}
+#if MOC_RECORDER_ENABLE
+        {
+            std::lock_guard<std::mutex> lock( mRecorderMutex );
+            if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::OCCLUDED, xmin, ymin, xmax, ymax, wmin );
+        }
+#endif
+		return CullingResult::OCCLUDED;
+	}
+
+	template<bool FAST_GATHER>
+	FORCE_INLINE void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+
+#if PRECISE_COVERAGE != 0
+		int originalRoundingMode = _MM_GET_ROUNDING_MODE();
+		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+#endif
+
+		STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
+
+		int clipHead = 0;
+		int clipTail = 0;
+		__m128 clipTriBuffer[MAX_CLIPPED * 3];
+
+		const unsigned int *inTrisPtr = inTris;
+		int numLanes = SIMD_LANES;
+		int triIndex = 0;
+		while (triIndex < nTris || clipHead != clipTail)
+		{
+            unsigned int triMask = SIMD_ALL_LANES_MASK;
+            __mw vtxX[3], vtxY[3], vtxW[3];
+
+            GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
+
+			if (triMask == 0x0)
+				continue;
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Project, transform to screen space and perform backface culling. Note 
+			// that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
+			// z = 1 is near. We must also use a greater than depth test, and in effect
+			// everything is reversed compared to regular z implementations.
+			//////////////////////////////////////////////////////////////////////////////
+
+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
+
+#if PRECISE_COVERAGE != 0
+			__mwi ipVtxX[3], ipVtxY[3];
+			ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+#else
+			ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
+#endif
+
+			// Perform backface test. 
+			__mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
+			__mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
+			__mw triArea = _mmw_sub_ps(triArea1, triArea2);
+			__mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
+
+#if PRECISE_COVERAGE != 0
+			triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
+#else
+			triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
+#endif
+
+			if (triMask == 0x0)
+				continue;
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Bin triangles
+			//////////////////////////////////////////////////////////////////////////////
+
+			unsigned int binWidth;
+			unsigned int binHeight;
+			ComputeBinWidthHeight(nBinsW, nBinsH, binWidth, binHeight);
+
+			// Compute pixel bounding box
+			__mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
+			ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, &mFullscreenScissor);
+
+			while (triMask)
+			{
+				unsigned int triIdx = find_clear_lsb(&triMask);
+
+				// Clamp bounding box to bins
+				int startX = min(nBinsW-1, simd_i32(bbPixelMinX)[triIdx] / binWidth);
+				int startY = min(nBinsH-1, simd_i32(bbPixelMinY)[triIdx] / binHeight);
+				int endX = min(nBinsW, (simd_i32(bbPixelMaxX)[triIdx] + binWidth - 1) / binWidth);
+				int endY = min(nBinsH, (simd_i32(bbPixelMaxY)[triIdx] + binHeight - 1) / binHeight);
+
+				for (int y = startY; y < endY; ++y)
+				{
+					for (int x = startX; x < endX; ++x)
+					{
+						int binIdx = x + y * nBinsW;
+						unsigned int writeTriIdx = triLists[binIdx].mTriIdx;
+						for (int i = 0; i < 3; ++i)
+						{
+#if PRECISE_COVERAGE != 0
+							((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 0] = simd_i32(ipVtxX[i])[triIdx];
+							((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 1] = simd_i32(ipVtxY[i])[triIdx];
+#else
+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 0] = simd_f32(pVtxX[i])[triIdx];
+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 1] = simd_f32(pVtxY[i])[triIdx];
+#endif
+							triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 2] = simd_f32(pVtxZ[i])[triIdx];
+						}
+						triLists[binIdx].mTriIdx++;
+					}
+				}
+			}
+		}
+#if PRECISE_COVERAGE != 0
+		_MM_SET_ROUNDING_MODE(originalRoundingMode);
+#endif
+	}
+
+	void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
+	{
+		if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
+			BinTriangles<true>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+		else
+			BinTriangles<false>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
+	}
+
+    template<int FAST_GATHER>
+    void GatherTransformClip( int & clipHead, int & clipTail, int & numLanes, int nTris, int & triIndex, __mw * vtxX, __mw * vtxY, __mw * vtxW, const float * inVtx, const unsigned int * &inTrisPtr, const VertexLayout & vtxLayout, const float * modelToClipMatrix, __m128 * clipTriBuffer, unsigned int &triMask, ClipPlanes clipPlaneMask )
+    {
+        //////////////////////////////////////////////////////////////////////////////
+        // Assemble triangles from the index list 
+        //////////////////////////////////////////////////////////////////////////////
+        unsigned int triClipMask = SIMD_ALL_LANES_MASK;
+
+        if( clipHead != clipTail )
+        {
+            int clippedTris = clipHead > clipTail ? clipHead - clipTail : MAX_CLIPPED + clipHead - clipTail;
+            clippedTris = min( clippedTris, SIMD_LANES );
+
+#if CLIPPING_PRESERVES_ORDER != 0
+            // if preserving order, don't mix clipped and new triangles, handle the clip buffer fully
+            // and then continue gathering; this is not as efficient - ideally we want to gather
+            // at the end (if clip buffer has less than SIMD_LANES triangles) but that requires
+            // more modifications below - something to do in the future.
+            numLanes = 0;
+#else
+            // Fill out SIMD registers by fetching more triangles. 
+            numLanes = max( 0, min( SIMD_LANES - clippedTris, nTris - triIndex ) );
+#endif
+
+            if( numLanes > 0 ) {
+                if( FAST_GATHER )
+                    GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
+                else
+                    GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
+
+                TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
+            }
+
+            for( int clipTri = numLanes; clipTri < numLanes + clippedTris; clipTri++ )
+            {
+                int triIdx = clipTail * 3;
+                for( int i = 0; i < 3; i++ )
+                {
+                    simd_f32( vtxX[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[0];
+                    simd_f32( vtxY[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[1];
+                    simd_f32( vtxW[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[2];
+                }
+                clipTail = ( clipTail + 1 ) & ( MAX_CLIPPED - 1 );
+            }
+
+            triIndex += numLanes;
+            inTrisPtr += numLanes * 3;
+
+            triMask = ( 1U << ( clippedTris + numLanes ) ) - 1;
+            triClipMask = ( 1U << numLanes ) - 1; // Don't re-clip already clipped triangles
+        }
+        else
+        {
+            numLanes = min( SIMD_LANES, nTris - triIndex );
+            triMask = ( 1U << numLanes ) - 1;
+            triClipMask = triMask;
+
+            if( FAST_GATHER )
+                GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
+            else
+                GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
+
+            TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
+
+            triIndex += SIMD_LANES;
+            inTrisPtr += SIMD_LANES * 3;
+        }
+
+        //////////////////////////////////////////////////////////////////////////////
+        // Clip transformed triangles
+        //////////////////////////////////////////////////////////////////////////////
+
+        if( clipPlaneMask != ClipPlanes::CLIP_PLANE_NONE )
+            ClipTriangleAndAddToBuffer( vtxX, vtxY, vtxW, clipTriBuffer, clipHead, triMask, triClipMask, clipPlaneMask );
+    }
+
+	void RenderTrilist(const TriList &triList, const ScissorRect *scissor) override
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+
+		// Setup fullscreen scissor rect as default
+		scissor = scissor == nullptr ? &mFullscreenScissor : scissor;
+
+		for (unsigned int i = 0; i < triList.mTriIdx; i += SIMD_LANES)
+		{
+			//////////////////////////////////////////////////////////////////////////////
+			// Fetch triangle vertices
+			//////////////////////////////////////////////////////////////////////////////
+
+			unsigned int numLanes = min((unsigned int)SIMD_LANES, triList.mTriIdx - i);
+			unsigned int triMask = (1U << numLanes) - 1;
+
+			__mw pVtxX[3], pVtxY[3], pVtxZ[3];
+#if PRECISE_COVERAGE != 0
+			__mwi ipVtxX[3], ipVtxY[3];
+			for (unsigned int l = 0; l < numLanes; ++l)
+			{
+				unsigned int triIdx = i + l;
+				for (int v = 0; v < 3; ++v)
+				{
+					simd_i32(ipVtxX[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 0];
+					simd_i32(ipVtxY[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 1];
+					simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
+				}
+			}
+
+			for (int v = 0; v < 3; ++v)
+			{
+				pVtxX[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[v]), _mmw_set1_ps(FP_INV));
+				pVtxY[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[v]), _mmw_set1_ps(FP_INV));
+			}
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Setup and rasterize a SIMD batch of triangles
+			//////////////////////////////////////////////////////////////////////////////
+
+			RasterizeTriangleBatch<false>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, scissor);
+#else
+			for (unsigned int l = 0; l < numLanes; ++l)
+			{
+				unsigned int triIdx = i + l;
+				for (int v = 0; v < 3; ++v)
+				{
+					simd_f32(pVtxX[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 0];
+					simd_f32(pVtxY[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 1];
+					simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
+				}
+			}
+
+			//////////////////////////////////////////////////////////////////////////////
+			// Setup and rasterize a SIMD batch of triangles
+			//////////////////////////////////////////////////////////////////////////////
+
+			RasterizeTriangleBatch<false>(pVtxX, pVtxY, pVtxZ, triMask, scissor);
+#endif
+
+		}
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Debugging and statistics
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	MaskedOcclusionCulling::Implementation GetImplementation() override
+	{
+		return gInstructionSet;
+	}
+
+	void ComputePixelDepthBuffer(float *depthData, bool flipY) override
+	{
+		assert(mMaskedHiZBuffer != nullptr);
+		for (int y = 0; y < mHeight; y++)
+		{
+			for (int x = 0; x < mWidth; x++)
+			{
+				// Compute 32xN tile index (SIMD value offset)
+				int tx = x / TILE_WIDTH;
+				int ty = y / TILE_HEIGHT;
+				int tileIdx = ty * mTilesWidth + tx;
+
+				// Compute 8x4 subtile index (SIMD lane offset)
+				int stx = (x % TILE_WIDTH) / SUB_TILE_WIDTH;
+				int sty = (y % TILE_HEIGHT) / SUB_TILE_HEIGHT;
+				int subTileIdx = sty * 4 + stx;
+
+				// Compute pixel index in subtile (bit index in 32-bit word)
+				int px = (x % SUB_TILE_WIDTH);
+				int py = (y % SUB_TILE_HEIGHT);
+				int bitIdx = py * 8 + px;
+
+				int pixelLayer = (simd_i32(mMaskedHiZBuffer[tileIdx].mMask)[subTileIdx] >> bitIdx) & 1;
+				float pixelDepth = simd_f32(mMaskedHiZBuffer[tileIdx].mZMin[pixelLayer])[subTileIdx];
+
+                if( flipY )
+                    depthData[( mHeight - y - 1 ) * mWidth + x] = pixelDepth;
+                else
+                    depthData[y * mWidth + x] = pixelDepth;
+			}
+		}
+	}
+
+	OcclusionCullingStatistics GetStatistics() override
+	{
+		return mStats;
+	}
+
+};

+ 6 - 0
Gems/Atom/RPI/Code/External/MaskedOcclusionCulling/PackageInfo.json

@@ -0,0 +1,6 @@
+{
+    "PackageName": "Masked Occlusion Culling",
+    "URL": "https://software.intel.com/content/www/us/en/develop/articles/masked-software-occlusion-culling.html",
+    "License": "Apache 2.0",
+    "LicenseFile": "LICENSE.txt"
+}

+ 19 - 5
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/Culling.h

@@ -31,7 +31,6 @@
 #include <AzFramework/Visibility/IVisibilitySystem.h>
 
 #include <Atom/RPI.Public/View.h>
-
 #include <Atom/RHI/DrawList.h>
 
 #include <AtomCore/std/parallel/concurrency_checker.h>
@@ -97,6 +96,9 @@ namespace AZ
             };
             LodData m_lodData;
 
+            //! Flag indicating if the object is visible, i.e., was not culled out in the last frame
+            bool m_isVisible = true;
+
             void SetDebugName([[maybe_unused]] const AZ::Name& debugName)
             {
 #ifdef AZ_CULL_DEBUG_ENABLED
@@ -213,6 +215,21 @@ namespace AZ
             void Activate(const class Scene* parentScene);
             void Deactivate();
 
+            struct OcclusionPlane
+            {
+                // World space corners of the occluson plane
+                Vector3 m_cornerBL;
+                Vector3 m_cornerTL;
+                Vector3 m_cornerTR;
+                Vector3 m_cornerBR;
+
+                Aabb m_aabb;
+            };
+            using OcclusionPlaneVector = AZStd::vector<OcclusionPlane>;
+
+            //! Sets a list of occlusion planes to be used during the culling process.
+            void SetOcclusionPlanes(const OcclusionPlaneVector& occlusionPlanes) { m_occlusionPlanes = occlusionPlanes; }
+
             //! Notifies the CullingScene that culling will begin for this frame.
             void BeginCulling(const AZStd::vector<ViewPtr>& views);
 
@@ -251,12 +268,9 @@ namespace AZ
 
             const Scene* m_parentScene = nullptr;
             AzFramework::IVisibilityScene* m_visScene = nullptr;
-
             CullingDebugContext m_debugCtx;
-
             AZStd::concurrency_checker m_cullDataConcurrencyCheck;
-
-            AZStd::mutex m_mutex;
+            OcclusionPlaneVector m_occlusionPlanes;
         };
         
 

+ 12 - 1
Gems/Atom/RPI/Code/Include/Atom/RPI.Public/View.h

@@ -24,6 +24,8 @@
 #include <AzCore/std/containers/vector.h>
 #include <AzCore/Name/Name.h>
 
+class MaskedOcclusionCulling;
+
 namespace AZ
 {
     namespace  RHI
@@ -57,7 +59,7 @@ namespace AZ
             //! Only use this function to create a new view object. And force using smart pointer to manage view's life time
             static ViewPtr CreateView(const AZ::Name& name, UsageFlags usage);
 
-            ~View() = default;
+            ~View();
 
             void SetDrawListMask(const RHI::DrawListMask& drawListMask);
             RHI::DrawListMask GetDrawListMask() const { return m_drawListMask; }
@@ -126,6 +128,12 @@ namespace AZ
             //! Notifies consumers when the world to clip matrix has changed.
             void ConnectWorldToClipMatrixChangedHandler(MatrixChangedEvent::Handler& handler);
 
+            //! Prepare for view culling
+            void BeginCulling();
+
+            //! Returns the masked occlusion culling interface
+            MaskedOcclusionCulling* GetMaskedOcclusionCulling();
+
         private:
             View() = delete;
             View(const AZ::Name& name, UsageFlags usage);
@@ -193,6 +201,9 @@ namespace AZ
 
             MatrixChangedEvent m_onWorldToClipMatrixChange;
             MatrixChangedEvent m_onWorldToViewMatrixChange;
+
+            // Masked Occlusion Culling interface
+            MaskedOcclusionCulling* m_maskedOcclusionCulling = nullptr;
         };
 
         AZ_DEFINE_ENUM_BITWISE_OPERATORS(View::UsageFlags);

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Android.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Android/Atom_RPI_Traits_Platform.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#include "Atom_RPI_Traits_Android.h"

+ 13 - 0
Gems/Atom/RPI/Code/Source/Platform/Android/PAL_android.cmake

@@ -0,0 +1,13 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)

+ 15 - 0
Gems/Atom/RPI/Code/Source/Platform/Android/platform_android_files.cmake

@@ -0,0 +1,15 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    Atom_RPI_Traits_Platform.h
+    Atom_RPI_Traits_Android.h
+)

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Linux.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Linux/Atom_RPI_Traits_Platform.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#include "Atom_RPI_Traits_Linux.h"

+ 1 - 0
Gems/Atom/RPI/Code/Source/Platform/Linux/PAL_linux.cmake

@@ -10,3 +10,4 @@
 #
 
 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)

+ 15 - 0
Gems/Atom/RPI/Code/Source/Platform/Linux/platform_linux_files.cmake

@@ -0,0 +1,15 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    Atom_RPI_Traits_Platform.h
+    Atom_RPI_Traits_Linux.h
+)

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Mac.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Mac/Atom_RPI_Traits_Platform.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#include "Atom_RPI_Traits_Mac.h"

+ 1 - 0
Gems/Atom/RPI/Code/Source/Platform/Mac/PAL_mac.cmake

@@ -10,3 +10,4 @@
 #
 
 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED TRUE)
+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)

+ 15 - 0
Gems/Atom/RPI/Code/Source/Platform/Mac/platform_mac_files.cmake

@@ -0,0 +1,15 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    Atom_RPI_Traits_Platform.h
+    Atom_RPI_Traits_Mac.h
+)

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Platform.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#include "Atom_RPI_Traits_Windows.h"

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Windows/Atom_RPI_Traits_Windows.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 1

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/Windows/PAL_windows.cmake

@@ -10,3 +10,17 @@
 #
 
 set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED TRUE)
+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED TRUE)
+
+ly_add_source_properties(
+    SOURCES External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
+    PROPERTY COMPILE_OPTIONS
+    VALUES /arch:AVX2 /W3
+)
+ly_add_source_properties(
+    SOURCES 
+        External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
+        External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
+    PROPERTY COMPILE_OPTIONS
+    VALUES /W3
+)

+ 15 - 0
Gems/Atom/RPI/Code/Source/Platform/Windows/platform_windows_files.cmake

@@ -0,0 +1,15 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    Atom_RPI_Traits_Platform.h
+    Atom_RPI_Traits_Windows.h
+)

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_Platform.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#include "Atom_RPI_Traits_iOS.h"

+ 14 - 0
Gems/Atom/RPI/Code/Source/Platform/iOS/Atom_RPI_Traits_iOS.h

@@ -0,0 +1,14 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+#pragma once
+
+#define AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED 0

+ 13 - 0
Gems/Atom/RPI/Code/Source/Platform/iOS/PAL_ios.cmake

@@ -0,0 +1,13 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set (PAL_TRAIT_BUILD_ATOM_RPI_ASSETS_SUPPORTED FALSE)
+set (PAL_TRAIT_BUILD_ATOM_RPI_MASKED_OCCLUSION_CULLING_SUPPORTED FALSE)

+ 15 - 0
Gems/Atom/RPI/Code/Source/Platform/iOS/platform_ios_files.cmake

@@ -0,0 +1,15 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    Atom_RPI_Traits_Platform.h
+    Atom_RPI_Traits_iOS.h
+)

+ 203 - 41
Gems/Atom/RPI/Code/Source/RPI.Public/Culling.cpp

@@ -20,15 +20,20 @@
 
 #include <Atom/RHI/CpuProfiler.h>
 
+#include <AzCore/Math/MatrixUtils.h>
 #include <AzCore/Math/ShapeIntersection.h>
 #include <AzCore/Casting/numeric_cast.h>
-
 #include <AzCore/std/parallel/lock.h>
 #include <AzCore/Casting/numeric_cast.h>
 #include <AzCore/Debug/EventTrace.h>
 #include <AzCore/Debug/Timer.h>
 #include <AzCore/Jobs/JobFunction.h>
 #include <AzCore/Jobs/Job.h>
+#include <Atom_RPI_Traits_Platform.h>
+
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+#include <MaskedOcclusionCulling/MaskedOcclusionCulling.h>
+#endif
 
 //Enables more inner-loop profiling scopes (can create high overhead in RadTelemetry if there are many-many objects in a scene)
 //#define AZ_CULL_PROFILE_DETAILED
@@ -272,21 +277,26 @@ namespace AZ
         public:
             AZ_CLASS_ALLOCATOR(AddObjectsToViewJob, ThreadPoolAllocator, 0);
 
+            struct JobData
+            {
+                CullingDebugContext* m_debugCtx = nullptr;
+                const Scene* m_scene = nullptr;
+                View* m_view = nullptr;
+                Frustum m_frustum;
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+                MaskedOcclusionCulling* m_maskedOcclusionCulling = nullptr;
+#endif
+            };
+
         private:
-            CullingDebugContext* m_debugCtx;
-            const Scene* m_scene;
-            View* m_view;
-            Frustum m_frustum;
+            const AZStd::shared_ptr<JobData> m_jobData;
             CullingScene::WorkListType m_worklist;
 
         public:
-            AddObjectsToViewJob(CullingDebugContext& debugCtx, const Scene& scene, View& view, Frustum& frustum, CullingScene::WorkListType& worklist)
+            AddObjectsToViewJob(const AZStd::shared_ptr<AddObjectsToViewJob::JobData>& jobData, CullingScene::WorkListType& worklist)
                 : Job(true, nullptr)        //auto-deletes, no JobContext
-                , m_debugCtx(&debugCtx)
-                , m_scene(&scene)
-                , m_view(&view)
-                , m_frustum(frustum)                 //capture by value
-                , m_worklist(AZStd::move(worklist))  //capture by value
+                , m_jobData(jobData)
+                , m_worklist(worklist)
             {
             }
 
@@ -295,37 +305,50 @@ namespace AZ
             {
                 AZ_PROFILE_FUNCTION(Debug::ProfileCategory::AzRender);
 
-                const View::UsageFlags viewFlags = m_view->GetUsageFlags();
-                const RHI::DrawListMask drawListMask = m_view->GetDrawListMask();
+                const View::UsageFlags viewFlags = m_jobData->m_view->GetUsageFlags();
+                const RHI::DrawListMask drawListMask = m_jobData->m_view->GetDrawListMask();
                 uint32_t numDrawPackets = 0;
                 uint32_t numVisibleCullables = 0;
 
                 for (const AzFramework::IVisibilityScene::NodeData& nodeData : m_worklist)
                 {
                     //If a node is entirely contained within the frustum, then we can skip the fine grained culling.
-                    bool nodeIsContainedInFrustum = ShapeIntersection::Contains(m_frustum, nodeData.m_bounds);
+                    bool nodeIsContainedInFrustum = ShapeIntersection::Contains(m_jobData->m_frustum, nodeData.m_bounds);
 
 #ifdef AZ_CULL_PROFILE_VERBOSE
                     AZ_PROFILE_SCOPE_DYNAMIC(Debug::ProfileCategory::AzRender, "process node (view: %s, skip fine cull: %d",
                         m_view->GetName().GetCStr(), nodeIsContainedInFrustum ? 1 : 0);
 #endif
 
-                    if (nodeIsContainedInFrustum || !m_debugCtx->m_enableFrustumCulling)
+                    if (nodeIsContainedInFrustum || !m_jobData->m_debugCtx->m_enableFrustumCulling)
                     {
                         //Add all objects within this node to the view, without any extra culling
                         for (AzFramework::VisibilityEntry* visibleEntry : nodeData.m_entries)
                         {
-                            if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
                             {
-                                Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
-                                if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
-                                    c->m_cullData.m_hideFlags & viewFlags ||
-                                    c->m_cullData.m_scene != m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
+                                if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
                                 {
-                                    continue;
+                                    Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
+
+                                    // reset visibility flag to false, update to true if all culling checks pass
+                                    c->m_isVisible = false;
+
+                                    if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
+                                        c->m_cullData.m_hideFlags & viewFlags ||
+                                        c->m_cullData.m_scene != m_jobData->m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
+                                    {
+                                        continue;
+                                    }
+
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+                                    if (TestOcclusionCulling(visibleEntry) == MaskedOcclusionCulling::CullingResult::VISIBLE)
+#endif
+                                    {
+                                        numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_jobData->m_view);
+                                        ++numVisibleCullables;
+                                        c->m_isVisible = true;
+                                    }
                                 }
-                                numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_view);
-                                ++numVisibleCullables;
                             }
                         }
                     }
@@ -337,68 +360,78 @@ namespace AZ
                             if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
                             {
                                 Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
+
+                                // reset visibility flag to false, update to true if all culling checks pass
+                                c->m_isVisible = false;
+
                                 if ((c->m_cullData.m_drawListMask & drawListMask).none() ||
                                     c->m_cullData.m_hideFlags & viewFlags ||
-                                    c->m_cullData.m_scene != m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
+                                    c->m_cullData.m_scene != m_jobData->m_scene)       //[GFX_TODO][ATOM-13796] once the IVisibilitySystem supports multiple octree scenes, remove this
                                 {
                                     continue;
                                 }
 
-                                IntersectResult res = ShapeIntersection::Classify(m_frustum, c->m_cullData.m_boundingSphere);
+                                IntersectResult res = ShapeIntersection::Classify(m_jobData->m_frustum, c->m_cullData.m_boundingSphere);
                                 if (res == IntersectResult::Exterior)
                                 {
                                     continue;
                                 }
-                                else if (res == IntersectResult::Interior || ShapeIntersection::Overlaps(m_frustum, c->m_cullData.m_boundingObb))
+                                else if (res == IntersectResult::Interior || ShapeIntersection::Overlaps(m_jobData->m_frustum, c->m_cullData.m_boundingObb))
                                 {
-                                    numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_view);
-                                    ++numVisibleCullables;
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+                                    if (TestOcclusionCulling(visibleEntry) == MaskedOcclusionCulling::CullingResult::VISIBLE)
+#endif
+                                    {
+                                        numDrawPackets += AddLodDataToView(c->m_cullData.m_boundingSphere.GetCenter(), c->m_lodData, *m_jobData->m_view);
+                                        ++numVisibleCullables;
+                                        c->m_isVisible = true;
+                                    }
                                 }
                             }
                         }
                     }
 
-                    if (m_debugCtx->m_debugDraw && (m_view->GetName() == m_debugCtx->m_currentViewSelectionName))
+                    if (m_jobData->m_debugCtx->m_debugDraw && (m_jobData->m_view->GetName() == m_jobData->m_debugCtx->m_currentViewSelectionName))
                     {
                         AZ_PROFILE_SCOPE(Debug::ProfileCategory::AzRender, "debug draw culling");
 
-                        AuxGeomDrawPtr auxGeomPtr = AuxGeomFeatureProcessorInterface::GetDrawQueueForScene(m_scene);
+                        AuxGeomDrawPtr auxGeomPtr = AuxGeomFeatureProcessorInterface::GetDrawQueueForScene(m_jobData->m_scene);
                         if (auxGeomPtr)
                         {
                             //Draw the node bounds
                             // "Fully visible" nodes are nodes that are fully inside the frustum. "Partially visible" nodes intersect the edges of the frustum.
                             // Since the nodes of an octree have lots of overlapping boxes with coplanar edges, it's easier to view these separately, so
                             // we have a few debug booleans to toggle which ones to draw.
-                            if (nodeIsContainedInFrustum && m_debugCtx->m_drawFullyVisibleNodes)
+                            if (nodeIsContainedInFrustum && m_jobData->m_debugCtx->m_drawFullyVisibleNodes)
                             {
                                 auxGeomPtr->DrawAabb(nodeData.m_bounds, Colors::Lime, RPI::AuxGeomDraw::DrawStyle::Line, RPI::AuxGeomDraw::DepthTest::Off);
                             }
-                            else if (!nodeIsContainedInFrustum && m_debugCtx->m_drawPartiallyVisibleNodes)
+                            else if (!nodeIsContainedInFrustum && m_jobData->m_debugCtx->m_drawPartiallyVisibleNodes)
                             {
                                 auxGeomPtr->DrawAabb(nodeData.m_bounds, Colors::Yellow, RPI::AuxGeomDraw::DrawStyle::Line, RPI::AuxGeomDraw::DepthTest::Off);
                             }
 
                             //Draw bounds on individual objects
-                            if (m_debugCtx->m_drawBoundingBoxes || m_debugCtx->m_drawBoundingSpheres || m_debugCtx->m_drawLodRadii)
+                            if (m_jobData->m_debugCtx->m_drawBoundingBoxes || m_jobData->m_debugCtx->m_drawBoundingSpheres || m_jobData->m_debugCtx->m_drawLodRadii)
                             {
                                 for (AzFramework::VisibilityEntry* visibleEntry : nodeData.m_entries)
                                 {
                                     if (visibleEntry->m_typeFlags & AzFramework::VisibilityEntry::TYPE_RPI_Cullable)
                                     {
                                         Cullable* c = static_cast<Cullable*>(visibleEntry->m_userData);
-                                        if (m_debugCtx->m_drawBoundingBoxes)
+                                        if (m_jobData->m_debugCtx->m_drawBoundingBoxes)
                                         {
                                             auxGeomPtr->DrawObb(c->m_cullData.m_boundingObb, Matrix3x4::Identity(),
                                                 nodeIsContainedInFrustum ? Colors::Lime : Colors::Yellow, AuxGeomDraw::DrawStyle::Line);
                                         }
 
-                                        if (m_debugCtx->m_drawBoundingSpheres)
+                                        if (m_jobData->m_debugCtx->m_drawBoundingSpheres)
                                         {
                                             auxGeomPtr->DrawSphere(c->m_cullData.m_boundingSphere.GetCenter(), c->m_cullData.m_boundingSphere.GetRadius(),
                                                 Color(0.5f, 0.5f, 0.5f, 0.3f), AuxGeomDraw::DrawStyle::Shaded);
                                         }
 
-                                        if (m_debugCtx->m_drawLodRadii)
+                                        if (m_jobData->m_debugCtx->m_drawLodRadii)
                                         {
                                             auxGeomPtr->DrawSphere(c->m_cullData.m_boundingSphere.GetCenter(),
                                                 c->m_lodData.m_lodSelectionRadius,
@@ -411,9 +444,9 @@ namespace AZ
                     }
                 }
 
-                if (m_debugCtx->m_enableStats)
+                if (m_jobData->m_debugCtx->m_enableStats)
                 {
-                    CullingDebugContext::CullStats& cullStats = m_debugCtx->GetCullStatsForView(m_view);
+                    CullingDebugContext::CullStats& cullStats = m_jobData->m_debugCtx->GetCullStatsForView(m_jobData->m_view);
 
                     //no need for mutex here since these are all atomics
                     cullStats.m_numVisibleDrawPackets += numDrawPackets;
@@ -421,6 +454,63 @@ namespace AZ
                     ++cullStats.m_numJobs;
                 }
             }
+
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            MaskedOcclusionCulling::CullingResult TestOcclusionCulling(AzFramework::VisibilityEntry* visibleEntry)
+            {
+                if (!m_jobData->m_maskedOcclusionCulling)
+                {
+                    return MaskedOcclusionCulling::CullingResult::VISIBLE;
+                }
+
+                if (visibleEntry->m_boundingVolume.Contains(m_jobData->m_view->GetCameraTransform().GetTranslation()))
+                {
+                    // camera is inside bounding volume
+                    return MaskedOcclusionCulling::CullingResult::VISIBLE;
+                }
+
+                const Vector3& minBound = visibleEntry->m_boundingVolume.GetMin();
+                const Vector3& maxBound = visibleEntry->m_boundingVolume.GetMax();
+
+                // compute bounding volume corners
+                Vector4 corners[8];
+                corners[0] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), minBound.GetY(), minBound.GetZ(), 1.0f);
+                corners[1] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), minBound.GetY(), maxBound.GetZ(), 1.0f);
+                corners[2] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), minBound.GetY(), maxBound.GetZ(), 1.0f);
+                corners[3] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), minBound.GetY(), minBound.GetZ(), 1.0f);
+                corners[4] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), maxBound.GetY(), minBound.GetZ(), 1.0f);
+                corners[5] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(minBound.GetX(), maxBound.GetY(), maxBound.GetZ(), 1.0f);
+                corners[6] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), maxBound.GetY(), maxBound.GetZ(), 1.0f);
+                corners[7] = m_jobData->m_view->GetWorldToClipMatrix() * Vector4(maxBound.GetX(), maxBound.GetY(), minBound.GetZ(), 1.0f);
+
+                // find min clip-space depth and NDC min/max
+                float minDepth = FLT_MAX;
+                float ndcMinX = FLT_MAX;
+                float ndcMinY = FLT_MAX;
+                float ndcMaxX = -FLT_MAX;
+                float ndcMaxY = -FLT_MAX;
+                for (uint32_t index = 0; index < 8; ++index)
+                {
+                    minDepth = AZStd::min(minDepth, corners[index].GetW());
+
+                    // convert to NDC
+                    corners[index] /= corners[index].GetW();
+
+                    ndcMinX = AZStd::min(ndcMinX, corners[index].GetX());
+                    ndcMinY = AZStd::min(ndcMinY, corners[index].GetY());
+                    ndcMaxX = AZStd::max(ndcMaxX, corners[index].GetX());
+                    ndcMaxY = AZStd::max(ndcMaxY, corners[index].GetY());
+                }
+
+                if (minDepth < 0.00000001f)
+                {
+                    return MaskedOcclusionCulling::VISIBLE;
+                }
+
+                // test against the occlusion buffer, which contains only the manually placed occlusion planes
+                return m_jobData->m_maskedOcclusionCulling->TestRect(ndcMinX, ndcMinY, ndcMaxX, ndcMaxY, minDepth);
+            }
+#endif
         };
 
         void CullingScene::ProcessCullables(const Scene& scene, View& view, AZ::Job& parentJob)
@@ -454,8 +544,67 @@ namespace AZ
                 cullStats.m_cameraViewToWorld = view.GetViewToWorldMatrix();
             }
 
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            // setup occlusion culling, if necessary
+            MaskedOcclusionCulling* maskedOcclusionCulling = m_occlusionPlanes.empty() ? nullptr : view.GetMaskedOcclusionCulling();
+            if (maskedOcclusionCulling)
+            {
+                // frustum cull occlusion planes
+                using VisibleOcclusionPlane = AZStd::pair<OcclusionPlane, float>;
+                AZStd::vector<VisibleOcclusionPlane> visibleOccluders;
+                for (const auto& occlusionPlane : m_occlusionPlanes)
+                {
+                    if (ShapeIntersection::Overlaps(frustum, occlusionPlane.m_aabb))
+                    {
+                        // occluder is visible, compute view space distance and add to list
+                        float depth = (view.GetWorldToViewMatrix() * occlusionPlane.m_aabb.GetMin()).GetZ();
+                        depth = AZStd::min(depth, (view.GetWorldToViewMatrix() * occlusionPlane.m_aabb.GetMax()).GetZ());
+
+                        visibleOccluders.push_back(AZStd::make_pair(occlusionPlane, depth));
+                    }
+                }
+
+                // sort the occlusion planes by view space distance, front-to-back
+                AZStd::sort(visibleOccluders.begin(), visibleOccluders.end(), [](const VisibleOcclusionPlane& LHS, const VisibleOcclusionPlane& RHS)
+                {
+                    return LHS.second > RHS.second;
+                });
+
+                for (const VisibleOcclusionPlane& occlusionPlane: visibleOccluders)
+                {
+                    // convert to clip-space
+                    Vector4 projectedBL = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerBL);
+                    Vector4 projectedTL = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerTL);
+                    Vector4 projectedTR = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerTR);
+                    Vector4 projectedBR = view.GetWorldToClipMatrix() * Vector4(occlusionPlane.first.m_cornerBR);
+
+                    // store to float array
+                    float verts[16];
+                    projectedBL.StoreToFloat4(&verts[0]);
+                    projectedTL.StoreToFloat4(&verts[4]);
+                    projectedTR.StoreToFloat4(&verts[8]);
+                    projectedBR.StoreToFloat4(&verts[12]);
+
+                    static uint32_t indices[6] = { 0, 1, 2, 2, 3, 0 };
+
+                    // render into the occlusion buffer, specifying BACKFACE_NONE so it functions as a double-sided occluder
+                    maskedOcclusionCulling->RenderTriangles((float*)verts, indices, 2, nullptr, MaskedOcclusionCulling::BACKFACE_NONE);
+                }
+            }
+#endif
+
             WorkListType worklist;
-            auto nodeVisitorLambda = [this, &scene, &view, &parentJob, &frustum, &worklist](const AzFramework::IVisibilityScene::NodeData& nodeData) -> void
+
+            AZStd::shared_ptr<AddObjectsToViewJob::JobData> jobData = AZStd::make_shared<AddObjectsToViewJob::JobData>();
+            jobData->m_debugCtx = &m_debugCtx;
+            jobData->m_scene = &scene;
+            jobData->m_view = &view;
+            jobData->m_frustum = frustum;
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            jobData->m_maskedOcclusionCulling = maskedOcclusionCulling;
+#endif
+
+            auto nodeVisitorLambda = [this, jobData, &parentJob, &frustum, &worklist](const AzFramework::IVisibilityScene::NodeData& nodeData) -> void
             {
                 AZ_PROFILE_SCOPE(Debug::ProfileCategory::AzRender, "nodeVisitorLambda()");
                 AZ_Assert(nodeData.m_entries.size() > 0, "should not get called with 0 entries");
@@ -468,7 +617,7 @@ namespace AZ
                 if (worklist.size() == worklist.capacity())
                 {
                     //Kick off a job to process the (full) worklist
-                    AddObjectsToViewJob* job = aznew AddObjectsToViewJob(m_debugCtx, scene, view, frustum, worklist); //pool allocated (cheap), auto-deletes when job finishes
+                    AddObjectsToViewJob* job = aznew AddObjectsToViewJob(jobData, worklist); //pool allocated (cheap), auto-deletes when job finishes
                     worklist.clear();
                     parentJob.SetContinuation(job);
                     job->Start();
@@ -486,8 +635,16 @@ namespace AZ
 
             if (worklist.size() > 0)
             {
+                AZStd::shared_ptr<AddObjectsToViewJob::JobData> remainingJobData = AZStd::make_shared<AddObjectsToViewJob::JobData>();
+                remainingJobData->m_debugCtx = &m_debugCtx;
+                remainingJobData->m_scene = &scene;
+                remainingJobData->m_view = &view;
+                remainingJobData->m_frustum = frustum;
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+                remainingJobData->m_maskedOcclusionCulling = maskedOcclusionCulling;
+#endif
                 //Kick off a job to process any remaining workitems
-                AddObjectsToViewJob* job = aznew AddObjectsToViewJob(m_debugCtx, scene, view, frustum, worklist); //pool allocated (cheap), auto-deletes when job finishes
+                AddObjectsToViewJob* job = aznew AddObjectsToViewJob(remainingJobData, worklist); //pool allocated (cheap), auto-deletes when job finishes
                 parentJob.SetContinuation(job);
                 job->Start();
             }
@@ -576,6 +733,11 @@ namespace AZ
             m_debugCtx.ResetCullStats();
             m_debugCtx.m_numCullablesInScene = GetNumCullables();
 
+            for (auto& view : views)
+            {
+                view->BeginCulling();
+            }
+
             AuxGeomDrawPtr auxGeom;
             if (m_debugCtx.m_debugDraw)
             {

+ 38 - 1
Gems/Atom/RPI/Code/Source/RPI.Public/View.cpp

@@ -15,18 +15,28 @@
 #include <Atom/RPI.Public/RPISystemInterface.h>
 #include <Atom/RPI.Public/Shader/ShaderResourceGroup.h>
 #include <Atom/RPI.Public/Culling.h>
-
+#include <Atom/RPI.Public/RenderPipeline.h>
+#include <Atom/RPI.Public/Pass/Specific/SwapChainPass.h>
 #include <Atom/RHI/DrawListTagRegistry.h>
 
 #include <AzCore/Casting/lossy_cast.h>
 #include <AzCore/Component/ComponentApplicationBus.h>
 #include <AzCore/Math/MatrixUtils.h>
 #include <AzCore/Serialization/SerializeContext.h>
+#include <Atom_RPI_Traits_Platform.h>
+
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+#include <MaskedOcclusionCulling/MaskedOcclusionCulling.h>
+#endif
 
 namespace AZ
 {
     namespace RPI
     {
+        // fixed-size software occlusion culling buffer
+        const uint32_t MaskedSoftwareOcclusionCullingWidth = 1920;
+        const uint32_t MaskedSoftwareOcclusionCullingHeight = 1080;
+
         ViewPtr View::CreateView(const AZ::Name& name, UsageFlags usage)
         {
             View* view = aznew View(name, usage);
@@ -51,6 +61,21 @@ namespace AZ
             {
                 m_shaderResourceGroup = ShaderResourceGroup::Create(viewSrgAsset);
             }
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            m_maskedOcclusionCulling = MaskedOcclusionCulling::Create();
+            m_maskedOcclusionCulling->SetResolution(MaskedSoftwareOcclusionCullingWidth, MaskedSoftwareOcclusionCullingHeight);
+#endif
+        }
+
+        View::~View()
+        {
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            if (m_maskedOcclusionCulling)
+            {
+                MaskedOcclusionCulling::Destroy(m_maskedOcclusionCulling);
+                m_maskedOcclusionCulling = nullptr;
+            }
+#endif
         }
 
         void View::SetDrawListMask(const RHI::DrawListMask& drawListMask)
@@ -374,5 +399,17 @@ namespace AZ
             m_shaderResourceGroup->Compile();
             m_needBuildSrg = false;
         }
+
+        void View::BeginCulling()
+        {
+#if AZ_TRAIT_MASKED_OCCLUSION_CULLING_SUPPORTED
+            m_maskedOcclusionCulling->ClearBuffer();
+#endif
+        }
+
+        MaskedOcclusionCulling* View::GetMaskedOcclusionCulling()
+        {
+            return m_maskedOcclusionCulling;
+        }
     } // namespace RPI
 } // namespace AZ

+ 18 - 0
Gems/Atom/RPI/Code/atom_rpi_masked_occlusion_files.cmake

@@ -0,0 +1,18 @@
+#
+# All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+# its licensors.
+#
+# For complete copyright and license terms please see the LICENSE at the root of this
+# distribution (the "License"). All use of this software is governed by the License,
+# or, if provided, by the license below or the license accompanying this file. Do not
+# remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+
+set(FILES
+    External/MaskedOcclusionCulling/MaskedOcclusionCulling.h
+    External/MaskedOcclusionCulling/MaskedOcclusionCullingCommon.inl
+    External/MaskedOcclusionCulling/MaskedOcclusionCulling.cpp
+    External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX2.cpp
+    External/MaskedOcclusionCulling/MaskedOcclusionCullingAVX512.cpp
+)

+ 1 - 1
Gems/Atom/RPI/Code/atom_rpi_public_files.cmake

@@ -180,4 +180,4 @@ set(FILES
     Source/RPI.Public/GpuQuery/Query.cpp
     Source/RPI.Public/GpuQuery/QueryPool.cpp
     Source/RPI.Public/GpuQuery/TimestampQueryPool.cpp
-)
+)

+ 4 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/Module.cpp

@@ -25,6 +25,7 @@
 #include <Material/MaterialComponent.h>
 #include <Mesh/MeshComponent.h>
 #include <ReflectionProbe/ReflectionProbeComponent.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
 #include <PostProcess/PostFxLayerComponent.h>
 #include <PostProcess/Bloom/BloomComponent.h>
 #include <PostProcess/DepthOfField/DepthOfFieldComponent.h>
@@ -57,6 +58,7 @@
 #include <Mesh/EditorMeshComponent.h>
 #include <Mesh/EditorMeshSystemComponent.h>
 #include <ReflectionProbe/EditorReflectionProbeComponent.h>
+#include <OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h>
 #include <PostProcess/EditorPostFxLayerComponent.h>
 #include <PostProcess/Bloom/EditorBloomComponent.h>
 #include <PostProcess/DepthOfField/EditorDepthOfFieldComponent.h>
@@ -117,6 +119,7 @@ namespace AZ
                         DeferredFogComponent::CreateDescriptor(),
                         SurfaceData::SurfaceDataMeshComponent::CreateDescriptor(),
                         AttachmentComponent::CreateDescriptor(),
+                        OcclusionCullingPlaneComponent::CreateDescriptor(),
 
 #ifdef ATOMLYINTEGRATION_FEATURE_COMMON_EDITOR
                         EditorAreaLightComponent::CreateDescriptor(),
@@ -149,6 +152,7 @@ namespace AZ
                         EditorDeferredFogComponent::CreateDescriptor(),
                         SurfaceData::EditorSurfaceDataMeshComponent::CreateDescriptor(),
                         EditorAttachmentComponent::CreateDescriptor(),
+                        EditorOcclusionCullingPlaneComponent::CreateDescriptor(),
 #endif
                     });
             }

+ 95 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp

@@ -0,0 +1,95 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#include <OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h>
+#include <AzFramework/StringFunc/StringFunc.h>
+#include <AzToolsFramework/API/ToolsApplicationAPI.h>
+#include <AzToolsFramework/Entity/EditorEntityInfoBus.h>
+#include <AzToolsFramework/API/EditorAssetSystemAPI.h>
+#include <AzCore/Component/Entity.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        void EditorOcclusionCullingPlaneComponent::Reflect(AZ::ReflectContext* context)
+        {
+            BaseClass::Reflect(context);
+
+            if (AZ::SerializeContext* serializeContext = azrtti_cast<AZ::SerializeContext*>(context))
+            {
+                serializeContext->Class<EditorOcclusionCullingPlaneComponent, BaseClass>()
+                    ->Version(1, ConvertToEditorRenderComponentAdapter<1>)
+                ;
+
+                if (AZ::EditContext* editContext = serializeContext->GetEditContext())
+                {
+                    editContext->Class<EditorOcclusionCullingPlaneComponent>(
+                        "Occlusion Culling Plane", "The OcclusionCullingPlane component is used to cull meshes that are inside the view frustum and behind the occlusion plane")
+                        ->ClassElement(AZ::Edit::ClassElements::EditorData, "")
+                            ->Attribute(AZ::Edit::Attributes::Category, "Atom")
+                            ->Attribute(AZ::Edit::Attributes::Icon, "Icons/Components/Component_Placeholder.svg")
+                            ->Attribute(AZ::Edit::Attributes::ViewportIcon, "Icons/Components/Viewport/Component_Placeholder.png")
+                            ->Attribute(AZ::Edit::Attributes::AppearsInAddComponentMenu, AZ_CRC("Game", 0x232b318c))
+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
+                        ;
+
+                    editContext->Class<OcclusionCullingPlaneComponentController>(
+                        "OcclusionCullingPlaneComponentController", "")
+                        ->ClassElement(AZ::Edit::ClassElements::EditorData, "")
+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
+                        ->DataElement(AZ::Edit::UIHandlers::Default, &OcclusionCullingPlaneComponentController::m_configuration, "Configuration", "")
+                            ->Attribute(AZ::Edit::Attributes::Visibility, AZ::Edit::PropertyVisibility::ShowChildrenOnly)
+                        ;
+
+                    editContext->Class<OcclusionCullingPlaneComponentConfig>(
+                        "OcclusionCullingPlaneComponentConfig", "")
+                        ->ClassElement(AZ::Edit::ClassElements::Group, "Settings")
+                            ->Attribute(AZ::Edit::Attributes::AutoExpand, true)
+                        ->DataElement(AZ::Edit::UIHandlers::CheckBox, &OcclusionCullingPlaneComponentConfig::m_showVisualization, "Show Visualization", "Show the occlusion culling plane visualization")
+                            ->Attribute(AZ::Edit::Attributes::ChangeNotify, Edit::PropertyRefreshLevels::ValuesOnly)
+                        ->DataElement(AZ::Edit::UIHandlers::CheckBox, &OcclusionCullingPlaneComponentConfig::m_transparentVisualization, "Transparent Visualization", "Sets the occlusion culling plane visualization as transparent")
+                            ->Attribute(AZ::Edit::Attributes::ChangeNotify, Edit::PropertyRefreshLevels::ValuesOnly)
+                        ;
+                }
+            }
+
+            if (auto behaviorContext = azrtti_cast<BehaviorContext*>(context))
+            {
+                behaviorContext->ConstantProperty("EditorOcclusionCullingPlaneComponentTypeId", BehaviorConstant(Uuid(EditorOcclusionCullingPlaneComponentTypeId)))
+                    ->Attribute(AZ::Script::Attributes::Module, "render")
+                    ->Attribute(AZ::Script::Attributes::Scope, AZ::Script::Attributes::ScopeFlags::Automation);
+            }
+        }
+
+        EditorOcclusionCullingPlaneComponent::EditorOcclusionCullingPlaneComponent()
+        {
+        }
+
+        EditorOcclusionCullingPlaneComponent::EditorOcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config)
+            : BaseClass(config)
+        {
+        }
+
+        void EditorOcclusionCullingPlaneComponent::Activate()
+        {
+            BaseClass::Activate();
+            AzFramework::EntityDebugDisplayEventBus::Handler::BusConnect(GetEntityId());
+        }
+
+        void EditorOcclusionCullingPlaneComponent::Deactivate()
+        {
+            AzFramework::EntityDebugDisplayEventBus::Handler::BusDisconnect();
+            BaseClass::Deactivate();
+        }
+    } // namespace Render
+} // namespace AZ

+ 43 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h

@@ -0,0 +1,43 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <AzFramework/Entity/EntityDebugDisplayBus.h>
+#include <AzToolsFramework/API/ComponentEntitySelectionBus.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
+#include <Atom/Feature/Utils/EditorRenderComponentAdapter.h>
+
+namespace AZ
+{
+    namespace Render
+    {        
+        class EditorOcclusionCullingPlaneComponent final
+            : public EditorRenderComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentConfig>
+            , private AzFramework::EntityDebugDisplayEventBus::Handler
+        {
+        public:
+            using BaseClass = EditorRenderComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentConfig>;
+            AZ_EDITOR_COMPONENT(AZ::Render::EditorOcclusionCullingPlaneComponent, EditorOcclusionCullingPlaneComponentTypeId, BaseClass);
+
+            static void Reflect(AZ::ReflectContext* context);
+
+            EditorOcclusionCullingPlaneComponent();
+            EditorOcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config);
+
+            // AZ::Component overrides
+            void Activate() override;
+            void Deactivate() override;
+        };
+    } // namespace Render
+} // namespace AZ

+ 43 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp

@@ -0,0 +1,43 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponent.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        OcclusionCullingPlaneComponent::OcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config)
+            : BaseClass(config)
+        {
+        }
+
+        void OcclusionCullingPlaneComponent::Reflect(AZ::ReflectContext* context)
+        {
+            BaseClass::Reflect(context);
+
+            if (auto serializeContext = azrtti_cast<AZ::SerializeContext*>(context))
+            {
+                serializeContext->Class<OcclusionCullingPlaneComponent, BaseClass>()
+                    ->Version(0)
+                    ;
+            }
+
+            if (auto behaviorContext = azrtti_cast<BehaviorContext*>(context))
+            {
+                behaviorContext->ConstantProperty("OcclusionCullingPlaneComponentTypeId", BehaviorConstant(Uuid(OcclusionCullingPlaneComponentTypeId)))
+                    ->Attribute(AZ::Script::Attributes::Module, "render")
+                    ->Attribute(AZ::Script::Attributes::Scope, AZ::Script::Attributes::ScopeFlags::Common);
+            }
+        }
+    } // namespace Render
+} // namespace AZ

+ 37 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h

@@ -0,0 +1,37 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
+#include <AzFramework/Components/ComponentAdapter.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        class OcclusionCullingPlaneComponent final
+            : public AzFramework::Components::ComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponentConfig>
+        {
+        public:
+            using BaseClass = AzFramework::Components::ComponentAdapter<OcclusionCullingPlaneComponentController, OcclusionCullingPlaneComponentConfig>;
+            AZ_COMPONENT(AZ::Render::OcclusionCullingPlaneComponent, OcclusionCullingPlaneComponentTypeId, BaseClass);
+
+            OcclusionCullingPlaneComponent() = default;
+            OcclusionCullingPlaneComponent(const OcclusionCullingPlaneComponentConfig& config);
+
+            static void Reflect(AZ::ReflectContext* context);
+        };
+
+    } // namespace Render
+} // namespace AZ

+ 22 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h

@@ -0,0 +1,22 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+namespace AZ
+{
+    namespace Render
+    {
+        static constexpr const char* const OcclusionCullingPlaneComponentTypeId = "{F7537387-15A8-48F0-A1F3-D19C5886B886}";
+        static constexpr const char* const EditorOcclusionCullingPlaneComponentTypeId = "{BE7CC17B-32EB-49B0-BAD9-D26E3A059012}";
+    } // namespace Render
+} // namespace AZ

+ 143 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp

@@ -0,0 +1,143 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
+
+#include <Atom/RPI.Public/Model/Model.h>
+#include <Atom/RPI.Public/Image/StreamingImage.h>
+#include <Atom/RPI.Public/Scene.h>
+
+#include <AzCore/Asset/AssetManager.h>
+#include <AzCore/Asset/AssetManagerBus.h>
+#include <AzCore/Debug/EventTrace.h>
+#include <AzCore/Serialization/SerializeContext.h>
+
+#include <AzFramework/Entity/EntityContextBus.h>
+#include <AzFramework/Entity/EntityContext.h>
+#include <AzFramework/Scene/Scene.h>
+
+#include <AzCore/RTTI/BehaviorContext.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        void OcclusionCullingPlaneComponentConfig::Reflect(ReflectContext* context)
+        {
+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
+            {
+                serializeContext->Class<OcclusionCullingPlaneComponentConfig>()
+                    ->Version(0)
+                    ->Field("ShowVisualization", &OcclusionCullingPlaneComponentConfig::m_showVisualization)
+                    ->Field("TransparentVisualization", &OcclusionCullingPlaneComponentConfig::m_transparentVisualization)
+                    ;
+            }
+        }
+
+        void OcclusionCullingPlaneComponentController::Reflect(ReflectContext* context)
+        {
+            OcclusionCullingPlaneComponentConfig::Reflect(context);
+
+            if (auto* serializeContext = azrtti_cast<SerializeContext*>(context))
+            {
+                serializeContext->Class<OcclusionCullingPlaneComponentController>()
+                    ->Version(0)
+                    ->Field("Configuration", &OcclusionCullingPlaneComponentController::m_configuration);
+            }
+        }
+
+        void OcclusionCullingPlaneComponentController::GetDependentServices(AZ::ComponentDescriptor::DependencyArrayType& dependent)
+        {
+            dependent.push_back(AZ_CRC("TransformService", 0x8ee22c50));
+        }
+
+        void OcclusionCullingPlaneComponentController::GetProvidedServices(AZ::ComponentDescriptor::DependencyArrayType& provided)
+        {
+            provided.push_back(AZ_CRC("OcclusionCullingPlaneService", 0x9123f33d));
+        }
+
+        void OcclusionCullingPlaneComponentController::GetIncompatibleServices(AZ::ComponentDescriptor::DependencyArrayType& incompatible)
+        {
+            incompatible.push_back(AZ_CRC("OcclusionCullingPlaneService", 0x9123f33d));
+        }
+
+        void OcclusionCullingPlaneComponentController::GetRequiredServices(AZ::ComponentDescriptor::DependencyArrayType& required)
+        {
+            required.push_back(AZ_CRC("TransformService"));
+        }
+
+        OcclusionCullingPlaneComponentController::OcclusionCullingPlaneComponentController(const OcclusionCullingPlaneComponentConfig& config)
+            : m_configuration(config)
+        {
+        }
+
+        void OcclusionCullingPlaneComponentController::Activate(AZ::EntityId entityId)
+        {
+            m_entityId = entityId;
+
+            TransformNotificationBus::Handler::BusConnect(m_entityId);
+
+            m_featureProcessor = RPI::Scene::GetFeatureProcessorForEntity<OcclusionCullingPlaneFeatureProcessorInterface>(entityId);
+            AZ_Assert(m_featureProcessor, "OcclusionCullingPlaneComponentController was unable to find a OcclusionCullingPlaneFeatureProcessor on the EntityContext provided.");
+
+            m_transformInterface = TransformBus::FindFirstHandler(entityId);
+            AZ_Assert(m_transformInterface, "Unable to attach to a TransformBus handler");
+            if (!m_transformInterface)
+            {
+                return;
+            }
+
+            // add this occlusion plane to the feature processor
+            const AZ::Transform& transform = m_transformInterface->GetWorldTM();
+            m_handle = m_featureProcessor->AddOcclusionCullingPlane(transform);
+
+            // set visualization
+            m_featureProcessor->ShowVisualization(m_handle, m_configuration.m_showVisualization);
+            m_featureProcessor->SetTransparentVisualization(m_handle, m_configuration.m_transparentVisualization);
+        }
+
+        void OcclusionCullingPlaneComponentController::Deactivate()
+        {
+            if (m_featureProcessor)
+            {
+                m_featureProcessor->RemoveOcclusionCullingPlane(m_handle);
+            }
+
+            Data::AssetBus::MultiHandler::BusDisconnect();
+            TransformNotificationBus::Handler::BusDisconnect();
+
+            m_transformInterface = nullptr;
+            m_featureProcessor = nullptr;
+        }
+
+        void OcclusionCullingPlaneComponentController::SetConfiguration(const OcclusionCullingPlaneComponentConfig& config)
+        {
+            m_configuration = config;
+        }
+
+        const OcclusionCullingPlaneComponentConfig& OcclusionCullingPlaneComponentController::GetConfiguration() const
+        {
+            return m_configuration;
+        }
+
+        void OcclusionCullingPlaneComponentController::OnTransformChanged([[maybe_unused]] const AZ::Transform& local, const AZ::Transform& world)
+        {
+            if (!m_featureProcessor)
+            {
+                return;
+            }
+
+            m_featureProcessor->SetTransform(m_handle, world);
+        }
+    } // namespace Render
+} // namespace AZ

+ 81 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h

@@ -0,0 +1,81 @@
+/*
+* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
+* its licensors.
+*
+* For complete copyright and license terms please see the LICENSE at the root of this
+* distribution (the "License"). All use of this software is governed by the License,
+* or, if provided, by the license below or the license accompanying this file. Do not
+* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*
+*/
+
+#pragma once
+
+#include <AzCore/Asset/AssetCommon.h>
+#include <AzCore/Component/Component.h>
+#include <AzCore/Component/TransformBus.h>
+#include <Atom/Feature/OcclusionCullingPlane/OcclusionCullingPlaneFeatureProcessorInterface.h>
+#include <Atom/RPI.Public/Model/Model.h>
+#include <LmbrCentral/Shape/BoxShapeComponentBus.h>
+#include <OcclusionCullingPlane/OcclusionCullingPlaneComponentConstants.h>
+
+namespace AZ
+{
+    namespace Render
+    {
+        class OcclusionCullingPlaneComponentConfig final
+            : public AZ::ComponentConfig
+        {
+        public:
+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneComponentConfig, "{D0E107CA-5AFB-4675-BC97-94BCA5F248DB}", ComponentConfig);
+            AZ_CLASS_ALLOCATOR(OcclusionCullingPlaneComponentConfig, SystemAllocator, 0);
+            static void Reflect(AZ::ReflectContext* context);
+
+            bool m_showVisualization = true;
+            bool m_transparentVisualization = false;
+
+            OcclusionCullingPlaneComponentConfig() = default;
+        };
+
+        class OcclusionCullingPlaneComponentController final
+            : public Data::AssetBus::MultiHandler
+            , private TransformNotificationBus::Handler
+        {
+        public:
+            friend class EditorOcclusionCullingPlaneComponent;
+
+            AZ_CLASS_ALLOCATOR(OcclusionCullingPlaneComponentController, AZ::SystemAllocator, 0);
+            AZ_RTTI(AZ::Render::OcclusionCullingPlaneComponentController, "{8EDA3C7D-5171-4843-9969-4D84DB13F221}");
+
+            static void Reflect(AZ::ReflectContext* context);
+            static void GetDependentServices(AZ::ComponentDescriptor::DependencyArrayType& dependent);
+            static void GetProvidedServices(AZ::ComponentDescriptor::DependencyArrayType& provided);
+            static void GetIncompatibleServices(AZ::ComponentDescriptor::DependencyArrayType& incompatible);
+            static void GetRequiredServices(AZ::ComponentDescriptor::DependencyArrayType& required);
+
+            OcclusionCullingPlaneComponentController() = default;
+            OcclusionCullingPlaneComponentController(const OcclusionCullingPlaneComponentConfig& config);
+
+            void Activate(AZ::EntityId entityId);
+            void Deactivate();
+            void SetConfiguration(const OcclusionCullingPlaneComponentConfig& config);
+            const OcclusionCullingPlaneComponentConfig& GetConfiguration() const;
+
+        private:
+
+            AZ_DISABLE_COPY(OcclusionCullingPlaneComponentController);
+
+            // TransformNotificationBus overrides
+            void OnTransformChanged(const AZ::Transform& local, const AZ::Transform& world) override;
+
+            // handle for this occlusion plane in the feature processor
+            OcclusionCullingPlaneHandle m_handle;
+
+            OcclusionCullingPlaneFeatureProcessorInterface* m_featureProcessor = nullptr;
+            TransformInterface* m_transformInterface = nullptr;
+            AZ::EntityId m_entityId;
+            OcclusionCullingPlaneComponentConfig m_configuration;
+        };
+    } // namespace Render
+} // namespace AZ

+ 2 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_editor_files.cmake

@@ -56,6 +56,8 @@ set(FILES
     Source/Mesh/EditorMeshSystemComponent.h
     Source/Mesh/MeshThumbnail.h
     Source/Mesh/MeshThumbnail.cpp
+    Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.h
+    Source/OcclusionCullingPlane/EditorOcclusionCullingPlaneComponent.cpp
     Source/PostProcess/EditorPostFxLayerComponent.cpp
     Source/PostProcess/EditorPostFxLayerComponent.h
     Source/PostProcess/Bloom/EditorBloomComponent.cpp

+ 4 - 0
Gems/AtomLyIntegration/CommonFeatures/Code/atomlyintegration_commonfeatures_files.cmake

@@ -72,6 +72,10 @@ set(FILES
     Source/Mesh/MeshComponent.cpp
     Source/Mesh/MeshComponentController.h
     Source/Mesh/MeshComponentController.cpp
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.h
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponent.cpp
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.h
+    Source/OcclusionCullingPlane/OcclusionCullingPlaneComponentController.cpp
     Source/PostProcess/PostFxLayerComponent.cpp
     Source/PostProcess/PostFxLayerComponent.h
     Source/PostProcess/PostFxLayerComponentConfig.cpp