Browse Source

Option to enable threaded occlusion rendering. May actually result in worse performance, so disabled by default. Report number of rendered occluders accurately (taking into account occluders rejected by being occluded themselves, or by running out of triangles.) Closes #970.

Lasse Öörni 10 years ago
parent
commit
60d946eb64

+ 1 - 1
Docs/Reference.dox

@@ -909,7 +909,7 @@ Additionally there are 2D drawable components defined by the \ref Urho2D "Urho2D
 
 The following techniques will be used to reduce the amount of CPU and GPU work when rendering. By default they are all on:
 
-- Software rasterized occlusion: after the octree has been queried for visible objects, the objects that are marked as occluders are rendered on the CPU to a small hierarchical-depth buffer, and it will be used to test the non-occluders for visibility. Use \ref Renderer::SetMaxOccluderTriangles "SetMaxOccluderTriangles()" and \ref Renderer::SetOccluderSizeThreshold "SetOccluderSizeThreshold()" to configure the occlusion rendering.
+- Software rasterized occlusion: after the octree has been queried for visible objects, the objects that are marked as occluders are rendered on the CPU to a small hierarchical-depth buffer, and it will be used to test the non-occluders for visibility. Use \ref Renderer::SetMaxOccluderTriangles "SetMaxOccluderTriangles()" and \ref Renderer::SetOccluderSizeThreshold "SetOccluderSizeThreshold()" to configure the occlusion rendering. Occlusion testing will always be multithreaded, however occlusion rendering is by default singlethreaded, to allow rejecting subsequent occluders while rendering front-to-back.. Use \ref Renderer::SetThreadedOcclusion "SetThreadedOcclusion()" to enable threading also in rendering, however this can actually perform worse in e.g. terrain scenes where terrain patches act as occluders.
 
 - Hardware instancing: rendering operations with the same geometry, material and light will be grouped together and performed as one draw call if supported. Note that even when instancing is not available, they still benefit from the grouping, as render state only needs to be checked & set once before rendering each group, reducing the CPU cost.
 

+ 2 - 0
Source/Urho3D/AngelScript/GraphicsAPI.cpp

@@ -1728,6 +1728,8 @@ static void RegisterRenderer(asIScriptEngine* engine)
     engine->RegisterObjectMethod("Renderer", "int get_occlusionBufferSize() const", asMETHOD(Renderer, GetOcclusionBufferSize), asCALL_THISCALL);
     engine->RegisterObjectMethod("Renderer", "void set_occluderSizeThreshold(float)", asMETHOD(Renderer, SetOccluderSizeThreshold), asCALL_THISCALL);
     engine->RegisterObjectMethod("Renderer", "float get_occluderSizeThreshold() const", asMETHOD(Renderer, GetOccluderSizeThreshold), asCALL_THISCALL);
+    engine->RegisterObjectMethod("Renderer", "void set_threadedOcclusion(bool)", asMETHOD(Renderer, SetThreadedOcclusion), asCALL_THISCALL);
+    engine->RegisterObjectMethod("Renderer", "bool get_threadedOcclusion() const", asMETHOD(Renderer, GetThreadedOcclusion), asCALL_THISCALL);
     engine->RegisterObjectMethod("Renderer", "void set_mobileShadowBiasMul(float)", asMETHOD(Renderer, SetMobileShadowBiasMul), asCALL_THISCALL);
     engine->RegisterObjectMethod("Renderer", "float get_mobileShadowBiasMul() const", asMETHOD(Renderer, GetMobileShadowBiasMul), asCALL_THISCALL);
     engine->RegisterObjectMethod("Renderer", "void set_mobileShadowBiasAdd(float)", asMETHOD(Renderer, SetMobileShadowBiasAdd), asCALL_THISCALL);

+ 1 - 1
Source/Urho3D/Graphics/CustomGeometry.cpp

@@ -196,7 +196,7 @@ bool CustomGeometry::DrawOcclusion(OcclusionBuffer* buffer)
             continue;
 
         // Draw and check for running out of triangles
-        success = buffer->Draw(node_->GetWorldTransform(), vertexData, vertexSize, geometry->GetVertexStart(),
+        success = buffer->AddTriangles(node_->GetWorldTransform(), vertexData, vertexSize, geometry->GetVertexStart(),
             geometry->GetVertexCount());
 
         if (!success)

+ 217 - 106
Source/Urho3D/Graphics/OcclusionBuffer.cpp

@@ -22,6 +22,8 @@
 
 #include "../Precompiled.h"
 
+#include "../Core/WorkQueue.h"
+#include "../Core/Profiler.h"
 #include "../Graphics/Camera.h"
 #include "../Graphics/OcclusionBuffer.h"
 #include "../IO/Log.h"
@@ -38,9 +40,15 @@ static const unsigned CLIPMASK_Y_NEG = 0x8;
 static const unsigned CLIPMASK_Z_POS = 0x10;
 static const unsigned CLIPMASK_Z_NEG = 0x20;
 
+void DrawOcclusionBatchWork(const WorkItem* item, unsigned threadIndex)
+{
+    OcclusionBuffer* buffer = reinterpret_cast<OcclusionBuffer*>(item->aux_);
+    OcclusionBatch& batch = *reinterpret_cast<OcclusionBatch*>(item->start_);
+    buffer->DrawBatch(batch, threadIndex);
+}
+
 OcclusionBuffer::OcclusionBuffer(Context* context) :
     Object(context),
-    buffer_(0),
     width_(0),
     height_(0),
     numTriangles_(0),
@@ -57,7 +65,7 @@ OcclusionBuffer::~OcclusionBuffer()
 {
 }
 
-bool OcclusionBuffer::SetSize(int width, int height)
+bool OcclusionBuffer::SetSize(int width, int height, bool threaded)
 {
     // Force the height to an even amount of pixels for better mip generation
     if (height & 1)
@@ -78,9 +86,18 @@ bool OcclusionBuffer::SetSize(int width, int height)
     width_ = width;
     height_ = height;
 
-    // Reserve extra memory in case 3D clipping is not exact
-    fullBuffer_ = new int[width * (height + 2) + 2];
-    buffer_ = fullBuffer_.Get() + width + 1;
+    // Build work buffers for threading
+    unsigned numThreadBuffers = threaded ? GetSubsystem<WorkQueue>()->GetNumThreads() + 1 : 1;
+    buffers_.Resize(numThreadBuffers);
+    for (unsigned i = 0; i < numThreadBuffers; ++i)
+    {
+        // Reserve extra memory in case 3D clipping is not exact
+        OcclusionBufferData& buffer = buffers_[i];
+        buffer.dataWithSafety_ = new int[width * (height + 2) + 2];
+        buffer.data_ = buffer.dataWithSafety_.Get() + width + 1;
+        buffer.used_ = false;
+    }
+
     mipBuffers_.Clear();
 
     // Build buffers for mip levels
@@ -96,7 +113,7 @@ bool OcclusionBuffer::SetSize(int width, int height)
     }
 
     URHO3D_LOGDEBUG("Set occlusion buffer size " + String(width_) + "x" + String(height_) + " with " +
-             String(mipBuffers_.Size()) + " mip levels");
+             String(mipBuffers_.Size()) + " mip levels and " + String(numThreadBuffers) + " thread buffers");
 
     CalculateViewport();
     return true;
@@ -136,123 +153,98 @@ void OcclusionBuffer::SetCullMode(CullMode mode)
 void OcclusionBuffer::Reset()
 {
     numTriangles_ = 0;
+    batches_.Clear();
 }
 
 void OcclusionBuffer::Clear()
 {
-    if (!buffer_)
-        return;
-
     Reset();
 
-    int* dest = buffer_;
-    int count = width_ * height_;
-    int fillValue = (int)OCCLUSION_Z_SCALE;
-
-    while (count--)
-        *dest++ = fillValue;
+    // Only clear the main thread buffer. Rest are cleared on-demand when drawing the first batch
+    ClearBuffer(0);
+    for (unsigned i = 1; i < buffers_.Size(); ++i)
+        buffers_[i].used_ = false;
 
     depthHierarchyDirty_ = true;
 }
 
-bool OcclusionBuffer::Draw(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, unsigned vertexStart,
+bool OcclusionBuffer::AddTriangles(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, unsigned vertexStart,
     unsigned vertexCount)
 {
-    const unsigned char* srcData = ((const unsigned char*)vertexData) + vertexStart * vertexSize;
-
-    Matrix4 modelViewProj = viewProj_ * model;
-    depthHierarchyDirty_ = true;
-
-    // Theoretical max. amount of vertices if each of the 6 clipping planes doubles the triangle count
-    Vector4 vertices[64 * 3];
-
-    // 16-bit indices
-    unsigned index = 0;
-    while (index + 2 < vertexCount)
-    {
-        if (numTriangles_ >= maxTriangles_)
-            return false;
-
-        const Vector3& v0 = *((const Vector3*)(&srcData[index * vertexSize]));
-        const Vector3& v1 = *((const Vector3*)(&srcData[(index + 1) * vertexSize]));
-        const Vector3& v2 = *((const Vector3*)(&srcData[(index + 2) * vertexSize]));
-
-        vertices[0] = ModelTransform(modelViewProj, v0);
-        vertices[1] = ModelTransform(modelViewProj, v1);
-        vertices[2] = ModelTransform(modelViewProj, v2);
-        DrawTriangle(vertices);
-
-        index += 3;
-    }
-
-    return true;
+    batches_.Resize(batches_.Size() + 1);
+    OcclusionBatch& batch = batches_.Back();
+
+    batch.model_ = model;
+    batch.vertexData_ = vertexData;
+    batch.vertexSize_ = vertexSize;
+    batch.indexData_ = 0;
+    batch.indexSize_ = 0;
+    batch.drawStart_ = vertexStart;
+    batch.drawCount_ = vertexCount;
+
+    numTriangles_ += vertexCount / 3;
+    return numTriangles_ <= maxTriangles_;
 }
 
-bool OcclusionBuffer::Draw(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, const void* indexData,
+bool OcclusionBuffer::AddTriangles(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, const void* indexData,
     unsigned indexSize, unsigned indexStart, unsigned indexCount)
 {
-    const unsigned char* srcData = (const unsigned char*)vertexData;
-
-    Matrix4 modelViewProj = viewProj_ * model;
-    depthHierarchyDirty_ = true;
-
-    // Theoretical max. amount of vertices if each of the 6 clipping planes doubles the triangle count
-    Vector4 vertices[64 * 3];
+    batches_.Resize(batches_.Size() + 1);
+    OcclusionBatch& batch = batches_.Back();
+
+    batch.model_ = model;
+    batch.vertexData_ = vertexData;
+    batch.vertexSize_ = vertexSize;
+    batch.indexData_ = indexData;
+    batch.indexSize_ = indexSize;
+    batch.drawStart_ = indexStart;
+    batch.drawCount_ = indexCount;
+
+    numTriangles_ += indexCount / 3;
+    return numTriangles_ <= maxTriangles_;
+}
 
-    // 16-bit indices
-    if (indexSize == sizeof(unsigned short))
+void OcclusionBuffer::DrawTriangles()
+{
+    if (buffers_.Size() == 1)
     {
-        const unsigned short* indices = ((const unsigned short*)indexData) + indexStart;
-        const unsigned short* indicesEnd = indices + indexCount;
-
-        while (indices < indicesEnd)
-        {
-            if (numTriangles_ >= maxTriangles_)
-                return false;
+        // Not threaded
+        for (Vector<OcclusionBatch>::Iterator i = batches_.Begin(); i != batches_.End(); ++i)
+            DrawBatch(*i, 0);
 
-            const Vector3& v0 = *((const Vector3*)(&srcData[indices[0] * vertexSize]));
-            const Vector3& v1 = *((const Vector3*)(&srcData[indices[1] * vertexSize]));
-            const Vector3& v2 = *((const Vector3*)(&srcData[indices[2] * vertexSize]));
-
-            vertices[0] = ModelTransform(modelViewProj, v0);
-            vertices[1] = ModelTransform(modelViewProj, v1);
-            vertices[2] = ModelTransform(modelViewProj, v2);
-            DrawTriangle(vertices);
-
-            indices += 3;
-        }
+        depthHierarchyDirty_ = true;
     }
-    else
+    else if (buffers_.Size() > 1)
     {
-        const unsigned* indices = ((const unsigned*)indexData) + indexStart;
-        const unsigned* indicesEnd = indices + indexCount;
+        // Threaded
+        WorkQueue* queue = GetSubsystem<WorkQueue>();
 
-        while (indices < indicesEnd)
+        for (Vector<OcclusionBatch>::Iterator i = batches_.Begin(); i != batches_.End(); ++i)
         {
-            if (numTriangles_ >= maxTriangles_)
-                return false;
+            SharedPtr<WorkItem> item = queue->GetFreeItem();
+            item->priority_ = M_MAX_UNSIGNED;
+            item->workFunction_ = DrawOcclusionBatchWork;
+            item->aux_ = this;
+            item->start_ = &(*i);
+            queue->AddWorkItem(item);
+        }
 
-            const Vector3& v0 = *((const Vector3*)(&srcData[indices[0] * vertexSize]));
-            const Vector3& v1 = *((const Vector3*)(&srcData[indices[1] * vertexSize]));
-            const Vector3& v2 = *((const Vector3*)(&srcData[indices[2] * vertexSize]));
+        queue->Complete(M_MAX_UNSIGNED);
 
-            vertices[0] = ModelTransform(modelViewProj, v0);
-            vertices[1] = ModelTransform(modelViewProj, v1);
-            vertices[2] = ModelTransform(modelViewProj, v2);
-            DrawTriangle(vertices);
-
-            indices += 3;
-        }
+        MergeBuffers();
+        depthHierarchyDirty_ = true;
     }
 
-    return true;
+    batches_.Clear();
 }
 
 void OcclusionBuffer::BuildDepthHierarchy()
 {
-    if (!buffer_)
+    if (buffers_.Empty() || !depthHierarchyDirty_)
         return;
 
+    URHO3D_PROFILE(BuildDepthHierarchy);
+
     // Build the first mip level from the pixel-level data
     int width = (width_ + 1) / 2;
     int height = (height_ + 1) / 2;
@@ -260,7 +252,7 @@ void OcclusionBuffer::BuildDepthHierarchy()
     {
         for (int y = 0; y < height; ++y)
         {
-            int* src = buffer_ + (y * 2) * width_;
+            int* src = buffers_[0].data_ + (y * 2) * width_;
             DepthValue* dest = mipBuffers_[0].Get() + y * width;
             DepthValue* end = dest + width;
 
@@ -350,7 +342,7 @@ void OcclusionBuffer::ResetUseTimer()
 
 bool OcclusionBuffer::IsVisible(const BoundingBox& worldSpaceBox) const
 {
-    if (!buffer_)
+    if (buffers_.Empty())
         return true;
 
     // Transform corners to projection space
@@ -455,8 +447,8 @@ bool OcclusionBuffer::IsVisible(const BoundingBox& worldSpaceBox) const
     }
 
     // If no conclusive result, finally check the pixel-level data
-    int* row = buffer_ + rect.top_ * width_;
-    int* endRow = buffer_ + rect.bottom_ * width_;
+    int* row = buffers_[0].data_ + rect.top_ * width_;
+    int* endRow = buffers_[0].data_ + rect.bottom_ * width_;
     while (row <= endRow)
     {
         int* src = row + rect.left_;
@@ -478,6 +470,86 @@ unsigned OcclusionBuffer::GetUseTimer()
     return useTimer_.GetMSec(false);
 }
 
+
+void OcclusionBuffer::DrawBatch(const OcclusionBatch& batch, unsigned threadIndex)
+{
+    // If buffer not yet used, clear it
+    if (threadIndex > 0 && !buffers_[threadIndex].used_)
+    {
+        ClearBuffer(threadIndex);
+        buffers_[threadIndex].used_ = true;
+    }
+
+    Matrix4 modelViewProj = viewProj_ * batch.model_;
+
+    // Theoretical max. amount of vertices if each of the 6 clipping planes doubles the triangle count
+    Vector4 vertices[64 * 3];
+
+    if (!batch.indexData_)
+    {
+        const unsigned char* srcData = ((const unsigned char*)batch.vertexData_) + batch.drawStart_ * batch.vertexSize_;
+
+        unsigned index = 0;
+        while (index + 2 < batch.drawCount_)
+        {
+            const Vector3& v0 = *((const Vector3*)(&srcData[index * batch.vertexSize_]));
+            const Vector3& v1 = *((const Vector3*)(&srcData[(index + 1) * batch.vertexSize_]));
+            const Vector3& v2 = *((const Vector3*)(&srcData[(index + 2) * batch.vertexSize_]));
+
+            vertices[0] = ModelTransform(modelViewProj, v0);
+            vertices[1] = ModelTransform(modelViewProj, v1);
+            vertices[2] = ModelTransform(modelViewProj, v2);
+            DrawTriangle(vertices, threadIndex);
+
+            index += 3;
+        }
+    }
+    else
+    {
+        const unsigned char* srcData = (const unsigned char*)batch.vertexData_;
+
+        // 16-bit indices
+        if (batch.indexSize_ == sizeof(unsigned short))
+        {
+            const unsigned short* indices = ((const unsigned short*)batch.indexData_) + batch.drawStart_;
+            const unsigned short* indicesEnd = indices + batch.drawCount_;
+
+            while (indices < indicesEnd)
+            {
+                const Vector3& v0 = *((const Vector3*)(&srcData[indices[0] * batch.vertexSize_]));
+                const Vector3& v1 = *((const Vector3*)(&srcData[indices[1] * batch.vertexSize_]));
+                const Vector3& v2 = *((const Vector3*)(&srcData[indices[2] * batch.vertexSize_]));
+
+                vertices[0] = ModelTransform(modelViewProj, v0);
+                vertices[1] = ModelTransform(modelViewProj, v1);
+                vertices[2] = ModelTransform(modelViewProj, v2);
+                DrawTriangle(vertices, threadIndex);
+
+                indices += 3;
+            }
+        }
+        else
+        {
+            const unsigned* indices = ((const unsigned*)batch.indexData_) + batch.drawStart_;
+            const unsigned* indicesEnd = indices + batch.drawCount_;
+
+            while (indices < indicesEnd)
+            {
+                const Vector3& v0 = *((const Vector3*)(&srcData[indices[0] * batch.vertexSize_]));
+                const Vector3& v1 = *((const Vector3*)(&srcData[indices[1] * batch.vertexSize_]));
+                const Vector3& v2 = *((const Vector3*)(&srcData[indices[2] * batch.vertexSize_]));
+
+                vertices[0] = ModelTransform(modelViewProj, v0);
+                vertices[1] = ModelTransform(modelViewProj, v1);
+                vertices[2] = ModelTransform(modelViewProj, v2);
+                DrawTriangle(vertices, threadIndex);
+
+                indices += 3;
+            }
+        }
+    }
+}
+
 inline Vector4 OcclusionBuffer::ModelTransform(const Matrix4& transform, const Vector3& vertex) const
 {
     return Vector4(
@@ -524,7 +596,7 @@ void OcclusionBuffer::CalculateViewport()
     projOffsetScaleY_ = projection_.m11_ * scaleY_;
 }
 
-void OcclusionBuffer::DrawTriangle(Vector4* vertices)
+void OcclusionBuffer::DrawTriangle(Vector4* vertices, unsigned threadIndex)
 {
     unsigned clipMask = 0;
     unsigned andClipMask = 0;
@@ -571,7 +643,7 @@ void OcclusionBuffer::DrawTriangle(Vector4* vertices)
         bool clockwise = SignedArea(projected[0], projected[1], projected[2]) < 0.0f;
         if (cullMode_ == CULL_NONE || (cullMode_ == CULL_CCW && clockwise) || (cullMode_ == CULL_CW && !clockwise))
         {
-            DrawTriangle2D(projected, clockwise);
+            DrawTriangle2D(projected, clockwise, threadIndex);
             drawOk = true;
         }
     }
@@ -609,7 +681,7 @@ void OcclusionBuffer::DrawTriangle(Vector4* vertices)
                 bool clockwise = SignedArea(projected[0], projected[1], projected[2]) < 0.0f;
                 if (cullMode_ == CULL_NONE || (cullMode_ == CULL_CCW && clockwise) || (cullMode_ == CULL_CW && !clockwise))
                 {
-                    DrawTriangle2D(projected, clockwise);
+                    DrawTriangle2D(projected, clockwise, threadIndex);
                     drawOk = true;
                 }
             }
@@ -750,7 +822,7 @@ struct Edge
     int invZStep_;
 };
 
-void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
+void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise, unsigned threadIndex)
 {
     int top, middle, bottom;
     bool middleIsRight;
@@ -826,11 +898,13 @@ void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
     Edge topToBottom(gradients, vertices[top], vertices[bottom], topY);
     Edge middleToBottom(gradients, vertices[middle], vertices[bottom], middleY);
 
+    int* bufferData = buffers_[threadIndex].data_;
+
     if (middleIsRight)
     {
         // Top half
-        int* row = buffer_ + topY * width_;
-        int* endRow = buffer_ + middleY * width_;
+        int* row = bufferData + topY * width_;
+        int* endRow = bufferData + middleY * width_;
         while (row < endRow)
         {
             int invZ = topToBottom.invZ_;
@@ -851,8 +925,8 @@ void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
         }
 
         // Bottom half
-        row = buffer_ + middleY * width_;
-        endRow = buffer_ + bottomY * width_;
+        row = bufferData + middleY * width_;
+        endRow = bufferData + bottomY * width_;
         while (row < endRow)
         {
             int invZ = topToBottom.invZ_;
@@ -875,8 +949,8 @@ void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
     else
     {
         // Top half
-        int* row = buffer_ + topY * width_;
-        int* endRow = buffer_ + middleY * width_;
+        int* row = bufferData + topY * width_;
+        int* endRow = bufferData + middleY * width_;
         while (row < endRow)
         {
             int invZ = topToMiddle.invZ_;
@@ -897,8 +971,8 @@ void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
         }
 
         // Bottom half
-        row = buffer_ + middleY * width_;
-        endRow = buffer_ + bottomY * width_;
+        row = bufferData + middleY * width_;
+        endRow = bufferData + bottomY * width_;
         while (row < endRow)
         {
             int invZ = middleToBottom.invZ_;
@@ -920,4 +994,41 @@ void OcclusionBuffer::DrawTriangle2D(const Vector3* vertices, bool clockwise)
     }
 }
 
+void OcclusionBuffer::MergeBuffers()
+{
+    URHO3D_PROFILE(MergeBuffers);
+
+    for (unsigned i = 1; i < buffers_.Size(); ++i)
+    {
+        if (!buffers_[i].used_)
+            continue;
+
+        int* src = buffers_[i].data_;
+        int* dest = buffers_[0].data_;
+        int count = width_ * height_;
+
+        while (count--)
+        {
+            // If thread buffer's depth value is closer, overwrite the original
+            if (*src < *dest)
+                *dest = *src;
+            ++src;
+            ++dest;
+        }
+    }
+}
+
+void OcclusionBuffer::ClearBuffer(unsigned threadIndex)
+{
+    if (threadIndex >= buffers_.Size())
+        return;
+
+    int* dest = buffers_[threadIndex].data_;
+    int count = width_ * height_;
+    int fillValue = (int)OCCLUSION_Z_SCALE;
+
+    while (count--)
+        *dest++ = fillValue;
+}
+
 }

+ 59 - 17
Source/Urho3D/Graphics/OcclusionBuffer.h

@@ -39,7 +39,7 @@ class VertexBuffer;
 struct Edge;
 struct Gradients;
 
-/// Occlusion hierarchy depth range.
+/// Occlusion hierarchy depth value.
 struct DepthValue
 {
     /// Minimum value.
@@ -48,6 +48,36 @@ struct DepthValue
     int max_;
 };
 
+/// Per-thread occlusion buffer data.
+struct OcclusionBufferData
+{
+    /// Full buffer data with safety padding.
+    SharedArrayPtr<int> dataWithSafety_;
+    /// Buffer data.
+    int* data_;
+    /// Use flag.
+    bool used_;
+};
+
+/// Stored occlusion render job.
+struct OcclusionBatch
+{
+    /// Model matrix.
+    Matrix3x4 model_;
+    /// Vertex data pointer.
+    const void* vertexData_;
+    /// Vertex size in bytes.
+    unsigned vertexSize_;
+    /// Index data pointer. Null if using non-indexed geometry.
+    const void* indexData_;
+    /// Index size in bytes.
+    unsigned indexSize_;
+    /// Draw start. First index for indexed geometry, otherwise first vertex.
+    unsigned drawStart_;
+    /// Index or vertex count.
+    unsigned drawCount_;
+};
+
 static const int OCCLUSION_MIN_SIZE = 8;
 static const int OCCLUSION_DEFAULT_MAX_TRIANGLES = 5000;
 static const float OCCLUSION_RELATIVE_BIAS = 0.00001f;
@@ -66,8 +96,8 @@ public:
     /// Destruct.
     virtual ~OcclusionBuffer();
 
-    /// Set occlusion buffer size.
-    bool SetSize(int width, int height);
+    /// Set occlusion buffer size and whether to reserve multiple buffers for threading optimization.
+    bool SetSize(int width, int height, bool threaded);
     /// Set camera view to render from.
     void SetView(Camera* camera);
     /// Set maximum triangles to render.
@@ -78,18 +108,20 @@ public:
     void Reset();
     /// Clear the buffer.
     void Clear();
-    /// Draw a triangle mesh to the buffer using non-indexed geometry.
-    bool Draw(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, unsigned vertexStart, unsigned vertexCount);
-    /// Draw a triangle mesh to the buffer using indexed geometry.
-    bool Draw(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, const void* indexData, unsigned indexSize,
+    /// Submit a triangle mesh to the buffer using non-indexed geometry. Return true if did not overflow the allowed triangle count.
+    bool AddTriangles(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, unsigned vertexStart, unsigned vertexCount);
+    /// Submit a triangle mesh to the buffer using indexed geometry. Return true if did not overflow the allowed triangle count.
+    bool AddTriangles(const Matrix3x4& model, const void* vertexData, unsigned vertexSize, const void* indexData, unsigned indexSize,
         unsigned indexStart, unsigned indexCount);
+    /// Draw submitted batches. Uses worker threads if enabled during SetSize().
+    void DrawTriangles();
     /// Build reduced size mip levels.
     void BuildDepthHierarchy();
     /// Reset last used timer.
     void ResetUseTimer();
 
     /// Return highest level depth values.
-    int* GetBuffer() const { return buffer_; }
+    int* GetBuffer() const { return buffers_.Size() ? buffers_[0].data_ : (int*)0; }
 
     /// Return view transform matrix.
     const Matrix3x4& GetView() const { return view_; }
@@ -112,11 +144,17 @@ public:
     /// Return culling mode.
     CullMode GetCullMode() const { return cullMode_; }
 
+    /// Return whether is using threads to speed up rendering.
+    bool IsThreaded() const { return buffers_.Size() > 1; }
+
     /// Test a bounding box for visibility. For best performance, build depth hierarchy first.
     bool IsVisible(const BoundingBox& worldSpaceBox) const;
     /// Return time since last use in milliseconds.
     unsigned GetUseTimer();
 
+    /// Draw a batch. Called internally.
+    void DrawBatch(const OcclusionBatch& batch, unsigned threadIndex);
+
 private:
     /// Apply modelview transform to vertex.
     inline Vector4 ModelTransform(const Matrix4& transform, const Vector3& vertex) const;
@@ -129,14 +167,22 @@ private:
     /// Calculate viewport transform.
     void CalculateViewport();
     /// Draw a triangle.
-    void DrawTriangle(Vector4* vertices);
+    void DrawTriangle(Vector4* vertices, unsigned threadIndex);
     /// Clip vertices against a plane.
     void ClipVertices(const Vector4& plane, Vector4* vertices, bool* triangles, unsigned& numTriangles);
     /// Draw a clipped triangle.
-    void DrawTriangle2D(const Vector3* vertices, bool clockwise);
-
-    /// Highest level depth buffer.
-    int* buffer_;
+    void DrawTriangle2D(const Vector3* vertices, bool clockwise, unsigned threadIndex);
+    /// Clear a thread work buffer.
+    void ClearBuffer(unsigned threadIndex);
+    /// Merge thread work buffers into the first buffer.
+    void MergeBuffers();
+
+    /// Highest-level buffer data per thread.
+    Vector<OcclusionBufferData> buffers_;
+    /// Reduced size depth buffers.
+    Vector<SharedArrayPtr<DepthValue> > mipBuffers_;
+    /// Submitted render jobs.
+    PODVector<OcclusionBatch> batches_;
     /// Buffer width.
     int width_;
     /// Buffer height.
@@ -175,10 +221,6 @@ private:
     float projOffsetScaleX_;
     /// Combined Y projection and viewport transform.
     float projOffsetScaleY_;
-    /// Highest level buffer with safety padding.
-    SharedArrayPtr<int> fullBuffer_;
-    /// Reduced size depth buffers.
-    Vector<SharedArrayPtr<DepthValue> > mipBuffers_;
 };
 
 }

+ 12 - 2
Source/Urho3D/Graphics/Renderer.cpp

@@ -280,6 +280,7 @@ Renderer::Renderer(Context* context) :
     drawShadows_(true),
     reuseShadowMaps_(true),
     dynamicInstancing_(true),
+    threadedOcclusion_(false),
     shadersDirty_(true),
     initialized_(false),
     resetViews_(false)
@@ -473,6 +474,15 @@ void Renderer::SetOccluderSizeThreshold(float screenSize)
     occluderSizeThreshold_ = Max(screenSize, 0.0f);
 }
 
+void Renderer::SetThreadedOcclusion(bool enable)
+{
+    if (enable != threadedOcclusion_)
+    {
+        threadedOcclusion_ = enable;
+        occlusionBuffers_.Clear();
+    }
+}
+
 void Renderer::ReloadShaders()
 {
     shadersDirty_ = true;
@@ -556,7 +566,7 @@ unsigned Renderer::GetNumOccluders(bool allViews) const
         if (!view)
             continue;
 
-        numOccluders += view->GetOccluders().Size();
+        numOccluders += view->GetNumActiveOccluders();
     }
 
     return numOccluders;
@@ -1047,7 +1057,7 @@ OcclusionBuffer* Renderer::GetOcclusionBuffer(Camera* camera)
     int height = (int)((float)occlusionBufferSize_ / camera->GetAspectRatio() + 0.5f);
 
     OcclusionBuffer* buffer = occlusionBuffers_[numOcclusionBuffers_++];
-    buffer->SetSize(width, height);
+    buffer->SetSize(width, height, threadedOcclusion_);
     buffer->SetView(camera);
     buffer->ResetUseTimer();
 

+ 7 - 0
Source/Urho3D/Graphics/Renderer.h

@@ -200,6 +200,8 @@ public:
     void SetOcclusionBufferSize(int size);
     /// Set required screen size (1.0 = full screen) for occluders.
     void SetOccluderSizeThreshold(float screenSize);
+    /// Set whether to thread occluder rendering. Default false.
+    void SetThreadedOcclusion(bool enable);
     /// Set shadow depth bias multiplier for mobile platforms (OpenGL ES.) No effect on desktops. Default 2.
     void SetMobileShadowBiasMul(float mul);
     /// Set shadow depth bias addition for mobile platforms (OpenGL ES.)  No effect on desktops. Default 0.0001.
@@ -266,6 +268,9 @@ public:
     /// Return occluder screen size threshold.
     float GetOccluderSizeThreshold() const { return occluderSizeThreshold_; }
 
+    /// Return whether occlusion rendering is threaded.
+    bool GetThreadedOcclusion() const { return threadedOcclusion_; }
+
     /// Return shadow depth bias multiplier for mobile platforms.
     float GetMobileShadowBiasMul() const { return mobileShadowBiasMul_; }
 
@@ -511,6 +516,8 @@ private:
     bool reuseShadowMaps_;
     /// Dynamic instancing flag.
     bool dynamicInstancing_;
+    /// Threaded occlusion rendering flag.
+    bool threadedOcclusion_;
     /// Shaders need reloading flag.
     bool shadersDirty_;
     /// Initialized flag.

+ 1 - 1
Source/Urho3D/Graphics/StaticModel.cpp

@@ -221,7 +221,7 @@ bool StaticModel::DrawOcclusion(OcclusionBuffer* buffer)
         unsigned indexCount = geometry->GetIndexCount();
 
         // Draw and check for running out of triangles
-        if (!buffer->Draw(node_->GetWorldTransform(), vertexData, vertexSize, indexData, indexSize, indexStart, indexCount))
+        if (!buffer->AddTriangles(node_->GetWorldTransform(), vertexData, vertexSize, indexData, indexSize, indexStart, indexCount))
             return false;
     }
 

+ 1 - 1
Source/Urho3D/Graphics/StaticModelGroup.cpp

@@ -257,7 +257,7 @@ bool StaticModelGroup::DrawOcclusion(OcclusionBuffer* buffer)
             unsigned indexCount = geometry->GetIndexCount();
 
             // Draw and check for running out of triangles
-            if (!buffer->Draw(worldTransforms_[i], vertexData, vertexSize, indexData, indexSize, indexStart, indexCount))
+            if (!buffer->AddTriangles(worldTransforms_[i], vertexData, vertexSize, indexData, indexSize, indexStart, indexCount))
                 return false;
         }
     }

+ 1 - 1
Source/Urho3D/Graphics/TerrainPatch.cpp

@@ -204,7 +204,7 @@ bool TerrainPatch::DrawOcclusion(OcclusionBuffer* buffer)
         return true;
 
     // Draw and check for running out of triangles
-    return buffer->Draw(node_->GetWorldTransform(), vertexData, vertexSize, indexData, indexSize, occlusionGeometry_->GetIndexStart(),
+    return buffer->AddTriangles(node_->GetWorldTransform(), vertexData, vertexSize, indexData, indexSize, occlusionGeometry_->GetIndexStart(),
         occlusionGeometry_->GetIndexCount());
 }
 

+ 32 - 9
Source/Urho3D/Graphics/View.cpp

@@ -560,6 +560,7 @@ void View::Update(const FrameInfo& frame)
     lights_.Clear();
     zones_.Clear();
     occluders_.Clear();
+    activeOccluders_ = 0;
     vertexLightQueues_.Clear();
     for (HashMap<unsigned, BatchQueue>::Iterator i = batchQueues_.Begin(); i != batchQueues_.End(); ++i)
         i->second_.Clear(maxSortedInstances);
@@ -2154,22 +2155,44 @@ void View::DrawOccluders(OcclusionBuffer* buffer, const PODVector<Drawable*>& oc
 {
     buffer->SetMaxTriangles((unsigned)maxOccluderTriangles_);
     buffer->Clear();
+    
+    if (!buffer->IsThreaded())
+    {
+        // If not threaded, draw occluders one by one and test the next occluder against already rasterized depth
+        for (unsigned i = 0; i < occluders.Size(); ++i)
+        {
+            Drawable* occluder = occluders[i];
+            if (i > 0)
+            {
+                // For subsequent occluders, do a test against the pixel-level occlusion buffer to see if rendering is necessary
+                if (!buffer->IsVisible(occluder->GetWorldBoundingBox()))
+                    continue;
+            }
 
-    for (unsigned i = 0; i < occluders.Size(); ++i)
+            // Check for running out of triangles
+            ++activeOccluders_;
+            bool success = occluder->DrawOcclusion(buffer);
+            // Draw triangles submitted by this occluder
+            buffer->DrawTriangles();
+            if (!success)
+                break;
+        }
+    }
+    else
     {
-        Drawable* occluder = occluders[i];
-        if (i > 0)
+        // In threaded mode submit all triangles first, then render (cannot test in this case)
+        for (unsigned i = 0; i < occluders.Size(); ++i)
         {
-            // For subsequent occluders, do a test against the pixel-level occlusion buffer to see if rendering is necessary
-            if (!buffer->IsVisible(occluder->GetWorldBoundingBox()))
-                continue;
+            // Check for running out of triangles
+            ++activeOccluders_;
+            if (!occluders[i]->DrawOcclusion(buffer))
+                break;
         }
 
-        // Check for running out of triangles
-        if (!occluder->DrawOcclusion(buffer))
-            break;
+        buffer->DrawTriangles();
     }
 
+    // Finally build the depth mip levels
     buffer->BuildDepthHierarchy();
 }
 

+ 5 - 0
Source/Urho3D/Graphics/View.h

@@ -168,6 +168,9 @@ public:
     /// Return the last used software occlusion buffer.
     OcclusionBuffer* GetOcclusionBuffer() const { return occlusionBuffer_; }
 
+    /// Return number of occluders that were actually rendered. Occluders may be rejected if running out of triangles or if behind other occluders.
+    unsigned GetNumActiveOccluders() const { return activeOccluders_; }
+
     /// Return the source view that was already prepared. Used when viewports specify the same culling camera.
     View* GetSourceView() const;
 
@@ -379,6 +382,8 @@ private:
     PODVector<Drawable*> occluders_;
     /// Lights.
     PODVector<Light*> lights_;
+    /// Number of active occluders.
+    unsigned activeOccluders_;
 
     /// Drawables that limit their maximum light count.
     HashSet<Drawable*> maxLightsDrawables_;

+ 3 - 0
Source/Urho3D/LuaScript/pkgs/Graphics/Renderer.pkg

@@ -23,6 +23,7 @@ class Renderer
     void SetMaxOccluderTriangles(int triangles);
     void SetOcclusionBufferSize(int size);
     void SetOccluderSizeThreshold(float screenSize);
+    void SetThreadedOcclusion(bool enable);
     void SetMobileShadowBiasMul(float mul);
     void SetMobileShadowBiasAdd(float add);
     void ReloadShaders();
@@ -47,6 +48,7 @@ class Renderer
     int GetMaxOccluderTriangles() const;
     int GetOcclusionBufferSize() const;
     float GetOccluderSizeThreshold() const;
+    bool GetThreadedOcclusion() const;
     float GetMobileShadowBiasMul() const;
     float GetMobileShadowBiasAdd() const;
     unsigned GetNumViews() const;
@@ -82,6 +84,7 @@ class Renderer
     tolua_property__get_set int maxOccluderTriangles;
     tolua_property__get_set int occlusionBufferSize;
     tolua_property__get_set float occluderSizeThreshold;
+    tolua_property__get_set bool threadedOcclusion;
     tolua_property__get_set float mobileShadowBiasMul;
     tolua_property__get_set float mobileShadowBiasAdd;
     tolua_readonly tolua_property__get_set unsigned numViews;