소스 검색

Optimize clustering. Still some bugs

Panagiotis Christopoulos Charitos 7 년 전
부모
커밋
2cfa2f7aad

+ 4 - 3
shaders/Common.glsl

@@ -60,9 +60,10 @@ const U32 UBO_MAX_SIZE = 16384u;
 #define ALPHA_LOCATION 2
 
 // Passes
-#define PASS_GB_FS 0
-#define PASS_SM 1
-#define PASS_EZ 2
+#define PASS_GB 0
+#define PASS_FS 1
+#define PASS_SM 2
+#define PASS_EZ 3
 
 // Other
 #if defined(ANKI_BACKEND_VULKAN) && ANKI_BACKEND_MAJOR >= 1 && ANKI_BACKEND_MINOR >= 1

+ 5 - 5
shaders/GBufferCommonFrag.glsl

@@ -11,7 +11,7 @@
 //
 // Input
 //
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 layout(location = 0) in highp Vec2 in_uv;
 layout(location = 1) in mediump Vec3 in_normal;
 layout(location = 2) in mediump Vec4 in_tangent;
@@ -25,12 +25,12 @@ layout(location = 6) in mediump Vec3 in_normalTangentSpace; // Parallax
 #	if VELOCITY
 layout(location = 7) in mediump Vec2 in_velocity; // Velocity
 #	endif
-#endif // PASS == PASS_GB_FS
+#endif // PASS == PASS_GB
 
 //
 // Output
 //
-#if PASS == PASS_GB_FS || PASS == PASS_EZ
+#if PASS == PASS_GB || PASS == PASS_EZ
 layout(location = 0) out Vec4 out_gbuffer0;
 layout(location = 1) out Vec4 out_gbuffer1;
 layout(location = 2) out Vec4 out_gbuffer2;
@@ -40,7 +40,7 @@ layout(location = 3) out Vec2 out_gbuffer3;
 //
 // Functions
 //
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 // Do normal mapping
 Vec3 readNormalFromTexture(sampler2D map, highp Vec2 texCoords)
 {
@@ -161,4 +161,4 @@ void writeRts(Vec3 diffColor,
 	g.m_velocity = velocity;
 	writeGBuffer(g, out_gbuffer0, out_gbuffer1, out_gbuffer2, out_gbuffer3);
 }
-#endif // PASS == PASS_GB_FS
+#endif // PASS == PASS_GB

+ 10 - 10
shaders/GBufferCommonVert.glsl

@@ -21,7 +21,7 @@ layout(ANKI_SS_BINDING(0, 0), row_major) readonly buffer ss00_
 // Input
 //
 layout(location = POSITION_LOCATION) in highp Vec3 in_position;
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 layout(location = TEXTURE_COORDINATE_LOCATION) in highp Vec2 in_uv;
 layout(location = NORMAL_LOCATION) in mediump Vec3 in_normal;
 layout(location = TANGENT_LOCATION) in mediump Vec4 in_tangent;
@@ -40,7 +40,7 @@ out gl_PerVertex
 	Vec4 gl_Position;
 };
 
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 layout(location = 0) out highp Vec2 out_uv;
 layout(location = 1) out mediump Vec3 out_normal;
 layout(location = 2) out mediump Vec4 out_tangent;
@@ -60,7 +60,7 @@ layout(location = 7) out mediump Vec2 out_velocity; // Velocity
 // Globals
 //
 Vec3 g_position = in_position;
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 highp Vec2 g_uv = in_uv;
 mediump Vec3 g_normal = in_normal;
 mediump Vec4 g_tangent = in_tangent;
@@ -71,7 +71,7 @@ mediump Vec4 g_tangent = in_tangent;
 //
 
 // Common store function
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 void positionUvNormalTangent(Mat4 mvp, Mat3 rotationMat)
 {
 	out_uv = g_uv;
@@ -85,10 +85,10 @@ void positionUvNormalTangent(Mat4 mvp, Mat3 rotationMat)
 	out_bitangent = cross(out_normal, out_tangent.xyz) * out_tangent.w;
 #	endif
 }
-#endif // PASS == PASS_GB_FS
+#endif // PASS == PASS_GB
 
 // Store stuff for parallax mapping
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 void parallax(Mat4 modelViewMat)
 {
 	Vec3 n = in_normal;
@@ -104,7 +104,7 @@ void parallax(Mat4 modelViewMat)
 	out_eyeTangentSpace = invTbn * viewPos;
 	out_normalTangentSpace = invTbn * n;
 }
-#endif // PASS == PASS_GB_FS
+#endif // PASS == PASS_GB
 
 /// Will compute new position, normal and tangent
 #if BONES
@@ -121,7 +121,7 @@ void skinning()
 			F32 boneWeight = in_boneWeights[i];
 
 			position += (u_boneTransforms[boneIdx] * Vec4(g_position * boneWeight, 1.0)).xyz;
-#	if PASS == PASS_GB_FS
+#	if PASS == PASS_GB
 			normal += (u_boneTransforms[boneIdx] * Vec4(g_normal * boneWeight, 0.0)).xyz;
 			tangent += (u_boneTransforms[boneIdx] * Vec4(g_tangent.xyz * boneWeight, 0.0)).xyz;
 #	endif
@@ -129,14 +129,14 @@ void skinning()
 	}
 
 	g_position = position;
-#	if PASS == PASS_GB_FS
+#	if PASS == PASS_GB
 	g_tangent.xyz = tangent;
 	g_normal = normal;
 #	endif
 }
 #endif
 
-#if VELOCITY && PASS == PASS_GB_FS
+#if VELOCITY && PASS == PASS_GB
 void velocity(Mat4 prevMvp)
 {
 	Vec4 v4 = prevMvp * Vec4(g_position, 1.0);

+ 3 - 3
shaders/GBufferGeneric.glslp

@@ -5,7 +5,7 @@
 
 #pragma anki mutator instanced INSTANCE_COUNT 1 2 4 8 16 32 64
 #pragma anki mutator LOD 0 1 2
-#pragma anki mutator PASS 0 1 2
+#pragma anki mutator PASS 0 1 2 3
 #pragma anki mutator DIFFUSE_TEX 0 1
 #pragma anki mutator SPECULAR_TEX 0 1
 #pragma anki mutator ROUGHNESS_TEX 0 1
@@ -45,7 +45,7 @@ void main()
 	skinning();
 #endif
 
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 	positionUvNormalTangent(mvp, rotationMat);
 
 #	if PARALLAX
@@ -66,7 +66,7 @@ void main()
 
 void main()
 {
-#if PASS == PASS_GB_FS
+#if PASS == PASS_GB
 #	if heightTex_DEFINED
 	Vec2 uv = computeTextureCoordParallax(heightTex, in_uv, heightMapScale);
 #	else

+ 21 - 0
src/anki/collision/Tests.h

@@ -6,6 +6,7 @@
 #pragma once
 
 #include <anki/collision/Common.h>
+#include <anki/math/Vec4.h>
 
 namespace anki
 {
@@ -13,6 +14,26 @@ namespace anki
 /// @addtogroup collision
 /// @{
 
+/// https://bartwronski.com/2017/04/13/cull-that-cone/
+inline Bool testConeVsSphere(const Vec4& coneOrigin,
+	const Vec4& coneDir,
+	F32 coneLength,
+	F32 coneAngle,
+	const Vec4& sphereCenter,
+	F32 sphereRadius)
+{
+	ANKI_ASSERT(coneOrigin.w() == 0.0f && sphereCenter.w() == 0.0f && coneDir.w() == 0.0f);
+	const Vec4 V = sphereCenter - coneOrigin;
+	const F32 VlenSq = V.dot(V);
+	const F32 V1len = V.dot(coneDir);
+	const F32 distanceClosestPoint = cos(coneAngle) * sqrt(VlenSq - V1len * V1len) - V1len * sin(coneAngle);
+
+	const Bool angleCull = distanceClosestPoint > sphereRadius;
+	const Bool frontCull = V1len > sphereRadius + coneLength;
+	const Bool backCull = V1len < -sphereRadius;
+	return !(angleCull || frontCull || backCull);
+}
+
 /// Test if two collision shapes collide.
 Bool testCollisionShapes(const CollisionShape& a, const CollisionShape& b);
 /// @}

+ 1 - 1
src/anki/core/Config.cpp

@@ -18,7 +18,7 @@ Config::Config()
 	newOption("r.clusterSizeX", 32);
 	newOption("r.clusterSizeY", 26);
 	newOption("r.clusterSizeZ", 32);
-	newOption("r.avgObjectsPerCluster", 8);
+	newOption("r.avgObjectsPerCluster", 16);
 
 	newOption("r.shadowMapping.enabled", true);
 	newOption("r.shadowMapping.resolution", 512);

+ 13 - 9
src/anki/gr/Enums.h

@@ -575,21 +575,25 @@ enum class BufferUsageBit : U64
 
 	INDEX = 1 << 24,
 	VERTEX = 1 << 25,
-	INDIRECT = 1 << 26,
 
-	FILL = 1 << 27,
-	BUFFER_UPLOAD_SOURCE = 1 << 28,
-	BUFFER_UPLOAD_DESTINATION = 1 << 29, ///< Destination of buffer upload.
-	TEXTURE_UPLOAD_SOURCE = 1 << 30, ///< Source for texture upload.
-	QUERY_RESULT = 1u << 31u, ///< Destination to store query results.
+	INDIRECT_COMPUTE = 1 << 26,
+	INDIRECT_GRAPHICS = 1 << 27,
+	INDIRECT_ALL = INDIRECT_COMPUTE | INDIRECT_GRAPHICS,
+
+	FILL = 1 << 28,
+	BUFFER_UPLOAD_SOURCE = 1 << 29,
+	BUFFER_UPLOAD_DESTINATION = 1 << 30, ///< Destination of buffer upload.
+	TEXTURE_UPLOAD_SOURCE = 1ull << 31ull, ///< Source for texture upload.
+	QUERY_RESULT = 1ull << 32ull, ///< Destination to store query results.
 	TRANSFER_ALL_READ = BUFFER_UPLOAD_SOURCE | TEXTURE_UPLOAD_SOURCE,
 	TRANSFER_ALL_WRITE = FILL | BUFFER_UPLOAD_DESTINATION | QUERY_RESULT,
 	TRANSFER_ALL = TRANSFER_ALL_READ | TRANSFER_ALL_WRITE,
 
 	// Derived
-	ALL_GRAPHICS = UNIFORM_ALL_GRAPHICS | STORAGE_ALL_GRAPHICS | TEXTURE_ALL_GRAPHICS | INDEX | VERTEX | INDIRECT,
-	ALL_COMPUTE = UNIFORM_COMPUTE | STORAGE_COMPUTE_READ_WRITE | TEXTURE_COMPUTE | INDIRECT,
-	ALL_READ = UNIFORM_ALL | STORAGE_ALL_READ | TEXTURE_ALL | INDEX | VERTEX | INDIRECT | TRANSFER_ALL_READ,
+	ALL_GRAPHICS =
+		UNIFORM_ALL_GRAPHICS | STORAGE_ALL_GRAPHICS | TEXTURE_ALL_GRAPHICS | INDEX | VERTEX | INDIRECT_GRAPHICS,
+	ALL_COMPUTE = UNIFORM_COMPUTE | STORAGE_COMPUTE_READ_WRITE | TEXTURE_COMPUTE | INDIRECT_COMPUTE,
+	ALL_READ = UNIFORM_ALL | STORAGE_ALL_READ | TEXTURE_ALL | INDEX | VERTEX | INDIRECT_ALL | TRANSFER_ALL_READ,
 	ALL_WRITE = STORAGE_ALL_WRITE | TRANSFER_ALL_WRITE,
 };
 ANKI_ENUM_ALLOW_NUMERIC_OPERATIONS(BufferUsageBit, inline)

+ 2 - 1
src/anki/gr/RenderGraph.cpp

@@ -1163,7 +1163,8 @@ StringAuto RenderGraph::bufferUsageToStr(StackAllocator<U8>& alloc, BufferUsageB
 	ANKI_BUFF_USAGE(TEXTURE_COMPUTE);
 	ANKI_BUFF_USAGE(INDEX);
 	ANKI_BUFF_USAGE(VERTEX);
-	ANKI_BUFF_USAGE(INDIRECT);
+	ANKI_BUFF_USAGE(INDIRECT_COMPUTE);
+	ANKI_BUFF_USAGE(INDIRECT_GRAPHICS);
 	ANKI_BUFF_USAGE(FILL);
 	ANKI_BUFF_USAGE(BUFFER_UPLOAD_SOURCE);
 	ANKI_BUFF_USAGE(BUFFER_UPLOAD_DESTINATION);

+ 1 - 1
src/anki/gr/gl/CommandBuffer.cpp

@@ -1352,7 +1352,7 @@ void CommandBuffer::setBufferBarrier(
 		d |= GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT;
 	}
 
-	if(!!(all & BufferUsageBit::INDIRECT))
+	if(!!(all & BufferUsageBit::INDIRECT_ALL))
 	{
 		d |= GL_COMMAND_BARRIER_BIT;
 	}

+ 2 - 2
src/anki/gr/vulkan/BufferImpl.cpp

@@ -209,7 +209,7 @@ VkPipelineStageFlags BufferImpl::computePplineStage(BufferUsageBit usage)
 					 | VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT;
 	}
 
-	if(!!(usage & BufferUsageBit::INDIRECT))
+	if(!!(usage & BufferUsageBit::INDIRECT_ALL))
 	{
 		stageMask |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
 	}
@@ -272,7 +272,7 @@ VkAccessFlags BufferImpl::computeAccessMask(BufferUsageBit usage)
 		mask |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
 	}
 
-	if(!!(usage & BufferUsageBit::INDIRECT))
+	if(!!(usage & BufferUsageBit::INDIRECT_ALL))
 	{
 		mask |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
 	}

+ 2 - 2
src/anki/gr/vulkan/CommandBufferImpl.inl.h

@@ -280,7 +280,7 @@ inline void CommandBufferImpl::drawArraysIndirect(
 	m_state.setPrimitiveTopology(topology);
 	drawcallCommon();
 	const BufferImpl& impl = static_cast<const BufferImpl&>(*buff);
-	ANKI_ASSERT(impl.usageValid(BufferUsageBit::INDIRECT));
+	ANKI_ASSERT(impl.usageValid(BufferUsageBit::INDIRECT_GRAPHICS));
 	ANKI_ASSERT((offset % 4) == 0);
 	ANKI_ASSERT((offset + sizeof(DrawArraysIndirectInfo) * drawCount) <= impl.getSize());
 
@@ -294,7 +294,7 @@ inline void CommandBufferImpl::drawElementsIndirect(
 	m_state.setPrimitiveTopology(topology);
 	drawcallCommon();
 	const BufferImpl& impl = static_cast<const BufferImpl&>(*buff);
-	ANKI_ASSERT(impl.usageValid(BufferUsageBit::INDIRECT));
+	ANKI_ASSERT(impl.usageValid(BufferUsageBit::INDIRECT_ALL));
 	ANKI_ASSERT((offset % 4) == 0);
 	ANKI_ASSERT((offset + sizeof(DrawElementsIndirectInfo) * drawCount) <= impl.getSize());
 

+ 1 - 1
src/anki/gr/vulkan/Common.cpp

@@ -278,7 +278,7 @@ VkBufferUsageFlags convertBufferUsageBit(BufferUsageBit usageMask)
 		out |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
 	}
 
-	if(!!(usageMask & BufferUsageBit::INDIRECT))
+	if(!!(usageMask & BufferUsageBit::INDIRECT_ALL))
 	{
 		out |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
 	}

+ 1 - 0
src/anki/math/Vec4.h

@@ -6,6 +6,7 @@
 #pragma once
 
 #include <anki/math/CommonIncludes.h>
+#include <anki/math/Vec.h>
 
 namespace anki
 {

+ 191 - 86
src/anki/renderer/ClusterBin.cpp

@@ -9,6 +9,7 @@
 #include <anki/collision/Sphere.h>
 #include <anki/collision/Functions.h>
 #include <anki/collision/Tests.h>
+#include <anki/collision/Plane.h>
 #include <anki/util/ThreadHive.h>
 #include <anki/core/Config.h>
 #include <anki/core/Trace.h>
@@ -18,7 +19,6 @@ namespace anki
 
 static const U32 TYPED_OBJECT_COUNT = 4; // Point, spot, decal & probe
 static const F32 INVALID_TEXTURE_INDEX = -1.0;
-static const U32 MAX_TYPED_OBJECTS_PER_CLUSTER = 64;
 
 /// Get a view space point.
 static Vec4 unproject(const F32 zVspace, const Vec2& ndc, const Vec4& unprojParams)
@@ -32,20 +32,18 @@ static Vec4 unproject(const F32 zVspace, const Vec2& ndc, const Vec4& unprojPara
 	return view * zVspace;
 }
 
-/// https://bartwronski.com/2017/04/13/cull-that-cone/
-static Bool testConeVsSphere(
-	const Vec4& coneOrigin, const Vec4& coneDir, F32 coneLength, F32 coneAngle, const Sphere& sphere)
+template<typename TShape>
+static Bool insideClusterFrustum(const Array<Plane, 4>& planeArr, const TShape& shape)
 {
-	ANKI_ASSERT(coneOrigin.w() == 0.0f && sphere.getCenter().w() == 0.0f && coneDir.w() == 0.0f);
-	const Vec4 V = sphere.getCenter() - coneOrigin;
-	const F32 VlenSq = V.dot(V);
-	const F32 V1len = V.dot(coneDir);
-	const F32 distanceClosestPoint = cos(coneAngle) * sqrt(VlenSq - V1len * V1len) - V1len * sin(coneAngle);
-
-	const Bool angleCull = distanceClosestPoint > sphere.getRadius();
-	const Bool frontCull = V1len > sphere.getRadius() + coneLength;
-	const Bool backCull = V1len < -sphere.getRadius();
-	return !(angleCull || frontCull || backCull);
+	for(const Plane& plane : planeArr)
+	{
+		if(shape.testPlane(plane) < 0.0f)
+		{
+			return false;
+		}
+	}
+
+	return true;
 }
 
 /// Bin context.
@@ -227,7 +225,8 @@ void ClusterBin::binTile(U32 tileIdx, BinCtx& ctx)
 	const U tileY = tileIdx / m_clusterCounts[0];
 
 	// Compute the tile's cluster edges in view space
-	Vec4* clusterEdgesVSpace = &m_clusterEdges[tileIdx * (m_clusterCounts[2] + 1) * 4];
+	WeakArray<Vec4> clusterEdgesVSpace(
+		&m_clusterEdges[tileIdx * (m_clusterCounts[2] + 1) * 4], (m_clusterCounts[2] + 1) * 4);
 	if(ctx.m_clusterEdgesDirty)
 	{
 		const Vec2 tileSize = 2.0f / Vec2(m_clusterCounts[0], m_clusterCounts[1]);
@@ -258,11 +257,20 @@ void ClusterBin::binTile(U32 tileIdx, BinCtx& ctx)
 		clusterEdgesWSpace[idx + 3] = (ctx.m_in->m_renderQueue->m_cameraTransform * clusterEdgesVSpace[idx + 3]).xyz0();
 	}
 
-	// For all clusters in the tile
+	// Compute the tile frustum
+	Array<Plane, 4> frustumPlanes;
+	frustumPlanes[0].setFrom3Points(clusterEdgesWSpace[0], clusterEdgesWSpace[1], clusterEdgesWSpace[5]);
+	frustumPlanes[1].setFrom3Points(clusterEdgesWSpace[2], clusterEdgesWSpace[1], clusterEdgesWSpace[5]);
+	frustumPlanes[2].setFrom3Points(clusterEdgesWSpace[2], clusterEdgesWSpace[3], clusterEdgesWSpace[7]);
+	frustumPlanes[3].setFrom3Points(clusterEdgesWSpace[7], clusterEdgesWSpace[3], clusterEdgesWSpace[0]);
+
+	// Compute the cluster AABBs and spheres
+	DynamicArrayAuto<Aabb> clusterBoxes(ctx.m_in->m_tempAlloc);
+	clusterBoxes.create(m_clusterCounts[2]);
+	DynamicArrayAuto<Sphere> clusterSpheres(ctx.m_in->m_tempAlloc);
+	clusterSpheres.create(m_clusterCounts[2]);
 	for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
 	{
-		const U clusterIdx = clusterZ * (m_clusterCounts[0] * m_clusterCounts[1]) + tileY * m_clusterCounts[0] + tileX;
-
 		// Compute an AABB and a sphere that contains the cluster
 		Vec4 aabbMin(MAX_F32, MAX_F32, MAX_F32, 0.0f);
 		Vec4 aabbMax(MIN_F32, MIN_F32, MIN_F32, 0.0f);
@@ -272,104 +280,205 @@ void ClusterBin::binTile(U32 tileIdx, BinCtx& ctx)
 			aabbMax = aabbMax.max(clusterEdgesWSpace[clusterZ * 4 + i]);
 		}
 
-		const Aabb clusterBox(aabbMin, aabbMax);
+		clusterBoxes[clusterZ] = Aabb(aabbMin, aabbMax);
 
 		const Vec4 sphereCenter = (aabbMin + aabbMax) / 2.0f;
-		const Sphere clusterSphere(sphereCenter, (aabbMin - sphereCenter).getLength());
+		clusterSpheres[clusterZ] = Sphere(sphereCenter, (aabbMin - sphereCenter).getLength());
+	}
+
+	// Allocate temp indices for each cluster
+	DynamicArrayAuto<U32> indices(ctx.m_in->m_tempAlloc);
+	const U32 avgIndicesPerCluster = m_indexCount / m_totalClusterCount;
+	indices.create(m_clusterCounts[2] * avgIndicesPerCluster);
 
-		// Bin decals
-		Array<U32, MAX_TYPED_OBJECTS_PER_CLUSTER> objectIndices;
-		U32* pObjectIndex = &objectIndices[0];
-		const U32* pObjectIndexEnd = &objectIndices[0] + objectIndices.getSize();
-		(void)pObjectIndexEnd;
+	DynamicArrayAuto<U32*> pIndices(ctx.m_in->m_tempAlloc);
+	pIndices.create(m_clusterCounts[2]);
+	DynamicArrayAuto<U32*> pCounts(ctx.m_in->m_tempAlloc);
+	pCounts.create(m_clusterCounts[2]);
+	for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+	{
+		pIndices[clusterZ] = &indices[clusterZ * avgIndicesPerCluster];
+	}
+
+	// Decals
+	{
+		for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+		{
+			pCounts[clusterZ] = pIndices[clusterZ];
+			*pCounts[clusterZ] = 0;
+			++pIndices[clusterZ];
+		}
 
-		U idx = 0;
 		Obb decalBox;
-		U32* pObjectCount = pObjectIndex;
-		++pObjectIndex;
-		for(const DecalQueueElement& decal : ctx.m_in->m_renderQueue->m_decals)
+		for(U i = 0; i < ctx.m_in->m_renderQueue->m_decals.getSize(); ++i)
 		{
+			const DecalQueueElement& decal = ctx.m_in->m_renderQueue->m_decals[i];
 			decalBox.setCenter(decal.m_obbCenter.xyz0());
 			decalBox.setRotation(Mat3x4(decal.m_obbRotation));
 			decalBox.setExtend(decal.m_obbExtend.xyz0());
-			if(testCollisionShapes(decalBox, clusterBox))
+
+			if(!insideClusterFrustum(frustumPlanes, decalBox))
 			{
-				ANKI_ASSERT(pObjectIndex < pObjectIndexEnd);
-				*pObjectIndex = idx;
-				++pObjectIndex;
+				continue;
 			}
 
-			++idx;
+			for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+			{
+				if(!testCollisionShapes(decalBox, clusterBoxes[clusterZ]))
+				{
+					continue;
+				}
+
+				const U32 count = pIndices[clusterZ] - &indices[clusterZ * avgIndicesPerCluster];
+				if(ANKI_UNLIKELY(count + 3 >= avgIndicesPerCluster))
+				{
+					ANKI_R_LOGW("Out of cluster indices. Increase r.avgObjectsPerCluster");
+					continue;
+				}
+
+				*pIndices[clusterZ] = i;
+				*pCounts[clusterZ] += 1;
+				++pIndices[clusterZ];
+			}
 		}
+	}
 
-		*pObjectCount = pObjectIndex - pObjectCount - 1;
+	// Point lights
+	{
+		for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+		{
+			pCounts[clusterZ] = pIndices[clusterZ];
+			*pCounts[clusterZ] = 0;
+			++pIndices[clusterZ];
+		}
 
-		// Bin the point lights
-		idx = 0;
 		Sphere lightSphere;
-		pObjectCount = pObjectIndex;
-		++pObjectIndex;
-		for(const PointLightQueueElement& plight : ctx.m_in->m_renderQueue->m_pointLights)
+		for(U i = 0; i < ctx.m_in->m_renderQueue->m_pointLights.getSize(); ++i)
 		{
+			const PointLightQueueElement& plight = ctx.m_in->m_renderQueue->m_pointLights[i];
 			lightSphere.setCenter(plight.m_worldPosition.xyz0());
 			lightSphere.setRadius(plight.m_radius);
-			if(testCollisionShapes(lightSphere, clusterBox))
+
+			if(!insideClusterFrustum(frustumPlanes, lightSphere))
 			{
-				ANKI_ASSERT(pObjectIndex < pObjectIndexEnd);
-				*pObjectIndex = idx;
-				++pObjectIndex;
+				continue;
 			}
 
-			++idx;
+			for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+			{
+				if(!testCollisionShapes(lightSphere, clusterBoxes[clusterZ]))
+				{
+					continue;
+				}
+
+				const U32 count = pIndices[clusterZ] - &indices[clusterZ * avgIndicesPerCluster];
+				if(ANKI_UNLIKELY(count + 2 >= avgIndicesPerCluster))
+				{
+					ANKI_R_LOGW("Out of cluster indices. Increase r.avgObjectsPerCluster");
+					continue;
+				}
+
+				*pIndices[clusterZ] = i;
+				*pCounts[clusterZ] += 1;
+				++pIndices[clusterZ];
+			}
 		}
+	}
 
-		*pObjectCount = pObjectIndex - pObjectCount - 1;
+	// Spot lights
+	{
+		for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+		{
+			pCounts[clusterZ] = pIndices[clusterZ];
+			*pCounts[clusterZ] = 0;
+			++pIndices[clusterZ];
+		}
 
-		// Bin the spot lights
-		idx = 0;
-		pObjectCount = pObjectIndex;
-		++pObjectIndex;
-		for(const SpotLightQueueElement& slight : ctx.m_in->m_renderQueue->m_spotLights)
+		PerspectiveFrustum slightFrustum;
+		for(U i = 0; i < ctx.m_in->m_renderQueue->m_spotLights.getSize(); ++i)
 		{
-			if(testConeVsSphere(slight.m_worldTransform.getTranslationPart(),
-				   -slight.m_worldTransform.getZAxis(),
-				   slight.m_distance,
-				   slight.m_outerAngle,
-				   clusterSphere))
+			const SpotLightQueueElement& slight = ctx.m_in->m_renderQueue->m_spotLights[i];
+			slightFrustum.setAll(slight.m_outerAngle, slight.m_outerAngle, 0.01f, slight.m_distance);
+			slightFrustum.transform(Transform(slight.m_worldTransform));
+
+			if(!insideClusterFrustum(frustumPlanes, slightFrustum))
 			{
-				ANKI_ASSERT(pObjectIndex < pObjectIndexEnd);
-				*pObjectIndex = idx;
-				++pObjectIndex;
+				continue;
 			}
 
-			++idx;
+			for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+			{
+				if(!testConeVsSphere(slight.m_worldTransform.getTranslationPart().xyz0(),
+					   -slight.m_worldTransform.getZAxis(),
+					   slight.m_distance,
+					   slight.m_outerAngle,
+					   clusterSpheres[clusterZ].getCenter(),
+					   clusterSpheres[clusterZ].getRadius()))
+				{
+					continue;
+				}
+
+				const U32 count = pIndices[clusterZ] - &indices[clusterZ * avgIndicesPerCluster];
+				if(ANKI_UNLIKELY(count + 1 >= avgIndicesPerCluster))
+				{
+					ANKI_R_LOGW("Out of cluster indices. Increase r.avgObjectsPerCluster");
+					continue;
+				}
+
+				*pIndices[clusterZ] = i;
+				*pCounts[clusterZ] += 1;
+				++pIndices[clusterZ];
+			}
 		}
+	}
 
-		*pObjectCount = pObjectIndex - pObjectCount - 1;
+	// Probes
+	{
+		for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+		{
+			pCounts[clusterZ] = pIndices[clusterZ];
+			*pCounts[clusterZ] = 0;
+			++pIndices[clusterZ];
+		}
 
-		// Bin probes
-		idx = 0;
 		Aabb probeBox;
-		pObjectCount = pObjectIndex;
-		++pObjectIndex;
-		for(const ReflectionProbeQueueElement& probe : ctx.m_in->m_renderQueue->m_reflectionProbes)
+		for(U i = 0; i < ctx.m_in->m_renderQueue->m_reflectionProbes.getSize(); ++i)
 		{
+			const ReflectionProbeQueueElement& probe = ctx.m_in->m_renderQueue->m_reflectionProbes[i];
 			probeBox.setMin(probe.m_aabbMin);
 			probeBox.setMax(probe.m_aabbMax);
-			if(testCollisionShapes(probeBox, clusterBox))
+
+			if(!insideClusterFrustum(frustumPlanes, probeBox))
 			{
-				ANKI_ASSERT(pObjectIndex < pObjectIndexEnd);
-				*pObjectIndex = idx;
-				++pObjectIndex;
+				continue;
 			}
 
-			++idx;
+			for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+			{
+				if(!testCollisionShapes(probeBox, clusterBoxes[clusterZ]))
+				{
+					continue;
+				}
+
+				const U32 count = pIndices[clusterZ] - &indices[clusterZ * avgIndicesPerCluster];
+				if(ANKI_UNLIKELY(count >= avgIndicesPerCluster))
+				{
+					ANKI_R_LOGW("Out of cluster indices. Increase r.avgObjectsPerCluster");
+					continue;
+				}
+
+				*pIndices[clusterZ] = i;
+				*pCounts[clusterZ] += 1;
+				++pIndices[clusterZ];
+			}
 		}
+	}
 
-		*pObjectCount = pObjectIndex - pObjectCount - 1;
-
-		// Allocate and store indices for the cluster
-		U indexCount = pObjectIndex - &objectIndices[0];
+	// Upload the indices for all clusters of the tile
+	for(U clusterZ = 0; clusterZ < m_clusterCounts[2]; ++clusterZ)
+	{
+		const U indexCount = pIndices[clusterZ] - &indices[clusterZ * avgIndicesPerCluster];
+		ANKI_ASSERT(indexCount <= avgIndicesPerCluster);
 		ANKI_ASSERT(indexCount >= TYPED_OBJECT_COUNT);
 
 		U firstIndex;
@@ -378,17 +487,11 @@ void ClusterBin::binTile(U32 tileIdx, BinCtx& ctx)
 			// Have some objects to bin
 
 			firstIndex = ctx.m_allocatedIndexCount.fetchAdd(indexCount);
+			ANKI_ASSERT(firstIndex + indexCount <= ctx.m_lightIds.getSize());
 
-			if(firstIndex + indexCount <= ctx.m_lightIds.getSize())
-			{
-				memcpy(&ctx.m_lightIds[firstIndex], &objectIndices[0], sizeof(objectIndices[0]) * indexCount);
-			}
-			else
-			{
-				ANKI_R_LOGW("Out of cluster indices. Increase r.avgObjectsPerCluster");
-				firstIndex = 0;
-				indexCount = TYPED_OBJECT_COUNT;
-			}
+			memcpy(&ctx.m_lightIds[firstIndex],
+				&indices[clusterZ * avgIndicesPerCluster],
+				sizeof(ctx.m_lightIds[firstIndex]) * indexCount);
 		}
 		else
 		{
@@ -397,7 +500,9 @@ void ClusterBin::binTile(U32 tileIdx, BinCtx& ctx)
 		}
 
 		// Write the cluster
-		ctx.m_clusters[clusterIdx] = firstIndex;
+		const U clusterIndex =
+			clusterZ * (m_clusterCounts[0] * m_clusterCounts[1]) + tileY * m_clusterCounts[0] + tileX;
+		ctx.m_clusters[clusterIndex] = firstIndex;
 	}
 }
 

+ 1 - 1
src/anki/renderer/Dbg.cpp

@@ -76,7 +76,7 @@ void Dbg::run(RenderPassWorkContext& rgraphCtx, const RenderingContext& ctx)
 	dctx.m_cameraTransform = ctx.m_renderQueue->m_viewMatrix.getInverse();
 	dctx.m_stagingGpuAllocator = &m_r->getStagingGpuMemoryManager();
 	dctx.m_commandBuffer = cmdb;
-	dctx.m_key = RenderingKey(Pass::GB_FS, 0, 1, false, false);
+	dctx.m_key = RenderingKey(Pass::FS, 0, 1, false, false);
 	dctx.m_debugDraw = true;
 	dctx.m_debugDrawFlags = m_debugDrawFlags;
 

+ 2 - 2
src/anki/renderer/ForwardShading.cpp

@@ -134,7 +134,7 @@ void ForwardShading::run(RenderingContext& ctx, RenderPassWorkContext& rgraphCtx
 		cmdb->setDepthWrite(false);
 
 		// Start drawing
-		m_r->getSceneDrawer().drawRange(Pass::GB_FS,
+		m_r->getSceneDrawer().drawRange(Pass::FS,
 			ctx.m_matrices.m_view,
 			ctx.m_matrices.m_viewProjectionJitter,
 			ctx.m_prevMatrices.m_viewProjectionJitter,
@@ -181,7 +181,7 @@ void ForwardShading::populateRenderGraph(RenderingContext& ctx)
 
 	if(ctx.m_renderQueue->m_lensFlares.getSize())
 	{
-		pass.newDependency({m_r->getLensFlare().getIndirectDrawBuffer(), BufferUsageBit::INDIRECT});
+		pass.newDependency({m_r->getLensFlare().getIndirectDrawBuffer(), BufferUsageBit::INDIRECT_GRAPHICS});
 	}
 }
 

+ 1 - 1
src/anki/renderer/GBuffer.cpp

@@ -125,7 +125,7 @@ void GBuffer::runInThread(const RenderingContext& ctx, RenderPassWorkContext& rg
 		cmdb->setDepthCompareOperation(CompareOperation::LESS_EQUAL);
 
 		ANKI_ASSERT(colorStart < colorEnd && colorEnd <= I32(ctx.m_renderQueue->m_renderables.getSize()));
-		m_r->getSceneDrawer().drawRange(Pass::GB_FS,
+		m_r->getSceneDrawer().drawRange(Pass::GB,
 			ctx.m_matrices.m_view,
 			ctx.m_matrices.m_viewProjectionJitter,
 			ctx.m_matrices.m_jitter * ctx.m_prevMatrices.m_viewProjection,

+ 1 - 1
src/anki/renderer/Indirect.cpp

@@ -347,7 +347,7 @@ void Indirect::runGBuffer(CommandBufferPtr& cmdb)
 
 		if(!rqueue.m_renderables.isEmpty())
 		{
-			m_r->getSceneDrawer().drawRange(Pass::GB_FS,
+			m_r->getSceneDrawer().drawRange(Pass::GB,
 				rqueue.m_viewMatrix,
 				rqueue.m_viewProjectionMatrix,
 				Mat4::getIdentity(), // Don't care about prev mats

+ 1 - 1
src/anki/renderer/LensFlare.cpp

@@ -69,7 +69,7 @@ Error LensFlare::initOcclusion(const ConfigSet& config)
 	GrManager& gr = getGrManager();
 
 	m_indirectBuff = gr.newBuffer(BufferInitInfo(m_maxFlares * sizeof(DrawArraysIndirectInfo),
-		BufferUsageBit::INDIRECT | BufferUsageBit::STORAGE_COMPUTE_WRITE,
+		BufferUsageBit::INDIRECT_GRAPHICS | BufferUsageBit::STORAGE_COMPUTE_WRITE,
 		BufferMapAccessBit::NONE,
 		"LensFlares"));
 

+ 2 - 2
src/anki/resource/ModelResource.cpp

@@ -45,7 +45,7 @@ void ModelPatch::getRenderingDataSub(
 	// Vertex attributes
 	U32 positionBinding = MAX_U32;
 	{
-		if(key.m_pass == Pass::GB_FS)
+		if(key.m_pass == Pass::GB || key.m_pass == Pass::FS)
 		{
 			// All attributes
 
@@ -84,7 +84,7 @@ void ModelPatch::getRenderingDataSub(
 
 	// Vertex buffers
 	{
-		if(key.m_pass == Pass::GB_FS)
+		if(key.m_pass == Pass::GB || key.m_pass == Pass::FS)
 		{
 			// All attributes
 

+ 1 - 1
src/anki/resource/ParticleEmitterResource.cpp

@@ -145,7 +145,7 @@ void ParticleEmitterResource::getRenderingInfo(U lod, ShaderProgramPtr& prog) co
 {
 	lod = min<U>(lod, m_lodCount - 1);
 
-	RenderingKey key(Pass::GB_FS, lod, 1, false, false);
+	RenderingKey key(Pass::FS, lod, 1, false, false);
 	const MaterialVariant& variant = m_material->getOrCreateVariant(key);
 	prog = variant.getShaderProgram();
 }

+ 3 - 2
src/anki/resource/RenderingKey.h

@@ -14,7 +14,8 @@ namespace anki
 /// The AnKi passes visible to materials.
 enum class Pass : U8
 {
-	GB_FS, ///< GBuffer or forward shading.
+	GB, ///< GBuffer.
+	FS, ///< Forward shading.
 	SM, ///< Shadow mapping.
 	EZ, ///< Early Z.
 	COUNT
@@ -42,7 +43,7 @@ public:
 	}
 
 	RenderingKey()
-		: RenderingKey(Pass::GB_FS, 0, 1, false, false)
+		: RenderingKey(Pass::GB, 0, 1, false, false)
 	{
 	}
 

+ 2 - 2
src/anki/scene/ModelNode.cpp

@@ -96,7 +96,7 @@ void ModelPatchNode::drawCallback(RenderQueueDrawContext& ctx, ConstWeakArray<vo
 	}
 
 	ModelRenderingInfo modelInf;
-	ctx.m_key.m_velocity = moved;
+	ctx.m_key.m_velocity = moved && ctx.m_key.m_pass == Pass::GB;
 	self.m_modelPatch->getRenderingDataSub(ctx.m_key, WeakArray<U8>(), modelInf);
 
 	// Program
@@ -306,7 +306,7 @@ void ModelNode::drawCallback(RenderQueueDrawContext& ctx, ConstWeakArray<void*>
 			cmdb->bindStorageBuffer(0, 0, token.m_buffer, token.m_offset, token.m_range);
 		}
 
-		ctx.m_key.m_velocity = moved;
+		ctx.m_key.m_velocity = moved && ctx.m_key.m_pass == Pass::GB;
 		ModelRenderingInfo modelInf;
 		patch->getRenderingDataSub(ctx.m_key, WeakArray<U8>(), modelInf);