2
0
Panagiotis Christopoulos Charitos 2 жил өмнө
parent
commit
37af33e088

+ 2 - 0
AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h

@@ -20,6 +20,7 @@ class GpuVisibleTransientMemoryAllocation
 public:
 	Buffer* m_buffer = nullptr;
 	PtrSize m_offset = kMaxPtrSize;
+	PtrSize m_size = 0;
 };
 
 /// GPU only transient memory. Used for temporary allocations. Allocations will get reset after each frame.
@@ -33,6 +34,7 @@ public:
 	{
 		GpuVisibleTransientMemoryAllocation out;
 		m_pool.allocate(size, out.m_offset, out.m_buffer);
+		out.m_size = size;
 		return out;
 	}
 

+ 8 - 6
AnKi/Gr/RenderGraph.cpp

@@ -41,7 +41,7 @@ public:
 };
 
 /// Same as RT but for buffers.
-class RenderGraph::Buffer
+class RenderGraph::BufferRange
 {
 public:
 	BufferUsageBit m_usage;
@@ -177,7 +177,7 @@ public:
 	BitSet<kMaxRenderGraphPasses, U64> m_passIsInBatch{false};
 	DynamicArray<Batch, MemoryPoolPtrWrapper<StackMemoryPool>> m_batches;
 	DynamicArray<RT, MemoryPoolPtrWrapper<StackMemoryPool>> m_rts;
-	DynamicArray<Buffer, MemoryPoolPtrWrapper<StackMemoryPool>> m_buffers;
+	DynamicArray<BufferRange, MemoryPoolPtrWrapper<StackMemoryPool>> m_buffers;
 	DynamicArray<AS, MemoryPoolPtrWrapper<StackMemoryPool>> m_as;
 
 	DynamicArray<CommandBufferPtr, MemoryPoolPtrWrapper<StackMemoryPool>> m_graphicsCmdbs;
@@ -350,7 +350,7 @@ void RenderGraph::reset()
 		rt.m_texture.reset(nullptr);
 	}
 
-	for(Buffer& buff : m_ctx->m_buffers)
+	for(BufferRange& buff : m_ctx->m_buffers)
 	{
 		buff.m_buffer.reset(nullptr);
 	}
@@ -1246,10 +1246,12 @@ TexturePtr RenderGraph::getTexture(RenderTargetHandle handle) const
 	return m_ctx->m_rts[handle.m_idx].m_texture;
 }
 
-BufferPtr RenderGraph::getBuffer(BufferHandle handle) const
+void RenderGraph::getCachedBuffer(BufferHandle handle, Buffer*& buff, PtrSize& offset, PtrSize& range) const
 {
-	ANKI_ASSERT(m_ctx->m_buffers[handle.m_idx].m_buffer.isCreated());
-	return m_ctx->m_buffers[handle.m_idx].m_buffer;
+	const BufferRange& record = m_ctx->m_buffers[handle.m_idx];
+	buff = record.m_buffer.get();
+	offset = record.m_offset;
+	range = record.m_range;
 }
 
 AccelerationStructurePtr RenderGraph::getAs(AccelerationStructureHandle handle) const

+ 11 - 9
AnKi/Gr/RenderGraph.h

@@ -121,7 +121,7 @@ public:
 	U32 m_currentSecondLevelCommandBufferIndex ANKI_DEBUG_CODE(= 0);
 	U32 m_secondLevelCommandBufferCount ANKI_DEBUG_CODE(= 0);
 
-	void getBufferState(BufferHandle handle, BufferPtr& buff) const;
+	void getBufferState(BufferHandle handle, Buffer*& buff, PtrSize& offset, PtrSize& range) const;
 
 	void getRenderTargetState(RenderTargetHandle handle, const TextureSubresourceInfo& subresource,
 							  TexturePtr& tex) const;
@@ -206,17 +206,19 @@ public:
 	/// Convenience method.
 	void bindStorageBuffer(U32 set, U32 binding, BufferHandle handle)
 	{
-		BufferPtr buff;
-		getBufferState(handle, buff);
-		m_commandBuffer->bindStorageBuffer(set, binding, buff, 0, kMaxPtrSize);
+		Buffer* buff;
+		PtrSize offset, range;
+		getBufferState(handle, buff, offset, range);
+		m_commandBuffer->bindStorageBuffer(set, binding, BufferPtr(buff), offset, range);
 	}
 
 	/// Convenience method.
 	void bindUniformBuffer(U32 set, U32 binding, BufferHandle handle)
 	{
-		BufferPtr buff;
-		getBufferState(handle, buff);
-		m_commandBuffer->bindUniformBuffer(set, binding, buff, 0, kMaxPtrSize);
+		Buffer* buff;
+		PtrSize offset, range;
+		getBufferState(handle, buff, offset, range);
+		m_commandBuffer->bindUniformBuffer(set, binding, BufferPtr(buff), offset, range);
 	}
 
 	/// Convenience method.
@@ -685,7 +687,7 @@ private:
 	class Pass;
 	class Batch;
 	class RT;
-	class Buffer;
+	class BufferRange;
 	class AS;
 	class TextureBarrier;
 	class BufferBarrier;
@@ -764,7 +766,7 @@ private:
 	/// @}
 
 	TexturePtr getTexture(RenderTargetHandle handle) const;
-	BufferPtr getBuffer(BufferHandle handle) const;
+	void getCachedBuffer(BufferHandle handle, Buffer*& buff, PtrSize& offset, PtrSize& range) const;
 	AccelerationStructurePtr getAs(AccelerationStructureHandle handle) const;
 };
 /// @}

+ 3 - 2
AnKi/Gr/RenderGraph.inl.h

@@ -12,9 +12,10 @@ inline void RenderPassWorkContext::bindAccelerationStructure(U32 set, U32 bindin
 	m_commandBuffer->bindAccelerationStructure(set, binding, m_rgraph->getAs(handle));
 }
 
-inline void RenderPassWorkContext::getBufferState(BufferHandle handle, BufferPtr& buff) const
+inline void RenderPassWorkContext::getBufferState(BufferHandle handle, Buffer*& buff, PtrSize& offset,
+												  PtrSize& range) const
 {
-	buff = m_rgraph->getBuffer(handle);
+	m_rgraph->getCachedBuffer(handle, buff, offset, range);
 }
 
 inline void RenderPassWorkContext::getRenderTargetState(RenderTargetHandle handle,

+ 112 - 1
AnKi/Renderer/GpuVisibility.cpp

@@ -5,16 +5,127 @@
 
 #include <AnKi/Renderer/GpuVisibility.h>
 #include <AnKi/Renderer/Renderer.h>
+#include <AnKi/Renderer/HiZ.h>
 #include <AnKi/Scene/RenderStateBucket.h>
+#include <AnKi/Scene/ContiguousArrayAllocator.h>
 #include <AnKi/Core/GpuMemory/GpuVisibleTransientMemoryPool.h>
+#include <AnKi/Core/GpuMemory/RebarTransientMemoryPool.h>
+#include <AnKi/Core/GpuMemory/GpuSceneBuffer.h>
+#include <AnKi/Collision/Functions.h>
 
 namespace anki {
 
+Error GpuVisibility::init()
+{
+	ANKI_CHECK(loadShaderProgram("ShaderBinaries/GpuVisibility.ankiprogbin", m_prog, m_grProg));
+
+	return Error::kNone;
+}
+
 void GpuVisibility::populateRenderGraph(RenderingContext& ctx)
 {
+	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
+
+	U32 activeBucketCount = 0;
+	U32 aabbCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(RenderingTechnique::kGBuffer,
+															  [&](const RenderStateInfo&, U32 userCount) {
+																  ++activeBucketCount;
+																  aabbCount += userCount;
+															  });
+
+	ANKI_ASSERT(aabbCount
+				== AllGpuSceneContiguousArrays::getSingleton().getElementCount(
+					GpuSceneContiguousArrayType::kRenderableBoundingVolumesGBuffer));
+	aabbCount = AllGpuSceneContiguousArrays::getSingleton().getElementCount(
+		GpuSceneContiguousArrayType::kRenderableBoundingVolumesGBuffer);
+
 	// Allocate memory for the indirect commands
+	const GpuVisibleTransientMemoryAllocation indirectCalls =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(DrawIndexedIndirectInfo));
+	const GpuVisibleTransientMemoryAllocation instanceRateRenderables =
+		GpuVisibleTransientMemoryPool::getSingleton().allocate(aabbCount * sizeof(GpuSceneRenderable));
+
+	// Allocate and fill the offsets to the atomic counters for each bucket
+	RebarAllocation atomicsAlloc;
+	U32* atomics = static_cast<U32*>(
+		RebarTransientMemoryPool::getSingleton().allocateFrame(activeBucketCount * sizeof(U32), atomicsAlloc));
+	U32 count = 0;
+	activeBucketCount = 0;
+	RenderStateBucketContainer::getSingleton().iterateBuckets(RenderingTechnique::kGBuffer,
+															  [&](const RenderStateInfo&, U32 userCount) {
+																  atomics[activeBucketCount] = count;
+																  count += userCount;
+																  ++activeBucketCount;
+															  });
+
+	// Import buffers
+	m_runCtx.m_instanceRateRenderables =
+		rgraph.importBuffer(BufferPtr(instanceRateRenderables.m_buffer), BufferUsageBit::kNone,
+							instanceRateRenderables.m_offset, instanceRateRenderables.m_size);
+	m_runCtx.m_drawIndexedIndirects = rgraph.importBuffer(BufferPtr(indirectCalls.m_buffer), BufferUsageBit::kNone,
+														  indirectCalls.m_offset, indirectCalls.m_size);
+	m_runCtx.m_drawIndirectOffsets =
+		rgraph.importBuffer(RebarTransientMemoryPool::getSingleton().getBuffer(), BufferUsageBit::kNone,
+							atomicsAlloc.m_offset, atomicsAlloc.m_range);
+
+	// Create the renderpass
+	constexpr BufferUsageBit bufferUsage = BufferUsageBit::kStorageComputeRead;
+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("GPU occlusion GBuffer");
+
+	pass.newBufferDependency(getRenderer().getGpuSceneBufferHandle(), bufferUsage);
+	pass.newTextureDependency(getRenderer().getHiZ().getHiZRt(), TextureUsageBit::kSampledCompute);
+	pass.newBufferDependency(m_runCtx.m_instanceRateRenderables, bufferUsage);
+	pass.newBufferDependency(m_runCtx.m_drawIndexedIndirects, bufferUsage);
+	pass.newBufferDependency(m_runCtx.m_drawIndirectOffsets, bufferUsage);
+
+	pass.setWork([this, &ctx](RenderPassWorkContext& rpass) {
+		CommandBufferPtr& cmdb = rpass.m_commandBuffer;
+
+		cmdb->bindShaderProgram(m_grProg);
+
+		cmdb->bindStorageBuffer(0, 0, GpuSceneBuffer::getSingleton().getBuffer(),
+								AllGpuSceneContiguousArrays::getSingleton().getArrayBase(
+									GpuSceneContiguousArrayType::kRenderableBoundingVolumesGBuffer),
+								AllGpuSceneContiguousArrays::getSingleton().getElementCount(
+									GpuSceneContiguousArrayType::kRenderableBoundingVolumesGBuffer)
+									* sizeof(GpuSceneRenderableAabb));
+
+		cmdb->bindStorageBuffer(
+			0, 1, GpuSceneBuffer::getSingleton().getBuffer(),
+			AllGpuSceneContiguousArrays::getSingleton().getArrayBase(GpuSceneContiguousArrayType::kRenderables),
+			AllGpuSceneContiguousArrays::getSingleton().getElementCount(GpuSceneContiguousArrayType::kRenderables)
+				* sizeof(GpuSceneRenderable));
+
+		cmdb->bindStorageBuffer(0, 2, GpuSceneBuffer::getSingleton().getBuffer(), 0, kMaxPtrSize);
+
+		rpass.bindColorTexture(0, 3, getRenderer().getHiZ().getHiZRt());
+
+		rpass.bindStorageBuffer(0, 4, m_runCtx.m_instanceRateRenderables);
+		rpass.bindStorageBuffer(0, 5, m_runCtx.m_drawIndexedIndirects);
+		rpass.bindStorageBuffer(0, 6, m_runCtx.m_drawIndirectOffsets);
+
+		struct Uniforms
+		{
+			Vec4 m_clipPlanes[6u];
+
+			UVec3 m_padding;
+			U32 m_aabbCount;
+		} unis;
+
+		Array<Plane, 6> planes;
+		extractClipPlanes(ctx.m_matrices.m_viewProjection, planes);
+		for(U32 i = 0; i < 6; ++i)
+		{
+			unis.m_clipPlanes[i] = Vec4(planes[i].getNormal().xyz(), planes[i].getOffset());
+		}
+
+		unis.m_aabbCount = AllGpuSceneContiguousArrays::getSingleton().getElementCount(
+			GpuSceneContiguousArrayType::kRenderableBoundingVolumesGBuffer);
+		cmdb->setPushConstants(&unis, sizeof(unis));
 
-	// TODO
+		dispatchPPCompute(cmdb, 64, 1, 1, unis.m_aabbCount, 1, 1);
+	});
 }
 
 } // end namespace anki

+ 11 - 4
AnKi/Renderer/GpuVisibility.h

@@ -16,15 +16,22 @@ namespace anki {
 class GpuVisibility : public RendererObject
 {
 public:
-	Error init()
-	{
-		return Error::kNone;
-	}
+	Error init();
 
 	/// Populate the rendergraph.
 	void populateRenderGraph(RenderingContext& ctx);
 
 private:
+	ShaderProgramResourcePtr m_prog;
+	ShaderProgramPtr m_grProg;
+
+	class
+	{
+	public:
+		BufferHandle m_instanceRateRenderables;
+		BufferHandle m_drawIndexedIndirects;
+		BufferHandle m_drawIndirectOffsets;
+	} m_runCtx;
 };
 /// @}
 

+ 5 - 0
AnKi/Renderer/HiZ.h

@@ -20,6 +20,11 @@ public:
 
 	void populateRenderGraph(RenderingContext& ctx);
 
+	const RenderTargetHandle& getHiZRt() const
+	{
+		return m_runCtx.m_hiZRt;
+	}
+
 private:
 	RenderTargetDescription m_hiZRtDescr;
 

+ 5 - 0
AnKi/Renderer/Renderer.cpp

@@ -45,6 +45,7 @@
 #include <AnKi/Renderer/VrsSriGeneration.h>
 #include <AnKi/Renderer/PackVisibleClusteredObjects.h>
 #include <AnKi/Renderer/HiZ.h>
+#include <AnKi/Renderer/GpuVisibility.h>
 
 namespace anki {
 
@@ -254,6 +255,9 @@ Error Renderer::initInternal(UVec2 swapchainResolution)
 	m_hiZ.reset(newInstance<HiZ>(RendererMemoryPool::getSingleton()));
 	ANKI_CHECK(m_hiZ->init());
 
+	m_gpuVisibility.reset(newInstance<GpuVisibility>(RendererMemoryPool::getSingleton()));
+	ANKI_CHECK(m_gpuVisibility->init());
+
 	// Init samplers
 	{
 		SamplerInitInfo sinit("NearestNearestClamp");
@@ -352,6 +356,7 @@ Error Renderer::populateRenderGraph(RenderingContext& ctx)
 	// Populate render graph. WARNING Watch the order
 	m_hiZ->populateRenderGraph(ctx);
 	gpuSceneCopy(ctx);
+	m_gpuVisibility->populateRenderGraph(ctx);
 	m_packVisibleClustererObjects->populateRenderGraph(ctx);
 	m_genericCompute->populateRenderGraph(ctx);
 	m_clusterBinning->populateRenderGraph(ctx);

+ 10 - 0
AnKi/Renderer/RendererObject.cpp

@@ -70,4 +70,14 @@ void RendererObject::registerDebugRenderTarget(CString rtName)
 	getRenderer().registerDebugRenderTarget(this, rtName);
 }
 
+Error RendererObject::loadShaderProgram(CString filename, ShaderProgramResourcePtr& rsrc, ShaderProgramPtr& grProg)
+{
+	ANKI_CHECK(ResourceManager::getSingleton().loadResource(filename, rsrc));
+	const ShaderProgramResourceVariant* variant;
+	rsrc->getOrCreateVariant(variant);
+	grProg = variant->getProgram();
+
+	return Error::kNone;
+}
+
 } // end namespace anki

+ 1 - 0
AnKi/Renderer/RendererObject.defs.h

@@ -33,3 +33,4 @@ ANKI_RENDERER_OBJECT_DEF(IndirectDiffuse, indirectDiffuse)
 ANKI_RENDERER_OBJECT_DEF(VrsSriGeneration, vrsSriGeneration)
 ANKI_RENDERER_OBJECT_DEF(PackVisibleClusteredObjects, packVisibleClustererObjects)
 ANKI_RENDERER_OBJECT_DEF(HiZ, hiZ)
+ANKI_RENDERER_OBJECT_DEF(GpuVisibility, gpuVisibility)

+ 2 - 0
AnKi/Renderer/RendererObject.h

@@ -102,6 +102,8 @@ protected:
 	}
 
 	void registerDebugRenderTarget(CString rtName);
+
+	static Error loadShaderProgram(CString filename, ShaderProgramResourcePtr& rsrc, ShaderProgramPtr& grProg);
 };
 /// @}
 

+ 5 - 6
AnKi/Scene/Components/ModelComponent.cpp

@@ -10,6 +10,7 @@
 #include <AnKi/Scene/Components/SkinComponent.h>
 #include <AnKi/Resource/ModelResource.h>
 #include <AnKi/Resource/ResourceManager.h>
+#include <AnKi/Shaders/Include/GpuSceneFunctions.h>
 
 namespace anki {
 
@@ -311,15 +312,13 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 		const U32 modelPatchCount = m_model->getModelPatches().getSize();
 		for(U32 i = 0; i < modelPatchCount; ++i)
 		{
-			GpuSceneRenderableAabb gpuVolume;
-			gpuVolume.m_aabbMin = m_spatial.getAabbWorldSpace().getMin().xyz();
-			gpuVolume.m_aabbMax = m_spatial.getAabbWorldSpace().getMax().xyz();
-			gpuVolume.m_renderableIndex = m_patchInfos[i].m_gpuSceneIndexRenderable.get();
-
 			for(RenderingTechnique t :
 				EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(m_patchInfos[i].m_techniques))
 			{
-				gpuVolume.m_renderStateBucket = m_patchInfos[i].m_renderStateBucketIndices[t].get();
+				const GpuSceneRenderableAabb gpuVolume = initGpuSceneRenderableAabb(
+					m_spatial.getAabbWorldSpace().getMin().xyz(), m_spatial.getAabbWorldSpace().getMax().xyz(),
+					m_patchInfos[i].m_gpuSceneIndexRenderable.get(),
+					m_patchInfos[i].m_renderStateBucketIndices[t].get());
 
 				GpuSceneMicroPatcher::getSingleton().newCopy(
 					*info.m_framePool, m_patchInfos[i].m_gpuSceneIndexRenderableAabbs[t].getOffsetInGpuScene(),

+ 6 - 6
AnKi/Scene/Components/ParticleEmitterComponent.cpp

@@ -14,6 +14,7 @@
 #include <AnKi/Physics/PhysicsWorld.h>
 #include <AnKi/Math.h>
 #include <AnKi/Renderer/RenderQueue.h>
+#include <AnKi/Shaders/Include/GpuSceneFunctions.h>
 
 namespace anki {
 
@@ -381,12 +382,11 @@ Error ParticleEmitterComponent::update(SceneComponentUpdateInfo& info, Bool& upd
 	for(RenderingTechnique t : EnumBitsIterable<RenderingTechnique, RenderingTechniqueBit>(
 			m_particleEmitterResource->getMaterial()->getRenderingTechniques()))
 	{
-		GpuSceneRenderableAabb aabb;
-		aabb.m_aabbMin = m_spatial.getAabbWorldSpace().getMin().xyz();
-		aabb.m_aabbMax = m_spatial.getAabbWorldSpace().getMax().xyz();
-		aabb.m_renderableIndex = m_gpuSceneIndexRenderable.get();
-		aabb.m_renderStateBucket = m_renderStateBuckets[t].get();
-		patcher.newCopy(*info.m_framePool, m_gpuSceneIndexAabbs[t].getOffsetInGpuScene(), aabb);
+		const GpuSceneRenderableAabb gpuVolume = initGpuSceneRenderableAabb(
+			m_spatial.getAabbWorldSpace().getMin().xyz(), m_spatial.getAabbWorldSpace().getMax().xyz(),
+			m_gpuSceneIndexRenderable.get(), m_renderStateBuckets[t].get());
+
+		patcher.newCopy(*info.m_framePool, m_gpuSceneIndexAabbs[t].getOffsetInGpuScene(), gpuVolume);
 	}
 
 	m_resourceUpdated = false;

+ 2 - 2
AnKi/Scene/ContiguousArrayAllocator.h

@@ -211,8 +211,8 @@ private:
 		sizeof(GpuSceneDecal),
 		sizeof(GpuSceneFogDensityVolume),
 		sizeof(GpuSceneRenderable),
-		sizeof(GpuSceneRenderable),
-		sizeof(GpuSceneRenderable),
+		sizeof(GpuSceneRenderableAabb),
+		sizeof(GpuSceneRenderableAabb),
 		sizeof(GpuSceneRenderableAabb)};
 
 	AllGpuSceneContiguousArrays();

+ 1 - 1
AnKi/Scene/RenderStateBucket.h

@@ -94,7 +94,7 @@ public:
 		{
 			if(b.m_userCount > 0)
 			{
-				func(static_cast<const RenderStateInfo&>(b));
+				func(static_cast<const RenderStateInfo&>(b), b.m_userCount);
 			}
 		}
 	}

+ 17 - 15
AnKi/Shaders/GpuVisibility.ankiprog

@@ -8,26 +8,26 @@
 #include <AnKi/Shaders/Include/GpuSceneTypes.h>
 #include <AnKi/Shaders/CollisionFunctions.hlsl>
 
+// Buffers that point to the GPU scene
 [[vk::binding(0)]] StructuredBuffer<GpuSceneRenderableAabb> g_aabbs;
 [[vk::binding(1)]] StructuredBuffer<GpuSceneRenderable> g_renderables;
+[[vk::binding(2)]] ByteAddressBuffer g_gpuScene;
 
-[[vk::binding(2)]] Texture2D<U32> g_hiZTex;
+[[vk::binding(3)]] Texture2D<U32> g_hiZTex;
 
 // These 2 have the same size
-[[vk::binding(3)]] RWStructuredBuffer<GpuSceneRenderable> g_instanceRateRenderables;
-[[vk::binding(4)]] RWStructuredBuffer<DrawIndexedIndirectInfo> g_drawIndexedIndirects;
+[[vk::binding(4)]] RWStructuredBuffer<GpuSceneRenderable> g_instanceRateRenderables;
+[[vk::binding(5)]] RWStructuredBuffer<DrawIndexedIndirectInfo> g_drawIndexedIndirects;
 
 // Index pointing to the above arrays. Its size is equal to the number of render state buckets
-[[vk::binding(5)]] RWStructuredBuffer<U32> g_drawIndirectOffset;
-
-[[vk::binding(6)]] ByteAddressBuffer g_gpuScene;
+[[vk::binding(6)]] RWStructuredBuffer<U32> g_drawIndirectOffset;
 
 struct Uniforms
 {
 	Vec4 m_clipPlanes[6u];
 
 	UVec3 m_padding;
-	U32 m_renderableCount;
+	U32 m_aabbCount;
 };
 
 [[vk::push_constant]] ConstantBuffer<Uniforms> g_unis;
@@ -35,7 +35,7 @@ struct Uniforms
 [numthreads(64, 1, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	const U32 aabbIdx = svDispatchThreadId.x;
-	if(aabbIdx >= g_unis.m_renderableCount)
+	if(aabbIdx >= g_unis.m_aabbCount)
 	{
 		return;
 	}
@@ -44,15 +44,14 @@ struct Uniforms
 
 	// Frustum test
 	//
-	Bool behindAtLeastOnePlane = false;
+	F32 minPlaneDistance = 0.0f;
 	[unroll] for(U32 i = 0; i < 6; ++i)
 	{
-		const Bool behind =
-			testPlaneAabb(g_unis.m_clipPlanes[i].xyz, g_unis.m_clipPlanes[i].w, aabb.m_aabbMin, aabb.m_aabbMax) < 0.0;
-		behindAtLeastOnePlane = behindAtLeastOnePlane || behind;
+		const F32 d = testPlanePoint(g_unis.m_clipPlanes[i].xyz, g_unis.m_clipPlanes[i].w, aabb.m_sphereCenter);
+		minPlaneDistance = min(minPlaneDistance, d);
 	}
 
-	if(behindAtLeastOnePlane)
+	if(minPlaneDistance <= aabb.m_negativeSphereRadius)
 	{
 		return;
 	}
@@ -64,10 +63,13 @@ struct Uniforms
 
 	// Add the drawcall
 	//
+	const U32 renderStateBucket = aabb.m_renderableIndexAndRenderStateBucket & ((1u << 12u) - 1u);
+	const U32 renderableIdx = aabb.m_renderableIndexAndRenderStateBucket >> 12u;
+
 	U32 indirectIdx;
-	InterlockedAdd(g_drawIndirectOffset[aabb.m_renderStateBucket], 1, indirectIdx);
+	InterlockedAdd(g_drawIndirectOffset[renderStateBucket], 1, indirectIdx);
 
-	const GpuSceneRenderable renderableIn = g_renderables[aabb.m_renderableIndex];
+	const GpuSceneRenderable renderableIn = g_renderables[renderableIdx];
 	const GpuSceneMeshLod meshLod =
 		g_gpuScene.Load<GpuSceneMeshLod>(renderableIn.m_geometryOffset + sizeof(GpuSceneMeshLod) * lod);
 

+ 3 - 3
AnKi/Shaders/Include/Common.h

@@ -19,7 +19,6 @@
 
 #	define ANKI_BEGIN_NAMESPACE namespace anki {
 #	define ANKI_END_NAMESPACE }
-#	define ANKI_SHADER_FUNC_INLINE inline
 
 #	define ANKI_ARRAY(type, size, name) Array<type, U32(size)> name
 
@@ -46,7 +45,8 @@ ANKI_END_NAMESPACE
 
 #	define ANKI_BEGIN_NAMESPACE
 #	define ANKI_END_NAMESPACE
-#	define ANKI_SHADER_FUNC_INLINE
+#	define inline
+#	define ANKI_ASSERT(x)
 
 #	define ANKI_ARRAY(type, size, name) type name[(U32)size]
 
@@ -386,7 +386,7 @@ constexpr F32 kPi = 3.14159265358979323846f;
 
 #	define ANKI_BEGIN_NAMESPACE
 #	define ANKI_END_NAMESPACE
-#	define ANKI_SHADER_FUNC_INLINE
+#	define inline
 
 #	define ANKI_SHADER_STATIC_ASSERT(cond_)
 

+ 24 - 2
AnKi/Shaders/Include/GpuSceneFunctions.h

@@ -9,7 +9,7 @@
 
 ANKI_BEGIN_NAMESPACE
 
-ANKI_SHADER_FUNC_INLINE GpuSceneRenderablePacked packGpuSceneRenderable(GpuSceneRenderable x)
+inline GpuSceneRenderablePacked packGpuSceneRenderable(GpuSceneRenderable x)
 {
 	GpuSceneRenderablePacked o;
 	o[0] = x.m_worldTransformsOffset;
@@ -19,7 +19,7 @@ ANKI_SHADER_FUNC_INLINE GpuSceneRenderablePacked packGpuSceneRenderable(GpuScene
 	return o;
 }
 
-ANKI_SHADER_FUNC_INLINE GpuSceneRenderable unpackGpuSceneRenderable(GpuSceneRenderablePacked x)
+inline GpuSceneRenderable unpackGpuSceneRenderable(GpuSceneRenderablePacked x)
 {
 	GpuSceneRenderable o;
 	o.m_worldTransformsOffset = x[0];
@@ -29,4 +29,26 @@ ANKI_SHADER_FUNC_INLINE GpuSceneRenderable unpackGpuSceneRenderable(GpuSceneRend
 	return o;
 }
 
+inline GpuSceneRenderableAabb initGpuSceneRenderableAabb(Vec3 aabbMin, Vec3 aabbMax, U32 renderableIndex,
+														 U32 renderStateBucket)
+{
+	GpuSceneRenderableAabb gpuVolume;
+
+	gpuVolume.m_sphereCenter = (aabbMin + aabbMax) * 0.5f;
+	gpuVolume.m_aabbExtend = aabbMax - gpuVolume.m_sphereCenter;
+#if defined(__cplusplus)
+	gpuVolume.m_negativeSphereRadius = -gpuVolume.m_aabbExtend.getLength();
+#else
+	gpuVolume.m_negativeSphereRadius = -length(gpuVolume.m_aabbExtend);
+#endif
+
+	ANKI_ASSERT(renderableIndex <= (1u << 20u) - 1u);
+	gpuVolume.m_renderableIndexAndRenderStateBucket = renderableIndex << 12u;
+
+	ANKI_ASSERT(renderStateBucket <= (1u << 12u) - 1u);
+	gpuVolume.m_renderableIndexAndRenderStateBucket |= renderStateBucket;
+
+	return gpuVolume;
+}
+
 ANKI_END_NAMESPACE

+ 6 - 4
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -24,12 +24,14 @@ typedef UVec4 GpuSceneRenderablePacked;
 /// Used in visibility testing.
 struct GpuSceneRenderableAabb
 {
-	Vec3 m_aabbMin;
-	U32 m_renderableIndex; ///< Points to a GpuSceneRenderable
+	Vec3 m_sphereCenter;
+	F32 m_negativeSphereRadius;
 
-	Vec3 m_aabbMax;
-	U32 m_renderStateBucket;
+	Vec3 m_aabbExtend;
+	/// High 20bits point to a GpuSceneRenderable. Rest 12bits are the render state bucket idx
+	U32 m_renderableIndexAndRenderStateBucket;
 };
+static_assert(sizeof(GpuSceneRenderableAabb) == sizeof(Vec4) * 2);
 
 struct GpuSceneMeshLod
 {