Pārlūkot izejas kodu

Have a 32bit atomics path in cluster binning

Panagiotis Christopoulos Charitos 4 gadi atpakaļ
vecāks
revīzija
902f46ea49

+ 2 - 1
AnKi/Config.h.cmake

@@ -220,7 +220,8 @@ namespace anki {
 	extern "C" void android_main(android_app* app) \
 	{ \
 		anki::g_androidApp = app; \
-		char* argv[] = {"androidapp"}; \
+		char arr[] = "androidapp"; \
+		char* argv[] = {arr}; \
 		myMain(1, argv); \
 	}
 #else

+ 5 - 1
AnKi/Gr/Common.h

@@ -153,9 +153,13 @@ public:
 
 	/// RT.
 	Bool m_rayTracingEnabled = false;
+
+	/// 64 bit atomics.
+	Bool m_64bitAtomics = false;
 };
 ANKI_END_PACKED_STRUCT
-static_assert(sizeof(GpuDeviceCapabilities) == sizeof(PtrSize) * 4 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool),
+static_assert(sizeof(GpuDeviceCapabilities)
+				  == sizeof(PtrSize) * 4 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 2,
 			  "Should be packed");
 
 /// Bindless related info.

+ 1 - 0
AnKi/Gr/ConfigDefs.h

@@ -10,6 +10,7 @@ ANKI_CONFIG_OPTION(gr_vsync, 0, 0, 1)
 ANKI_CONFIG_OPTION(gr_maxBindlessTextures, 256, 8, 1024)
 ANKI_CONFIG_OPTION(gr_maxBindlessImages, 32, 8, 1024)
 ANKI_CONFIG_OPTION(gr_rayTracing, 0, 0, 1, "Try enabling ray tracing")
+ANKI_CONFIG_OPTION(gr_64bitAtomics, 1, 0, 1)
 
 // Vulkan
 ANKI_CONFIG_OPTION(gr_diskShaderCacheMaxSize, 128_MB, 1_MB, 1_GB)

+ 7 - 5
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -662,7 +662,8 @@ Error GrManagerImpl::initDevice(const GrManagerInitInfo& init)
 				m_extensions |= VulkanExtensions::KHR_SHADER_FLOAT16_INT8;
 				extensionsToEnable[extensionsToEnableCount++] = extensionName.cstr();
 			}
-			else if(extensionName == VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME)
+			else if(extensionName == VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME
+					&& init.m_config->getBool("gr_64bitAtomics"))
 			{
 				m_extensions |= VulkanExtensions::KHR_SHADER_ATOMIC_INT64;
 				extensionsToEnable[extensionsToEnableCount++] = extensionName.cstr();
@@ -744,8 +745,7 @@ Error GrManagerImpl::initDevice(const GrManagerInitInfo& init)
 	// Buffer address
 	if(!(m_extensions & VulkanExtensions::KHR_BUFFER_DEVICE_ADDRESS))
 	{
-		ANKI_VK_LOGE(VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME " is not supported");
-		return Error::FUNCTION_FAILED;
+		ANKI_VK_LOGW(VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME " is not supported");
 	}
 	else
 	{
@@ -881,11 +881,13 @@ Error GrManagerImpl::initDevice(const GrManagerInitInfo& init)
 	// 64bit atomics
 	if(!(m_extensions & VulkanExtensions::KHR_SHADER_ATOMIC_INT64))
 	{
-		ANKI_VK_LOGE(VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME " is not supported");
-		return Error::FUNCTION_FAILED;
+		ANKI_VK_LOGW(VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME " is not supported or disabled");
+		m_capabilities.m_64bitAtomics = false;
 	}
 	else
 	{
+		m_capabilities.m_64bitAtomics = true;
+
 		m_atomicInt64Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR;
 
 		VkPhysicalDeviceFeatures2 features = {};

+ 2 - 0
AnKi/Renderer/ClusterBinning.cpp

@@ -40,6 +40,8 @@ Error ClusterBinning::init(const ConfigSet& config)
 	variantInitInfo.addConstant("RENDERING_SIZE",
 								UVec2(m_r->getInternalResolution().x(), m_r->getInternalResolution().y()));
 
+	variantInitInfo.addMutation("SUPPORTS_64BIT_ATOMICS", m_r->getGrManager().getDeviceCapabilities().m_64bitAtomics);
+
 	const ShaderProgramResourceVariant* variant;
 	m_prog->getOrCreateVariant(variantInitInfo, variant);
 	m_grProg = variant->getProgram();

+ 84 - 20
AnKi/Shaders/ClusterBinning.ankiprog

@@ -3,6 +3,9 @@
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 
+// For those platforms that don't support 64bit atomics try to do the atomics in 32bit
+#pragma anki mutator SUPPORTS_64BIT_ATOMICS 0 1
+
 ANKI_SPECIALIZATION_CONSTANT_U32(TILE_SIZE, 0u);
 ANKI_SPECIALIZATION_CONSTANT_U32(TILE_COUNT_X, 1u);
 ANKI_SPECIALIZATION_CONSTANT_U32(TILE_COUNT_Y, 2u);
@@ -23,11 +26,22 @@ layout(set = 0, binding = 0, scalar) uniform b_unis
 	ClusteredShadingUniforms u_unis;
 };
 
-layout(set = 0, binding = 1, scalar) writeonly buffer b_clusters
+layout(set = 0, binding = 1, scalar) writeonly buffer b_clusters64
+{
+	Cluster u_clusters64[];
+};
+
+layout(set = 0, binding = 1, scalar) writeonly buffer b_clusters32
 {
-	Cluster u_clusters[];
+	Cluster32 u_clusters32[];
 };
 
+#if SUPPORTS_64BIT_ATOMICS
+#	define u_clusters u_clusters64
+#else
+#	define u_clusters u_clusters32
+#endif
+
 layout(set = 0, binding = 2, scalar) uniform b_pointLights
 {
 	PointLight u_pointLights[MAX_VISIBLE_POINT_LIGHTS];
@@ -68,10 +82,60 @@ UVec2 SAMPLE_LOCATIONS[SAMPLE_COUNT] = UVec2[](LOCATION(-2, -6), LOCATION(6, -2)
 
 // A mask per tile of this workgroup for the clusterer object being processed by this workgroup
 const U32 TILES_PER_WORKGROUP = WORKGROUP_SIZE / SAMPLE_COUNT;
+
+#if SUPPORTS_64BIT_ATOMICS
 shared U64 s_tileMasks[TILES_PER_WORKGROUP];
+#else
+shared U32 s_tileMasks[TILES_PER_WORKGROUP][2u];
+#endif
 
 // A mask for each Z split for a specific clusterer object
+#if SUPPORTS_64BIT_ATOMICS
 shared U64 s_zSplitMasks[Z_SPLIT_COUNT];
+#else
+shared U32 s_zSplitMasks[Z_SPLIT_COUNT][2u];
+#endif
+
+#if SUPPORTS_64BIT_ATOMICS
+#	define atomicOr2x32_64(dest, src) atomicOr(dest, src)
+
+#	define atomicOr2x32_2x32(dest, src) atomicOr(dest, src)
+
+#	define atomicOr32_2X32(dest, src) atomicOr(dest, U32(src))
+
+#	define zero2x32(dest) \
+		do \
+		{ \
+			dest = 0ul; \
+		} while(false)
+
+#	define isZero2x32(src) (src == 0ul)
+#else // !SUPPORTS_64BIT_ATOMICS
+#	define atomicOr2x32_64(dest, src) \
+		do \
+		{ \
+			atomicOr(dest[0u], U32(src)); \
+			atomicOr(dest[1u], U32(src >> 32ul)); \
+		} while(false)
+
+#	define atomicOr2x32_2x32(dest, src) \
+		do \
+		{ \
+			atomicOr(dest[0u], src[0u]); \
+			atomicOr(dest[1u], src[1u]); \
+		} while(false)
+
+#	define atomicOr32_2X32(dest, src) atomicOr(dest, src[1u])
+
+#	define zero2x32(dest) \
+		do \
+		{ \
+			dest[0u] = 0u; \
+			dest[1u] = 0u; \
+		} while(false)
+
+#	define isZero2x32(src) ((src[0u] | src[1u]) == 0u)
+#endif
 
 Bool isPointLight()
 {
@@ -132,12 +196,12 @@ void main()
 	const Vec3 rayDir = normalize(farWorldPos - rayOrigin);
 
 	// Zero shared memory
-	s_tileMasks[localTileIdx] = 0ul;
+	zero2x32(s_tileMasks[localTileIdx]);
 	const U32 splitsPerInvocation = max(1u, Z_SPLIT_COUNT / WORKGROUP_SIZE);
 	for(U32 i = gl_LocalInvocationIndex * splitsPerInvocation;
 		i < (gl_LocalInvocationIndex + 1u) * splitsPerInvocation && i < Z_SPLIT_COUNT; ++i)
 	{
-		s_zSplitMasks[i] = 0ul;
+		zero2x32(s_zSplitMasks[i]);
 	}
 	memoryBarrierShared();
 	barrier();
@@ -239,7 +303,7 @@ void main()
 	{
 		// Set the tile
 		const U64 mask = 1ul << U64(objectArrayIdx);
-		atomicOr(s_tileMasks[localTileIdx], mask);
+		atomicOr2x32_64(s_tileMasks[localTileIdx], mask);
 
 		// Compute and set the Z splits
 		const Vec3 hitpointA = rayDir * t0 + rayOrigin;
@@ -267,7 +331,7 @@ void main()
 			clamp(I32(maxDistFromNearPlane * u_unis.m_zSplitCountOverFrustumLength), 0, I32(Z_SPLIT_COUNT) - 1);
 		for(I32 i = startZSplit; i <= endZSplit; ++i)
 		{
-			atomicOr(s_zSplitMasks[i], mask);
+			atomicOr2x32_64(s_zSplitMasks[i], mask);
 		}
 	}
 
@@ -276,31 +340,31 @@ void main()
 	barrier();
 
 	// First sample writes the tile
-	if(sampleIdx == 0u && s_tileMasks[localTileIdx] != 0ul)
+	if(sampleIdx == 0u && !isZero2x32(s_tileMasks[localTileIdx]))
 	{
 		if(isPointLight())
 		{
-			atomicOr(u_clusters[tileIdx].m_pointLightsMask, s_tileMasks[localTileIdx]);
+			atomicOr2x32_2x32(u_clusters[tileIdx].m_pointLightsMask, s_tileMasks[localTileIdx]);
 		}
 		else if(isSpotLight())
 		{
-			atomicOr(u_clusters[tileIdx].m_spotLightsMask, s_tileMasks[localTileIdx]);
+			atomicOr2x32_2x32(u_clusters[tileIdx].m_spotLightsMask, s_tileMasks[localTileIdx]);
 		}
 		else if(isDecal())
 		{
-			atomicOr(u_clusters[tileIdx].m_decalsMask, s_tileMasks[localTileIdx]);
+			atomicOr2x32_2x32(u_clusters[tileIdx].m_decalsMask, s_tileMasks[localTileIdx]);
 		}
 		else if(isFogVolume())
 		{
-			atomicOr(u_clusters[tileIdx].m_fogDensityVolumesMask, U32(s_tileMasks[localTileIdx]));
+			atomicOr32_2X32(u_clusters[tileIdx].m_fogDensityVolumesMask, s_tileMasks[localTileIdx]);
 		}
 		else if(isReflectionProbe())
 		{
-			atomicOr(u_clusters[tileIdx].m_reflectionProbesMask, U32(s_tileMasks[localTileIdx]));
+			atomicOr32_2X32(u_clusters[tileIdx].m_reflectionProbesMask, s_tileMasks[localTileIdx]);
 		}
 		else
 		{
-			atomicOr(u_clusters[tileIdx].m_giProbesMask, U32(s_tileMasks[localTileIdx]));
+			atomicOr32_2X32(u_clusters[tileIdx].m_giProbesMask, s_tileMasks[localTileIdx]);
 		}
 	}
 
@@ -308,31 +372,31 @@ void main()
 	for(U32 i = gl_LocalInvocationIndex * splitsPerInvocation;
 		i < (gl_LocalInvocationIndex + 1u) * splitsPerInvocation && i < Z_SPLIT_COUNT; ++i)
 	{
-		if(s_zSplitMasks[i] != 0ul)
+		if(!isZero2x32(s_zSplitMasks[i]))
 		{
 			if(isPointLight())
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_pointLightsMask, s_zSplitMasks[i]);
+				atomicOr2x32_2x32(u_clusters[TILE_COUNT + i].m_pointLightsMask, s_zSplitMasks[i]);
 			}
 			else if(isSpotLight())
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_spotLightsMask, s_zSplitMasks[i]);
+				atomicOr2x32_2x32(u_clusters[TILE_COUNT + i].m_spotLightsMask, s_zSplitMasks[i]);
 			}
 			else if(isDecal())
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_decalsMask, s_zSplitMasks[i]);
+				atomicOr2x32_2x32(u_clusters[TILE_COUNT + i].m_decalsMask, s_zSplitMasks[i]);
 			}
 			else if(isFogVolume())
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_fogDensityVolumesMask, U32(s_zSplitMasks[i]));
+				atomicOr32_2X32(u_clusters[TILE_COUNT + i].m_fogDensityVolumesMask, s_zSplitMasks[i]);
 			}
 			else if(isReflectionProbe())
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_reflectionProbesMask, U32(s_zSplitMasks[i]));
+				atomicOr32_2X32(u_clusters[TILE_COUNT + i].m_reflectionProbesMask, s_zSplitMasks[i]);
 			}
 			else
 			{
-				atomicOr(u_clusters[TILE_COUNT + i].m_giProbesMask, U32(s_zSplitMasks[i]));
+				atomicOr32_2X32(u_clusters[TILE_COUNT + i].m_giProbesMask, s_zSplitMasks[i]);
 			}
 		}
 	}

+ 15 - 0
AnKi/Shaders/Include/ClusteredShadingTypes.h

@@ -218,4 +218,19 @@ struct Cluster
 const U32 _ANKI_SIZEOF_Cluster = 5u * ANKI_SIZEOF(U64);
 ANKI_SHADER_STATIC_ASSERT(sizeof(Cluster) == _ANKI_SIZEOF_Cluster);
 
+/// An alternative representation of Cluster that doesn't contain 64bit values
+struct Cluster32
+{
+	U32 m_pointLightsMask[2u];
+	U32 m_spotLightsMask[2u];
+	U32 m_decalsMask[2u];
+	U32 m_fogDensityVolumesMask;
+	U32 m_reflectionProbesMask;
+	U32 m_giProbesMask;
+	U32 m_padding; ///< Add some padding to be 100% sure nothing will break.
+};
+
+const U32 _ANKI_SIZEOF_Cluster32 = _ANKI_SIZEOF_Cluster;
+ANKI_SHADER_STATIC_ASSERT(sizeof(Cluster32) == _ANKI_SIZEOF_Cluster32);
+
 ANKI_END_NAMESPACE