4 年之前 · 0aca702730
--- a/AnKi/Renderer/DepthDownscale.cpp
+++ b/AnKi/Renderer/DepthDownscale.cpp
@@ -7,14 +7,26 @@
 
				 #include <AnKi/Renderer/Renderer.h>
			
 
				 #include <AnKi/Renderer/GBuffer.h>
			
 
				 
			
 
				+#if ANKI_COMPILER_GCC_COMPATIBLE
			
 
				+#	pragma GCC diagnostic push
			
 
				+#	pragma GCC diagnostic ignored "-Wunused-function"
			
 
				+#	pragma GCC diagnostic ignored "-Wignored-qualifiers"
			
 
				+#endif
			
 
				+#define A_CPU
			
 
				+#include <ThirdParty/FidelityFX/ffx_a.h>
			
 
				+#include <ThirdParty/FidelityFX/ffx_spd.h>
			
 
				+#if ANKI_COMPILER_GCC_COMPATIBLE
			
 
				+#	pragma GCC diagnostic pop
			
 
				+#endif
			
 
				+
			
 
				 namespace anki
			
 
				 {
			
 
				 
			
 
				 DepthDownscale::~DepthDownscale()
			
 
				 {
			
 
				-	if(m_copyToBuff.m_buffAddr)
			
 
				+	if(m_clientBufferAddr)
			
 
				 	{
			
 
				-		m_copyToBuff.m_buff->unmap();
			
 
				+		m_clientBuffer->unmap();
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -25,46 +37,71 @@ Error DepthDownscale::initInternal(const ConfigSet&)
 
				 
			
 
				 	m_mipCount = computeMaxMipmapCount2d(width, height, HIERARCHICAL_Z_MIN_HEIGHT);
			
 
				 
			
 
				-	const U32 lastMipWidth = width >> (m_mipCount - 1);
			
 
				-	const U32 lastMipHeight = height >> (m_mipCount - 1);
			
 
				+	m_lastMipSize.x() = width >> (m_mipCount - 1);
			
 
				+	m_lastMipSize.y() = height >> (m_mipCount - 1);
			
 
				+
			
 
				+	ANKI_R_LOGI("Initializing HiZ. Mip count %u, last mip size %ux%u", m_mipCount, m_lastMipSize.x(),
			
 
				+				m_lastMipSize.y());
			
 
				 
			
 
				-	ANKI_R_LOGI("Initializing HiZ. Mip count %u, last mip size %ux%u", m_mipCount, lastMipWidth, lastMipHeight);
			
 
				+	const Bool supportsReductionSampler = getGrManager().getDeviceCapabilities().m_samplingFilterMinMax;
			
 
				 
			
 
				 	// Create RT descr
			
 
				-	TextureInitInfo texInit = m_r->create2DRenderTargetInitInfo(
			
 
				-		width, height, Format::R32_SFLOAT, TextureUsageBit::ALL_SAMPLED | TextureUsageBit::IMAGE_COMPUTE_WRITE, "HiZ");
			
 
				-	texInit.m_mipmapCount = U8(m_mipCount);
			
 
				-	texInit.m_initialUsage = TextureUsageBit::SAMPLED_FRAGMENT;
			
 
				-	m_hizTex = m_r->createAndClearRenderTarget(texInit);
			
 
				+	{
			
 
				+		TextureInitInfo texInit = m_r->create2DRenderTargetInitInfo(
			
 
				+			width, height, Format::R32_SFLOAT, TextureUsageBit::ALL_SAMPLED | TextureUsageBit::IMAGE_COMPUTE_WRITE,
			
 
				+			"HiZ");
			
 
				+		texInit.m_mipmapCount = U8(m_mipCount);
			
 
				+		texInit.m_initialUsage = TextureUsageBit::SAMPLED_FRAGMENT;
			
 
				+		m_hizTex = m_r->createAndClearRenderTarget(texInit);
			
 
				+	}
			
 
				 
			
 
				 	// Progs
			
 
				-	ANKI_CHECK(getResourceManager().loadResource("Shaders/DepthDownscale.ankiprog", m_prog));
			
 
				+	{
			
 
				+		ANKI_CHECK(getResourceManager().loadResource("Shaders/DepthDownscale.ankiprog", m_prog));
			
 
				+
			
 
				+		ShaderProgramResourceVariantInitInfo variantInitInfo(m_prog);
			
 
				+		variantInitInfo.addMutation("SAMPLE_RESOLVE_TYPE", 2);
			
 
				+		variantInitInfo.addMutation("WAVE_OPERATIONS", 0);
			
 
				+		variantInitInfo.addMutation("REDUCTION_SAMPLER", supportsReductionSampler);
			
 
				 
			
 
				-	ShaderProgramResourceVariantInitInfo variantInitInfo(m_prog);
			
 
				-	variantInitInfo.addMutation("SAMPLE_RESOLVE_TYPE", 2);
			
 
				+		const ShaderProgramResourceVariant* variant;
			
 
				+		m_prog->getOrCreateVariant(variantInitInfo, variant);
			
 
				+		m_grProg = variant->getProgram();
			
 
				+	}
			
 
				 
			
 
				-	const ShaderProgramResourceVariant* variant;
			
 
				-	m_prog->getOrCreateVariant(variantInitInfo, variant);
			
 
				-	m_grProg = variant->getProgram();
			
 
				+	// Reduction sampler
			
 
				+	if(supportsReductionSampler)
			
 
				+	{
			
 
				+		SamplerInitInfo sinit("HiZReduction");
			
 
				+		sinit.m_addressing = SamplingAddressing::CLAMP;
			
 
				+		sinit.m_mipmapFilter = SamplingFilter::MAX;
			
 
				+		sinit.m_minMagFilter = SamplingFilter::MAX;
			
 
				+		m_reductionSampler = getGrManager().newSampler(sinit);
			
 
				+	}
			
 
				 
			
 
				-	// Copy to buffer
			
 
				+	// Counter buffer
			
 
				 	{
			
 
				-		m_copyToBuff.m_lastMipWidth = lastMipWidth;
			
 
				-		m_copyToBuff.m_lastMipHeight = lastMipHeight;
			
 
				+		BufferInitInfo buffInit("HiZCounterBuffer");
			
 
				+		buffInit.m_size = sizeof(U32);
			
 
				+		buffInit.m_usage = BufferUsageBit::STORAGE_COMPUTE_WRITE | BufferUsageBit::TRANSFER_DESTINATION;
			
 
				+		m_counterBuffer = getGrManager().newBuffer(buffInit);
			
 
				+	}
			
 
				 
			
 
				+	// Copy to buffer
			
 
				+	{
			
 
				 		// Create buffer
			
 
				 		BufferInitInfo buffInit("HiZ Client");
			
 
				 		buffInit.m_mapAccess = BufferMapAccessBit::READ;
			
 
				-		buffInit.m_size = PtrSize(lastMipHeight) * PtrSize(lastMipWidth) * sizeof(F32);
			
 
				+		buffInit.m_size = PtrSize(m_lastMipSize.y()) * PtrSize(m_lastMipSize.x()) * sizeof(F32);
			
 
				 		buffInit.m_usage = BufferUsageBit::STORAGE_COMPUTE_WRITE;
			
 
				-		m_copyToBuff.m_buff = getGrManager().newBuffer(buffInit);
			
 
				+		m_clientBuffer = getGrManager().newBuffer(buffInit);
			
 
				 
			
 
				-		m_copyToBuff.m_buffAddr = m_copyToBuff.m_buff->map(0, buffInit.m_size, BufferMapAccessBit::READ);
			
 
				+		m_clientBufferAddr = m_clientBuffer->map(0, buffInit.m_size, BufferMapAccessBit::READ);
			
 
				 
			
 
				 		// Fill the buffer with 1.0f
			
 
				-		for(U32 i = 0; i < lastMipHeight * lastMipWidth; ++i)
			
 
				+		for(U32 i = 0; i < m_lastMipSize.x() * m_lastMipSize.y(); ++i)
			
 
				 		{
			
 
				-			static_cast<F32*>(m_copyToBuff.m_buffAddr)[i] = 1.0f;
			
 
				+			static_cast<F32*>(m_clientBufferAddr)[i] = 1.0f;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -75,7 +112,7 @@ Error DepthDownscale::init(const ConfigSet& cfg)
 
				 {
			
 
				 	ANKI_R_LOGI("Initializing depth downscale passes");
			
 
				 
			
 
				-	Error err = initInternal(cfg);
			
 
				+	const Error err = initInternal(cfg);
			
 
				 	if(err)
			
 
				 	{
			
 
				 		ANKI_R_LOGE("Failed to initialize depth downscale passes");
			
@@ -104,105 +141,96 @@ void DepthDownscale::populateRenderGraph(RenderingContext& ctx)
 
				 {
			
 
				 	RenderGraphDescription& rgraph = ctx.m_renderGraphDescr;
			
 
				 
			
 
				-	static const Array<CString, 5> passNames = {"HiZ #0", "HiZ #1", "HiZ #2", "HiZ #3", "HiZ #4"};
			
 
				+	ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass("HiZ");
			
 
				+
			
 
				+	pass.newDependency(RenderPassDependency(m_r->getGBuffer().getDepthRt(), TextureUsageBit::SAMPLED_COMPUTE,
			
 
				+											TextureSubresourceInfo(DepthStencilAspectBit::DEPTH)));
			
 
				 
			
 
				-	// Every pass can do MIPS_WRITTEN_PER_PASS mips
			
 
				-	U32 firstMipToWrite = 0;
			
 
				-	for(U32 i = 0; i < m_mipCount; i += MIPS_WRITTEN_PER_PASS)
			
 
				+	for(U32 mip = 0; mip < m_mipCount; ++mip)
			
 
				 	{
			
 
				-		const U mipsToFill = (i + 1 < m_mipCount) ? MIPS_WRITTEN_PER_PASS : 1;
			
 
				+		TextureSubresourceInfo subresource;
			
 
				+		subresource.m_firstMipmap = mip;
			
 
				+		pass.newDependency(RenderPassDependency(m_runCtx.m_hizRt, TextureUsageBit::IMAGE_COMPUTE_WRITE, subresource));
			
 
				+	}
			
 
				 
			
 
				-		ComputeRenderPassDescription& pass = rgraph.newComputeRenderPass(passNames[i / MIPS_WRITTEN_PER_PASS]);
			
 
				+	pass.setWork([this](RenderPassWorkContext& rgraphCtx) {
			
 
				+		CommandBufferPtr& cmdb = rgraphCtx.m_commandBuffer;
			
 
				 
			
 
				-		if(i == 0)
			
 
				-		{
			
 
				-			pass.newDependency({m_r->getGBuffer().getDepthRt(), TextureUsageBit::SAMPLED_COMPUTE,
			
 
				-								TextureSubresourceInfo(DepthStencilAspectBit::DEPTH)});
			
 
				-		}
			
 
				-		else
			
 
				+		// Zero the counter buffer before everything else
			
 
				+		if(ANKI_UNLIKELY(!m_counterBufferZeroed))
			
 
				 		{
			
 
				-			TextureSubresourceInfo subresource;
			
 
				-			subresource.m_firstMipmap = i - 1;
			
 
				-
			
 
				-			pass.newDependency({m_runCtx.m_hizRt, TextureUsageBit::SAMPLED_COMPUTE, subresource});
			
 
				-		}
			
 
				+			m_counterBufferZeroed = true;
			
 
				 
			
 
				-		TextureSubresourceInfo subresource;
			
 
				-		subresource.m_firstMipmap = i;
			
 
				-		pass.newDependency({m_runCtx.m_hizRt, TextureUsageBit::IMAGE_COMPUTE_WRITE, subresource});
			
 
				+			cmdb->fillBuffer(m_counterBuffer, 0, MAX_PTR_SIZE, 0);
			
 
				 
			
 
				-		if(mipsToFill == MIPS_WRITTEN_PER_PASS)
			
 
				-		{
			
 
				-			subresource.m_firstMipmap = i + 1;
			
 
				-			pass.newDependency({m_runCtx.m_hizRt, TextureUsageBit::IMAGE_COMPUTE_WRITE, subresource});
			
 
				+			cmdb->setBufferBarrier(m_counterBuffer, BufferUsageBit::TRANSFER_DESTINATION,
			
 
				+								   BufferUsageBit::STORAGE_COMPUTE_WRITE, 0, MAX_PTR_SIZE);
			
 
				 		}
			
 
				 
			
 
				-		pass.setWork([this, firstMipToWrite](RenderPassWorkContext& rgraphCtx) { run(firstMipToWrite, rgraphCtx); });
			
 
				-		firstMipToWrite += MIPS_WRITTEN_PER_PASS;
			
 
				-	}
			
 
				-}
			
 
				+		cmdb->bindShaderProgram(m_grProg);
			
 
				 
			
 
				-void DepthDownscale::run(U32 mipToWrite, RenderPassWorkContext& rgraphCtx)
			
 
				-{
			
 
				-	CommandBufferPtr& cmdb = rgraphCtx.m_commandBuffer;
			
 
				+		varAU2(dispatchThreadGroupCountXY);
			
 
				+		varAU2(workGroupOffset); // needed if Left and Top are not 0,0
			
 
				+		varAU2(numWorkGroupsAndMips);
			
 
				+		varAU4(rectInfo) = initAU4(0, 0, m_r->getInternalResolution().x(), m_r->getInternalResolution().y());
			
 
				+		SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo);
			
 
				+		SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, m_mipCount);
			
 
				 
			
 
				-	const U32 level = mipToWrite;
			
 
				-	const U32 mipsToFill = (level + 1 < m_mipCount) ? MIPS_WRITTEN_PER_PASS : 1;
			
 
				-	const U32 copyToClientLevel = (level + mipsToFill == m_mipCount) ? mipsToFill - 1 : MAX_U32;
			
 
				+		class PC
			
 
				+		{
			
 
				+		public:
			
 
				+			U32 m_workgroupCount;
			
 
				+			U32 m_mipmapCount;
			
 
				+			Vec2 m_srcTexSizeOverOne;
			
 
				+			U32 m_lastMipWidth;
			
 
				+			U32 m_padding[3u];
			
 
				+		} pc;
			
 
				+		pc.m_workgroupCount = numWorkGroupsAndMips[0];
			
 
				+		pc.m_mipmapCount = numWorkGroupsAndMips[1];
			
 
				+		pc.m_srcTexSizeOverOne = 1.0f / Vec2(m_r->getInternalResolution());
			
 
				+		pc.m_lastMipWidth = m_lastMipSize.x();
			
 
				+
			
 
				+		cmdb->setPushConstants(&pc, sizeof(pc));
			
 
				+
			
 
				+		constexpr U32 maxMipsSpdCanProduce = 12;
			
 
				+		for(U32 mip = 0; mip < maxMipsSpdCanProduce; ++mip)
			
 
				+		{
			
 
				+			TextureSubresourceInfo subresource;
			
 
				+			if(mip < m_mipCount)
			
 
				+			{
			
 
				+				subresource.m_firstMipmap = mip;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				subresource.m_firstMipmap = 0;
			
 
				+			}
			
 
				+
			
 
				+			rgraphCtx.bindImage(0, 0, m_runCtx.m_hizRt, subresource, mip);
			
 
				+		}
			
 
				 
			
 
				-	const U32 level0Width = m_r->getInternalResolution().x() >> (level + 1);
			
 
				-	const U32 level0Height = m_r->getInternalResolution().y() >> (level + 1);
			
 
				-	const U32 level1Width = level0Width >> 1;
			
 
				-	const U32 level1Height = level0Height >> 1;
			
 
				+		if(m_mipCount >= 5)
			
 
				+		{
			
 
				+			TextureSubresourceInfo subresource;
			
 
				+			subresource.m_firstMipmap = 4;
			
 
				+			rgraphCtx.bindImage(0, 1, m_runCtx.m_hizRt, subresource);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// Bind something random
			
 
				+			TextureSubresourceInfo subresource;
			
 
				+			subresource.m_firstMipmap = 0;
			
 
				+			rgraphCtx.bindImage(0, 1, m_runCtx.m_hizRt, subresource);
			
 
				+		}
			
 
				 
			
 
				-	cmdb->bindShaderProgram(m_grProg);
			
 
				+		cmdb->bindStorageBuffer(0, 2, m_counterBuffer, 0, MAX_PTR_SIZE);
			
 
				+		cmdb->bindStorageBuffer(0, 3, m_clientBuffer, 0, MAX_PTR_SIZE);
			
 
				 
			
 
				-	// Uniforms
			
 
				-	struct PushConsts
			
 
				-	{
			
 
				-		UVec2 m_level0WriteImgSize;
			
 
				-		UVec2 m_level1WriteImgSize;
			
 
				-		U32 m_copyToClientLevel;
			
 
				-		U32 m_writeLevel1;
			
 
				-		U32 m_padding0;
			
 
				-		U32 m_padding1;
			
 
				-	} regs;
			
 
				-
			
 
				-	regs.m_level0WriteImgSize = UVec2(level0Width, level0Height);
			
 
				-	regs.m_level1WriteImgSize = UVec2(level1Width, level1Height);
			
 
				-	regs.m_copyToClientLevel = copyToClientLevel;
			
 
				-	regs.m_writeLevel1 = mipsToFill == MIPS_WRITTEN_PER_PASS;
			
 
				-	cmdb->setPushConstants(&regs, sizeof(regs));
			
 
				-
			
 
				-	cmdb->bindSampler(0, 0, m_r->getSamplers().m_nearestNearestClamp);
			
 
				-
			
 
				-	// Bind input texure
			
 
				-	if(level == 0)
			
 
				-	{
			
 
				-		rgraphCtx.bindTexture(0, 1, m_r->getGBuffer().getDepthRt(),
			
 
				+		cmdb->bindSampler(0, 4, (doesSamplerReduction()) ? m_reductionSampler : m_r->getSamplers().m_trilinearClamp);
			
 
				+		rgraphCtx.bindTexture(0, 5, m_r->getGBuffer().getDepthRt(),
			
 
				 							  TextureSubresourceInfo(DepthStencilAspectBit::DEPTH));
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		TextureSubresourceInfo subresource;
			
 
				-		subresource.m_firstMipmap = level - 1;
			
 
				-		rgraphCtx.bindTexture(0, 1, m_runCtx.m_hizRt, subresource);
			
 
				-	}
			
 
				-
			
 
				-	// 1st level
			
 
				-	TextureSubresourceInfo subresource;
			
 
				-	subresource.m_firstMipmap = level;
			
 
				-	rgraphCtx.bindImage(0, 2, m_runCtx.m_hizRt, subresource);
			
 
				-
			
 
				-	// 2nd level
			
 
				-	subresource.m_firstMipmap = (mipsToFill == MIPS_WRITTEN_PER_PASS) ? level + 1 : level; // Bind the next or the same
			
 
				-	rgraphCtx.bindImage(0, 3, m_runCtx.m_hizRt, subresource);
			
 
				-
			
 
				-	// Client buffer
			
 
				-	cmdb->bindStorageBuffer(0, 4, m_copyToBuff.m_buff, 0, m_copyToBuff.m_buff->getSize());
			
 
				 
			
 
				-	// Done
			
 
				-	dispatchPPCompute(cmdb, 8, 8, level0Width, level0Height);
			
 
				+		cmdb->dispatchCompute(dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1], 1);
			
 
				+	});
			
 
				 }
			
 
				 
			
 
				 } // end namespace anki
			
--- a/AnKi/Renderer/DepthDownscale.h
+++ b/AnKi/Renderer/DepthDownscale.h
@@ -47,20 +47,29 @@ public:
 
				 
			
 
				 	void getClientDepthMapInfo(F32*& depthValues, U32& width, U32& height) const
			
 
				 	{
			
 
				-		width = m_copyToBuff.m_lastMipWidth;
			
 
				-		height = m_copyToBuff.m_lastMipHeight;
			
 
				-		ANKI_ASSERT(m_copyToBuff.m_buffAddr);
			
 
				-		m_copyToBuff.m_buff->invalidate(0, MAX_PTR_SIZE);
			
 
				-		depthValues = static_cast<F32*>(m_copyToBuff.m_buffAddr);
			
 
				+		width = m_lastMipSize.x();
			
 
				+		height = m_lastMipSize.y();
			
 
				+		ANKI_ASSERT(m_clientBuffer);
			
 
				+		m_clientBuffer->invalidate(0, MAX_PTR_SIZE);
			
 
				+		depthValues = static_cast<F32*>(m_clientBufferAddr);
			
 
				 	}
			
 
				 
			
 
				 private:
			
 
				-	static const U32 MIPS_WRITTEN_PER_PASS = 2;
			
 
				-
			
 
				 	TexturePtr m_hizTex;
			
 
				 	Bool m_hizTexImportedOnce = false;
			
 
				+
			
 
				 	ShaderProgramResourcePtr m_prog;
			
 
				 	ShaderProgramPtr m_grProg;
			
 
				+
			
 
				+	BufferPtr m_counterBuffer;
			
 
				+	Bool m_counterBufferZeroed = false;
			
 
				+
			
 
				+	SamplerPtr m_reductionSampler;
			
 
				+
			
 
				+	BufferPtr m_clientBuffer;
			
 
				+	void* m_clientBufferAddr = nullptr;
			
 
				+
			
 
				+	UVec2 m_lastMipSize;
			
 
				 	U32 m_mipCount = 0;
			
 
				 
			
 
				 	class
			
@@ -69,17 +78,12 @@ private:
 
				 		RenderTargetHandle m_hizRt;
			
 
				 	} m_runCtx; ///< Run context.
			
 
				 
			
 
				-	class
			
 
				-	{
			
 
				-	public:
			
 
				-		BufferPtr m_buff;
			
 
				-		void* m_buffAddr = nullptr;
			
 
				-		U32 m_lastMipWidth = MAX_U32, m_lastMipHeight = MAX_U32;
			
 
				-	} m_copyToBuff; ///< Copy to buffer members.
			
 
				-
			
 
				 	ANKI_USE_RESULT Error initInternal(const ConfigSet& cfg);
			
 
				 
			
 
				-	void run(U32 mipToWrite, RenderPassWorkContext& rgraphCtx);
			
 
				+	Bool doesSamplerReduction() const
			
 
				+	{
			
 
				+		return m_reductionSampler.isCreated();
			
 
				+	}
			
 
				 };
			
 
				 /// @}
			
 
				 
			
--- a/AnKi/Resource/Stb.cpp
+++ b/AnKi/Resource/Stb.cpp
@@ -15,6 +15,7 @@
 
				 #	pragma GCC diagnostic ignored "-Wtype-limits"
			
 
				 #	pragma GCC diagnostic ignored "-Wmissing-field-initializers"
			
 
				 #	pragma GCC diagnostic ignored "-Wsign-compare"
			
 
				+#	pragma GCC diagnostic ignored "-Wunused-but-set-variable"
			
 
				 #endif
			
 
				 
			
 
				 #include <AnKi/Resource/Stb.h>
			
--- a/AnKi/Shaders/DepthDownscale.ankiprog
+++ b/AnKi/Shaders/DepthDownscale.ankiprog
@@ -4,6 +4,8 @@
 
				 // http://www.anki3d.org/LICENSE
			
 
				 
			
 
				 #pragma anki mutator SAMPLE_RESOLVE_TYPE 0 1 2 // 0: average, 1: min, 2: max
			
 
				+#pragma anki mutator WAVE_OPERATIONS 0 1
			
 
				+#pragma anki mutator REDUCTION_SAMPLER 0 1
			
 
				 #define AVG 0
			
 
				 #define MIN 1
			
 
				 #define MAX 2
			
@@ -11,95 +13,133 @@
 
				 #pragma anki start comp
			
 
				 #include <AnKi/Shaders/Common.glsl>
			
 
				 
			
 
				-layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
			
 
				+layout(local_size_x = 256) in;
			
 
				 
			
 
				-layout(push_constant, std430) uniform pc_
			
 
				+layout(push_constant, std430) uniform b_pc
			
 
				 {
			
 
				-	UVec2 u_level0WriteImgSize;
			
 
				-	UVec2 u_level1WriteImgSize;
			
 
				-	U32 u_copyToClientLevel;
			
 
				-	U32 u_writeLevel1;
			
 
				-	U32 u_padding0;
			
 
				-	U32 u_padding1;
			
 
				+	U32 u_workgroupCount;
			
 
				+	U32 u_mipmapCount;
			
 
				+	Vec2 u_srcTexSizeOverOne;
			
 
				+	U32 u_lastMipWidth;
			
 
				+	U32 u_padding[3u];
			
 
				 };
			
 
				 
			
 
				-layout(set = 0, binding = 0) uniform sampler u_nearestAnyClampSampler;
			
 
				-layout(set = 0, binding = 1) uniform texture2D u_readTex;
			
 
				-layout(set = 0, binding = 2) writeonly uniform image2D u_level0WriteImg;
			
 
				-layout(set = 0, binding = 3) writeonly uniform image2D u_level1WriteImg;
			
 
				+layout(set = 0, binding = 0) uniform image2D u_dstImages[12u];
			
 
				+layout(set = 0, binding = 1) coherent uniform image2D u_dstImage5;
			
 
				 
			
 
				-layout(std430, set = 0, binding = 4) writeonly buffer s1_
			
 
				+layout(set = 0, binding = 2) coherent buffer b_atomic
			
 
				+{
			
 
				+	U32 u_spdCounter;
			
 
				+};
			
 
				+
			
 
				+layout(std430, set = 0, binding = 3) writeonly buffer b_cb
			
 
				 {
			
 
				 	F32 u_clientBuf[];
			
 
				 };
			
 
				 
			
 
				-shared F32 s_depths[gl_WorkGroupSize.y][gl_WorkGroupSize.x];
			
 
				+layout(set = 0, binding = 4) uniform sampler u_reductionSampler; ///< Special sampler that can do min/max reduction.
			
 
				+layout(set = 0, binding = 5) uniform texture2D u_srcTex;
			
 
				 
			
 
				-// Resolve depths into one value
			
 
				-F32 resolveDepths(Vec4 depths)
			
 
				+// Include SPD
			
 
				+#define A_GPU 1
			
 
				+#define A_GLSL 1
			
 
				+#include <ThirdParty/FidelityFX/ffx_a.h>
			
 
				+
			
 
				+shared AU1 s_spdCounter;
			
 
				+shared AF1 s_spdIntermediateR[16][16];
			
 
				+
			
 
				+F32 reduce(Vec4 depths)
			
 
				 {
			
 
				 #if SAMPLE_RESOLVE_TYPE == MIN
			
 
				-	Vec2 mind2 = min(depths.xy, depths.zw);
			
 
				-	const F32 depth = min(mind2.x, mind2.y);
			
 
				+	return min(depths.x, min(depths.y, min(depths.z, depths.w)));
			
 
				 #elif SAMPLE_RESOLVE_TYPE == MAX
			
 
				-	Vec2 max2 = max(depths.xy, depths.zw);
			
 
				-	const F32 depth = max(max2.x, max2.y);
			
 
				+	return max(depths.x, max(depths.y, max(depths.z, depths.w)));
			
 
				 #elif SAMPLE_RESOLVE_TYPE == AVG
			
 
				-	const F32 depth = dot(depths, Vec4(1.0 / 4.0));
			
 
				+	return (depths.x + depths.y + depths.z + depths.w) * 0.25;
			
 
				 #else
			
 
				 #	error See file
			
 
				 #endif
			
 
				-
			
 
				-	return depth;
			
 
				 }
			
 
				 
			
 
				-void main()
			
 
				+AF4 SpdLoadSourceImage(AU2 p, AU1 slice)
			
 
				 {
			
 
				-	// Read depth
			
 
				-	const Vec2 readUv = (Vec2(gl_GlobalInvocationID.xy) + 0.5) / Vec2(u_level0WriteImgSize);
			
 
				-	Vec4 depths = textureGather(sampler2D(u_readTex, u_nearestAnyClampSampler), readUv, 0);
			
 
				+	const AF2 textureCoord = Vec2(p) * u_srcTexSizeOverOne + u_srcTexSizeOverOne;
			
 
				+#if REDUCTION_SAMPLER == 1
			
 
				+	return AF4(textureLod(u_srcTex, u_reductionSampler, textureCoord, 0.0).r, 0.0, 0.0, 0.0);
			
 
				+#else
			
 
				+	const Vec4 depths = textureGather(sampler2D(u_srcTex, u_reductionSampler), textureCoord, 0);
			
 
				+	return AF4(reduce(depths), 0.0, 0.0, 0.0);
			
 
				+#endif
			
 
				+}
			
 
				 
			
 
				-	// Resolve & store the 1st level
			
 
				-	F32 depth = resolveDepths(depths);
			
 
				-	s_depths[gl_LocalInvocationID.y][gl_LocalInvocationID.x] = depth;
			
 
				+AF4 SpdLoad(AU2 p, AU1 slice)
			
 
				+{
			
 
				+	return AF4(imageLoad(u_dstImage5, IVec2(p)).r, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				 
			
 
				-	if(all(lessThan(gl_GlobalInvocationID.xy, u_level0WriteImgSize)))
			
 
				+void SpdStore(AU2 p, AF4 value, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+	if(mip == 5u)
			
 
				 	{
			
 
				-		imageStore(u_level0WriteImg, IVec2(gl_GlobalInvocationID.xy), Vec4(depth));
			
 
				-
			
 
				-		if(u_copyToClientLevel == 0u)
			
 
				-		{
			
 
				-			const U32 idx = gl_GlobalInvocationID.y * u_level0WriteImgSize.x + gl_GlobalInvocationID.x;
			
 
				-			u_clientBuf[idx] = depth;
			
 
				-		}
			
 
				+		imageStore(u_dstImage5, IVec2(p), Vec4(value.x, 0.0, 0.0, 0.0));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		imageStore(u_dstImages[mip], IVec2(p), Vec4(value.x, 0.0, 0.0, 0.0));
			
 
				 	}
			
 
				 
			
 
				-	// Sync
			
 
				-	memoryBarrierShared();
			
 
				-	barrier();
			
 
				-
			
 
				-	// Resolve 2nd level
			
 
				-	if(u_writeLevel1 == 1u && all(equal(gl_LocalInvocationID.xy & UVec2(1u), UVec2(0u))))
			
 
				+	// Store the last mip to the buffer as well
			
 
				+	if(mip == u_mipmapCount - 1u)
			
 
				 	{
			
 
				-		depths.x = depth;
			
 
				-		depths.y = s_depths[gl_LocalInvocationID.y + 0u][gl_LocalInvocationID.x + 1u];
			
 
				-		depths.z = s_depths[gl_LocalInvocationID.y + 1u][gl_LocalInvocationID.x + 1u];
			
 
				-		depths.w = s_depths[gl_LocalInvocationID.y + 1u][gl_LocalInvocationID.x + 0u];
			
 
				-
			
 
				-		depth = resolveDepths(depths);
			
 
				-
			
 
				-		const UVec2 writeUv = gl_GlobalInvocationID.xy >> 1u;
			
 
				-		if(all(lessThan(writeUv, u_level1WriteImgSize)))
			
 
				-		{
			
 
				-			imageStore(u_level1WriteImg, IVec2(writeUv), Vec4(depth));
			
 
				-
			
 
				-			if(u_copyToClientLevel == 1u)
			
 
				-			{
			
 
				-				const U32 idx = writeUv.y * u_level1WriteImgSize.x + writeUv.x;
			
 
				-				u_clientBuf[idx] = depth;
			
 
				-			}
			
 
				-		}
			
 
				+		const U32 idx = p.y * u_lastMipWidth + p.x;
			
 
				+		u_clientBuf[idx] = value.x;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void SpdIncreaseAtomicCounter(AU1 slice)
			
 
				+{
			
 
				+	s_spdCounter = atomicAdd(u_spdCounter, 1u);
			
 
				+}
			
 
				+
			
 
				+AU1 SpdGetAtomicCounter()
			
 
				+{
			
 
				+	return s_spdCounter;
			
 
				+}
			
 
				+
			
 
				+void SpdResetAtomicCounter(AU1 slice)
			
 
				+{
			
 
				+	u_spdCounter = 0u;
			
 
				+}
			
 
				+
			
 
				+AF4 SpdLoadIntermediate(AU1 x, AU1 y)
			
 
				+{
			
 
				+	return AF4(s_spdIntermediateR[x][y], 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+
			
 
				+void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value)
			
 
				+{
			
 
				+	s_spdIntermediateR[x][y] = value.x;
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
			
 
				+{
			
 
				+	return AF4(reduce(Vec4(v0.x, v1.x, v2.x, v3.x)), 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+
			
 
				+#define SPD_LINEAR_SAMPLER 1
			
 
				+
			
 
				+#if WAVE_OPERATIONS == 0
			
 
				+#	define SPD_NO_WAVE_OPERATIONS 1
			
 
				+#endif
			
 
				+
			
 
				+#include <ThirdParty/FidelityFX/ffx_spd.h>
			
 
				+
			
 
				+void main()
			
 
				+{
			
 
				+	const U32 slice = 0u;
			
 
				+	const UVec2 offset = UVec2(0u);
			
 
				+	SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), AU1(u_mipmapCount), AU1(u_workgroupCount),
			
 
				+				  slice, offset);
			
 
				+}
			
 
				+
			
 
				 #pragma anki end
			
--- a/ThirdParty/FidelityFX/ffx_spd.h
+++ b/ThirdParty/FidelityFX/ffx_spd.h
@@ -0,0 +1,1248 @@
 
				+//_____________________________________________________________/\_______________________________________________________________
			
 
				+//==============================================================================================================================
			
 
				+//
			
 
				+//                                         [FFX SPD] Single Pass Downsampler 2.0
			
 
				+//
			
 
				+//==============================================================================================================================
			
 
				+// LICENSE
			
 
				+// =======
			
 
				+// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
			
 
				+// -------
			
 
				+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
			
 
				+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
			
 
				+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
			
 
				+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
			
 
				+// -------
			
 
				+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
			
 
				+// Software.
			
 
				+// -------
			
 
				+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
			
 
				+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS
			
 
				+// OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
			
 
				+// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
			
 
				+//
			
 
				+//------------------------------------------------------------------------------------------------------------------------------
			
 
				+// CHANGELIST v2.0
			
 
				+// ===============
			
 
				+// - Added support for cube and array textures. SpdDownsample and SpdDownsampleH shader functions now take index of
			
 
				+// texture slice
			
 
				+//   as an additional parameter. For regular texture use 0.
			
 
				+// - Added support for updating only sub-rectangle of the texture. Additional, optional parameter workGroupOffset added
			
 
				+// to shader
			
 
				+//   functions SpdDownsample and SpdDownsampleH.
			
 
				+// - Added C function SpdSetup that helps to setup constants to be passed as a constant buffer.
			
 
				+// - The global atomic counter is automatically reset to 0 by the shader at the end, so you do not need to clear it
			
 
				+// before every
			
 
				+//   use, just once after creation
			
 
				+//
			
 
				+//------------------------------------------------------------------------------------------------------------------------------
			
 
				+// INTEGRATION SUMMARY FOR CPU
			
 
				+// ===========================
			
 
				+// // you need to provide as constants:
			
 
				+// // number of mip levels to be computed (maximum is 12)
			
 
				+// // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6)
			
 
				+// // workGroupOffset -> by default 0, if you only downsample a rectancle within the source texture use SpdSetup
			
 
				+// function to calculate correct offset
			
 
				+// ...
			
 
				+// // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image
			
 
				+// // for Cube Textures or Texture2DArray, use the z dimension
			
 
				+// vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6, slices);
			
 
				+
			
 
				+// // you can also use the SpdSetup function:
			
 
				+// //on top of your cpp file:
			
 
				+// #define A_CPU
			
 
				+// #include "ffx_a.h"
			
 
				+// #include "ffx_spd.h"
			
 
				+// // before your dispatch call, use SpdSetup function to get your constants
			
 
				+// varAU2(dispatchThreadGroupCountXY); // output variable
			
 
				+// varAU2(workGroupOffset);  // output variable, this constants are required if Left and Top are not 0,0
			
 
				+// varAU2(numWorkGroupsAndMips); // output variable
			
 
				+// // input information about your source texture:
			
 
				+// // left and top of the rectancle within your texture you want to downsample
			
 
				+// // width and height of the rectancle you want to downsample
			
 
				+// // if complete source texture should get downsampled: left = 0, top = 0, width = sourceTexture.width, height =
			
 
				+// sourceTexture.height varAU4(rectInfo) = initAU4(0, 0, m_Texture.GetWidth(), m_Texture.GetHeight()); // left, top,
			
 
				+// width, height SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo);
			
 
				+// ...
			
 
				+// // constants:
			
 
				+// data.numWorkGroupsPerSlice = numWorkGroupsAndMips[0];
			
 
				+// data.mips = numWorkGroupsAndMips[1];
			
 
				+// data.workGroupOffset[0] = workGroupOffset[0];
			
 
				+// data.workGroupOffset[1] = workGroupOffset[1];
			
 
				+// ...
			
 
				+// uint32_t dispatchX = dispatchThreadGroupCountXY[0];
			
 
				+// uint32_t dispatchY = dispatchThreadGroupCountXY[1];
			
 
				+// uint32_t dispatchZ = m_CubeTexture.GetArraySize(); // slices - for 2D Texture this is 1, for cube texture 6
			
 
				+// vkCmdDispatch(cmd_buf, dispatchX, dispatchY, dispatchZ);
			
 
				+
			
 
				+//------------------------------------------------------------------------------------------------------------------------------
			
 
				+// INTEGRATION SUMMARY FOR GPU
			
 
				+// ===========================
			
 
				+
			
 
				+// [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image
			
 
				+// follow additionally the instructions marked with [SAMPLER]
			
 
				+// add following define:
			
 
				+// #define SPD_LINEAR_SAMPLER
			
 
				+// this is recommended, as using one sample() with linear filter to reduce 2x2 is faster
			
 
				+// than 4x load() plus manual averaging
			
 
				+
			
 
				+// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
			
 
				+// // Note: If you use SRGB format for UAV load() and store() (if it's supported), you need to convert to and from
			
 
				+// linear space
			
 
				+// // when using UAV load() and store()
			
 
				+// // approximate conversion to linear (load function): x*x
			
 
				+// // approximate conversion from linear (store function): sqrt()
			
 
				+// // or use more accurate functions from ffx_a.h: AFromSrgbF1(value) and AToSrgbF1(value)
			
 
				+// // Recommendation: use UNORM format instead of SRGB for UAV access, and SRGB for SRV access
			
 
				+// // look in the sample app to see how it's done
			
 
				+
			
 
				+// // source image
			
 
				+// // if cube texture use image2DArray / Texture2DArray and adapt your load/store/sample calls
			
 
				+// GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
			
 
				+// [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc;
			
 
				+// HLSL: [[vk::binding(0)]] Texture2D<float4> imgSrc :register(u0);
			
 
				+
			
 
				+// // destination -> 12 is the maximum number of mips supported by SPD
			
 
				+// GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12];
			
 
				+// HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D<float4> imgDst[12] :register(u1);
			
 
				+
			
 
				+// // global atomic counter - MUST be initialized to 0
			
 
				+// // SPD resets the counter back after each run by calling SpdResetAtomicCounter(slice)
			
 
				+// // if you have more than 1 slice (== if you downsample a cube texture or a texture2Darray)
			
 
				+// // you have an array of counters: counter[6] -> if you have 6 slices for example
			
 
				+// // GLSL:
			
 
				+// layout(std430, set=0, binding=2) coherent buffer SpdGlobalAtomicBuffer
			
 
				+// {
			
 
				+//    uint counter;
			
 
				+// } spdGlobalAtomic;
			
 
				+// // HLSL:
			
 
				+// struct SpdGlobalAtomicBuffer
			
 
				+// {
			
 
				+//    uint counter;
			
 
				+// };
			
 
				+// [[vk::binding(2)]] globallycoherent RWStructuredBuffer<SpdGlobalAtomicBuffer> spdGlobalAtomic;
			
 
				+
			
 
				+// // [SAMPLER] add sampler
			
 
				+// GLSL: layout(set=0, binding=3) uniform sampler srcSampler;
			
 
				+// HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0);
			
 
				+
			
 
				+// // constants - either push constant or constant buffer
			
 
				+// // or calculate within shader
			
 
				+// // [SAMPLER] when using sampler add inverse source image size
			
 
				+// // GLSL:
			
 
				+// layout(push_constant) uniform SpdConstants {
			
 
				+//    uint mips; // needed to opt out earlier if mips are < 12
			
 
				+//    uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * 1
			
 
				+//                        // it is important to NOT take the number of slices (z dimension) into account here
			
 
				+//                        // as each slice has its own counter!
			
 
				+//    vec2 workGroupOffset; // optional - use SpdSetup() function to calculate correct workgroup offset
			
 
				+// } spdConstants;
			
 
				+// // HLSL:
			
 
				+// [[vk::push_constant]]
			
 
				+// cbuffer spdConstants {
			
 
				+//    uint mips;
			
 
				+//    uint numWorkGroups;
			
 
				+//    float2 workGroupOffset; // optional
			
 
				+// };
			
 
				+
			
 
				+// ...
			
 
				+// // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc)
			
 
				+// #define A_GPU 1
			
 
				+// #define A_GLSL 1 // or // #define A_HLSL 1
			
 
				+
			
 
				+// // if you want to use PACKED version
			
 
				+// // recommended if bpc <= 16bit
			
 
				+// #define A_HALF
			
 
				+
			
 
				+// ...
			
 
				+// // Include the portability header (or copy it in without an include).
			
 
				+// #include "ffx_a.h"
			
 
				+// ...
			
 
				+
			
 
				+// // Define LDS variables
			
 
				+// shared AF4 spdIntermediate[16][16]; // HLSL: groupshared
			
 
				+// shared AU1 spdCounter; // HLSL: groupshared
			
 
				+// // PACKED version
			
 
				+// shared AH4 spdIntermediate[16][16]; // HLSL: groupshared
			
 
				+// // Note: You can also use
			
 
				+// shared AF1 spdIntermediateR[16][16];
			
 
				+// shared AF1 spdIntermediateG[16][16];
			
 
				+// shared AF1 spdIntermediateB[16][16];
			
 
				+// shared AF1 spdIntermediateA[16][16];
			
 
				+// // or for Packed version:
			
 
				+// shared AH2 spdIntermediateRG[16][16];
			
 
				+// shared AH2 spdIntermediateBA[16][16];
			
 
				+// // This is potentially faster
			
 
				+// // Adapt your load and store functions accordingly
			
 
				+
			
 
				+// // if subgroup operations are not supported / can't use SM6.0
			
 
				+// #define SPD_NO_WAVE_OPERATIONS
			
 
				+
			
 
				+// // Define the fetch function(s) and the reduction function
			
 
				+// // if non-power-of-2 textures, add border controls to the load and store functions
			
 
				+// // to make sure the borders of the mip level look as you want it
			
 
				+// // if you don't add border controls you'll read zeros past the border
			
 
				+// // if you load with a sampler, this is obv. handled by your sampler :)
			
 
				+// // this is also the place where you need to do color space transformation if needed
			
 
				+// // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions
			
 
				+// // no automatic to/from linear conversions are happening
			
 
				+// // there is to/from linear conversions when using a sampler and render target approach
			
 
				+// // conversion to linear (load function): x*x
			
 
				+// // conversion from linear (store function): sqrt()
			
 
				+
			
 
				+// AU1 slice parameter is for Cube textures and texture2DArray
			
 
				+// if downsampling Texture2D you can ignore this parameter, otherwise use it to access correct slice
			
 
				+// // Load from source image
			
 
				+// GLSL: AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return imageLoad(imgSrc, p);}
			
 
				+// HLSL: AF4 SpdLoadSourceImage(ASU2 tex, AU1 slice){return imgSrc[tex];}
			
 
				+// [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :)
			
 
				+// GLSL:
			
 
				+// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
			
 
				+//    AF2 textureCoord = p * invInputSize + invInputSize;
			
 
				+//    return texture(sampler2D(imgSrc, srcSampler), textureCoord);
			
 
				+// }
			
 
				+// HLSL:
			
 
				+// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
			
 
				+//    AF2 textureCoord = p * invInputSize + invInputSize;
			
 
				+//    return imgSrc.SampleLevel(srcSampler, textureCoord, 0);
			
 
				+// }
			
 
				+
			
 
				+// // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color.
			
 
				+// // Loads the 5th mip level, each value is computed by a different thread group
			
 
				+// // last thread group will access all its elements and compute the subsequent mips
			
 
				+// // reminder: if non-power-of-2 textures, add border controls if you do not want to read zeros past the border
			
 
				+// GLSL: AF4 SpdLoad(ASU2 p, AU1 slice){return imageLoad(imgDst[5],p);}
			
 
				+// HLSL: AF4 SpdLoad(ASU2 tex, AU1 slice){return imgDst[5][tex];}
			
 
				+
			
 
				+// Define the store function
			
 
				+// GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, value);}
			
 
				+// HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 mip, AU1 slice){imgDst[mip][pix] = value;}
			
 
				+
			
 
				+// // Define the atomic counter increase function
			
 
				+// // each slice only reads and stores to its specific slice counter
			
 
				+// // so, if you have several slices it's
			
 
				+// // InterlockedAdd(spdGlobalAtomic[0].counter[slice], 1, spdCounter);
			
 
				+// // GLSL:
			
 
				+// void SpdIncreaseAtomicCounter(AU1 slice){spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);}
			
 
				+// AU1 SpdGetAtomicCounter() {return spdCounter;}
			
 
				+// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic.counter[slice] = 0;}
			
 
				+// // HLSL:
			
 
				+// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
			
 
				+// AU1 SpdGetAtomicCounter(){return spdCounter;}
			
 
				+// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic[0].counter[slice] = 0;}
			
 
				+
			
 
				+// // Define the LDS load and store functions
			
 
				+// // GLSL:
			
 
				+// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
			
 
				+// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
			
 
				+// // HLSL:
			
 
				+// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
			
 
				+// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
			
 
				+
			
 
				+// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
			
 
				+// Example below: computes the average value
			
 
				+// AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;}
			
 
				+
			
 
				+// // PACKED VERSION
			
 
				+// Load from source image
			
 
				+// GLSL: AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){return AH4(imageLoad(imgSrc, p));}
			
 
				+// HLSL: AH4 SpdLoadSourceImageH(ASU2 tex, AU1 slice){return AH4(imgSrc[tex]);}
			
 
				+// [SAMPLER]
			
 
				+// GLSL:
			
 
				+// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
			
 
				+//    AF2 textureCoord = p * invInputSize + invInputSize;
			
 
				+//    return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord));
			
 
				+// }
			
 
				+// HLSL:
			
 
				+// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
			
 
				+//    AF2 textureCoord = p * invInputSize + invInputSize;
			
 
				+//    return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0));
			
 
				+// }
			
 
				+
			
 
				+// // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color.
			
 
				+// // Loads the 5th mip level, each value is computed by a different thread group
			
 
				+// // last thread group will access all its elements and compute the subsequent mips
			
 
				+// GLSL: AH4 SpdLoadH(ASU2 p, AU1 slice){return AH4(imageLoad(imgDst[5],p));}
			
 
				+// HLSL: AH4 SpdLoadH(ASU2 tex, AU1 slice){return AH4(imgDst[5][tex]);}
			
 
				+
			
 
				+// Define the store function
			
 
				+// GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, AF4(value));}
			
 
				+// HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index, AU1 slice){imgDst[index][pix] = AF4(value);}
			
 
				+
			
 
				+// // Define the atomic counter increase function
			
 
				+// // GLSL:
			
 
				+// void SpdIncreaseAtomicCounter(AU1 slice){spd_counter = atomicAdd(spdGlobalAtomic.counter, 1);}
			
 
				+// AU1 SpdGetAtomicCounter() {return spdCounter;}
			
 
				+// // HLSL:
			
 
				+// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
			
 
				+// AU1 SpdGetAtomicCounter(){return spdCounter;}
			
 
				+
			
 
				+// // Define the LDS load and store functions
			
 
				+// // GLSL:
			
 
				+// AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spdIntermediate[x][y];}
			
 
				+// void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
			
 
				+// // HLSL:
			
 
				+// AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
			
 
				+// void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
			
 
				+
			
 
				+// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
			
 
				+// Example below: computes the average value
			
 
				+// AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);}
			
 
				+
			
 
				+// //
			
 
				+
			
 
				+// // If you only use PACKED version
			
 
				+// #define SPD_PACKED_ONLY
			
 
				+
			
 
				+// // Include this SPD (single pass downsampler) header file (or copy it in without an include).
			
 
				+// #include "ffx_spd.h"
			
 
				+// ...
			
 
				+
			
 
				+// // Example in shader integration
			
 
				+// // GLSL:
			
 
				+// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
			
 
				+// void main(){
			
 
				+//  // Call the downsampling function
			
 
				+// // WorkGroupId.z should be 0 if you only downsample a Texture2D!
			
 
				+//  SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
			
 
				+//    AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
			
 
				+//
			
 
				+// // PACKED:
			
 
				+//  SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex),
			
 
				+//    AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
			
 
				+// ...
			
 
				+// // HLSL:
			
 
				+// [numthreads(256,1,1)]
			
 
				+// void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) {
			
 
				+//  SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
			
 
				+//    AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
			
 
				+//
			
 
				+// // PACKED:
			
 
				+//  SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),
			
 
				+//    AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
			
 
				+// ...
			
 
				+
			
 
				+//
			
 
				+//------------------------------------------------------------------------------------------------------------------------------
			
 
				+
			
 
				+//==============================================================================================================================
			
 
				+//                                                     SPD Setup
			
 
				+//==============================================================================================================================
			
 
				+#ifdef A_CPU
			
 
				+A_STATIC void SpdSetup(outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
			
 
				+					   outAU2 workGroupOffset, // GPU side: pass in as constant
			
 
				+					   outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
			
 
				+					   inAU4 rectInfo, // left, top, width, height
			
 
				+					   ASU1 mips // optional: if -1, calculate based on rect width and height
			
 
				+)
			
 
				+{
			
 
				+	workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
			
 
				+	workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
			
 
				+
			
 
				+	AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
			
 
				+	AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
			
 
				+
			
 
				+	dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
			
 
				+	dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
			
 
				+
			
 
				+	numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
			
 
				+
			
 
				+	if(mips >= 0)
			
 
				+	{
			
 
				+		numWorkGroupsAndMips[1] = AU1(mips);
			
 
				+	}
			
 
				+	else
			
 
				+	{ // calculate based on rect width and height
			
 
				+		AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]);
			
 
				+		numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12))));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+A_STATIC void SpdSetup(outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
			
 
				+					   outAU2 workGroupOffset, // GPU side: pass in as constant
			
 
				+					   outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
			
 
				+					   inAU4 rectInfo // left, top, width, height
			
 
				+)
			
 
				+{
			
 
				+	SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
			
 
				+}
			
 
				+#endif // #ifdef A_CPU
			
 
				+//==============================================================================================================================
			
 
				+//                                                     NON-PACKED VERSION
			
 
				+//==============================================================================================================================
			
 
				+#ifdef A_GPU
			
 
				+#	ifdef SPD_PACKED_ONLY
			
 
				+// Avoid compiler error
			
 
				+AF4 SpdLoadSourceImage(ASU2 p, AU1 slice)
			
 
				+{
			
 
				+	return AF4(0.0, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+AF4 SpdLoad(ASU2 p, AU1 slice)
			
 
				+{
			
 
				+	return AF4(0.0, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+}
			
 
				+AF4 SpdLoadIntermediate(AU1 x, AU1 y)
			
 
				+{
			
 
				+	return AF4(0.0, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value)
			
 
				+{
			
 
				+}
			
 
				+AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3)
			
 
				+{
			
 
				+	return AF4(0.0, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+#	endif // #ifdef SPD_PACKED_ONLY
			
 
				+
			
 
				+//_____________________________________________________________/\_______________________________________________________________
			
 
				+#	if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
			
 
				+#		extension GL_KHR_shader_subgroup_quad : require
			
 
				+#	endif
			
 
				+
			
 
				+void SpdWorkgroupShuffleBarrier()
			
 
				+{
			
 
				+#	ifdef A_GLSL
			
 
				+	barrier();
			
 
				+#	endif
			
 
				+#	ifdef A_HLSL
			
 
				+	GroupMemoryBarrierWithGroupSync();
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+// Only last active workgroup should proceed
			
 
				+bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice)
			
 
				+{
			
 
				+	// global atomic counter
			
 
				+	if(localInvocationIndex == 0u)
			
 
				+	{
			
 
				+		SpdIncreaseAtomicCounter(slice);
			
 
				+	}
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	return (SpdGetAtomicCounter() != (numWorkGroups - 1u));
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// User defined: AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3);
			
 
				+
			
 
				+AF4 SpdReduceQuad(AF4 v)
			
 
				+{
			
 
				+#	if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
			
 
				+	AF4 v0 = v;
			
 
				+	AF4 v1 = subgroupQuadSwapHorizontal(v);
			
 
				+	AF4 v2 = subgroupQuadSwapVertical(v);
			
 
				+	AF4 v3 = subgroupQuadSwapDiagonal(v);
			
 
				+	return SpdReduce4(v0, v1, v2, v3);
			
 
				+#	elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
			
 
				+	// requires SM6.0
			
 
				+	AU1 quad = WaveGetLaneIndex() & (~0x3);
			
 
				+	AF4 v0 = v;
			
 
				+	AF4 v1 = WaveReadLaneAt(v, quad | 1);
			
 
				+	AF4 v2 = WaveReadLaneAt(v, quad | 2);
			
 
				+	AF4 v3 = WaveReadLaneAt(v, quad | 3);
			
 
				+	return SpdReduce4(v0, v1, v2, v3);
			
 
				+/*
			
 
				+// if SM6.0 is not available, you can use the AMD shader intrinsics
			
 
				+// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
			
 
				+// https://gpuopen.com/amd-gpu-services-ags-library/
			
 
				+// works for DX11
			
 
				+AF4 v0 = v;
			
 
				+AF4 v1;
			
 
				+v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+AF4 v2;
			
 
				+v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+AF4 v3;
			
 
				+v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+return SpdReduce4(v0, v1, v2, v3);
			
 
				+*/
			
 
				+#	endif
			
 
				+	return v;
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
			
 
				+{
			
 
				+	AF4 v0 = SpdLoadIntermediate(i0.x, i0.y);
			
 
				+	AF4 v1 = SpdLoadIntermediate(i1.x, i1.y);
			
 
				+	AF4 v2 = SpdLoadIntermediate(i2.x, i2.y);
			
 
				+	AF4 v3 = SpdLoadIntermediate(i3.x, i3.y);
			
 
				+	return SpdReduce4(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
			
 
				+{
			
 
				+	AF4 v0 = SpdLoad(i0, slice);
			
 
				+	AF4 v1 = SpdLoad(i1, slice);
			
 
				+	AF4 v2 = SpdLoad(i2, slice);
			
 
				+	AF4 v3 = SpdLoad(i3, slice);
			
 
				+	return SpdReduce4(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduceLoad4(AU2 base, AU1 slice)
			
 
				+{
			
 
				+	return SpdReduceLoad4(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)), AU2(base + AU2(1, 1)),
			
 
				+						  slice);
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
			
 
				+{
			
 
				+	AF4 v0 = SpdLoadSourceImage(i0, slice);
			
 
				+	AF4 v1 = SpdLoadSourceImage(i1, slice);
			
 
				+	AF4 v2 = SpdLoadSourceImage(i2, slice);
			
 
				+	AF4 v3 = SpdLoadSourceImage(i3, slice);
			
 
				+	return SpdReduce4(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_LINEAR_SAMPLER
			
 
				+	return SpdLoadSourceImage(base, slice);
			
 
				+#	else
			
 
				+	return SpdReduceLoadSourceImage4(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)),
			
 
				+									 AU2(base + AU2(1, 1)), slice);
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+	AF4 v[4];
			
 
				+
			
 
				+	AU2 tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u);
			
 
				+	AU2 pix = AU2(workGroupID.xy * 32u) + AU2(x, y);
			
 
				+	v[0] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[0], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y);
			
 
				+	v[1] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[1], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u + 32u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x, y + 16u);
			
 
				+	v[2] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[2], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u + 32u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y + 16u);
			
 
				+	v[3] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[3], 0u, slice);
			
 
				+
			
 
				+	if(mip <= 1u)
			
 
				+		return;
			
 
				+
			
 
				+	v[0] = SpdReduceQuad(v[0]);
			
 
				+	v[1] = SpdReduceQuad(v[1]);
			
 
				+	v[2] = SpdReduceQuad(v[2]);
			
 
				+	v[3] = SpdReduceQuad(v[3]);
			
 
				+
			
 
				+	if((localInvocationIndex % 4u) == 0u)
			
 
				+	{
			
 
				+		SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u, y / 2u), v[0], 1u, slice);
			
 
				+		SpdStoreIntermediate(x / 2u, y / 2u, v[0]);
			
 
				+
			
 
				+		SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u + 8u, y / 2u), v[1], 1u, slice);
			
 
				+		SpdStoreIntermediate(x / 2u + 8u, y / 2u, v[1]);
			
 
				+
			
 
				+		SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u, y / 2u + 8u), v[2], 1u, slice);
			
 
				+		SpdStoreIntermediate(x / 2u, y / 2u + 8u, v[2]);
			
 
				+
			
 
				+		SpdStore(AU2(workGroupID.xy * 16u) + AU2(x / 2u + 8u, y / 2u + 8u), v[3], 1u, slice);
			
 
				+		SpdStoreIntermediate(x / 2u + 8u, y / 2u + 8u, v[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+	AF4 v[4];
			
 
				+
			
 
				+	AU2 tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u);
			
 
				+	AU2 pix = AU2(workGroupID.xy * 32u) + AU2(x, y);
			
 
				+	v[0] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[0], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y);
			
 
				+	v[1] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[1], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u, y * 2u + 32u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x, y + 16u);
			
 
				+	v[2] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[2], 0u, slice);
			
 
				+
			
 
				+	tex = AU2(workGroupID.xy * 64u) + AU2(x * 2u + 32u, y * 2u + 32u);
			
 
				+	pix = AU2(workGroupID.xy * 32u) + AU2(x + 16u, y + 16u);
			
 
				+	v[3] = SpdReduceLoadSourceImage(tex, slice);
			
 
				+	SpdStore(pix, v[3], 0u, slice);
			
 
				+
			
 
				+	if(mip <= 1u)
			
 
				+		return;
			
 
				+
			
 
				+	for(AU1 i = 0u; i < 4u; i++)
			
 
				+	{
			
 
				+		SpdStoreIntermediate(x, y, v[i]);
			
 
				+		SpdWorkgroupShuffleBarrier();
			
 
				+		if(localInvocationIndex < 64u)
			
 
				+		{
			
 
				+			v[i] = SpdReduceIntermediate(AU2(x * 2u + 0u, y * 2u + 0u), AU2(x * 2u + 1u, y * 2u + 0u),
			
 
				+										 AU2(x * 2u + 0u, y * 2u + 1u), AU2(x * 2u + 1u, y * 2u + 1u));
			
 
				+			SpdStore(AU2(workGroupID.xy * 16u) + AU2(x + (i % 2u) * 8u, y + (i / 2u) * 8u), v[i], 1u, slice);
			
 
				+		}
			
 
				+		SpdWorkgroupShuffleBarrier();
			
 
				+	}
			
 
				+
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		SpdStoreIntermediate(x + 0u, y + 0u, v[0]);
			
 
				+		SpdStoreIntermediate(x + 8u, y + 0u, v[1]);
			
 
				+		SpdStoreIntermediate(x + 0u, y + 8u, v[2]);
			
 
				+		SpdStoreIntermediate(x + 8u, y + 8u, v[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
			
 
				+#	else
			
 
				+	SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		AF4 v = SpdReduceIntermediate(AU2(x * 2u + 0u, y * 2u + 0u), AU2(x * 2u + 1u, y * 2u + 0u),
			
 
				+									  AU2(x * 2u + 0u, y * 2u + 1u), AU2(x * 2u + 1u, y * 2u + 1u));
			
 
				+		SpdStore(AU2(workGroupID.xy * 8u) + AU2(x, y), v, mip, slice);
			
 
				+		// store to LDS, try to reduce bank conflicts
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		// ...
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		SpdStoreIntermediate(x * 2u + y % 2u, y * 2u, v);
			
 
				+	}
			
 
				+#	else
			
 
				+	AF4 v = SpdLoadIntermediate(x, y);
			
 
				+	v = SpdReduceQuad(v);
			
 
				+	// quad index 0 stores result
			
 
				+	if(localInvocationIndex % 4u == 0u)
			
 
				+	{
			
 
				+		SpdStore(AU2(workGroupID.xy * 8u) + AU2(x / 2u, y / 2u), v, mip, slice);
			
 
				+		SpdStoreIntermediate(x + (y / 2u) % 2u, y, v);
			
 
				+	}
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 16u)
			
 
				+	{
			
 
				+		// x 0 x 0
			
 
				+		// 0 0 0 0
			
 
				+		// 0 x 0 x
			
 
				+		// 0 0 0 0
			
 
				+		AF4 v = SpdReduceIntermediate(AU2(x * 4u + 0u + 0u, y * 4u + 0u), AU2(x * 4u + 2u + 0u, y * 4u + 0u),
			
 
				+									  AU2(x * 4u + 0u + 1u, y * 4u + 2u), AU2(x * 4u + 2u + 1u, y * 4u + 2u));
			
 
				+		SpdStore(AU2(workGroupID.xy * 4u) + AU2(x, y), v, mip, slice);
			
 
				+		// store to LDS
			
 
				+		// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
			
 
				+		// ...
			
 
				+		// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
			
 
				+		// ...
			
 
				+		// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
			
 
				+		// ...
			
 
				+		SpdStoreIntermediate(x * 4u + y, y * 4u, v);
			
 
				+	}
			
 
				+#	else
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		AF4 v = SpdLoadIntermediate(x * 2u + y % 2u, y * 2u);
			
 
				+		v = SpdReduceQuad(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4u == 0u)
			
 
				+		{
			
 
				+			SpdStore(AU2(workGroupID.xy * 4u) + AU2(x / 2u, y / 2u), v, mip, slice);
			
 
				+			SpdStoreIntermediate(x * 2u + y / 2u, y * 2u, v);
			
 
				+		}
			
 
				+	}
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 4u)
			
 
				+	{
			
 
				+		// x 0 0 0 x 0 0 0
			
 
				+		// ...
			
 
				+		// 0 x 0 0 0 x 0 0
			
 
				+		AF4 v = SpdReduceIntermediate(
			
 
				+			AU2(x * 8u + 0u + 0u + y * 2u, y * 8u + 0u), AU2(x * 8u + 4u + 0u + y * 2u, y * 8u + 0u),
			
 
				+			AU2(x * 8u + 0u + 1u + y * 2u, y * 8u + 4u), AU2(x * 8u + 4u + 1u + y * 2u, y * 8u + 4u));
			
 
				+		SpdStore(AU2(workGroupID.xy * 2u) + AU2(x, y), v, mip, slice);
			
 
				+		// store to LDS
			
 
				+		// x x x x 0 ...
			
 
				+		// 0 ...
			
 
				+		SpdStoreIntermediate(x + y * 2u, 0u, v);
			
 
				+	}
			
 
				+#	else
			
 
				+	if(localInvocationIndex < 16u)
			
 
				+	{
			
 
				+		AF4 v = SpdLoadIntermediate(x * 4u + y, y * 4u);
			
 
				+		v = SpdReduceQuad(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4u == 0u)
			
 
				+		{
			
 
				+			SpdStore(AU2(workGroupID.xy * 2u) + AU2(x / 2u, y / 2u), v, mip, slice);
			
 
				+			SpdStoreIntermediate(x / 2u + y, 0u, v);
			
 
				+		}
			
 
				+	}
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#	ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 1u)
			
 
				+	{
			
 
				+		// x x x x 0 ...
			
 
				+		// 0 ...
			
 
				+		AF4 v = SpdReduceIntermediate(AU2(0, 0), AU2(1, 0), AU2(2, 0), AU2(3, 0));
			
 
				+		SpdStore(AU2(workGroupID.xy), v, mip, slice);
			
 
				+	}
			
 
				+#	else
			
 
				+	if(localInvocationIndex < 4u)
			
 
				+	{
			
 
				+		AF4 v = SpdLoadIntermediate(localInvocationIndex, 0u);
			
 
				+		v = SpdReduceQuad(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4u == 0u)
			
 
				+		{
			
 
				+			SpdStore(AU2(workGroupID.xy), v, mip, slice);
			
 
				+		}
			
 
				+	}
			
 
				+#	endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	AU2 tex = AU2(x * 4u + 0u, y * 4u + 0u);
			
 
				+	AU2 pix = AU2(x * 2u + 0u, y * 2u + 0u);
			
 
				+	AF4 v0 = SpdReduceLoad4(tex, slice);
			
 
				+	SpdStore(pix, v0, 6u, slice);
			
 
				+
			
 
				+	tex = AU2(x * 4u + 2u, y * 4u + 0u);
			
 
				+	pix = AU2(x * 2u + 1u, y * 2u + 0u);
			
 
				+	AF4 v1 = SpdReduceLoad4(tex, slice);
			
 
				+	SpdStore(pix, v1, 6u, slice);
			
 
				+
			
 
				+	tex = AU2(x * 4u + 0u, y * 4u + 2u);
			
 
				+	pix = AU2(x * 2u + 0u, y * 2u + 1u);
			
 
				+	AF4 v2 = SpdReduceLoad4(tex, slice);
			
 
				+	SpdStore(pix, v2, 6u, slice);
			
 
				+
			
 
				+	tex = AU2(x * 4u + 2u, y * 4u + 2u);
			
 
				+	pix = AU2(x * 2u + 1u, y * 2u + 1u);
			
 
				+	AF4 v3 = SpdReduceLoad4(tex, slice);
			
 
				+	SpdStore(pix, v3, 6u, slice);
			
 
				+
			
 
				+	if(mips <= 7u)
			
 
				+		return;
			
 
				+	// no barrier needed, working on values only from the same thread
			
 
				+
			
 
				+	AF4 v = SpdReduce4(v0, v1, v2, v3);
			
 
				+	SpdStore(AU2(x, y), v, 7u, slice);
			
 
				+	SpdStoreIntermediate(x, y, v);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	if(mips <= baseMip)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 1u)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1u, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 2u)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2u, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 3u)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3u, slice);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsample(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice)
			
 
				+{
			
 
				+	AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64u);
			
 
				+	AU1 x = sub_xy.x + 8u * ((localInvocationIndex >> 6u) % 2u);
			
 
				+	AU1 y = sub_xy.y + 8u * ((localInvocationIndex >> 7u));
			
 
				+	SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
			
 
				+
			
 
				+	SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2u, mips, slice);
			
 
				+
			
 
				+	if(mips <= 6u)
			
 
				+		return;
			
 
				+
			
 
				+	if(SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
			
 
				+		return;
			
 
				+
			
 
				+	SpdResetAtomicCounter(slice);
			
 
				+
			
 
				+	// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
			
 
				+	SpdDownsampleMips_6_7(x, y, mips, slice);
			
 
				+
			
 
				+	SpdDownsampleNextFour(x, y, AU2(0, 0), localInvocationIndex, 8u, mips, slice);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsample(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice,
			
 
				+				   AU2 workGroupOffset)
			
 
				+{
			
 
				+	SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+//==============================================================================================================================
			
 
				+//                                                       PACKED VERSION
			
 
				+//==============================================================================================================================
			
 
				+
			
 
				+#	ifdef A_HALF
			
 
				+
			
 
				+#		ifdef A_GLSL
			
 
				+#			extension GL_EXT_shader_subgroup_extended_types_float16 : require
			
 
				+#		endif
			
 
				+
			
 
				+AH4 SpdReduceQuadH(AH4 v)
			
 
				+{
			
 
				+#		if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
			
 
				+	AH4 v0 = v;
			
 
				+	AH4 v1 = subgroupQuadSwapHorizontal(v);
			
 
				+	AH4 v2 = subgroupQuadSwapVertical(v);
			
 
				+	AH4 v3 = subgroupQuadSwapDiagonal(v);
			
 
				+	return SpdReduce4H(v0, v1, v2, v3);
			
 
				+#		elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
			
 
				+	// requires SM6.0
			
 
				+	AU1 quad = WaveGetLaneIndex() & (~0x3);
			
 
				+	AH4 v0 = v;
			
 
				+	AH4 v1 = WaveReadLaneAt(v, quad | 1);
			
 
				+	AH4 v2 = WaveReadLaneAt(v, quad | 2);
			
 
				+	AH4 v3 = WaveReadLaneAt(v, quad | 3);
			
 
				+	return SpdReduce4H(v0, v1, v2, v3);
			
 
				+/*
			
 
				+// if SM6.0 is not available, you can use the AMD shader intrinsics
			
 
				+// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
			
 
				+// https://gpuopen.com/amd-gpu-services-ags-library/
			
 
				+// works for DX11
			
 
				+AH4 v0 = v;
			
 
				+AH4 v1;
			
 
				+v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
			
 
				+AH4 v2;
			
 
				+v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
			
 
				+AH4 v3;
			
 
				+v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
			
 
				+return SpdReduce4H(v0, v1, v2, v3);
			
 
				+*/
			
 
				+#		endif
			
 
				+	return AH4(0.0, 0.0, 0.0, 0.0);
			
 
				+}
			
 
				+
			
 
				+AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
			
 
				+{
			
 
				+	AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
			
 
				+	AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
			
 
				+	AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
			
 
				+	AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
			
 
				+	return SpdReduce4H(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
			
 
				+{
			
 
				+	AH4 v0 = SpdLoadH(ASU2(i0), slice);
			
 
				+	AH4 v1 = SpdLoadH(ASU2(i1), slice);
			
 
				+	AH4 v2 = SpdLoadH(ASU2(i2), slice);
			
 
				+	AH4 v3 = SpdLoadH(ASU2(i3), slice);
			
 
				+	return SpdReduce4H(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AH4 SpdReduceLoad4H(AU2 base, AU1 slice)
			
 
				+{
			
 
				+	return SpdReduceLoad4H(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)), AU2(base + AU2(1, 1)),
			
 
				+						   slice);
			
 
				+}
			
 
				+
			
 
				+AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
			
 
				+{
			
 
				+	AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice);
			
 
				+	AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice);
			
 
				+	AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice);
			
 
				+	AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice);
			
 
				+	return SpdReduce4H(v0, v1, v2, v3);
			
 
				+}
			
 
				+
			
 
				+AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_LINEAR_SAMPLER
			
 
				+	return SpdLoadSourceImageH(ASU2(base), slice);
			
 
				+#		else
			
 
				+	return SpdReduceLoadSourceImage4H(AU2(base + AU2(0, 0)), AU2(base + AU2(0, 1)), AU2(base + AU2(1, 0)),
			
 
				+									  AU2(base + AU2(1, 1)), slice);
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	AH4 v[4];
			
 
				+
			
 
				+	ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
			
 
				+	ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
			
 
				+	v[0] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[0], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
			
 
				+	v[1] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[1], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
			
 
				+	v[2] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[2], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
			
 
				+	v[3] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[3], 0, slice);
			
 
				+
			
 
				+	if(mips <= 1)
			
 
				+		return;
			
 
				+
			
 
				+	v[0] = SpdReduceQuadH(v[0]);
			
 
				+	v[1] = SpdReduceQuadH(v[1]);
			
 
				+	v[2] = SpdReduceQuadH(v[2]);
			
 
				+	v[3] = SpdReduceQuadH(v[3]);
			
 
				+
			
 
				+	if((localInvocationIndex % 4) == 0)
			
 
				+	{
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2), v[0], 1, slice);
			
 
				+		SpdStoreIntermediateH(x / 2, y / 2, v[0]);
			
 
				+
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2), v[1], 1, slice);
			
 
				+		SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
			
 
				+
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2 + 8), v[2], 1, slice);
			
 
				+		SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
			
 
				+
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
			
 
				+		SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	AH4 v[4];
			
 
				+
			
 
				+	ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
			
 
				+	ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
			
 
				+	v[0] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[0], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
			
 
				+	v[1] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[1], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
			
 
				+	v[2] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[2], 0, slice);
			
 
				+
			
 
				+	tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
			
 
				+	pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
			
 
				+	v[3] = SpdReduceLoadSourceImageH(tex, slice);
			
 
				+	SpdStoreH(pix, v[3], 0, slice);
			
 
				+
			
 
				+	if(mips <= 1)
			
 
				+		return;
			
 
				+
			
 
				+	for(int i = 0; i < 4; i++)
			
 
				+	{
			
 
				+		SpdStoreIntermediateH(x, y, v[i]);
			
 
				+		SpdWorkgroupShuffleBarrier();
			
 
				+		if(localInvocationIndex < 64u)
			
 
				+		{
			
 
				+			v[i] = SpdReduceIntermediateH(AU2(x * 2 + 0, y * 2 + 0), AU2(x * 2 + 1, y * 2 + 0),
			
 
				+										  AU2(x * 2 + 0, y * 2 + 1), AU2(x * 2 + 1, y * 2 + 1));
			
 
				+			SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
			
 
				+		}
			
 
				+		SpdWorkgroupShuffleBarrier();
			
 
				+	}
			
 
				+
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		SpdStoreIntermediateH(x + 0, y + 0, v[0]);
			
 
				+		SpdStoreIntermediateH(x + 8, y + 0, v[1]);
			
 
				+		SpdStoreIntermediateH(x + 0, y + 8, v[2]);
			
 
				+		SpdStoreIntermediateH(x + 8, y + 8, v[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
			
 
				+#		else
			
 
				+	SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		AH4 v = SpdReduceIntermediateH(AU2(x * 2 + 0, y * 2 + 0), AU2(x * 2 + 1, y * 2 + 0), AU2(x * 2 + 0, y * 2 + 1),
			
 
				+									   AU2(x * 2 + 1, y * 2 + 1));
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
			
 
				+		// store to LDS, try to reduce bank conflicts
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		// ...
			
 
				+		// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
			
 
				+		SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
			
 
				+	}
			
 
				+#		else
			
 
				+	AH4 v = SpdLoadIntermediateH(x, y);
			
 
				+	v = SpdReduceQuadH(v);
			
 
				+	// quad index 0 stores result
			
 
				+	if(localInvocationIndex % 4 == 0)
			
 
				+	{
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x / 2, y / 2), v, mip, slice);
			
 
				+		SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
			
 
				+	}
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 16)
			
 
				+	{
			
 
				+		// x 0 x 0
			
 
				+		// 0 0 0 0
			
 
				+		// 0 x 0 x
			
 
				+		// 0 0 0 0
			
 
				+		AH4 v = SpdReduceIntermediateH(AU2(x * 4 + 0 + 0, y * 4 + 0), AU2(x * 4 + 2 + 0, y * 4 + 0),
			
 
				+									   AU2(x * 4 + 0 + 1, y * 4 + 2), AU2(x * 4 + 2 + 1, y * 4 + 2));
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
			
 
				+		// store to LDS
			
 
				+		// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
			
 
				+		// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
			
 
				+		// ...
			
 
				+		// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
			
 
				+		// ...
			
 
				+		// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
			
 
				+		// ...
			
 
				+		SpdStoreIntermediateH(x * 4 + y, y * 4, v);
			
 
				+	}
			
 
				+#		else
			
 
				+	if(localInvocationIndex < 64u)
			
 
				+	{
			
 
				+		AH4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
			
 
				+		v = SpdReduceQuadH(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4 == 0)
			
 
				+		{
			
 
				+			SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x / 2, y / 2), v, mip, slice);
			
 
				+			SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
			
 
				+		}
			
 
				+	}
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 4)
			
 
				+	{
			
 
				+		// x 0 0 0 x 0 0 0
			
 
				+		// ...
			
 
				+		// 0 x 0 0 0 x 0 0
			
 
				+		AH4 v = SpdReduceIntermediateH(AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
			
 
				+									   AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
			
 
				+		SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
			
 
				+		// store to LDS
			
 
				+		// x x x x 0 ...
			
 
				+		// 0 ...
			
 
				+		SpdStoreIntermediateH(x + y * 2, 0, v);
			
 
				+	}
			
 
				+#		else
			
 
				+	if(localInvocationIndex < 16)
			
 
				+	{
			
 
				+		AH4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
			
 
				+		v = SpdReduceQuadH(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4 == 0)
			
 
				+		{
			
 
				+			SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x / 2, y / 2), v, mip, slice);
			
 
				+			SpdStoreIntermediateH(x / 2 + y, 0, v);
			
 
				+		}
			
 
				+	}
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
			
 
				+{
			
 
				+#		ifdef SPD_NO_WAVE_OPERATIONS
			
 
				+	if(localInvocationIndex < 1)
			
 
				+	{
			
 
				+		// x x x x 0 ...
			
 
				+		// 0 ...
			
 
				+		AH4 v = SpdReduceIntermediateH(AU2(0, 0), AU2(1, 0), AU2(2, 0), AU2(3, 0));
			
 
				+		SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
			
 
				+	}
			
 
				+#		else
			
 
				+	if(localInvocationIndex < 4)
			
 
				+	{
			
 
				+		AH4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
			
 
				+		v = SpdReduceQuadH(v);
			
 
				+		// quad index 0 stores result
			
 
				+		if(localInvocationIndex % 4 == 0)
			
 
				+		{
			
 
				+			SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
			
 
				+		}
			
 
				+	}
			
 
				+#		endif
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
			
 
				+	ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
			
 
				+	AH4 v0 = SpdReduceLoad4H(tex, slice);
			
 
				+	SpdStoreH(pix, v0, 6, slice);
			
 
				+
			
 
				+	tex = ASU2(x * 4 + 2, y * 4 + 0);
			
 
				+	pix = ASU2(x * 2 + 1, y * 2 + 0);
			
 
				+	AH4 v1 = SpdReduceLoad4H(tex, slice);
			
 
				+	SpdStoreH(pix, v1, 6, slice);
			
 
				+
			
 
				+	tex = ASU2(x * 4 + 0, y * 4 + 2);
			
 
				+	pix = ASU2(x * 2 + 0, y * 2 + 1);
			
 
				+	AH4 v2 = SpdReduceLoad4H(tex, slice);
			
 
				+	SpdStoreH(pix, v2, 6, slice);
			
 
				+
			
 
				+	tex = ASU2(x * 4 + 2, y * 4 + 2);
			
 
				+	pix = ASU2(x * 2 + 1, y * 2 + 1);
			
 
				+	AH4 v3 = SpdReduceLoad4H(tex, slice);
			
 
				+	SpdStoreH(pix, v3, 6, slice);
			
 
				+
			
 
				+	if(mips < 8)
			
 
				+		return;
			
 
				+	// no barrier needed, working on values only from the same thread
			
 
				+
			
 
				+	AH4 v = SpdReduce4H(v0, v1, v2, v3);
			
 
				+	SpdStoreH(ASU2(x, y), v, 7, slice);
			
 
				+	SpdStoreIntermediateH(x, y, v);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
			
 
				+{
			
 
				+	if(mips <= baseMip)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 1)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 2)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
			
 
				+
			
 
				+	if(mips <= baseMip + 3)
			
 
				+		return;
			
 
				+	SpdWorkgroupShuffleBarrier();
			
 
				+	SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleH(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice)
			
 
				+{
			
 
				+	AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
			
 
				+	AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
			
 
				+	AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
			
 
				+
			
 
				+	SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
			
 
				+
			
 
				+	SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
			
 
				+
			
 
				+	if(mips < 7)
			
 
				+		return;
			
 
				+
			
 
				+	if(SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
			
 
				+		return;
			
 
				+
			
 
				+	SpdResetAtomicCounter(slice);
			
 
				+
			
 
				+	// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
			
 
				+	SpdDownsampleMips_6_7H(x, y, mips, slice);
			
 
				+
			
 
				+	SpdDownsampleNextFourH(x, y, AU2(0, 0), localInvocationIndex, 8, mips, slice);
			
 
				+}
			
 
				+
			
 
				+void SpdDownsampleH(AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 numWorkGroups, AU1 slice,
			
 
				+					AU2 workGroupOffset)
			
 
				+{
			
 
				+	SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
			
 
				+}
			
 
				+
			
 
				+#	endif // #ifdef A_HALF
			
 
				+#endif // #ifdef A_GPU