11 tháng trước cách đây · a1e768c508
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -2751,7 +2751,7 @@ Error Image::compress_from_channels(CompressMode p_mode, UsedChannels p_channels
 
				 
			
 
				 			case COMPRESS_S3TC: {
			
 
				 				// BC3 is unsupported currently.
			
 
				-				if ((p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) {
			
 
				+				if ((p_channels == USED_CHANNELS_R || p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) {
			
 
				 					Error result = _image_compress_bc_rd_func(this, p_channels);
			
 
				 
			
 
				 					// If the image was compressed successfully, we return here. If not, we fall back to the default compression scheme.
			
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -2900,8 +2900,8 @@
 
				 		</member>
			
 
				 		<member name="rendering/textures/vram_compression/compress_with_gpu" type="bool" setter="" getter="" default="true">
			
 
				 			If [code]true[/code], the texture importer will utilize the GPU for compressing textures, improving the import time of large images.
			
 
				-			[b]Note:[/b] This setting requires either Vulkan or D3D12 available as a rendering backend.
			
 
				-			[b]Note:[/b] Currently this only affects BC1 and BC6H compression, which are used on Desktop and Console for fully opaque and HDR images respectively.
			
 
				+			[b]Note:[/b] This only functions on a device which supports either Vulkan, D3D12, or Metal available as a rendering backend.
			
 
				+			[b]Note:[/b] Currently this only affects certain compressed formats (BC1, BC4, and BC6), all of which are exclusive to desktop platforms and consoles.
			
 
				 		</member>
			
 
				 		<member name="rendering/textures/vram_compression/import_etc2_astc" type="bool" setter="" getter="" default="false">
			
 
				 			If [code]true[/code], the texture importer will import VRAM-compressed textures using the Ericsson Texture Compression 2 algorithm for lower quality textures and normal maps and Adaptable Scalable Texture Compression algorithm for high quality textures (in 4×4 block size).
			
--- a/modules/betsy/SCsub
+++ b/modules/betsy/SCsub
@@ -7,6 +7,7 @@ Import("env_modules")
 
				 env_betsy = env_modules.Clone()
			
 
				 env_betsy.GLSL_HEADER("bc6h.glsl")
			
 
				 env_betsy.GLSL_HEADER("bc1.glsl")
			
 
				+env_betsy.GLSL_HEADER("bc4.glsl")
			
 
				 env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"])
			
 
				 
			
 
				 # Thirdparty source files
			
--- a/modules/betsy/bc4.glsl
+++ b/modules/betsy/bc4.glsl
@@ -0,0 +1,151 @@
 
				+#[versions]
			
 
				+
			
 
				+unsigned = "";
			
 
				+signed = "#define SNORM";
			
 
				+
			
 
				+#[compute]
			
 
				+#version 450
			
 
				+
			
 
				+#include "CrossPlatformSettings_piece_all.glsl"
			
 
				+#include "UavCrossPlatform_piece_all.glsl"
			
 
				+
			
 
				+#VERSION_DEFINES
			
 
				+
			
 
				+shared float2 g_minMaxValues[4u * 4u * 4u];
			
 
				+shared uint2 g_mask[4u * 4u];
			
 
				+
			
 
				+layout(binding = 0) uniform sampler2D srcTex;
			
 
				+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
			
 
				+
			
 
				+layout(push_constant, std430) uniform Params {
			
 
				+	uint p_channelIdx;
			
 
				+	uint p_padding[3];
			
 
				+}
			
 
				+params;
			
 
				+
			
 
				+layout(local_size_x = 4, //
			
 
				+		local_size_y = 4, //
			
 
				+		local_size_z = 4) in;
			
 
				+
			
 
				+/// Each block is 16 pixels
			
 
				+/// Each thread works on 4 pixels
			
 
				+/// Therefore each block needs 4 threads, generating 8 masks
			
 
				+/// At the end these 8 masks get merged into 2 and results written to output
			
 
				+///
			
 
				+/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
			
 
				+///
			
 
				+/// A: It's a sweetspot.
			
 
				+///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
			
 
				+///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
			
 
				+///    overhead, and also more LDS usage which reduces occupancy.
			
 
				+///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
			
 
				+void main() {
			
 
				+	float minVal, maxVal;
			
 
				+	float4 srcPixel;
			
 
				+
			
 
				+	const uint blockThreadId = gl_LocalInvocationID.x;
			
 
				+
			
 
				+	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
			
 
				+
			
 
				+	for (uint i = 0u; i < 4u; ++i) {
			
 
				+		const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i, blockThreadId);
			
 
				+
			
 
				+		const float4 value = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyzw;
			
 
				+		srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
			
 
				+		srcPixel[i] *= 255.0f;
			
 
				+	}
			
 
				+
			
 
				+	minVal = min3(srcPixel.x, srcPixel.y, srcPixel.z);
			
 
				+	maxVal = max3(srcPixel.x, srcPixel.y, srcPixel.z);
			
 
				+	minVal = min(minVal, srcPixel.w);
			
 
				+	maxVal = max(maxVal, srcPixel.w);
			
 
				+
			
 
				+	const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
			
 
				+	const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
			
 
				+
			
 
				+	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2(minVal, maxVal);
			
 
				+	g_mask[maskIdxBase] = uint2(0u, 0u);
			
 
				+
			
 
				+	memoryBarrierShared();
			
 
				+	barrier();
			
 
				+
			
 
				+	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
			
 
				+	for (uint i = 0u; i < 4u; ++i) {
			
 
				+		minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
			
 
				+		maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
			
 
				+	}
			
 
				+
			
 
				+	// determine bias and emit color indices
			
 
				+	// given the choice of maxVal/minVal, these indices are optimal:
			
 
				+	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
			
 
				+	float dist = maxVal - minVal;
			
 
				+	float dist4 = dist * 4.0f;
			
 
				+	float dist2 = dist * 2.0f;
			
 
				+	float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
			
 
				+	bias -= minVal * 7;
			
 
				+
			
 
				+	uint mask0 = 0u, mask1 = 0u;
			
 
				+
			
 
				+	for (uint i = 0u; i < 4u; ++i) {
			
 
				+		float a = srcPixel[i] * 7.0f + bias;
			
 
				+
			
 
				+		int ind = 0;
			
 
				+
			
 
				+		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
			
 
				+		if (a >= dist4) {
			
 
				+			ind = 4;
			
 
				+			a -= dist4;
			
 
				+		}
			
 
				+
			
 
				+		if (a >= dist2) {
			
 
				+			ind += 2;
			
 
				+			a -= dist2;
			
 
				+		}
			
 
				+
			
 
				+		if (a >= dist)
			
 
				+			ind += 1;
			
 
				+
			
 
				+		// turn linear scale into DXT index (0/1 are extremal pts)
			
 
				+		ind = -ind & 7;
			
 
				+		ind ^= (2 > ind) ? 1 : 0;
			
 
				+
			
 
				+		// write index
			
 
				+		const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
			
 
				+		if (bits < 32u) {
			
 
				+			mask0 |= uint(ind) << bits;
			
 
				+			if (bits + 3u > 32u) {
			
 
				+				mask1 |= uint(ind) >> (32u - bits);
			
 
				+			}
			
 
				+		} else {
			
 
				+			mask1 |= uint(ind) << (bits - 32u);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (mask0 != 0u)
			
 
				+		atomicOr(g_mask[maskIdxBase].x, mask0);
			
 
				+	if (mask1 != 0u)
			
 
				+		atomicOr(g_mask[maskIdxBase].y, mask1);
			
 
				+
			
 
				+	memoryBarrierShared();
			
 
				+	barrier();
			
 
				+
			
 
				+	if (blockThreadId == 0u) {
			
 
				+		// Save data
			
 
				+		uint2 outputBytes;
			
 
				+
			
 
				+#ifdef SNORM
			
 
				+		outputBytes.x =
			
 
				+				packSnorm4x8(float4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
			
 
				+						minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
			
 
				+#else
			
 
				+		outputBytes.x = packUnorm4x8(
			
 
				+				float4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
			
 
				+#endif
			
 
				+
			
 
				+		outputBytes.x |= g_mask[maskIdxBase].x;
			
 
				+		outputBytes.y = g_mask[maskIdxBase].y;
			
 
				+
			
 
				+		uint2 dstUV = gl_GlobalInvocationID.yz;
			
 
				+		imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
			
 
				+	}
			
 
				+}
			
--- a/modules/betsy/image_compress_betsy.cpp
+++ b/modules/betsy/image_compress_betsy.cpp
@@ -35,6 +35,7 @@
 
				 #include "betsy_bc1.h"
			
 
				 
			
 
				 #include "bc1.glsl.gen.h"
			
 
				+#include "bc4.glsl.gen.h"
			
 
				 #include "bc6h.glsl.gen.h"
			
 
				 
			
 
				 static Mutex betsy_mutex;
			
@@ -165,6 +166,10 @@ static String get_shader_name(BetsyFormat p_format) {
 
				 		case BETSY_FORMAT_BC3:
			
 
				 			return "BC3";
			
 
				 
			
 
				+		case BETSY_FORMAT_BC4_SIGNED:
			
 
				+		case BETSY_FORMAT_BC4_UNSIGNED:
			
 
				+			return "BC4";
			
 
				+
			
 
				 		case BETSY_FORMAT_BC6_SIGNED:
			
 
				 		case BETSY_FORMAT_BC6_UNSIGNED:
			
 
				 			return "BC6";
			
@@ -202,6 +207,12 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
 
				 			dest_format = Image::FORMAT_DXT1;
			
 
				 			break;
			
 
				 
			
 
				+		case BETSY_FORMAT_BC4_UNSIGNED:
			
 
				+			version = "unsigned";
			
 
				+			dst_rd_format = RD::DATA_FORMAT_R32G32_UINT;
			
 
				+			dest_format = Image::FORMAT_RGTC_R;
			
 
				+			break;
			
 
				+
			
 
				 		case BETSY_FORMAT_BC6_SIGNED:
			
 
				 			version = "signed";
			
 
				 			dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT;
			
@@ -235,8 +246,13 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
 
				 				err = source->parse_versions_from_text(bc1_shader_glsl);
			
 
				 				break;
			
 
				 
			
 
				-			case BETSY_FORMAT_BC6_UNSIGNED:
			
 
				+			case BETSY_FORMAT_BC4_SIGNED:
			
 
				+			case BETSY_FORMAT_BC4_UNSIGNED:
			
 
				+				err = source->parse_versions_from_text(bc4_shader_glsl);
			
 
				+				break;
			
 
				+
			
 
				 			case BETSY_FORMAT_BC6_SIGNED:
			
 
				+			case BETSY_FORMAT_BC6_UNSIGNED:
			
 
				 				err = source->parse_versions_from_text(bc6h_shader_glsl);
			
 
				 				break;
			
 
				 
			
@@ -430,26 +446,45 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
 
				 		compress_rd->compute_list_bind_compute_pipeline(compute_list, shader.pipeline);
			
 
				 		compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
			
 
				 
			
 
				-		if (dest_format == Image::FORMAT_BPTC_RGBFU || dest_format == Image::FORMAT_BPTC_RGBF) {
			
 
				-			BC6PushConstant push_constant;
			
 
				-			push_constant.sizeX = 1.0f / width;
			
 
				-			push_constant.sizeY = 1.0f / height;
			
 
				-			push_constant.padding[0] = 0;
			
 
				-			push_constant.padding[1] = 0;
			
 
				-
			
 
				-			compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
			
 
				-
			
 
				-		} else {
			
 
				-			BC1PushConstant push_constant;
			
 
				-			push_constant.num_refines = 2;
			
 
				-			push_constant.padding[0] = 0;
			
 
				-			push_constant.padding[1] = 0;
			
 
				-			push_constant.padding[2] = 0;
			
 
				-
			
 
				-			compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
			
 
				+		switch (dest_format) {
			
 
				+			case Image::FORMAT_BPTC_RGBFU:
			
 
				+			case Image::FORMAT_BPTC_RGBF: {
			
 
				+				BC6PushConstant push_constant;
			
 
				+				push_constant.sizeX = 1.0f / width;
			
 
				+				push_constant.sizeY = 1.0f / height;
			
 
				+				push_constant.padding[0] = 0;
			
 
				+				push_constant.padding[1] = 0;
			
 
				+
			
 
				+				compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
			
 
				+				compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
			
 
				+			} break;
			
 
				+
			
 
				+			case Image::FORMAT_DXT1: {
			
 
				+				BC1PushConstant push_constant;
			
 
				+				push_constant.num_refines = 2;
			
 
				+				push_constant.padding[0] = 0;
			
 
				+				push_constant.padding[1] = 0;
			
 
				+				push_constant.padding[2] = 0;
			
 
				+
			
 
				+				compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
			
 
				+				compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
			
 
				+			} break;
			
 
				+
			
 
				+			case Image::FORMAT_RGTC_R: {
			
 
				+				BC4PushConstant push_constant;
			
 
				+				push_constant.channel_idx = 0;
			
 
				+				push_constant.padding[0] = 0;
			
 
				+				push_constant.padding[1] = 0;
			
 
				+				push_constant.padding[2] = 0;
			
 
				+
			
 
				+				compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
			
 
				+				compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
			
 
				+			} break;
			
 
				+
			
 
				+			default: {
			
 
				+			} break;
			
 
				 		}
			
 
				 
			
 
				-		compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
			
 
				 		compress_rd->compute_list_end();
			
 
				 
			
 
				 		compress_rd->submit();
			
@@ -511,13 +546,14 @@ Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) {
 
				 
			
 
				 	switch (p_channels) {
			
 
				 		case Image::USED_CHANNELS_RGB:
			
 
				-			result = betsy->compress(BETSY_FORMAT_BC1_DITHER, r_img);
			
 
				-			break;
			
 
				-
			
 
				 		case Image::USED_CHANNELS_L:
			
 
				 			result = betsy->compress(BETSY_FORMAT_BC1, r_img);
			
 
				 			break;
			
 
				 
			
 
				+		case Image::USED_CHANNELS_R:
			
 
				+			result = betsy->compress(BETSY_FORMAT_BC4_UNSIGNED, r_img);
			
 
				+			break;
			
 
				+
			
 
				 		default:
			
 
				 			break;
			
 
				 	}
			
--- a/modules/betsy/image_compress_betsy.h
+++ b/modules/betsy/image_compress_betsy.h
@@ -50,6 +50,8 @@ enum BetsyFormat {
 
				 	BETSY_FORMAT_BC1,
			
 
				 	BETSY_FORMAT_BC1_DITHER,
			
 
				 	BETSY_FORMAT_BC3,
			
 
				+	BETSY_FORMAT_BC4_SIGNED,
			
 
				+	BETSY_FORMAT_BC4_UNSIGNED,
			
 
				 	BETSY_FORMAT_BC6_SIGNED,
			
 
				 	BETSY_FORMAT_BC6_UNSIGNED,
			
 
				 };
			
@@ -65,6 +67,11 @@ struct BC1PushConstant {
 
				 	uint32_t padding[3];
			
 
				 };
			
 
				 
			
 
				+struct BC4PushConstant {
			
 
				+	uint32_t channel_idx;
			
 
				+	uint32_t padding[3];
			
 
				+};
			
 
				+
			
 
				 void free_device();
			
 
				 
			
 
				 Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels);
			
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -78,7 +78,7 @@ fix build with our own copy of zstd (patch in `patches`).
 
				 
			
 
				 Files extracted from upstream source:
			
 
				 
			
 
				-- `bc6h.glsl`, `bc1.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
			
 
				+- `bc6h.glsl`, `bc1.glsl`, `bc4.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`.
			
 
				 - `LICENSE.md`