1 year ago · 98a40e8533
--- a/3rdparty/astc-encoder/include/astcenc.h
+++ b/3rdparty/astc-encoder/include/astcenc.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2020-2023 Arm Limited
			
 
				+// Copyright 2020-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -215,6 +215,8 @@ enum astcenc_error {
 
				 	ASTCENC_ERR_BAD_CONTEXT,
			
 
				 	/** @brief The call failed due to unimplemented functionality. */
			
 
				 	ASTCENC_ERR_NOT_IMPLEMENTED,
			
 
				+	/** @brief The call failed due to an out-of-spec decode mode flag set. */
			
 
				+	ASTCENC_ERR_BAD_DECODE_MODE,
			
 
				 #if defined(ASTCENC_DIAGNOSTICS)
			
 
				 	/** @brief The call failed due to an issue with diagnostic tracing. */
			
 
				 	ASTCENC_ERR_DTRACE_FAILURE,
			
@@ -302,6 +304,11 @@ enum astcenc_type
 
				 	ASTCENC_TYPE_F32 = 2
			
 
				 };
			
 
				 
			
 
				+/**
			
 
				+ * @brief Function pointer type for compression progress reporting callback.
			
 
				+ */
			
 
				+extern "C" typedef void (*astcenc_progress_callback)(float);
			
 
				+
			
 
				 /**
			
 
				  * @brief Enable normal map compression.
			
 
				  *
			
@@ -312,6 +319,19 @@ enum astcenc_type
 
				  */
			
 
				 static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
			
 
				 
			
 
				+/**
			
 
				+ * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
			
 
				+ *
			
 
				+ * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
			
 
				+ * flag during compression will allow the compressor to use the correct rounding when selecting
			
 
				+ * encodings. This will improve the compressed image quality if your application is using the
			
 
				+ * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
			
 
				+ *
			
 
				+ * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
			
 
				+ * this setting.
			
 
				+ */
			
 
				+static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8        = 1 << 1;
			
 
				+
			
 
				 /**
			
 
				  * @brief Enable alpha weighting.
			
 
				  *
			
@@ -378,6 +398,7 @@ static const unsigned int ASTCENC_ALL_FLAGS =
 
				                               ASTCENC_FLG_MAP_RGBM |
			
 
				                               ASTCENC_FLG_USE_ALPHA_WEIGHT |
			
 
				                               ASTCENC_FLG_USE_PERCEPTUAL |
			
 
				+                              ASTCENC_FLG_USE_DECODE_UNORM8 |
			
 
				                               ASTCENC_FLG_DECOMPRESS_ONLY |
			
 
				                               ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
			
 
				 
			
@@ -550,6 +571,16 @@ struct astcenc_config
 
				 	 */
			
 
				 	float tune_search_mode0_enable;
			
 
				 
			
 
				+	/**
			
 
				+	 * @brief The progress callback, can be @c nullptr.
			
 
				+	 *
			
 
				+	 * If this is specified the codec will peridocially report progress for
			
 
				+	 * compression as a percentage between 0 and 100. The callback is called from one
			
 
				+	 * of the compressor threads, so doing significant work in the callback will
			
 
				+	 * reduce compression performance.
			
 
				+	 */
			
 
				+	astcenc_progress_callback progress_callback;
			
 
				+
			
 
				 #if defined(ASTCENC_DIAGNOSTICS)
			
 
				 	/**
			
 
				 	 * @brief The path to save the diagnostic trace data to.
			
--- a/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_color_unquantize.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2021 Arm Limited
			
 
				+// Copyright 2011-2023 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -894,32 +894,55 @@ void unpack_color_endpoints(
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	vint4 ldr_scale(257);
			
 
				-	vint4 hdr_scale(1);
			
 
				-	vint4 output_scale = ldr_scale;
			
 
				+	// Handle endpoint errors and expansion
			
 
				 
			
 
				-	// An LDR profile image
			
 
				-	if ((decode_mode == ASTCENC_PRF_LDR) ||
			
 
				-	    (decode_mode == ASTCENC_PRF_LDR_SRGB))
			
 
				+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
			
 
				+	if (decode_mode == ASTCENC_PRF_LDR)
			
 
				 	{
			
 
				-		// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
			
 
				-		if (rgb_hdr == true)
			
 
				+		// Error color - HDR endpoint in an LDR encoding
			
 
				+		if (rgb_hdr || alpha_hdr)
			
 
				 		{
			
 
				-			output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
			
 
				-			output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
			
 
				-			output_scale = hdr_scale;
			
 
				+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
			
 
				+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
			
 
				+			rgb_hdr = false;
			
 
				+			alpha_hdr = false;
			
 
				+		}
			
 
				 
			
 
				+		output0 = output0 * 257;
			
 
				+		output1 = output1 * 257;
			
 
				+	}
			
 
				+	// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
			
 
				+	//  - RGB = shift left by 8 bits and OR with 0x80
			
 
				+	//  - A = replication
			
 
				+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				+	{
			
 
				+		// Error color - HDR endpoint in an LDR encoding
			
 
				+		if (rgb_hdr || alpha_hdr)
			
 
				+		{
			
 
				+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
			
 
				+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
			
 
				 			rgb_hdr = false;
			
 
				 			alpha_hdr = false;
			
 
				 		}
			
 
				+
			
 
				+		vmask4 mask(true, true, true, false);
			
 
				+
			
 
				+		vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
			
 
				+		vint4 output0a = output0 * 257;
			
 
				+		output0 = select(output0a, output0rgb, mask);
			
 
				+
			
 
				+		vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
			
 
				+		vint4 output1a = output1 * 257;
			
 
				+		output1 = select(output1a, output1rgb, mask);
			
 
				 	}
			
 
				-	// An HDR profile image
			
 
				+	// An HDR profile decode, but may be using linear LDR endpoints
			
 
				+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
			
 
				+	// HDR endpoints are already 16-bit
			
 
				 	else
			
 
				 	{
			
 
				 		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
			
 
				-		output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
			
 
				+		vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
			
 
				+		output0 = output0 * output_scale;
			
 
				+		output1 = output1 * output_scale;
			
 
				 	}
			
 
				-
			
 
				-	output0 = output0 * output_scale;
			
 
				-	output1 = output1 * output_scale;
			
 
				 }
			
--- a/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_compress_symbolic.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -247,7 +247,7 @@ static bool realign_weights_decimated(
 
				 		}
			
 
				 
			
 
				 		// Create an unquantized weight grid for this decimation level
			
 
				-		alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
			
 
				+		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
			
 
				 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint unquant_value(dec_weights_uquant + we_idx);
			
@@ -467,7 +467,7 @@ static float compress_symbolic_block_for_partition_1plane(
 
				 
			
 
				 		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
			
 
				 
			
 
				-		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				+		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 		// Generate the optimized set of weights for the weight mode
			
 
				 		compute_quantized_weights_for_decimation(
			
@@ -830,7 +830,7 @@ static float compress_symbolic_block_for_partition_2planes(
 
				 		unsigned int decimation_mode = bm.decimation_mode;
			
 
				 		const auto& di = bsd.get_decimation_info(decimation_mode);
			
 
				 
			
 
				-		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				+		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 		// Generate the optimized set of weights for the mode
			
 
				 		compute_quantized_weights_for_decimation(
			
--- a/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_decompress_symbolic.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -27,15 +27,15 @@
 
				 /**
			
 
				  * @brief Compute the integer linear interpolation of two color endpoints.
			
 
				  *
			
 
				- * @param decode_mode   The ASTC profile (linear or sRGB)
			
 
				+ * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
			
 
				  * @param color0        The endpoint0 color.
			
 
				  * @param color1        The endpoint1 color.
			
 
				- * @param weights        The interpolation weight (between 0 and 64).
			
 
				+ * @param weights       The interpolation weight (between 0 and 64).
			
 
				  *
			
 
				  * @return The interpolated color.
			
 
				  */
			
 
				 static vint4 lerp_color_int(
			
 
				-	astcenc_profile decode_mode,
			
 
				+	vmask4 u8_mask,
			
 
				 	vint4 color0,
			
 
				 	vint4 color1,
			
 
				 	vint4 weights
			
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
 
				 	vint4 weight1 = weights;
			
 
				 	vint4 weight0 = vint4(64) - weight1;
			
 
				 
			
 
				-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				-	{
			
 
				-		color0 = asr<8>(color0);
			
 
				-		color1 = asr<8>(color1);
			
 
				-	}
			
 
				-
			
 
				 	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
			
 
				 	color = asr<6>(color);
			
 
				 
			
 
				-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				-	{
			
 
				-		color = color * vint4(257);
			
 
				-	}
			
 
				+	// For decode_unorm8 values force the codec to bit replicate. This allows the
			
 
				+	// rest of the codec to assume the full 0xFFFF range for everything and ignore
			
 
				+	// the decode_mode setting
			
 
				+	vint4 color_u8 = asr<8>(color) * vint4(257);
			
 
				+	color = select(color, color_u8, u8_mask);
			
 
				 
			
 
				 	return color;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /**
			
 
				  * @brief Convert integer color value into a float value for the decoder.
			
 
				  *
			
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
 
				 		{
			
 
				 			vint4 colori(scb.constant_color);
			
 
				 
			
 
				-			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
			
 
				-			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
			
 
				-			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				-			{
			
 
				-				colori = asr<8>(colori) * 257;
			
 
				-			}
			
 
				+			// Determine the UNORM8 rounding on the decode
			
 
				+			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
			
 
				+
			
 
				+			// The real decoder would just use the top 8 bits, but we rescale
			
 
				+			// in to a 16-bit value that rounds correctly.
			
 
				+			vint4 colori_u8 = asr<8>(colori) * 257;
			
 
				+			colori = select(colori, colori_u8, u8_mask);
			
 
				 
			
 
				 			vint4 colorf16 = unorm16_to_sf16(colori);
			
 
				 			color = float16_to_float(colorf16);
			
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
 
				 	int plane2_component = scb.plane2_component;
			
 
				 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
			
 
				 
			
 
				+	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
			
 
				+
			
 
				 	for (int i = 0; i < partition_count; i++)
			
 
				 	{
			
 
				 		// Decode the color endpoints for this partition
			
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
 
				 		{
			
 
				 			int tix = pi.texels_of_partition[i][j];
			
 
				 			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
			
 
				-			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
			
 
				+			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
			
 
				 			vfloat4 colorf = decode_texel(color, lns_mask);
			
 
				 
			
 
				 			blk.data_r[tix] = colorf.lane<0>();
			
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
 
				 	                       rgb_lns, a_lns,
			
 
				 	                       ep0, ep1);
			
 
				 
			
 
				+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
			
 
				+
			
 
				 	// Unpack and compute error for each texel in the partition
			
 
				 	unsigned int texel_count = bsd.texel_count;
			
 
				 	for (unsigned int i = 0; i < texel_count; i++)
			
 
				 	{
			
 
				 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
			
 
				-		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
			
 
				+		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
			
 
				 
			
 
				 		vfloat4 color = int_to_float(colori);
			
 
				 		vfloat4 oldColor = blk.texel(i);
			
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
 
				 	int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
			
 
				 
			
 
				+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
			
 
				+
			
 
				 	vfloat4 summa = vfloat4::zero();
			
 
				 	for (unsigned int i = 0; i < partition_count; i++)
			
 
				 	{
			
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
 
				 		for (unsigned int j = 0; j < texel_count; j++)
			
 
				 		{
			
 
				 			unsigned int tix = pi.texels_of_partition[i][j];
			
 
				-			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
			
 
				+			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
			
 
				 			                              vint4(plane1_weights[tix]));
			
 
				 
			
 
				 			vfloat4 color = int_to_float(colori);
			
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
 
				 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
			
 
				 
			
 
				 	// Unquantize and undecimate the weights
			
 
				-	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
			
 
				 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
			
 
				 
			
 
				 	// Decode the color endpoints for this partition
			
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
 
				 	                       rgb_lns, a_lns,
			
 
				 	                       ep0, ep1);
			
 
				 
			
 
				-
			
 
				-	// Pre-shift sRGB so things round correctly
			
 
				-	if (config.profile == ASTCENC_PRF_LDR_SRGB)
			
 
				-	{
			
 
				-		ep0 = asr<8>(ep0);
			
 
				-		ep1 = asr<8>(ep1);
			
 
				-	}
			
 
				+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
			
 
				 
			
 
				 	// Unpack and compute error for each texel in the partition
			
 
				 	vfloatacc summav = vfloatacc::zero();
			
 
				 
			
 
				 	vint lane_id = vint::lane_id();
			
 
				-	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
			
 
				 
			
 
				 	unsigned int texel_count = bsd.texel_count;
			
 
				 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
 
				 		vint ep0_b = vint(ep0.lane<2>()) * weight0;
			
 
				 		vint ep0_a = vint(ep0.lane<3>()) * weight0;
			
 
				 
			
 
				-		// Shift so things round correctly
			
 
				-		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
			
 
				-		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
			
 
				-		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
			
 
				-		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
			
 
				+		// Combine contributions
			
 
				+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
			
 
				+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
			
 
				+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
			
 
				+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
			
 
				+
			
 
				+		// If using a U8 decode mode bit replicate top 8 bits
			
 
				+		// so rest of codec can assume 0xFFFF max range everywhere
			
 
				+		vint colori_r8 = asr<8>(colori_r) * vint(257);
			
 
				+		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
			
 
				+
			
 
				+		vint colori_g8 = asr<8>(colori_g) * vint(257);
			
 
				+		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
			
 
				+
			
 
				+		vint colori_b8 = asr<8>(colori_b) * vint(257);
			
 
				+		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
			
 
				+
			
 
				+		vint colori_a8 = asr<8>(colori_a) * vint(257);
			
 
				+		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
			
 
				 
			
 
				 		// Compute color diff
			
 
				 		vfloat color_r = int_to_float(colori_r);
			
--- a/3rdparty/astc-encoder/source/astcenc_entry.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_entry.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -217,11 +217,13 @@ static astcenc_error validate_block_size(
 
				 /**
			
 
				  * @brief Validate flags.
			
 
				  *
			
 
				- * @param flags   The flags to check.
			
 
				+ * @param profile   The profile to check.
			
 
				+ * @param flags     The flags to check.
			
 
				  *
			
 
				  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
			
 
				  */
			
 
				 static astcenc_error validate_flags(
			
 
				+	astcenc_profile profile,
			
 
				 	unsigned int flags
			
 
				 ) {
			
 
				 	// Flags field must not contain any unknown flag bits
			
@@ -239,6 +241,14 @@ static astcenc_error validate_flags(
 
				 		return ASTCENC_ERR_BAD_FLAGS;
			
 
				 	}
			
 
				 
			
 
				+	// Decode_unorm8 must only be used with an LDR profile
			
 
				+	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
			
 
				+	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
			
 
				+	if (is_unorm8 && is_hdr)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_DECODE_MODE;
			
 
				+	}
			
 
				+
			
 
				 	return ASTCENC_SUCCESS;
			
 
				 }
			
 
				 
			
@@ -364,7 +374,7 @@ static astcenc_error validate_config(
 
				 		return status;
			
 
				 	}
			
 
				 
			
 
				-	status = validate_flags(config.flags);
			
 
				+	status = validate_flags(config.profile, config.flags);
			
 
				 	if (status != ASTCENC_SUCCESS)
			
 
				 	{
			
 
				 		return status;
			
@@ -591,7 +601,7 @@ astcenc_error astcenc_config_init(
 
				 	}
			
 
				 
			
 
				 	// Flags field must not contain any unknown flag bits
			
 
				-	status = validate_flags(flags);
			
 
				+	status = validate_flags(profile, flags);
			
 
				 	if (status != ASTCENC_SUCCESS)
			
 
				 	{
			
 
				 		return status;
			
@@ -689,6 +699,12 @@ astcenc_error astcenc_context_alloc(
 
				 	}
			
 
				 
			
 
				 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
			
 
				+	if (!ctx->bsd)
			
 
				+	{
			
 
				+		delete ctxo;
			
 
				+		return ASTCENC_ERR_OUT_OF_MEM;
			
 
				+	}
			
 
				+
			
 
				 	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
			
 
				 	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
			
 
				 	                           can_omit_modes,
			
@@ -698,7 +714,7 @@ astcenc_error astcenc_context_alloc(
 
				 
			
 
				 #if !defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				 	// Do setup only needed by compression
			
 
				-	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
			
 
				+	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
			
 
				 	{
			
 
				 		// Turn a dB limit into a per-texel error for faster use later
			
 
				 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
			
@@ -712,7 +728,7 @@ astcenc_error astcenc_context_alloc(
 
				 
			
 
				 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
			
 
				 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
			
 
				-		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
			
 
				+		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
			
 
				 		              "compression_working_buffers size must be multiple of vector alignment");
			
 
				 		if (!ctx->working_buffers)
			
 
				 		{
			
@@ -802,6 +818,8 @@ static void compress_image(
 
				 	int row_blocks = xblocks;
			
 
				 	int plane_blocks = xblocks * yblocks;
			
 
				 
			
 
				+	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
			
 
				+
			
 
				 	// Populate the block channel weights
			
 
				 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
			
 
				 	                             ctx.config.cw_g_weight,
			
@@ -812,7 +830,7 @@ static void compress_image(
 
				 	auto& temp_buffers = ctx.working_buffers[thread_index];
			
 
				 
			
 
				 	// Only the first thread actually runs the initializer
			
 
				-	ctxo.manage_compress.init(block_count);
			
 
				+	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
			
 
				 
			
 
				 	// Determine if we can use an optimized load function
			
 
				 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
			
@@ -1137,6 +1155,7 @@ astcenc_error astcenc_decompress_image(
 
				 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
			
 
				 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
			
 
				 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
			
 
				+	unsigned int block_count = zblocks * yblocks * xblocks;
			
 
				 
			
 
				 	int row_blocks = xblocks;
			
 
				 	int plane_blocks = xblocks * yblocks;
			
@@ -1148,9 +1167,12 @@ astcenc_error astcenc_decompress_image(
 
				 		return ASTCENC_ERR_OUT_OF_MEM;
			
 
				 	}
			
 
				 
			
 
				-	image_block blk;
			
 
				+	image_block blk {};
			
 
				 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
			
 
				 
			
 
				+	// Decode mode inferred from the output data type
			
 
				+	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
			
 
				+
			
 
				 	// If context thread count is one then implicitly reset
			
 
				 	if (ctx->thread_count == 1)
			
 
				 	{
			
@@ -1158,7 +1180,7 @@ astcenc_error astcenc_decompress_image(
 
				 	}
			
 
				 
			
 
				 	// Only the first thread actually runs the initializer
			
 
				-	ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
			
 
				+	ctxo->manage_decompress.init(block_count, nullptr);
			
 
				 
			
 
				 	// All threads run this processing loop until there is no work remaining
			
 
				 	while (true)
			
@@ -1356,6 +1378,8 @@ const char* astcenc_get_error_string(
 
				 		return "ASTCENC_ERR_BAD_CONTEXT";
			
 
				 	case ASTCENC_ERR_NOT_IMPLEMENTED:
			
 
				 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
			
 
				+	case ASTCENC_ERR_BAD_DECODE_MODE:
			
 
				+		return "ASTCENC_ERR_BAD_DECODE_MODE";
			
 
				 #if defined(ASTCENC_DIAGNOSTICS)
			
 
				 	case ASTCENC_ERR_DTRACE_FAILURE:
			
 
				 		return "ASTCENC_ERR_DTRACE_FAILURE";
			
--- a/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -873,7 +873,7 @@ void compute_ideal_weights_for_decimation(
 
				 	}
			
 
				 
			
 
				 	// Otherwise compute an estimate and perform single refinement iteration
			
 
				-	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	// Compute an initial average for each decimated weight
			
 
				 	bool constant_wes = ei.is_constant_weight_error_scale;
			
@@ -1171,7 +1171,7 @@ void recompute_ideal_colors_1plane(
 
				 	promise(total_texel_count > 0);
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
			
 
				 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 	{
			
 
				 		vint unquant_value(dec_weights_uquant + i);
			
@@ -1179,7 +1179,7 @@ void recompute_ideal_colors_1plane(
 
				 		storea(unquant_valuef, dec_weight + i);
			
 
				 	}
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
			
 
				 	float* undec_weight_ref;
			
 
				 	if (di.max_texel_weight_count == 1)
			
 
				 	{
			
@@ -1394,8 +1394,8 @@ void recompute_ideal_colors_2planes(
 
				 	promise(total_texel_count > 0);
			
 
				 	promise(weight_count > 0);
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				-	alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				+	ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				+	ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
			
 
				 
			
 
				 	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
			
 
				 
			
@@ -1410,8 +1410,8 @@ void recompute_ideal_colors_2planes(
 
				 		storea(unquant_value2f, dec_weight_plane2 + i);
			
 
				 	}
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
			
 
				-	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	float* undec_weight_plane1_ref;
			
 
				 	float* undec_weight_plane2_ref;
			
--- a/3rdparty/astc-encoder/source/astcenc_image.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_image.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2022 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
 
				 	vfloat4 data,
			
 
				 	const astcenc_swizzle& swz
			
 
				 ) {
			
 
				-	alignas(16) float datas[6];
			
 
				+	ASTCENC_ALIGNAS float datas[6];
			
 
				 
			
 
				 	storea(data, datas);
			
 
				 	datas[ASTCENC_SWZ_0] = 0.0f;
			
--- a/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_integer_sequence.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2021 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -464,10 +464,10 @@ static inline void write_bits(
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Read up to 8 bits at an arbitrary bit offset.
			
 
				+ * @brief Read up to 16 bits from two bytes.
			
 
				  *
			
 
				- * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
			
 
				- * span two separate bytes in memory.
			
 
				+ * This function reads a packed N-bit field from two bytes in memory. The stored value must exist
			
 
				+ * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
			
 
				  *
			
 
				  * @param         bitcount    The number of bits to read.
			
 
				  * @param         bitoffset   The bit offset to read from, between 0 and 7.
			
--- a/3rdparty/astc-encoder/source/astcenc_internal.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -29,6 +29,7 @@
 
				 	#include <cstdio>
			
 
				 #endif
			
 
				 #include <cstdlib>
			
 
				+#include <limits>
			
 
				 
			
 
				 #include "astcenc.h"
			
 
				 #include "astcenc_mathlib.h"
			
@@ -325,10 +326,10 @@ struct partition_info
 
				 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
			
 
				 
			
 
				 	/** @brief The partition of each texel in the block. */
			
 
				-	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/** @brief The list of texels in each partition. */
			
 
				-	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -366,40 +367,40 @@ struct decimation_info
 
				 	 * @brief The number of weights that contribute to each texel.
			
 
				 	 * Value is between 1 and 4.
			
 
				 	 */
			
 
				-	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The weight index of the N weights that are interpolated for each texel.
			
 
				 	 * Stored transposed to improve vectorization.
			
 
				 	 */
			
 
				-	uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
			
 
				 	 * Value is between 0 and 16, stored transposed to improve vectorization.
			
 
				 	 */
			
 
				-	uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
			
 
				 	 * Value is between 0 and 1, stored transposed to improve vectorization.
			
 
				 	 */
			
 
				-	alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/** @brief The number of texels that each stored weight contributes to. */
			
 
				-	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The list of texels that use a specific weight index.
			
 
				 	 * Stored transposed to improve vectorization.
			
 
				 	 */
			
 
				-	uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The bilinear contribution to the N texels that use each weight.
			
 
				 	 * Value is between 0 and 1, stored transposed to improve vectorization.
			
 
				 	 */
			
 
				-	alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The bilinear contribution to the Nth texel that uses each weight.
			
@@ -579,7 +580,7 @@ struct block_size_descriptor
 
				 	decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
			
 
				 
			
 
				 	/** @brief The active decimation tables, stored in low indices. */
			
 
				-	alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
			
 
				+	ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
			
 
				 
			
 
				 	/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
			
 
				 	uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
			
@@ -731,7 +732,11 @@ struct block_size_descriptor
 
				  *
			
 
				  * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
			
 
				  * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
			
 
				- * data is stored as direct UNORM data, HDR data is stored as LNS data.
			
 
				+ * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
			
 
				+ * elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
			
 
				+ * 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
			
 
				+ * two elements above the last real data element). The overspill values are never written to memory,
			
 
				+ * and would be benign, but the padding avoids hitting undefined behavior.
			
 
				  *
			
 
				  * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
			
 
				  * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
			
@@ -739,16 +744,16 @@ struct block_size_descriptor
 
				 struct image_block
			
 
				 {
			
 
				 	/** @brief The input (compress) or output (decompress) data for the red color component. */
			
 
				-	alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
			
 
				 
			
 
				 	/** @brief The input (compress) or output (decompress) data for the green color component. */
			
 
				-	alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
			
 
				 
			
 
				 	/** @brief The input (compress) or output (decompress) data for the blue color component. */
			
 
				-	alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
			
 
				 
			
 
				 	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
			
 
				-	alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
			
 
				 
			
 
				 	/** @brief The number of texels in the block. */
			
 
				 	uint8_t texel_count;
			
@@ -771,6 +776,9 @@ struct image_block
 
				 	/** @brief Is this grayscale block where R == G == B for all texels? */
			
 
				 	bool grayscale;
			
 
				 
			
 
				+	/** @brief Is the eventual decode using decode_unorm8 rounding? */
			
 
				+	bool decode_unorm8;
			
 
				+
			
 
				 	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
			
 
				 	uint8_t rgb_lns[BLOCK_MAX_TEXELS];
			
 
				 
			
@@ -897,10 +905,10 @@ struct endpoints_and_weights
 
				 	endpoints ep;
			
 
				 
			
 
				 	/** @brief The ideal weight for each texel; may be undecimated or decimated. */
			
 
				-	alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
			
 
				-	alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
			
 
				+	ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -930,7 +938,7 @@ struct encoding_choice_errors
 
				 /**
			
 
				  * @brief Preallocated working buffers, allocated per thread during context creation.
			
 
				  */
			
 
				-struct alignas(ASTCENC_VECALIGN) compression_working_buffers
			
 
				+struct ASTCENC_ALIGNAS compression_working_buffers
			
 
				 {
			
 
				 	/** @brief Ideal endpoints and weights for plane 1. */
			
 
				 	endpoints_and_weights ei1;
			
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
 
				 	 *
			
 
				 	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
			
 
				 	 */
			
 
				-	alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief Decimated quantized weight values in the unquantized 0-64 range.
			
 
				 	 *
			
 
				 	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
			
 
				 	 */
			
 
				-	uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/** @brief Error of the best encoding combination for each block mode. */
			
 
				-	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
			
 
				+	ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
			
 
				 
			
 
				 	/** @brief The best color quant for each block mode. */
			
 
				 	uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
			
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
 
				 	 *
			
 
				 	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
			
 
				 	 */
			
 
				-	uint8_t weights[BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief Get the weight quantization used by this block mode.
			
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
 
				   Functionality for managing images and image related data.
			
 
				 ============================================================================ */
			
 
				 
			
 
				+/**
			
 
				+ * @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
			
 
				+ *
			
 
				+ * @param decode_mode   The color profile for LDR_SRGB settings.
			
 
				+ * @param blk           The image block for output image bitness settings.
			
 
				+ *
			
 
				+ * @return The component mask vector.
			
 
				+ */
			
 
				+static inline vmask4 get_u8_component_mask(
			
 
				+	astcenc_profile decode_mode,
			
 
				+	const image_block& blk
			
 
				+) {
			
 
				+	vmask4 u8_mask(false);
			
 
				+	// Decode mode writing to a unorm8 output value
			
 
				+	if (blk.decode_unorm8)
			
 
				+	{
			
 
				+		u8_mask = vmask4(true);
			
 
				+	}
			
 
				+	// SRGB writing to a unorm8 RGB value
			
 
				+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				+	{
			
 
				+		u8_mask = vmask4(true, true, true, false);
			
 
				+	}
			
 
				+
			
 
				+	return u8_mask;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Setup computation of regional averages in an image.
			
 
				  *
			
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
 
				  *
			
 
				  * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
			
 
				  *
			
 
				- * @param      decode_mode   The decode mode (LDR, HDR).
			
 
				+ * @param      decode_mode   The decode mode (LDR, HDR, etc).
			
 
				  * @param      format        The color endpoint mode used.
			
 
				  * @param      input         The raw array of encoded input integers. The length of this array
			
 
				  *                           depends on @c format; it can be safely assumed to be large enough.
			
@@ -2142,10 +2177,11 @@ Platform-specific functions.
 
				 /**
			
 
				  * @brief Allocate an aligned memory buffer.
			
 
				  *
			
 
				- * Allocated memory must be freed by aligned_free;
			
 
				+ * Allocated memory must be freed by aligned_free.
			
 
				  *
			
 
				  * @param size    The desired buffer size.
			
 
				- * @param align   The desired buffer alignment; must be 2^N.
			
 
				+ * @param align   The desired buffer alignment; must be 2^N, may be increased
			
 
				+ *                by the implementation to a minimum allowable alignment.
			
 
				  *
			
 
				  * @return The memory buffer pointer or nullptr on allocation failure.
			
 
				  */
			
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
 
				 	void* ptr;
			
 
				 	int error = 0;
			
 
				 
			
 
				+	// Don't allow this to under-align a type
			
 
				+	size_t min_align = astc::max(alignof(T), sizeof(void*));
			
 
				+	size_t real_align = astc::max(min_align, align);
			
 
				+
			
 
				 #if defined(_WIN32)
			
 
				-	ptr = _aligned_malloc(size, align);
			
 
				+	ptr = _aligned_malloc(size, real_align);
			
 
				 #else
			
 
				-	error = posix_memalign(&ptr, align, size);
			
 
				+	error = posix_memalign(&ptr, real_align, size);
			
 
				 #endif
			
 
				 
			
 
				 	if (error || (!ptr))
			
--- a/3rdparty/astc-encoder/source/astcenc_internal_entry.h
+++ b/3rdparty/astc-encoder/source/astcenc_internal_entry.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2022 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -118,6 +118,18 @@ private:
 
				 	/** @brief Number of tasks that need to be processed. */
			
 
				 	unsigned int m_task_count;
			
 
				 
			
 
				+	/** @brief Progress callback (optional). */
			
 
				+	astcenc_progress_callback m_callback;
			
 
				+
			
 
				+	/** @brief Lock used for callback synchronization. */
			
 
				+	std::mutex m_callback_lock;
			
 
				+
			
 
				+	/** @brief Minimum progress before making a callback. */
			
 
				+	float m_callback_min_diff;
			
 
				+
			
 
				+	/** @brief Last progress callback value. */
			
 
				+	float m_callback_last_value;
			
 
				+
			
 
				 public:
			
 
				 	/** @brief Create a new ParallelManager. */
			
 
				 	ParallelManager()
			
@@ -138,6 +150,9 @@ public:
 
				 		m_start_count = 0;
			
 
				 		m_done_count = 0;
			
 
				 		m_task_count = 0;
			
 
				+		m_callback = nullptr;
			
 
				+		m_callback_last_value = 0.0f;
			
 
				+		m_callback_min_diff = 1.0f;
			
 
				 	}
			
 
				 
			
 
				 	/**
			
@@ -166,14 +181,20 @@ public:
 
				 	 * initialization. Other threads will block and wait for it to complete.
			
 
				 	 *
			
 
				 	 * @param task_count   Total number of tasks needing processing.
			
 
				+	 * @param callback     Function pointer for progress status callbacks.
			
 
				 	 */
			
 
				-	void init(unsigned int task_count)
			
 
				+	void init(unsigned int task_count, astcenc_progress_callback callback)
			
 
				 	{
			
 
				 		std::lock_guard<std::mutex> lck(m_lock);
			
 
				 		if (!m_init_done)
			
 
				 		{
			
 
				+			m_callback = callback;
			
 
				 			m_task_count = task_count;
			
 
				 			m_init_done = true;
			
 
				+
			
 
				+			// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
			
 
				+			float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
			
 
				+			m_callback_min_diff = astc::max(min_diff, 1.0f);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -212,12 +233,49 @@ public:
 
				 	{
			
 
				 		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
			
 
				 		// update here and the wait() for other threads
			
 
				-		std::unique_lock<std::mutex> lck(m_lock);
			
 
				-		this->m_done_count += count;
			
 
				-		if (m_done_count == m_task_count)
			
 
				+		unsigned int local_count;
			
 
				+		float local_last_value;
			
 
				 		{
			
 
				-			lck.unlock();
			
 
				-			m_complete.notify_all();
			
 
				+			std::unique_lock<std::mutex> lck(m_lock);
			
 
				+			m_done_count += count;
			
 
				+			local_count = m_done_count;
			
 
				+			local_last_value = m_callback_last_value;
			
 
				+
			
 
				+			if (m_done_count == m_task_count)
			
 
				+			{
			
 
				+				// Ensure the progress bar hits 100%
			
 
				+				if (m_callback)
			
 
				+				{
			
 
				+					std::unique_lock<std::mutex> cblck(m_callback_lock);
			
 
				+					m_callback(100.0f);
			
 
				+					m_callback_last_value = 100.0f;
			
 
				+				}
			
 
				+
			
 
				+				lck.unlock();
			
 
				+				m_complete.notify_all();
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Process progress callback if we have one
			
 
				+		if (m_callback)
			
 
				+		{
			
 
				+			// Initial lockless test - have we progressed enough to emit?
			
 
				+			float num = static_cast<float>(local_count);
			
 
				+			float den = static_cast<float>(m_task_count);
			
 
				+			float this_value =  (num / den) * 100.0f;
			
 
				+			bool report_test = (this_value - local_last_value) > m_callback_min_diff;
			
 
				+
			
 
				+			// Recheck under lock, because another thread might report first
			
 
				+			if (report_test)
			
 
				+			{
			
 
				+				std::unique_lock<std::mutex> cblck(m_callback_lock);
			
 
				+				bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
			
 
				+				if (report_retest)
			
 
				+				{
			
 
				+					m_callback(this_value);
			
 
				+					m_callback_last_value = this_value;
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/3rdparty/astc-encoder/source/astcenc_mathlib.h
+++ b/3rdparty/astc-encoder/source/astcenc_mathlib.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2021 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -73,10 +73,22 @@
 
				   #endif
			
 
				 #endif
			
 
				 
			
 
				+// Force vector-sized SIMD alignment
			
 
				 #if ASTCENC_AVX
			
 
				   #define ASTCENC_VECALIGN 32
			
 
				-#else
			
 
				+#elif ASTCENC_SSE || ASTCENC_NEON
			
 
				   #define ASTCENC_VECALIGN 16
			
 
				+// Use default alignment for non-SIMD builds
			
 
				+#else
			
 
				+  #define ASTCENC_VECALIGN 0
			
 
				+#endif
			
 
				+
			
 
				+// C++11 states that alignas(0) should be ignored but GCC doesn't do
			
 
				+// this on some versions, so workaround and avoid emitting alignas(0)
			
 
				+#if ASTCENC_VECALIGN > 0
			
 
				+	#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
			
 
				+#else
			
 
				+	#define ASTCENC_ALIGNAS
			
 
				 #endif
			
 
				 
			
 
				 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
			
--- a/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_mathlib_softfloat.cpp
@@ -15,13 +15,13 @@
 
				 // under the License.
			
 
				 // ----------------------------------------------------------------------------
			
 
				 
			
 
				-#include "astcenc_mathlib.h"
			
 
				-
			
 
				 /**
			
 
				  * @brief Soft-float library for IEEE-754.
			
 
				  */
			
 
				 #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
			
 
				 
			
 
				+#include "astcenc_mathlib.h"
			
 
				+
			
 
				 /*	sized soft-float types. These are mapped to the sized integer
			
 
				     types of C99, instead of C's floating-point types; this is because
			
 
				     the library needs to maintain exact, bit-level control on all
			
--- a/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_symbolic_physical.cpp
@@ -330,12 +330,14 @@ void physical_to_symbolic(
 
				 				return;
			
 
				 			}
			
 
				 
			
 
				+			// Low values span 3 bytes so need two read_bits calls
			
 
				 			int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
			
 
				-			int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
			
 
				+			int vx_high_s = read_bits(13, 25, pcb);
			
 
				 			int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
			
 
				-			int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
			
 
				+			int vx_high_t = read_bits(13, 51, pcb);
			
 
				 
			
 
				-			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
			
 
				+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
			
 
				+			               vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
			
 
				 
			
 
				 			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
			
 
				 			{
			
@@ -350,12 +352,14 @@ void physical_to_symbolic(
 
				 			int vx_high_s = read_bits(9, 19, pcb);
			
 
				 			int vx_low_t = read_bits(9, 28, pcb);
			
 
				 			int vx_high_t = read_bits(9, 37, pcb);
			
 
				-			int vx_low_p = read_bits(9, 46, pcb);
			
 
				-			int vx_high_p = read_bits(9, 55, pcb);
			
 
				+			int vx_low_r = read_bits(9, 46, pcb);
			
 
				+			int vx_high_r = read_bits(9, 55, pcb);
			
 
				 
			
 
				-			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
			
 
				+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
			
 
				+			               vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
			
 
				+			               vx_low_r == 0x1FF && vx_high_r == 0x1FF;
			
 
				 
			
 
				-			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
			
 
				+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
			
 
				 			{
			
 
				 				scb.block_type = SYM_BTYPE_ERROR;
			
 
				 				return;
			
@@ -470,8 +474,7 @@ void physical_to_symbolic(
 
				 				bitpos += 2;
			
 
				 			}
			
 
				 		}
			
 
				-		scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
			
 
				-		                                            (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
			
 
				+		scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
			
 
				 	}
			
 
				 
			
 
				 	for (int i = 0; i < partition_count; i++)
			
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_avx2_8.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2022 Arm Limited
			
 
				+// Copyright 2019-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -1170,7 +1170,7 @@ ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 ma
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void print(vint8 a)
			
 
				 {
			
 
				-	alignas(ASTCENC_VECALIGN) int v[8];
			
 
				+	alignas(32) int v[8];
			
 
				 	storea(a, v);
			
 
				 	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
			
 
				 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
@@ -1181,7 +1181,7 @@ ASTCENC_SIMD_INLINE void print(vint8 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void printx(vint8 a)
			
 
				 {
			
 
				-	alignas(ASTCENC_VECALIGN) int v[8];
			
 
				+	alignas(32) int v[8];
			
 
				 	storea(a, v);
			
 
				 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
			
 
				 	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
@@ -1192,7 +1192,7 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void print(vfloat8 a)
			
 
				 {
			
 
				-	alignas(ASTCENC_VECALIGN) float v[8];
			
 
				+	alignas(32) float v[8];
			
 
				 	storea(a, v);
			
 
				 	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
			
 
				 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
			
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_common_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2020-2021 Arm Limited
			
 
				+// Copyright 2020-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -383,7 +383,7 @@ static ASTCENC_SIMD_INLINE void bit_transfer_signed(
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void print(vint4 a)
			
 
				 {
			
 
				-	alignas(16) int v[4];
			
 
				+	ASTCENC_ALIGNAS int v[4];
			
 
				 	storea(a, v);
			
 
				 	printf("v4_i32:\n  %8d %8d %8d %8d\n",
			
 
				 	       v[0], v[1], v[2], v[3]);
			
@@ -394,7 +394,7 @@ ASTCENC_SIMD_INLINE void print(vint4 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void printx(vint4 a)
			
 
				 {
			
 
				-	alignas(16) int v[4];
			
 
				+	ASTCENC_ALIGNAS int v[4];
			
 
				 	storea(a, v);
			
 
				 	printf("v4_i32:\n  %08x %08x %08x %08x\n",
			
 
				 	       v[0], v[1], v[2], v[3]);
			
@@ -405,7 +405,7 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void print(vfloat4 a)
			
 
				 {
			
 
				-	alignas(16) float v[4];
			
 
				+	ASTCENC_ALIGNAS float v[4];
			
 
				 	storea(a, v);
			
 
				 	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
			
 
				 	       static_cast<double>(v[0]), static_cast<double>(v[1]),
			
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_neon_4.h
@@ -359,9 +359,9 @@ struct vmask4
 
				 	/**
			
 
				 	 * @brief Get the scalar from a single lane.
			
 
				 	 */
			
 
				-	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
			
 
				+	template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
			
 
				 	{
			
 
				-		return vgetq_lane_u32(m, l);
			
 
				+		return vgetq_lane_u32(m, l) != 0;
			
 
				 	}
			
 
				 
			
 
				 	/**
			
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_none_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2022 Arm Limited
			
 
				+// Copyright 2019-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -351,6 +351,13 @@ struct vmask4
 
				 		m[3] = d == false ? 0 : -1;
			
 
				 	}
			
 
				 
			
 
				+	/**
			
 
				+	 * @brief Get the scalar value of a single lane.
			
 
				+	 */
			
 
				+	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	{
			
 
				+		return m[l] != 0;
			
 
				+	}
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief The vector ...
			
@@ -549,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
 
				  */
			
 
				 template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
			
 
				 {
			
 
				-	return vint4(a.m[0] << s,
			
 
				-	             a.m[1] << s,
			
 
				-	             a.m[2] << s,
			
 
				-	             a.m[3] << s);
			
 
				+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
			
 
				+	unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
			
 
				+	unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
			
 
				+	unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
			
 
				+	unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
			
 
				+
			
 
				+	return vint4(static_cast<int>(as0),
			
 
				+	             static_cast<int>(as1),
			
 
				+	             static_cast<int>(as2),
			
 
				+	             static_cast<int>(as3));
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -560,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
 
				  */
			
 
				 template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
			
 
				 {
			
 
				+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
			
 
				 	unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
			
 
				 	unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
			
 
				 	unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
			
--- a/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h
+++ b/3rdparty/astc-encoder/source/astcenc_vecmathlib_sse_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2022 Arm Limited
			
 
				+// Copyright 2019-2023 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -379,9 +379,9 @@ struct vmask4
 
				 	/**
			
 
				 	 * @brief Get the scalar value of a single lane.
			
 
				 	 */
			
 
				-	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
			
 
				 	{
			
 
				-		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
			
 
				+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
			
 
				 	}
			
 
				 
			
 
				 	/**
			
--- a/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
+++ b/3rdparty/astc-encoder/source/astcenc_weight_align.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -60,8 +60,8 @@ static const uint8_t steps_for_quant_level[12] {
 
				 	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
			
 
				 };
			
 
				 
			
 
				-alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				-alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				+ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				+ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
			
 
				 
			
 
				 #if defined(ASTCENC_DIAGNOSTICS)
			
 
				 	static bool print_once { true };
			
@@ -99,7 +99,7 @@ static void compute_angular_offsets(
 
				 	promise(weight_count > 0);
			
 
				 	promise(max_angular_steps > 0);
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
			
 
				+	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
			
 
				 
			
 
				 	// Precompute isample; arrays are always allocated 64 elements long
			
 
				 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
@@ -242,16 +242,16 @@ static void compute_angular_endpoints_for_quant_levels(
 
				 	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
			
 
				 	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
			
 
				 
			
 
				 	compute_angular_offsets(weight_count, dec_weight_ideal_value,
			
 
				 	                        max_angular_steps, angular_offsets);
			
 
				 
			
 
				-	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
			
 
				-	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
			
 
				-	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
			
 
				-	alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
			
 
				-	alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
			
 
				+	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
			
 
				 
			
 
				 	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
			
 
				 	                                  max_angular_steps, max_quant_steps,