4 mesiacov pred · 75ce42d463
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -180,7 +180,7 @@ License: BSD-3-clause
 
				 
			
 
				 Files: thirdparty/astcenc/*
			
 
				 Comment: Arm ASTC Encoder
			
 
				-Copyright: 2011-2024, Arm Limited
			
 
				+Copyright: 2011-2025, Arm Limited
			
 
				 License: Apache-2.0
			
 
				 
			
 
				 Files: thirdparty/basis_universal/*
			
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -50,7 +50,7 @@ Files extracted from upstream source:
 
				 ## astcenc
			
 
				 
			
 
				 - Upstream: https://github.com/ARM-software/astc-encoder
			
 
				-- Version: 4.8.0 (0d6c9047c5ad19640e2d60fdb8f11a16675e7938, 2024)
			
 
				+- Version: 5.3.0 (bf32abd05eccaf3042170b2a85cebdf0bfee5873, 2025)
			
 
				 - License: Apache 2.0
			
 
				 
			
 
				 Files extracted from upstream source:
			
--- a/thirdparty/astcenc/astcenc.h
+++ b/thirdparty/astcenc/astcenc.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2020-2024 Arm Limited
			
 
				+// Copyright 2020-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -784,6 +784,20 @@ ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
 
				 ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
			
 
				 	astcenc_context* context);
			
 
				 
			
 
				+/**
			
 
				+ * @brief Cancel any pending compression operation.
			
 
				+ *
			
 
				+ * The caller must behave as if the compression completed normally, even though the data will be
			
 
				+ * undefined. They are still responsible for synchronizing threads in the worker thread pool, and
			
 
				+ * must call reset before starting another compression.
			
 
				+ *
			
 
				+ * @param context   Codec context.
			
 
				+ *
			
 
				+ * @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
			
 
				+ */
			
 
				+ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
			
 
				+	astcenc_context* context);
			
 
				+
			
 
				 /**
			
 
				  * @brief Decompress an image.
			
 
				  *
			
--- a/thirdparty/astcenc/astcenc_averages_and_directions.cpp
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -50,7 +50,7 @@ static void compute_partition_averages_rgb(
 
				 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
			
 
				 ) {
			
 
				 	unsigned int partition_count = pi.partition_count;
			
 
				-	unsigned int texel_count = blk.texel_count;
			
 
				+	size_t texel_count = blk.texel_count;
			
 
				 	promise(texel_count > 0);
			
 
				 
			
 
				 	// For 1 partition just use the precomputed mean
			
@@ -64,11 +64,11 @@ static void compute_partition_averages_rgb(
 
				 		vfloatacc pp_avg_rgb[3] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -100,11 +100,11 @@ static void compute_partition_averages_rgb(
 
				 		vfloatacc pp_avg_rgb[2][3] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -145,11 +145,11 @@ static void compute_partition_averages_rgb(
 
				 		vfloatacc pp_avg_rgb[3][3] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -221,7 +221,7 @@ static void compute_partition_averages_rgba(
 
				 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
			
 
				 ) {
			
 
				 	unsigned int partition_count = pi.partition_count;
			
 
				-	unsigned int texel_count = blk.texel_count;
			
 
				+	size_t texel_count = blk.texel_count;
			
 
				 	promise(texel_count > 0);
			
 
				 
			
 
				 	// For 1 partition just use the precomputed mean
			
@@ -235,11 +235,11 @@ static void compute_partition_averages_rgba(
 
				 		vfloat4 pp_avg_rgba[4] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -275,11 +275,11 @@ static void compute_partition_averages_rgba(
 
				 		vfloat4 pp_avg_rgba[2][4] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -326,11 +326,11 @@ static void compute_partition_averages_rgba(
 
				 		vfloat4 pp_avg_rgba[3][4] {};
			
 
				 
			
 
				 		vint lane_id = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint texel_partition(pi.partition_of_texel + i);
			
 
				 
			
 
				-			vmask lane_mask = lane_id < vint(texel_count);
			
 
				+			vmask lane_mask = lane_id < vint_from_size(texel_count);
			
 
				 			lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
			
@@ -390,17 +390,17 @@ void compute_avgs_and_dirs_4_comp(
 
				 	const image_block& blk,
			
 
				 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				 ) {
			
 
				-	int partition_count = pi.partition_count;
			
 
				+	size_t partition_count = pi.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				 	// Pre-compute partition_averages
			
 
				 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
			
 
				 	compute_partition_averages_rgba(pi, blk, partition_averages);
			
 
				 
			
 
				-	for (int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				-		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		size_t texel_count = pi.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		vfloat4 average = partition_averages[partition];
			
@@ -411,7 +411,7 @@ void compute_avgs_and_dirs_4_comp(
 
				 		vfloat4 sum_zp = vfloat4::zero();
			
 
				 		vfloat4 sum_wp = vfloat4::zero();
			
 
				 
			
 
				-		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		for (size_t i = 0; i < texel_count; i++)
			
 
				 		{
			
 
				 			unsigned int iwt = texel_indexes[i];
			
 
				 			vfloat4 texel_datum = blk.texel(iwt);
			
@@ -509,13 +509,13 @@ void compute_avgs_and_dirs_3_comp(
 
				 		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
			
 
				 	}
			
 
				 
			
 
				-	unsigned int partition_count = pi.partition_count;
			
 
				+	size_t partition_count = pi.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				-	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				-		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		size_t texel_count = pi.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		vfloat4 average = partition_averages[partition];
			
@@ -525,7 +525,7 @@ void compute_avgs_and_dirs_3_comp(
 
				 		vfloat4 sum_yp = vfloat4::zero();
			
 
				 		vfloat4 sum_zp = vfloat4::zero();
			
 
				 
			
 
				-		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		for (size_t i = 0; i < texel_count; i++)
			
 
				 		{
			
 
				 			unsigned int iwt = texel_indexes[i];
			
 
				 
			
@@ -570,17 +570,17 @@ void compute_avgs_and_dirs_3_comp_rgb(
 
				 	const image_block& blk,
			
 
				 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
			
 
				 ) {
			
 
				-	unsigned int partition_count = pi.partition_count;
			
 
				+	size_t partition_count = pi.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				 	// Pre-compute partition_averages
			
 
				 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
			
 
				 	compute_partition_averages_rgb(pi, blk, partition_averages);
			
 
				 
			
 
				-	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				-		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		size_t texel_count = pi.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		vfloat4 average = partition_averages[partition];
			
@@ -590,7 +590,7 @@ void compute_avgs_and_dirs_3_comp_rgb(
 
				 		vfloat4 sum_yp = vfloat4::zero();
			
 
				 		vfloat4 sum_zp = vfloat4::zero();
			
 
				 
			
 
				-		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		for (size_t i = 0; i < texel_count; i++)
			
 
				 		{
			
 
				 			unsigned int iwt = texel_indexes[i];
			
 
				 
			
@@ -664,20 +664,20 @@ void compute_avgs_and_dirs_2_comp(
 
				 		data_vg = blk.data_b;
			
 
				 	}
			
 
				 
			
 
				-	unsigned int partition_count = pt.partition_count;
			
 
				+	size_t partition_count = pt.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				-	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
			
 
				-		unsigned int texel_count = pt.partition_texel_count[partition];
			
 
				+		size_t texel_count = pt.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		// Only compute a partition mean if more than one partition
			
 
				 		if (partition_count > 1)
			
 
				 		{
			
 
				 			average = vfloat4::zero();
			
 
				-			for (unsigned int i = 0; i < texel_count; i++)
			
 
				+			for (size_t i = 0; i < texel_count; i++)
			
 
				 			{
			
 
				 				unsigned int iwt = texel_indexes[i];
			
 
				 				average += vfloat2(data_vr[iwt], data_vg[iwt]);
			
@@ -691,7 +691,7 @@ void compute_avgs_and_dirs_2_comp(
 
				 		vfloat4 sum_xp = vfloat4::zero();
			
 
				 		vfloat4 sum_yp = vfloat4::zero();
			
 
				 
			
 
				-		for (unsigned int i = 0; i < texel_count; i++)
			
 
				+		for (size_t i = 0; i < texel_count; i++)
			
 
				 		{
			
 
				 			unsigned int iwt = texel_indexes[i];
			
 
				 			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
			
@@ -729,20 +729,20 @@ void compute_error_squared_rgba(
 
				 	float& uncor_error,
			
 
				 	float& samec_error
			
 
				 ) {
			
 
				-	unsigned int partition_count = pi.partition_count;
			
 
				+	size_t partition_count = pi.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				 	vfloatacc uncor_errorsumv = vfloatacc::zero();
			
 
				 	vfloatacc samec_errorsumv = vfloatacc::zero();
			
 
				 
			
 
				-	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				 
			
 
				 		processed_line4 l_uncor = uncor_plines[partition];
			
 
				 		processed_line4 l_samec = samec_plines[partition];
			
 
				 
			
 
				-		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		size_t texel_count = pi.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		// Vectorize some useful scalar inputs
			
@@ -775,15 +775,15 @@ void compute_error_squared_rgba(
 
				 		// array to extend the last value. This means min/max are not impacted, but we need to mask
			
 
				 		// out the dummy values when we compute the line weighting.
			
 
				 		vint lane_ids = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				-			vmask mask = lane_ids < vint(texel_count);
			
 
				-			vint texel_idxs(texel_indexes + i);
			
 
				+			vmask mask = lane_ids < vint_from_size(texel_count);
			
 
				+			const uint8_t* texel_idxs = texel_indexes + i;
			
 
				 
			
 
				-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
			
 
				-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
			
 
				-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
			
 
				-			vfloat data_a = gatherf(blk.data_a, texel_idxs);
			
 
				+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
			
 
				+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
			
 
				+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
			
 
				+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
			
 
				 
			
 
				 			vfloat uncor_param = (data_r * l_uncor_bs0)
			
 
				 			                   + (data_g * l_uncor_bs1)
			
@@ -847,17 +847,17 @@ void compute_error_squared_rgb(
 
				 	float& uncor_error,
			
 
				 	float& samec_error
			
 
				 ) {
			
 
				-	unsigned int partition_count = pi.partition_count;
			
 
				+	size_t partition_count = pi.partition_count;
			
 
				 	promise(partition_count > 0);
			
 
				 
			
 
				 	vfloatacc uncor_errorsumv = vfloatacc::zero();
			
 
				 	vfloatacc samec_errorsumv = vfloatacc::zero();
			
 
				 
			
 
				-	for (unsigned int partition = 0; partition < partition_count; partition++)
			
 
				+	for (size_t partition = 0; partition < partition_count; partition++)
			
 
				 	{
			
 
				 		partition_lines3& pl = plines[partition];
			
 
				 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
			
 
				-		unsigned int texel_count = pi.partition_texel_count[partition];
			
 
				+		size_t texel_count = pi.partition_texel_count[partition];
			
 
				 		promise(texel_count > 0);
			
 
				 
			
 
				 		processed_line3 l_uncor = pl.uncor_pline;
			
@@ -889,14 +889,14 @@ void compute_error_squared_rgb(
 
				 		// to extend the last value. This means min/max are not impacted, but we need to mask
			
 
				 		// out the dummy values when we compute the line weighting.
			
 
				 		vint lane_ids = vint::lane_id();
			
 
				-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				-			vmask mask = lane_ids < vint(texel_count);
			
 
				-			vint texel_idxs(texel_indexes + i);
			
 
				+			vmask mask = lane_ids < vint_from_size(texel_count);
			
 
				+			const uint8_t* texel_idxs = texel_indexes + i;
			
 
				 
			
 
				-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
			
 
				-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
			
 
				-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
			
 
				+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
			
 
				+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
			
 
				+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
			
 
				 
			
 
				 			vfloat uncor_param = (data_r * l_uncor_bs0)
			
 
				 			                   + (data_g * l_uncor_bs1)
			
--- a/thirdparty/astcenc/astcenc_block_sizes.cpp
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -384,12 +384,12 @@ static void init_decimation_info_2d(
 
				 	}
			
 
				 
			
 
				 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				 	{
			
 
				 		di.texel_weight_count[i] = 0;
			
 
				 
			
 
				-		for (unsigned int j = 0; j < 4; j++)
			
 
				+		for (size_t j = 0; j < 4; j++)
			
 
				 		{
			
 
				 			di.texel_weight_contribs_float_tr[j][i] = 0;
			
 
				 			di.texel_weights_tr[j][i] = 0;
			
@@ -402,12 +402,12 @@ static void init_decimation_info_2d(
 
				 	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
			
 
				 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
			
 
				 
			
 
				-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				 	{
			
 
				 		di.weight_texel_count[i] = 0;
			
 
				 
			
 
				-		for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
			
 
				+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
			
 
				 		{
			
 
				 			di.weight_texels_tr[j][i] = last_texel;
			
 
				 			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
@@ -640,12 +640,12 @@ static void init_decimation_info_3d(
 
				 	}
			
 
				 
			
 
				 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
			
 
				-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
			
 
				+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
			
 
				 	{
			
 
				 		di.texel_weight_count[i] = 0;
			
 
				 
			
 
				-		for (unsigned int j = 0; j < 4; j++)
			
 
				+		for (size_t j = 0; j < 4; j++)
			
 
				 		{
			
 
				 			di.texel_weight_contribs_float_tr[j][i] = 0;
			
 
				 			di.texel_weights_tr[j][i] = 0;
			
@@ -658,12 +658,12 @@ static void init_decimation_info_3d(
 
				 	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
			
 
				 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
			
 
				 
			
 
				-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
			
 
				+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
			
 
				 	{
			
 
				 		di.weight_texel_count[i] = 0;
			
 
				 
			
 
				-		for (int j = 0; j < max_texel_count_of_weight; j++)
			
 
				+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
			
 
				 		{
			
 
				 			di.weight_texels_tr[j][i] = last_texel;
			
 
				 			di.weights_texel_contribs_tr[j][i] = 0.0f;
			
--- a/thirdparty/astcenc/astcenc_color_unquantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@@ -925,15 +925,8 @@ void unpack_color_endpoints(
 
				 			alpha_hdr = false;
			
 
				 		}
			
 
				 
			
 
				-		vmask4 mask(true, true, true, false);
			
 
				-
			
 
				-		vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
			
 
				-		vint4 output0a = output0 * 257;
			
 
				-		output0 = select(output0a, output0rgb, mask);
			
 
				-
			
 
				-		vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
			
 
				-		vint4 output1a = output1 * 257;
			
 
				-		output1 = select(output1a, output1rgb, mask);
			
 
				+		output0 = lsl<8>(output0) | vint4(0x80);
			
 
				+		output1 = lsl<8>(output1) | vint4(0x80);
			
 
				 	}
			
 
				 	// An HDR profile decode, but may be using linear LDR endpoints
			
 
				 	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
			
--- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2024 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -1280,7 +1280,7 @@ void compress_block(
 
				 		1.0f
			
 
				 	};
			
 
				 
			
 
				-	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
			
 
				+	const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
			
 
				 
			
 
				 	// Only enable MODE0 fast path if enabled
			
 
				 	// Never enable for 3D blocks as no "always" block modes are available
			
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@@ -98,19 +98,14 @@ void unpack_weights(
 
				 	if (!is_dual_plane)
			
 
				 	{
			
 
				 		// Build full 64-entry weight lookup table
			
 
				-		vint4 tab0 = vint4::load(scb.weights +  0);
			
 
				-		vint4 tab1 = vint4::load(scb.weights + 16);
			
 
				-		vint4 tab2 = vint4::load(scb.weights + 32);
			
 
				-		vint4 tab3 = vint4::load(scb.weights + 48);
			
 
				-
			
 
				-		vint tab0p, tab1p, tab2p, tab3p;
			
 
				-		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
			
 
				+		vtable_64x8 table;
			
 
				+		vtable_prepare(table, scb.weights);
			
 
				 
			
 
				 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vint summed_value(8);
			
 
				 			vint weight_count(di.texel_weight_count + i);
			
 
				-			int max_weight_count = hmax(weight_count).lane<0>();
			
 
				+			int max_weight_count = hmax_s(weight_count);
			
 
				 
			
 
				 			promise(max_weight_count > 0);
			
 
				 			for (int j = 0; j < max_weight_count; j++)
			
@@ -118,7 +113,7 @@ void unpack_weights(
 
				 				vint texel_weights(di.texel_weights_tr[j] + i);
			
 
				 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
			
 
				 
			
 
				-				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
			
 
				+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
			
 
				 			}
			
 
				 
			
 
				 			store(lsr<4>(summed_value), weights_plane1 + i);
			
@@ -128,16 +123,12 @@ void unpack_weights(
 
				 	{
			
 
				 		// Build a 32-entry weight lookup table per plane
			
 
				 		// Plane 1
			
 
				-		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
			
 
				-		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
			
 
				-		vint tab0_plane1p, tab1_plane1p;
			
 
				-		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
			
 
				+		vtable_32x8 tab_plane1;
			
 
				+		vtable_prepare(tab_plane1, scb.weights);
			
 
				 
			
 
				 		// Plane 2
			
 
				-		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
			
 
				-		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
			
 
				-		vint tab0_plane2p, tab1_plane2p;
			
 
				-		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
			
 
				+		vtable_32x8 tab_plane2;
			
 
				+		vtable_prepare(tab_plane2, scb.weights + 32);
			
 
				 
			
 
				 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
@@ -145,7 +136,7 @@ void unpack_weights(
 
				 			vint sum_plane2(8);
			
 
				 
			
 
				 			vint weight_count(di.texel_weight_count + i);
			
 
				-			int max_weight_count = hmax(weight_count).lane<0>();
			
 
				+			int max_weight_count = hmax_s(weight_count);
			
 
				 
			
 
				 			promise(max_weight_count > 0);
			
 
				 			for (int j = 0; j < max_weight_count; j++)
			
@@ -153,8 +144,8 @@ void unpack_weights(
 
				 				vint texel_weights(di.texel_weights_tr[j] + i);
			
 
				 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
			
 
				 
			
 
				-				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
			
 
				-				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
			
 
				+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
			
 
				+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
			
 
				 			}
			
 
				 
			
 
				 			store(lsr<4>(sum_plane1), weights_plane1 + i);
			
--- a/thirdparty/astcenc/astcenc_entry.cpp
+++ b/thirdparty/astcenc/astcenc_entry.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2024 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -1123,6 +1123,29 @@ astcenc_error astcenc_compress_reset(
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+/* See header for documentation. */
			
 
				+astcenc_error astcenc_compress_cancel(
			
 
				+	astcenc_context* ctxo
			
 
				+) {
			
 
				+#if defined(ASTCENC_DECOMPRESS_ONLY)
			
 
				+	(void)ctxo;
			
 
				+	return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+#else
			
 
				+	astcenc_contexti* ctx = &ctxo->context;
			
 
				+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
			
 
				+	{
			
 
				+		return ASTCENC_ERR_BAD_CONTEXT;
			
 
				+	}
			
 
				+
			
 
				+	// Cancel compression before cancelling avg. This avoids the race condition
			
 
				+	// where cancelling them in the other order could see a compression worker
			
 
				+	// starting to process even though some of the avg data is undefined.
			
 
				+	ctxo->manage_compress.cancel();
			
 
				+	ctxo->manage_avg.cancel();
			
 
				+	return ASTCENC_SUCCESS;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /* See header for documentation. */
			
 
				 astcenc_error astcenc_decompress_image(
			
 
				 	astcenc_context* ctxo,
			
--- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2023 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -226,7 +226,7 @@ static void kmeans_update(
 
				 
			
 
				 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
			
 
				 
			
 
				-	// Find the center-of-gravity in each cluster
			
 
				+	// Find the center of gravity in each cluster
			
 
				 	for (unsigned int i = 0; i < texel_count; i++)
			
 
				 	{
			
 
				 		uint8_t partition = partition_of_texel[i];
			
@@ -425,8 +425,8 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
 
				 	}
			
 
				 
			
 
				 	// Create a running sum from the histogram array
			
 
				-	// Cells store previous values only; i.e. exclude self after sum
			
 
				-	unsigned int sum = 0;
			
 
				+	// Indices store previous values only; i.e. exclude self after sum
			
 
				+	uint16_t sum = 0;
			
 
				 	for (unsigned int i = 0; i < texel_count; i++)
			
 
				 	{
			
 
				 		uint16_t cnt = mscount[i];
			
--- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
@@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
 
				 	unsigned int index
			
 
				 ) {
			
 
				 	// Load the bilinear filter texel weight indexes in the decimated grid
			
 
				-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
			
 
				-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
			
 
				-	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
			
 
				-	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
			
 
				+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
			
 
				+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
			
 
				+	const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
			
 
				+	const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
			
 
				 
			
 
				 	// Load the bilinear filter weights from the decimated grid
			
 
				-	vfloat weight_val0 = gatherf(weights, weight_idx0);
			
 
				-	vfloat weight_val1 = gatherf(weights, weight_idx1);
			
 
				-	vfloat weight_val2 = gatherf(weights, weight_idx2);
			
 
				-	vfloat weight_val3 = gatherf(weights, weight_idx3);
			
 
				+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
			
 
				+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
			
 
				+	vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
			
 
				+	vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
			
 
				 
			
 
				 	// Load the weight contribution factors for each decimated weight
			
 
				 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
			
@@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
 
				 	unsigned int index
			
 
				 ) {
			
 
				 	// Load the bilinear filter texel weight indexes in the decimated grid
			
 
				-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
			
 
				-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
			
 
				+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
			
 
				+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
			
 
				 
			
 
				 	// Load the bilinear filter weights from the decimated grid
			
 
				-	vfloat weight_val0 = gatherf(weights, weight_idx0);
			
 
				-	vfloat weight_val1 = gatherf(weights, weight_idx1);
			
 
				+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
			
 
				+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
			
 
				 
			
 
				 	// Load the weight contribution factors for each decimated weight
			
 
				 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
			
@@ -195,8 +195,8 @@ static void compute_ideal_colors_and_weights_1_comp(
 
				 	}
			
 
				 
			
 
				 	// Zero initialize any SIMD over-fetch
			
 
				-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (size_t i = texel_count; i < texel_count_simd; i++)
			
 
				 	{
			
 
				 		ei.weights[i] = 0.0f;
			
 
				 		ei.weight_error_scale[i] = 0.0f;
			
@@ -333,8 +333,8 @@ static void compute_ideal_colors_and_weights_2_comp(
 
				 	}
			
 
				 
			
 
				 	// Zero initialize any SIMD over-fetch
			
 
				-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (size_t i = texel_count; i < texel_count_simd; i++)
			
 
				 	{
			
 
				 		ei.weights[i] = 0.0f;
			
 
				 		ei.weight_error_scale[i] = 0.0f;
			
@@ -500,8 +500,8 @@ static void compute_ideal_colors_and_weights_3_comp(
 
				 	}
			
 
				 
			
 
				 	// Zero initialize any SIMD over-fetch
			
 
				-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (size_t i = texel_count; i < texel_count_simd; i++)
			
 
				 	{
			
 
				 		ei.weights[i] = 0.0f;
			
 
				 		ei.weight_error_scale[i] = 0.0f;
			
@@ -598,8 +598,8 @@ static void compute_ideal_colors_and_weights_4_comp(
 
				 	}
			
 
				 
			
 
				 	// Zero initialize any SIMD over-fetch
			
 
				-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
			
 
				+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
			
 
				+	for (size_t i = texel_count; i < texel_count_simd; i++)
			
 
				 	{
			
 
				 		ei.weights[i] = 0.0f;
			
 
				 		ei.weight_error_scale[i] = 0.0f;
			
@@ -853,12 +853,6 @@ void compute_ideal_weights_for_decimation(
 
				 	promise(texel_count > 0);
			
 
				 	promise(weight_count > 0);
			
 
				 
			
 
				-	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
			
 
				-	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
			
 
				-	// arrays always contain space for 64 elements
			
 
				-	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
			
 
				-	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
			
 
				-
			
 
				 	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
			
 
				 	// zero-initialized SIMD over-fetch region
			
 
				 	if (is_direct)
			
@@ -873,7 +867,6 @@ void compute_ideal_weights_for_decimation(
 
				 	}
			
 
				 
			
 
				 	// Otherwise compute an estimate and perform single refinement iteration
			
 
				-	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
			
 
				 
			
 
				 	// Compute an initial average for each decimated weight
			
 
				 	bool constant_wes = ei.is_constant_weight_error_scale;
			
@@ -889,23 +882,23 @@ void compute_ideal_weights_for_decimation(
 
				 
			
 
				 		// Accumulate error weighting of all the texels using this weight
			
 
				 		vint weight_texel_count(di.weight_texel_count + i);
			
 
				-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
			
 
				+		unsigned int max_texel_count = hmax_s(weight_texel_count);
			
 
				 		promise(max_texel_count > 0);
			
 
				 
			
 
				 		for (unsigned int j = 0; j < max_texel_count; j++)
			
 
				 		{
			
 
				-			vint texel(di.weight_texels_tr[j] + i);
			
 
				+			const uint8_t* texel = di.weight_texels_tr[j] + i;
			
 
				 			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
			
 
				 
			
 
				 			if (!constant_wes)
			
 
				 			{
			
 
				-				weight_error_scale = gatherf(ei.weight_error_scale, texel);
			
 
				+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
			
 
				 			}
			
 
				 
			
 
				 			vfloat contrib_weight = weight * weight_error_scale;
			
 
				 
			
 
				 			weight_weight += contrib_weight;
			
 
				-			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
			
 
				+			initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
			
 
				 		}
			
 
				 
			
 
				 		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
			
@@ -914,6 +907,7 @@ void compute_ideal_weights_for_decimation(
 
				 	// Populate the interpolated weight grid based on the initial average
			
 
				 	// Process SIMD-width texel coordinates at at time while we can. Safe to
			
 
				 	// over-process full SIMD vectors - the tail is zeroed.
			
 
				+	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
			
 
				 	if (di.max_texel_weight_count <= 2)
			
 
				 	{
			
 
				 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
@@ -947,22 +941,22 @@ void compute_ideal_weights_for_decimation(
 
				 
			
 
				 		// Accumulate error weighting of all the texels using this weight
			
 
				 		vint weight_texel_count(di.weight_texel_count + i);
			
 
				-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
			
 
				+		unsigned int max_texel_count = hmax_s(weight_texel_count);
			
 
				 		promise(max_texel_count > 0);
			
 
				 
			
 
				 		for (unsigned int j = 0; j < max_texel_count; j++)
			
 
				 		{
			
 
				-			vint texel(di.weight_texels_tr[j] + i);
			
 
				+			const uint8_t* texel = di.weight_texels_tr[j] + i;
			
 
				 			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
			
 
				 
			
 
				 			if (!constant_wes)
			
 
				 			{
			
 
				- 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
			
 
				+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
			
 
				 			}
			
 
				 
			
 
				 			vfloat scale = weight_error_scale * contrib_weight;
			
 
				-			vfloat old_weight = gatherf(infilled_weights, texel);
			
 
				-			vfloat ideal_weight = gatherf(ei.weights, texel);
			
 
				+			vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
			
 
				+			vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
			
 
				 
			
 
				 			error_change0 += contrib_weight * scale;
			
 
				 			error_change1 += (old_weight - ideal_weight) * scale;
			
@@ -1023,9 +1017,8 @@ void compute_quantized_weights_for_decimation(
 
				 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
			
 
				 	if (get_quant_level(quant_level) <= 16)
			
 
				 	{
			
 
				-		vint4 tab0 = vint4::load(qat.quant_to_unquant);
			
 
				-		vint tab0p;
			
 
				-		vtable_prepare(tab0, tab0p);
			
 
				+		vtable_16x8 table;
			
 
				+		vtable_prepare(table, qat.quant_to_unquant);
			
 
				 
			
 
				 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
@@ -1038,8 +1031,8 @@ void compute_quantized_weights_for_decimation(
 
				 			vint weightl = float_to_int(ix1);
			
 
				 			vint weighth = min(weightl + vint(1), steps_m1);
			
 
				 
			
 
				-			vint ixli = vtable_8bt_32bi(tab0p, weightl);
			
 
				-			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
			
 
				+			vint ixli = vtable_lookup_32bit(table, weightl);
			
 
				+			vint ixhi = vtable_lookup_32bit(table, weighth);
			
 
				 
			
 
				 			vfloat ixl = int_to_float(ixli);
			
 
				 			vfloat ixh = int_to_float(ixhi);
			
@@ -1050,16 +1043,13 @@ void compute_quantized_weights_for_decimation(
 
				 
			
 
				 			// Invert the weight-scaling that was done initially
			
 
				 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
			
 
				-			vint scn = pack_low_bytes(weight);
			
 
				-			store_nbytes(scn, quantized_weight_set + i);
			
 
				+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		vint4 tab0 = vint4::load(qat.quant_to_unquant +  0);
			
 
				-		vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
			
 
				-		vint tab0p, tab1p;
			
 
				-		vtable_prepare(tab0, tab1, tab0p, tab1p);
			
 
				+		vtable_32x8 table;
			
 
				+		vtable_prepare(table, qat.quant_to_unquant);
			
 
				 
			
 
				 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
@@ -1072,8 +1062,8 @@ void compute_quantized_weights_for_decimation(
 
				 			vint weightl = float_to_int(ix1);
			
 
				 			vint weighth = min(weightl + vint(1), steps_m1);
			
 
				 
			
 
				-			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
			
 
				-			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
			
 
				+			vint ixli = vtable_lookup_32bit(table, weightl);
			
 
				+			vint ixhi = vtable_lookup_32bit(table, weighth);
			
 
				 
			
 
				 			vfloat ixl = int_to_float(ixli);
			
 
				 			vfloat ixh = int_to_float(ixhi);
			
@@ -1084,8 +1074,7 @@ void compute_quantized_weights_for_decimation(
 
				 
			
 
				 			// Invert the weight-scaling that was done initially
			
 
				 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
			
 
				-			vint scn = pack_low_bytes(weight);
			
 
				-			store_nbytes(scn, quantized_weight_set + i);
			
 
				+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/thirdparty/astcenc/astcenc_internal.h
+++ b/thirdparty/astcenc/astcenc_internal.h
@@ -1583,19 +1583,13 @@ static inline vmask4 get_u8_component_mask(
 
				 	astcenc_profile decode_mode,
			
 
				 	const image_block& blk
			
 
				 ) {
			
 
				-	vmask4 u8_mask(false);
			
 
				-	// Decode mode writing to a unorm8 output value
			
 
				-	if (blk.decode_unorm8)
			
 
				+	// Decode mode or sRGB forces writing to unorm8 output value
			
 
				+	if (blk.decode_unorm8 || decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				 	{
			
 
				-		u8_mask = vmask4(true);
			
 
				-	}
			
 
				-	// SRGB writing to a unorm8 RGB value
			
 
				-	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
			
 
				-	{
			
 
				-		u8_mask = vmask4(true, true, true, false);
			
 
				+		return vmask4(true);
			
 
				 	}
			
 
				 
			
 
				-	return u8_mask;
			
 
				+	return vmask4(false);
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/thirdparty/astcenc/astcenc_internal_entry.h
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2024 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -100,6 +100,9 @@ private:
 
				 	/** @brief Lock used for critical section and condition synchronization. */
			
 
				 	std::mutex m_lock;
			
 
				 
			
 
				+	/** @brief True if the current operation is cancelled. */
			
 
				+	std::atomic<bool> m_is_cancelled;
			
 
				+
			
 
				 	/** @brief True if the stage init() step has been executed. */
			
 
				 	bool m_init_done;
			
 
				 
			
@@ -147,6 +150,7 @@ public:
 
				 	{
			
 
				 		m_init_done = false;
			
 
				 		m_term_done = false;
			
 
				+		m_is_cancelled = false;
			
 
				 		m_start_count = 0;
			
 
				 		m_done_count = 0;
			
 
				 		m_task_count = 0;
			
@@ -155,6 +159,16 @@ public:
 
				 		m_callback_min_diff = 1.0f;
			
 
				 	}
			
 
				 
			
 
				+	/**
			
 
				+	 * @brief Clear the tracker and stop new tasks being assigned.
			
 
				+	 *
			
 
				+	 * Note, all in-flight tasks in a worker will still complete normally.
			
 
				+	 */
			
 
				+	void cancel()
			
 
				+	{
			
 
				+		m_is_cancelled = true;
			
 
				+	}
			
 
				+
			
 
				 	/**
			
 
				 	 * @brief Trigger the pipeline stage init step.
			
 
				 	 *
			
@@ -211,7 +225,7 @@ public:
 
				 	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
			
 
				 	{
			
 
				 		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
			
 
				-		if (base >= m_task_count)
			
 
				+		if (m_is_cancelled || base >= m_task_count)
			
 
				 		{
			
 
				 			count = 0;
			
 
				 			return 0;
			
@@ -241,16 +255,17 @@ public:
 
				 			local_count = m_done_count;
			
 
				 			local_last_value = m_callback_last_value;
			
 
				 
			
 
				-			if (m_done_count == m_task_count)
			
 
				+			// Ensure the progress bar hits 100%
			
 
				+			if (m_callback && m_done_count == m_task_count)
			
 
				 			{
			
 
				-				// Ensure the progress bar hits 100%
			
 
				-				if (m_callback)
			
 
				-				{
			
 
				-					std::unique_lock<std::mutex> cblck(m_callback_lock);
			
 
				-					m_callback(100.0f);
			
 
				-					m_callback_last_value = 100.0f;
			
 
				-				}
			
 
				+				std::unique_lock<std::mutex> cblck(m_callback_lock);
			
 
				+				m_callback(100.0f);
			
 
				+				m_callback_last_value = 100.0f;
			
 
				+			}
			
 
				 
			
 
				+			// Notify if nothing left to do
			
 
				+			if (m_is_cancelled || m_done_count == m_task_count)
			
 
				+			{
			
 
				 				lck.unlock();
			
 
				 				m_complete.notify_all();
			
 
				 			}
			
@@ -285,7 +300,7 @@ public:
 
				 	void wait()
			
 
				 	{
			
 
				 		std::unique_lock<std::mutex> lck(m_lock);
			
 
				-		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
			
 
				+		m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
			
 
				 	}
			
 
				 
			
 
				 	/**
			
--- a/thirdparty/astcenc/astcenc_mathlib.h
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2024 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -48,7 +48,7 @@
 
				     #define ASTCENC_SSE 42
			
 
				   #elif defined(__SSE4_1__)
			
 
				     #define ASTCENC_SSE 41
			
 
				-  #elif defined(__SSE2__)
			
 
				+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
			
 
				     #define ASTCENC_SSE 20
			
 
				   #else
			
 
				     #define ASTCENC_SSE 0
			
@@ -58,25 +58,42 @@
 
				 #ifndef ASTCENC_AVX
			
 
				   #if defined(__AVX2__)
			
 
				     #define ASTCENC_AVX 2
			
 
				+    #define ASTCENC_X86_GATHERS 1
			
 
				   #elif defined(__AVX__)
			
 
				     #define ASTCENC_AVX 1
			
 
				+    #define ASTCENC_X86_GATHERS 1
			
 
				   #else
			
 
				     #define ASTCENC_AVX 0
			
 
				   #endif
			
 
				 #endif
			
 
				 
			
 
				 #ifndef ASTCENC_NEON
			
 
				-  #if defined(__aarch64__)
			
 
				+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
			
 
				     #define ASTCENC_NEON 1
			
 
				   #else
			
 
				     #define ASTCENC_NEON 0
			
 
				   #endif
			
 
				 #endif
			
 
				 
			
 
				+#ifndef ASTCENC_SVE
			
 
				+  #if defined(__ARM_FEATURE_SVE)
			
 
				+    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
			
 
				+      #define ASTCENC_SVE 8
			
 
				+    // Auto-detected SVE can only assume vector width of 4 is available, but
			
 
				+    // must also allow for hardware being longer and so all use of intrinsics
			
 
				+    // must explicitly use predicate masks to limit to 4-wide.
			
 
				+    #else
			
 
				+      #define ASTCENC_SVE 4
			
 
				+    #endif
			
 
				+    #else
			
 
				+    #define ASTCENC_SVE 0
			
 
				+  #endif
			
 
				+#endif
			
 
				+
			
 
				 // Force vector-sized SIMD alignment
			
 
				-#if ASTCENC_AVX
			
 
				+#if ASTCENC_AVX || ASTCENC_SVE == 8
			
 
				   #define ASTCENC_VECALIGN 32
			
 
				-#elif ASTCENC_SSE || ASTCENC_NEON
			
 
				+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
			
 
				   #define ASTCENC_VECALIGN 16
			
 
				 // Use default alignment for non-SIMD builds
			
 
				 #else
			
--- a/thirdparty/astcenc/astcenc_partition_tables.cpp
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@@ -304,9 +304,9 @@ static bool generate_one_partition_info_entry(
 
				 	// Fill loop tail so we can overfetch later
			
 
				 	for (unsigned int i = 0; i < partition_count; i++)
			
 
				 	{
			
 
				-		int ptex_count = counts[i];
			
 
				-		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
			
 
				-		for (int j = ptex_count; j < ptex_count_simd; j++)
			
 
				+		size_t ptex_count = counts[i];
			
 
				+		size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
			
 
				+		for (size_t j = ptex_count; j < ptex_count_simd; j++)
			
 
				 		{
			
 
				 			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
			
 
				 		}
			
--- a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2011-2022 Arm Limited
			
 
				+// Copyright 2011-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
 
				 	vint lane_ids = vint::lane_id();
			
 
				 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 	{
			
 
				-		vint tix(texel_indexes + i);
			
 
				+		const uint8_t* tix = texel_indexes + i;
			
 
				 
			
 
				 		vmask mask = lane_ids < vint(texel_count);
			
 
				 		lane_ids += vint(ASTCENC_SIMD_WIDTH);
			
 
				 
			
 
				 		// Compute the error that arises from just ditching alpha
			
 
				-		vfloat data_a = gatherf(blk.data_a, tix);
			
 
				+		vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
			
 
				 		vfloat alpha_diff = data_a - default_a;
			
 
				 		alpha_diff = alpha_diff * alpha_diff;
			
 
				 
			
 
				 		haccumulate(a_drop_errv, alpha_diff, mask);
			
 
				 
			
 
				-		vfloat data_r = gatherf(blk.data_r, tix);
			
 
				-		vfloat data_g = gatherf(blk.data_g, tix);
			
 
				-		vfloat data_b = gatherf(blk.data_b, tix);
			
 
				+		vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
			
 
				+		vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
			
 
				+		vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
			
 
				 
			
 
				 		// Compute uncorrelated error
			
 
				 		vfloat param = data_r * uncor_bs0
			
@@ -1135,13 +1135,13 @@ unsigned int compute_ideal_endpoint_formats(
 
				 	vfloat clear_error(ERROR_CALC_DEFAULT);
			
 
				 	vint clear_quant(0);
			
 
				 
			
 
				-	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				+	size_t packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				 	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
			
 
				 	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
			
 
				 	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
			
 
				 
			
 
				 	// Ensure that last iteration overstep contains data that will never be picked
			
 
				-	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
			
 
				+	size_t packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
			
 
				 	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
			
 
				 	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
			
 
				 	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
			
@@ -1292,9 +1292,12 @@ unsigned int compute_ideal_endpoint_formats(
 
				 		vint vbest_error_index(-1);
			
 
				 		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
			
 
				 
			
 
				-		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				-		vint lane_ids = vint::lane_id() + vint(start_block_mode);
			
 
				-		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
			
 
				+		// TODO: This should use size_t for the inputs of start/end_block_mode
			
 
				+		// to avoid some of this type conversion, but that propagates and will
			
 
				+		// need a bigger PR to fix
			
 
				+		size_t start_mode = round_down_to_simd_multiple_vla(start_block_mode);
			
 
				+		vint lane_ids = vint::lane_id() + vint_from_size(start_mode);
			
 
				+		for (size_t j = start_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
			
 
				 		{
			
 
				 			vfloat err = vfloat(errors_of_best_combination + j);
			
 
				 			vmask mask = err < vbest_ep_error;
			
@@ -1306,8 +1309,8 @@ unsigned int compute_ideal_endpoint_formats(
 
				 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
			
 
				 		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
			
 
				 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
			
 
				-		vbest_error_index = hmin(vbest_error_index);
			
 
				-		int best_error_index = vbest_error_index.lane<0>();
			
 
				+
			
 
				+		int best_error_index = hmin_s(vbest_error_index);
			
 
				 
			
 
				 		best_error_weights[i] = best_error_index;
			
 
				 
			
--- a/thirdparty/astcenc/astcenc_vecmathlib.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2022 Arm Limited
			
 
				+// Copyright 2019-2025 Arm Limited
			
 
				 // Copyright 2008 Jose Fonseca
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
@@ -42,11 +42,12 @@
 
				  *
			
 
				  * With the current implementation ISA support is provided for:
			
 
				  *
			
 
				- *     * 1-wide for scalar reference.
			
 
				- *     * 4-wide for Armv8-A NEON.
			
 
				- *     * 4-wide for x86-64 SSE2.
			
 
				- *     * 4-wide for x86-64 SSE4.1.
			
 
				- *     * 8-wide for x86-64 AVX2.
			
 
				+ *     * 1-wide for scalar reference
			
 
				+ *     * 4-wide for Armv8-A NEON
			
 
				+ *     * 4-wide for x86-64 SSE2
			
 
				+ *     * 4-wide for x86-64 SSE4.1
			
 
				+ *     * 8-wide for Armv8-A SVE
			
 
				+ *     * 8-wide for x86-64 AVX2
			
 
				  */
			
 
				 
			
 
				 #ifndef ASTC_VECMATHLIB_H_INCLUDED
			
@@ -54,7 +55,14 @@
 
				 
			
 
				 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
			
 
				 	#include <immintrin.h>
			
 
				-#elif ASTCENC_NEON != 0
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_SVE != 0
			
 
				+	#include <arm_sve.h>
			
 
				+	#include <arm_neon_sve_bridge.h>
			
 
				+#endif
			
 
				+
			
 
				+#if ASTCENC_NEON != 0
			
 
				 	#include <arm_neon.h>
			
 
				 #endif
			
 
				 
			
@@ -69,8 +77,10 @@
 
				 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
			
 
				 #endif
			
 
				 
			
 
				+template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
			
 
				+
			
 
				 #if ASTCENC_AVX >= 2
			
 
				-	/* If we have AVX2 expose 8-wide VLA. */
			
 
				+	// If we have AVX2 expose 8-wide VLA.
			
 
				 	#include "astcenc_vecmathlib_sse_4.h"
			
 
				 	#include "astcenc_vecmathlib_common_4.h"
			
 
				 	#include "astcenc_vecmathlib_avx2_8.h"
			
@@ -88,11 +98,16 @@
 
				 	using vint = vint8;
			
 
				 	using vmask = vmask8;
			
 
				 
			
 
				+	using vtable_16x8 = vtable8_16x8;
			
 
				+	using vtable_32x8 = vtable8_32x8;
			
 
				+	using vtable_64x8 = vtable8_64x8;
			
 
				+
			
 
				 	constexpr auto loada = vfloat8::loada;
			
 
				 	constexpr auto load1 = vfloat8::load1;
			
 
				+	constexpr auto vint_from_size = vint8_from_size;
			
 
				 
			
 
				 #elif ASTCENC_SSE >= 20
			
 
				-	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
			
 
				+	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
			
 
				 	#include "astcenc_vecmathlib_sse_4.h"
			
 
				 	#include "astcenc_vecmathlib_common_4.h"
			
 
				 
			
@@ -103,11 +118,48 @@
 
				 	using vint = vint4;
			
 
				 	using vmask = vmask4;
			
 
				 
			
 
				+	using vtable_16x8 = vtable4_16x8;
			
 
				+	using vtable_32x8 = vtable4_32x8;
			
 
				+	using vtable_64x8 = vtable4_64x8;
			
 
				+
			
 
				 	constexpr auto loada = vfloat4::loada;
			
 
				 	constexpr auto load1 = vfloat4::load1;
			
 
				+	constexpr auto vint_from_size = vint4_from_size;
			
 
				+
			
 
				+#elif ASTCENC_SVE == 8
			
 
				+	// Check the compiler is configured with fixed-length 256-bit SVE.
			
 
				+	#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
			
 
				+		#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
			
 
				+	#endif
			
 
				+
			
 
				+	// If we have SVE configured as 8-wide, expose 8-wide VLA.
			
 
				+	#include "astcenc_vecmathlib_neon_4.h"
			
 
				+	#include "astcenc_vecmathlib_common_4.h"
			
 
				+	#include "astcenc_vecmathlib_sve_8.h"
			
 
				+
			
 
				+	#define ASTCENC_SIMD_WIDTH 8
			
 
				+
			
 
				+	using vfloat = vfloat8;
			
 
				+
			
 
				+	#if defined(ASTCENC_NO_INVARIANCE)
			
 
				+		using vfloatacc = vfloat8;
			
 
				+	#else
			
 
				+		using vfloatacc = vfloat4;
			
 
				+	#endif
			
 
				+
			
 
				+	using vint = vint8;
			
 
				+	using vmask = vmask8;
			
 
				+
			
 
				+	using vtable_16x8 = vtable8_16x8;
			
 
				+	using vtable_32x8 = vtable8_32x8;
			
 
				+	using vtable_64x8 = vtable8_64x8;
			
 
				+
			
 
				+	constexpr auto loada = vfloat8::loada;
			
 
				+	constexpr auto load1 = vfloat8::load1;
			
 
				+	constexpr auto vint_from_size = vint8_from_size;
			
 
				 
			
 
				 #elif ASTCENC_NEON > 0
			
 
				-	/* If we have NEON expose 4-wide VLA. */
			
 
				+	// If we have NEON expose 4-wide VLA.
			
 
				 	#include "astcenc_vecmathlib_neon_4.h"
			
 
				 	#include "astcenc_vecmathlib_common_4.h"
			
 
				 
			
@@ -118,8 +170,13 @@
 
				 	using vint = vint4;
			
 
				 	using vmask = vmask4;
			
 
				 
			
 
				+	using vtable_16x8 = vtable4_16x8;
			
 
				+	using vtable_32x8 = vtable4_32x8;
			
 
				+	using vtable_64x8 = vtable4_64x8;
			
 
				+
			
 
				 	constexpr auto loada = vfloat4::loada;
			
 
				 	constexpr auto load1 = vfloat4::load1;
			
 
				+	constexpr auto vint_from_size = vint4_from_size;
			
 
				 
			
 
				 #else
			
 
				 	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
			
@@ -150,34 +207,15 @@
 
				 	using vint = vint4;
			
 
				 	using vmask = vmask4;
			
 
				 
			
 
				+	using vtable_16x8 = vtable4_16x8;
			
 
				+	using vtable_32x8 = vtable4_32x8;
			
 
				+	using vtable_64x8 = vtable4_64x8;
			
 
				+
			
 
				 	constexpr auto loada = vfloat4::loada;
			
 
				 	constexpr auto load1 = vfloat4::load1;
			
 
				+	constexpr auto vint_from_size = vint4_from_size;
			
 
				 #endif
			
 
				 
			
 
				-/**
			
 
				- * @brief Round a count down to the largest multiple of 8.
			
 
				- *
			
 
				- * @param count   The unrounded value.
			
 
				- *
			
 
				- * @return The rounded value.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
			
 
				-{
			
 
				-	return count & static_cast<unsigned int>(~(8 - 1));
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * @brief Round a count down to the largest multiple of 4.
			
 
				- *
			
 
				- * @param count   The unrounded value.
			
 
				- *
			
 
				- * @return The rounded value.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
			
 
				-{
			
 
				-	return count & static_cast<unsigned int>(~(4 - 1));
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Round a count down to the largest multiple of the SIMD width.
			
 
				  *
			
@@ -187,9 +225,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int coun
 
				  *
			
 
				  * @return The rounded value.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
			
 
				+ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
			
 
				 {
			
 
				-	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
			
 
				+	return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -201,9 +239,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int co
 
				  *
			
 
				  * @return The rounded value.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
			
 
				+ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
			
 
				 {
			
 
				-	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
			
 
				+	size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
			
 
				 	return multiples * ASTCENC_SIMD_WIDTH;
			
 
				 }
			
 
				 
			
@@ -239,8 +277,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
 
				 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
			
 
				 {
			
 
				 	vfloat z = atan(abs(y / x));
			
 
				-	vmask xmask = vmask(float_as_int(x).m);
			
 
				-	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
			
 
				+	vmask xmask = x < vfloat::zero();
			
 
				+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2024 Arm Limited
			
 
				+// Copyright 2019-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -54,7 +54,7 @@ struct vfloat8
 
				 	ASTCENC_SIMD_INLINE vfloat8() = default;
			
 
				 
			
 
				 	/**
			
 
				-	 * @brief Construct from 4 values loaded from an unaligned address.
			
 
				+	 * @brief Construct from 8 values loaded from an unaligned address.
			
 
				 	 *
			
 
				 	 * Consider using loada() which is better with vectors if data is aligned
			
 
				 	 * to vector length.
			
@@ -74,18 +74,6 @@ struct vfloat8
 
				 		m = _mm256_set1_ps(a);
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Construct from 8 scalar values.
			
 
				-	 *
			
 
				-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				-	 */
			
 
				-	ASTCENC_SIMD_INLINE explicit vfloat8(
			
 
				-		float a, float b, float c, float d,
			
 
				-		float e, float f, float g, float h)
			
 
				-	{
			
 
				-		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Construct from an existing SIMD register.
			
 
				 	 */
			
@@ -94,20 +82,6 @@ struct vfloat8
 
				 		m = a;
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Get the scalar value of a single lane.
			
 
				-	 */
			
 
				-	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				-	{
			
 
				-	#if !defined(__clang__) && defined(_MSC_VER)
			
 
				-		return m.m256_f32[l];
			
 
				-	#else
			
 
				-		union { __m256 m; float f[8]; } cvt;
			
 
				-		cvt.m = m;
			
 
				-		return cvt.f[l];
			
 
				-	#endif
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Factory that returns a vector of zeros.
			
 
				 	 */
			
@@ -132,14 +106,6 @@ struct vfloat8
 
				 		return vfloat8(_mm256_load_ps(p));
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				-	 */
			
 
				-	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
			
 
				-	{
			
 
				-		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief The vector ...
			
 
				 	 */
			
@@ -183,25 +149,13 @@ struct vint8
 
				 	/**
			
 
				 	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				 	 *
			
 
				-	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				 	 */
			
 
				 	ASTCENC_SIMD_INLINE explicit vint8(int a)
			
 
				 	{
			
 
				 		m = _mm256_set1_epi32(a);
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Construct from 8 scalar values.
			
 
				-	 *
			
 
				-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
			
 
				-	 */
			
 
				-	ASTCENC_SIMD_INLINE explicit vint8(
			
 
				-		int a, int b, int c, int d,
			
 
				-		int e, int f, int g, int h)
			
 
				-	{
			
 
				-		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Construct from an existing SIMD register.
			
 
				 	 */
			
@@ -210,20 +164,6 @@ struct vint8
 
				 		m = a;
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Get the scalar from a single lane.
			
 
				-	 */
			
 
				-	template <int l> ASTCENC_SIMD_INLINE int lane() const
			
 
				-	{
			
 
				-	#if !defined(__clang__) && defined(_MSC_VER)
			
 
				-		return m.m256i_i32[l];
			
 
				-	#else
			
 
				-		union { __m256i m; int f[8]; } cvt;
			
 
				-		cvt.m = m;
			
 
				-		return cvt.f[l];
			
 
				-	#endif
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Factory that returns a vector of zeros.
			
 
				 	 */
			
@@ -518,31 +458,54 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
			
 
				 {
			
 
				-	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
			
 
				-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
			
 
				-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
			
 
				-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
			
 
				+	// Build min within groups of 2, then 4, then 8
			
 
				+	__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
			
 
				+	m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
			
 
				+	m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
			
 
				 
			
 
				-	__m256i r = astcenc_mm256_set_m128i(m, m);
			
 
				-	vint8 vmin(r);
			
 
				+	vint8 vmin(m);
			
 
				 	return vmin;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
			
 
				+{
			
 
				+	return _mm256_cvtsi256_si32(hmin(a).m);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Return the horizontal maximum of a vector.
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
			
 
				 {
			
 
				-	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
			
 
				-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
			
 
				-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
			
 
				-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
			
 
				+	// Build max within groups of 2, then 4, then 8
			
 
				+	__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
			
 
				+	m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
			
 
				+	m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
			
 
				 
			
 
				-	__m256i r = astcenc_mm256_set_m128i(m, m);
			
 
				-	vint8 vmax(r);
			
 
				+	vint8 vmax(m);
			
 
				 	return vmax;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
			
 
				+{
			
 
				+	return _mm256_cvtsi256_si32(hmax(a).m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a vint8 from a size_t.
			
 
				+ */
			
 
				+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
			
 
				+ {
			
 
				+	assert(a <= std::numeric_limits<int>::max());
			
 
				+	return vint8(static_cast<int>(a));
			
 
				+ }
			
 
				+
			
 
				 /**
			
 
				  * @brief Store a vector to a 16B aligned memory address.
			
 
				  */
			
@@ -570,18 +533,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 
				 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Gather N (vector width) indices from the array.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
			
 
				-{
			
 
				-	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
			
 
				+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
			
 
				 {
			
 
				 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
			
 
				 	                               0, 0, 0, 0, 28, 24, 20, 16,
			
@@ -593,7 +548,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
 
				 	__m128i b = _mm_unpacklo_epi32(a0, a1);
			
 
				 
			
 
				 	__m256i r = astcenc_mm256_set_m128i(b, b);
			
 
				-	return vint8(r);
			
 
				+
			
 
				+	store_nbytes(vint8(r), p);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -606,7 +562,7 @@ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
 
				 }
			
 
				 
			
 
				 // ============================================================================
			
 
				-// vfloat4 operators and functions
			
 
				+// vfloat8 operators and functions
			
 
				 // ============================================================================
			
 
				 
			
 
				 /**
			
@@ -674,7 +630,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
 
				 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /**
			
 
				  * @brief Overload: scalar by vector division.
			
 
				  */
			
@@ -683,7 +638,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
 
				 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /**
			
 
				  * @brief Overload: vector by vector equality.
			
 
				  */
			
@@ -786,19 +740,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
 
				 	return a;
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return a clamped value between 0.0f and max.
			
 
				- *
			
 
				- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
			
 
				- * be returned for that lane.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
			
 
				-{
			
 
				-	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
			
 
				-	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
			
 
				-	return a;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Return a clamped value between 0.0f and 1.0f.
			
 
				  *
			
@@ -857,7 +798,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
			
 
				 {
			
 
				-	return hmin(a).lane<0>();
			
 
				+	return _mm256_cvtss_f32(hmin(a).m);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -887,7 +828,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
			
 
				 {
			
 
				-	return hmax(a).lane<0>();
			
 
				+	return _mm256_cvtss_f32(hmax(a).m);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -909,14 +850,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
 
				 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
			
 
				-{
			
 
				-	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
			
 
				  *
			
@@ -979,6 +912,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
 
				 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array using byte indices from memory
			
 
				+ */
			
 
				+template<>
			
 
				+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
			
 
				+{
			
 
				+#if ASTCENC_X86_GATHERS == 0
			
 
				+	// Perform manual gather using scalar loads in two separate dependency chains,
			
 
				+	// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
			
 
				+	// into a bunch of memory-operand inserts on 128-bit halves then merges late,
			
 
				+	// which performs significantly worse in tests.
			
 
				+	__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
			
 
				+	__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
			
 
				+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
			
 
				+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
			
 
				+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
			
 
				+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
			
 
				+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
			
 
				+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
			
 
				+
			
 
				+	return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
			
 
				+#else
			
 
				+	vint8 inds(indices);
			
 
				+	return gatherf(base, inds);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Store a vector to an unaligned memory address.
			
 
				  */
			
@@ -1045,98 +1005,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 
				 	return vfloat8(_mm256_castsi256_ps(a.m));
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Table structure for a 16x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_16x8 {
			
 
				+	vint8 t0;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 32x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_32x8 {
			
 
				+	vint8 t0;
			
 
				+	vint8 t1;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 64x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_64x8 {
			
 
				+	vint8 t0;
			
 
				+	vint8 t1;
			
 
				+	vint8 t2;
			
 
				+	vint8 t3;
			
 
				+};
			
 
				+
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
			
 
				-{
			
 
				-	// AVX2 duplicates the table within each 128-bit lane
			
 
				-	__m128i t0n = t0.m;
			
 
				-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable8_16x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	// AVX2 tables duplicate table entries in each 128-bit half-register
			
 
				+	vint4 d0 = vint4::load(data);
			
 
				+
			
 
				+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
			
 
				-{
			
 
				-	// AVX2 duplicates the table within each 128-bit lane
			
 
				-	__m128i t0n = t0.m;
			
 
				-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable8_32x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	// AVX2 tables duplicate table entries in each 128-bit half-register
			
 
				+	vint4 d0 = vint4::load(data);
			
 
				+	vint4 d1 = vint4::load(data + 16);
			
 
				 
			
 
				-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
			
 
				-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
			
 
				+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
			
 
				+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
			
 
				+
			
 
				+	// XOR chain the high rows to allow table emulation
			
 
				+	table.t1 = table.t1 ^ table.t0;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
			
 
				-{
			
 
				-	// AVX2 duplicates the table within each 128-bit lane
			
 
				-	__m128i t0n = t0.m;
			
 
				-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
			
 
				-
			
 
				-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
			
 
				-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
			
 
				+	vtable8_64x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	// AVX2 tables duplicate table entries in each 128-bit half-register
			
 
				+	vint4 d0 = vint4::load(data);
			
 
				+	vint4 d1 = vint4::load(data + 16);
			
 
				+	vint4 d2 = vint4::load(data + 32);
			
 
				+	vint4 d3 = vint4::load(data + 48);
			
 
				 
			
 
				-	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
			
 
				-	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
			
 
				+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
			
 
				+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
			
 
				+	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
			
 
				+	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
			
 
				 
			
 
				-	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
			
 
				-	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
			
 
				+	// XOR chain the high rows to allow table emulation
			
 
				+	table.t3 = table.t3 ^ table.t2;
			
 
				+	table.t2 = table.t2 ^ table.t1;
			
 
				+	table.t1 = table.t1 ^ table.t0;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_16x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	return vint8(result);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_32x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				 
			
 
				-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
			
 
				+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
			
 
				 	result = _mm256_xor_si256(result, result2);
			
 
				 	return vint8(result);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_64x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
			
 
				+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				 
			
 
				-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
			
 
				+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
			
 
				 	result = _mm256_xor_si256(result, result2);
			
 
				 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				 
			
 
				-	result2 = _mm256_shuffle_epi8(t2.m, idxx);
			
 
				+	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
			
 
				 	result = _mm256_xor_si256(result, result2);
			
 
				 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
			
 
				 
			
 
				-	result2 = _mm256_shuffle_epi8(t3.m, idxx);
			
 
				+	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
			
 
				 	result = _mm256_xor_si256(result, result2);
			
 
				 
			
 
				 	return vint8(result);
			
@@ -1146,7 +1148,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
 
				  * @brief Return a vector of interleaved RGBA data.
			
 
				  *
			
 
				  * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				- * with high  bits set to zero.
			
 
				+ * with high bits set to zero.
			
 
				  *
			
 
				  * Output vector stores a single RGBA texel packed in each lane.
			
 
				  */
			
@@ -1183,8 +1185,12 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
 
				 {
			
 
				 	alignas(32) int v[8];
			
 
				 	storea(a, v);
			
 
				+
			
 
				+	unsigned int uv[8];
			
 
				+	std::memcpy(uv, v, sizeof(int) * 8);
			
 
				+
			
 
				 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
			
 
				-	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
 
				+		uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2020-2024 Arm Limited
			
 
				+// Copyright 2020-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -31,26 +31,7 @@
 
				 #endif
			
 
				 
			
 
				 #include <cstdio>
			
 
				-
			
 
				-// ============================================================================
			
 
				-// vmask4 operators and functions
			
 
				-// ============================================================================
			
 
				-
			
 
				-/**
			
 
				- * @brief True if any lanes are enabled, false otherwise.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE bool any(vmask4 a)
			
 
				-{
			
 
				-	return mask(a) != 0;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * @brief True if all lanes are enabled, false otherwise.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE bool all(vmask4 a)
			
 
				-{
			
 
				-	return mask(a) == 0xF;
			
 
				-}
			
 
				+#include <limits>
			
 
				 
			
 
				 // ============================================================================
			
 
				 // vint4 operators and functions
			
@@ -129,6 +110,31 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
 
				 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
			
 
				+{
			
 
				+	return hmin(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a vint4 from a size_t.
			
 
				+ */
			
 
				+ ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
			
 
				+ {
			
 
				+	assert(a <= std::numeric_limits<int>::max());
			
 
				+	return vint4(static_cast<int>(a));
			
 
				+ }
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
			
 
				+{
			
 
				+	return hmax(a).lane<0>();
			
 
				+}
			
 
				+
			
 
				 // ============================================================================
			
 
				 // vfloat4 operators and functions
			
 
				 // ============================================================================
			
@@ -222,18 +228,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
 
				 	return min(max(a, minv), maxv);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return the clamped value between 0.0f and max.
			
 
				- *
			
 
				- * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
			
 
				- * be returned for that lane.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
			
 
				-{
			
 
				-	// Do not reorder - second operand will return if either is NaN
			
 
				-	return min(max(a, vfloat4::zero()), maxv);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Return the clamped value between 0.0f and 1.0f.
			
 
				  *
			
@@ -396,8 +390,12 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
 
				 {
			
 
				 	ASTCENC_ALIGNAS int v[4];
			
 
				 	storea(a, v);
			
 
				+
			
 
				+	unsigned int uv[4];
			
 
				+	std::memcpy(uv, v, sizeof(int) * 4);
			
 
				+
			
 
				 	printf("v4_i32:\n  %08x %08x %08x %08x\n",
			
 
				-	       v[0], v[1], v[2], v[3]);
			
 
				+		uv[0], uv[1], uv[2], uv[3]);
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2023 Arm Limited
			
 
				+// Copyright 2019-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -115,7 +115,7 @@ struct vfloat4
 
				 	 */
			
 
				 	static ASTCENC_SIMD_INLINE vfloat4 zero()
			
 
				 	{
			
 
				-		return vfloat4(vdupq_n_f32(0.0f));
			
 
				+		return vfloat4(0.0f);
			
 
				 	}
			
 
				 
			
 
				 	/**
			
@@ -134,15 +134,6 @@ struct vfloat4
 
				 		return vfloat4(vld1q_f32(p));
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				-	 */
			
 
				-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				-	{
			
 
				-		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
			
 
				-		return vfloat4(vld1q_f32(data));
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Return a swizzled float 2.
			
 
				 	 */
			
@@ -203,16 +194,21 @@ struct vint4
 
				 	 */
			
 
				 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
			
 
				 	{
			
 
				-		// Cast is safe - NEON loads are allowed to be unaligned
			
 
				-		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
			
 
				-		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
			
 
				-		m = vreinterpretq_s32_u32(vmovl_u16(t16));
			
 
				+#if ASTCENC_SVE == 0
			
 
				+	// Cast is safe - NEON loads are allowed to be unaligned
			
 
				+	uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
			
 
				+	uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
			
 
				+	m = vreinterpretq_s32_u32(vmovl_u16(t16));
			
 
				+#else
			
 
				+	svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
			
 
				+	m = svget_neonq(data);
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	/**
			
 
				 	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				 	 *
			
 
				-	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				 	 */
			
 
				 	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				 	{
			
@@ -420,6 +416,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 
				 	return vaddvq_u32(vshlq_u32(tmp, shift));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask4 a)
			
 
				+{
			
 
				+	return vmaxvq_u32(a.m) != 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask4 a)
			
 
				+{
			
 
				+	return vminvq_u32(a.m) != 0;
			
 
				+}
			
 
				+
			
 
				 // ============================================================================
			
 
				 // vint4 operators and functions
			
 
				 // ============================================================================
			
@@ -570,15 +582,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 
				 	return vint4(vmaxvq_s32(a.m));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return the horizontal sum of a vector.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				-{
			
 
				-	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
			
 
				-	return vget_lane_s32(vpadd_s32(t, t), 0);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Store a vector to a 16B aligned memory address.
			
 
				  */
			
@@ -612,31 +615,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Gather N (vector width) indices from the array.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				-{
			
 
				-	alignas(16) int idx[4];
			
 
				-	storea(indices, idx);
			
 
				-	alignas(16) int vals[4];
			
 
				-	vals[0] = base[idx[0]];
			
 
				-	vals[1] = base[idx[1]];
			
 
				-	vals[2] = base[idx[2]];
			
 
				-	vals[3] = base[idx[3]];
			
 
				-	return vint4(vals);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ * @brief Pack and store low 8 bits of each vector lane.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
			
 
				 {
			
 
				 	alignas(16) uint8_t shuf[16] {
			
 
				 		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
			
 
				 	};
			
 
				 	uint8x16_t idx = vld1q_u8(shuf);
			
 
				 	int8x16_t av = vreinterpretq_s8_s32(a.m);
			
 
				-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
			
 
				+	a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
			
 
				+	store_nbytes(a, data);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -814,21 +803,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 
				 	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				-{
			
 
				-	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
			
 
				-	uint32x4_t mask = vcgeq_u32(cond.m, msb);
			
 
				-	return vfloat4(vbslq_f32(mask, b.m, a.m));
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Load a vector of gathered results from an array;
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
			
 
				 {
			
 
				+#if ASTCENC_SVE == 0
			
 
				 	alignas(16) int idx[4];
			
 
				 	storea(indices, idx);
			
 
				 	alignas(16) float vals[4];
			
@@ -837,8 +817,32 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 
				 	vals[2] = base[idx[2]];
			
 
				 	vals[3] = base[idx[3]];
			
 
				 	return vfloat4(vals);
			
 
				+#else
			
 
				+	svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
			
 
				+	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
			
 
				+	return vfloat4(svget_neonq_f32(data));
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array using byte indices from memory
			
 
				+ */
			
 
				+template<>
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
			
 
				+{
			
 
				+#if ASTCENC_SVE == 0
			
 
				+	alignas(16) float vals[4];
			
 
				+	vals[0] = base[indices[0]];
			
 
				+	vals[1] = base[indices[1]];
			
 
				+	vals[2] = base[indices[2]];
			
 
				+	vals[3] = base[indices[3]];
			
 
				+	return vfloat4(vals);
			
 
				+#else
			
 
				+	svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
			
 
				+	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
			
 
				+	return vfloat4(svget_neonq_f32(data));
			
 
				+#endif
			
 
				+}
			
 
				 /**
			
 
				  * @brief Store a vector to an unaligned memory address.
			
 
				  */
			
@@ -950,87 +954,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 
				 	return vfloat4(vreinterpretq_f32_s32(v.m));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+/*
			
 
				+ * Table structure for a 16x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				-}
			
 
				+struct vtable4_16x8 {
			
 
				+	uint8x16_t t0;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 32x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_32x8 {
			
 
				+	uint8x16x2_t t01;
			
 
				+};
			
 
				 
			
 
				+/*
			
 
				+ * Table structure for a 64x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_64x8 {
			
 
				+	uint8x16x4_t t0123;
			
 
				+};
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_16x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.t0 = vld1q_u8(data);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				-	t2p = t2;
			
 
				-	t3p = t3;
			
 
				+	vtable4_32x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.t01 = uint8x16x2_t {
			
 
				+		vld1q_u8(data),
			
 
				+		vld1q_u8(data + 16)
			
 
				+	};
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				-{
			
 
				-	int8x16_t table {
			
 
				-		vreinterpretq_s8_s32(t0.m)
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_64x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.t0123 = uint8x16x4_t {
			
 
				+		vld1q_u8(data),
			
 
				+		vld1q_u8(data + 16),
			
 
				+		vld1q_u8(data + 32),
			
 
				+		vld1q_u8(data + 48)
			
 
				 	};
			
 
				+}
			
 
				 
			
 
				+/**
			
 
				+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_16x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				 
			
 
				-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
			
 
				+	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				-{
			
 
				-	int8x16x2_t table {
			
 
				-		vreinterpretq_s8_s32(t0.m),
			
 
				-		vreinterpretq_s8_s32(t1.m)
			
 
				-	};
			
 
				-
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_32x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				 
			
 
				-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
			
 
				+	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				-{
			
 
				-	int8x16x4_t table {
			
 
				-		vreinterpretq_s8_s32(t0.m),
			
 
				-		vreinterpretq_s8_s32(t1.m),
			
 
				-		vreinterpretq_s8_s32(t2.m),
			
 
				-		vreinterpretq_s8_s32(t3.m)
			
 
				-	};
			
 
				-
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_64x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
			
 
				 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
			
 
				 
			
 
				-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
			
 
				+	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2024 Arm Limited
			
 
				+// Copyright 2019-2025 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -139,14 +139,6 @@ struct vfloat4
 
				 		return vfloat4(p);
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				-	 */
			
 
				-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				-	{
			
 
				-		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Return a swizzled float 2.
			
 
				 	 */
			
@@ -233,7 +225,7 @@ struct vint4
 
				 	/**
			
 
				 	 * @brief Construct from 4 scalar values replicated across all lanes.
			
 
				 	 *
			
 
				-	 * Consider using vint4::zero() for constexpr zeros.
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				 	 */
			
 
				 	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				 	{
			
@@ -354,7 +346,7 @@ struct vmask4
 
				 	/**
			
 
				 	 * @brief Get the scalar value of a single lane.
			
 
				 	 */
			
 
				-	template <int l> ASTCENC_SIMD_INLINE float lane() const
			
 
				+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
			
 
				 	{
			
 
				 		return m[l] != 0;
			
 
				 	}
			
@@ -420,10 +412,26 @@ ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
			
 
				 {
			
 
				-	return ((a.m[0] >> 31) & 0x1) |
			
 
				-	       ((a.m[1] >> 30) & 0x2) |
			
 
				-	       ((a.m[2] >> 29) & 0x4) |
			
 
				-	       ((a.m[3] >> 28) & 0x8);
			
 
				+	return (a.m[0] & 0x1) |
			
 
				+	       (a.m[1] & 0x2) |
			
 
				+	       (a.m[2] & 0x4) |
			
 
				+	       (a.m[3] & 0x8);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) != 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) == 0xF;
			
 
				 }
			
 
				 
			
 
				 // ============================================================================
			
@@ -638,14 +646,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 
				 	return vint4(std::max(b, c));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return the horizontal sum of vector lanes as a scalar.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				-{
			
 
				-	return a.m[0] + a.m[1] + a.m[2] + a.m[3];
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Store a vector to an aligned memory address.
			
 
				  */
			
@@ -684,29 +684,23 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 
				 	std::memcpy(p, a.m, sizeof(uint8_t) * 4);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Gather N (vector width) indices from the array.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				-{
			
 
				-	return vint4(base[indices.m[0]],
			
 
				-	             base[indices.m[1]],
			
 
				-	             base[indices.m[2]],
			
 
				-	             base[indices.m[3]]);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
			
 
				 {
			
 
				 	int b0 = a.m[0] & 0xFF;
			
 
				 	int b1 = a.m[1] & 0xFF;
			
 
				 	int b2 = a.m[2] & 0xFF;
			
 
				 	int b3 = a.m[3] & 0xFF;
			
 
				 
			
 
				+#if !defined(ASTCENC_BIG_ENDIAN)
			
 
				 	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
			
 
				-	return vint4(b, 0, 0, 0);
			
 
				+#else
			
 
				+	int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24);
			
 
				+#endif
			
 
				+	a = vint4(b, 0, 0, 0);
			
 
				+	store_nbytes(a, p);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -934,17 +928,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 
				 	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				-{
			
 
				-	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
			
 
				-	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
			
 
				-	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
			
 
				-	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Load a vector of gathered results from an array;
			
 
				  */
			
@@ -956,6 +939,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 
				 	               base[indices.m[3]]);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array using byte indices from memory
			
 
				+ */
			
 
				+template<>
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
			
 
				+{
			
 
				+	return vfloat4(base[indices[0]],
			
 
				+	               base[indices[1]],
			
 
				+	               base[indices[2]],
			
 
				+	               base[indices[3]]);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Store a vector to an unaligned memory address.
			
 
				  */
			
@@ -1080,84 +1075,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Table structure for a 16x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_16x8 {
			
 
				+	const uint8_t* data;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 32x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_32x8 {
			
 
				+	const uint8_t* data;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 64x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_64x8 {
			
 
				+	const uint8_t* data;
			
 
				+};
			
 
				+
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_16x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.data = data;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_32x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.data = data;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				-	t2p = t2;
			
 
				-	t3p = t3;
			
 
				+	vtable4_64x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.data = data;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				-{
			
 
				-	uint8_t table[16];
			
 
				-
			
 
				-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_16x8& table,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				+	return vint4(table.data[idx.lane<0>()],
			
 
				+	             table.data[idx.lane<1>()],
			
 
				+	             table.data[idx.lane<2>()],
			
 
				+	             table.data[idx.lane<3>()]);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				-{
			
 
				-	uint8_t table[32];
			
 
				-
			
 
				-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_32x8& table,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				+	return vint4(table.data[idx.lane<0>()],
			
 
				+	             table.data[idx.lane<1>()],
			
 
				+	             table.data[idx.lane<2>()],
			
 
				+	             table.data[idx.lane<3>()]);
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				-{
			
 
				-	uint8_t table[64];
			
 
				-
			
 
				-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 32, t2.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 48, t3.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_64x8& table,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				+	return vint4(table.data[idx.lane<0>()],
			
 
				+	             table.data[idx.lane<1>()],
			
 
				+	             table.data[idx.lane<2>()],
			
 
				+	             table.data[idx.lane<3>()]);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -1170,7 +1175,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
			
 
				 {
			
 
				+#if !defined(ASTCENC_BIG_ENDIAN)
			
 
				 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
			
 
				+#else
			
 
				+	return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r);
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
@@ -1,6 +1,6 @@
 
				 // SPDX-License-Identifier: Apache-2.0
			
 
				 // ----------------------------------------------------------------------------
			
 
				-// Copyright 2019-2023 Arm Limited
			
 
				+// Copyright 2019-2024 Arm Limited
			
 
				 //
			
 
				 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				 // use this file except in compliance with the License. You may obtain a copy
			
@@ -142,14 +142,6 @@ struct vfloat4
 
				 		return vfloat4(_mm_load_ps(p));
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				-	 */
			
 
				-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
			
 
				-	{
			
 
				-		return vfloat4(_mm_set_ps(3, 2, 1, 0));
			
 
				-	}
			
 
				-
			
 
				 	/**
			
 
				 	 * @brief Return a swizzled float 2.
			
 
				 	 */
			
@@ -229,7 +221,7 @@ struct vint4
 
				 	/**
			
 
				 	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				 	 *
			
 
				-	 * Consider using vfloat4::zero() for constexpr zeros.
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				 	 */
			
 
				 	ASTCENC_SIMD_INLINE explicit vint4(int a)
			
 
				 	{
			
@@ -436,6 +428,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 
				 	return static_cast<unsigned int>(_mm_movemask_ps(a.m));
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) != 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask4 a)
			
 
				+{
			
 
				+	return mask(a) == 0xF;
			
 
				+}
			
 
				+
			
 
				 // ============================================================================
			
 
				 // vint4 operators and functions
			
 
				 // ============================================================================
			
@@ -598,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
			
 
				 {
			
 
				-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
			
 
				+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
			
 
				+	return a;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -608,25 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
			
 
				 {
			
 
				-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
			
 
				-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
			
 
				-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * @brief Return the horizontal sum of a vector as a scalar.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
			
 
				-{
			
 
				-	// Add top and bottom halves, lane 1/0
			
 
				-	__m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
			
 
				-	                                              _mm_castsi128_ps(a.m)));
			
 
				-	__m128i t = _mm_add_epi32(a.m, fold);
			
 
				-
			
 
				-	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
			
 
				-	t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
			
 
				-
			
 
				-	return _mm_cvtsi128_si32(t);
			
 
				+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
			
 
				+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
			
 
				+	return a;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -663,32 +655,20 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 
				 	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Gather N (vector width) indices from the array.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
			
 
				-{
			
 
				-#if ASTCENC_AVX >= 2
			
 
				-	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
			
 
				-#else
			
 
				-	alignas(16) int idx[4];
			
 
				-	storea(indices, idx);
			
 
				-	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
			
 
				+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
			
 
				 {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				 	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
			
 
				-	return vint4(_mm_shuffle_epi8(a.m, shuf));
			
 
				+	a = vint4(_mm_shuffle_epi8(a.m, shuf));
			
 
				+	store_nbytes(a, p);
			
 
				 #else
			
 
				 	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
			
 
				 	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
			
 
				-	return vint4(_mm_unpacklo_epi16(va, vb));
			
 
				+	a = vint4(_mm_unpacklo_epi16(va, vb));
			
 
				+	store_nbytes(a, p);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -899,25 +879,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
			
 
				- */
			
 
				-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
			
 
				-{
			
 
				-#if ASTCENC_SSE >= 41
			
 
				-	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
			
 
				-#else
			
 
				-	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
			
 
				-	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * @brief Load a vector of gathered results from an array;
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
			
 
				 {
			
 
				-#if ASTCENC_AVX >= 2
			
 
				+#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
			
 
				 	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
			
 
				 #else
			
 
				 	alignas(16) int idx[4];
			
@@ -926,6 +893,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array using byte indices from memory
			
 
				+ */
			
 
				+template<>
			
 
				+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
			
 
				+{
			
 
				+	// Experimentally, in this particular use case (byte indices in memory),
			
 
				+	// using 4 separate scalar loads is appreciably faster than using gathers
			
 
				+	// even if they're available, on every x86 uArch tried, so always do the
			
 
				+	// separate loads even when ASTCENC_X86_GATHERS is enabled.
			
 
				+	//
			
 
				+	// Tested on:
			
 
				+	//   - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
			
 
				+	//   - AMD Zen 2, Zen 4
			
 
				+	return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * @brief Store a vector to an unaligned memory address.
			
 
				  */
			
@@ -1054,136 +1038,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 
				 	return vfloat4(_mm_castsi128_ps(v.m));
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Table structure for a 16x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_16x8 {
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	vint4 t0;
			
 
				+#else
			
 
				+	const uint8_t* data;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 32x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_32x8 {
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	vint4 t0;
			
 
				+	vint4 t1;
			
 
				+#else
			
 
				+	const uint8_t* data;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 64x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable4_64x8 {
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	vint4 t0;
			
 
				+	vint4 t1;
			
 
				+	vint4 t2;
			
 
				+	vint4 t3;
			
 
				+#else
			
 
				+	const uint8_t* data;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
			
 
				-{
			
 
				-	t0p = t0;
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_16x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+#if ASTCENC_SSE >= 41
			
 
				+	table.t0 = vint4::load(data);
			
 
				+#else
			
 
				+	table.data = data;
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable4_32x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				-	t0p = t0;
			
 
				-	t1p = t0 ^ t1;
			
 
				+	table.t0 = vint4::load(data);
			
 
				+	table.t1 = vint4::load(data + 16);
			
 
				+
			
 
				+	table.t1 = table.t1 ^ table.t0;
			
 
				 #else
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				+	table.data = data;
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Prepare a vtable lookup table for use with the native SIMD size.
			
 
				+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
			
 
				  */
			
 
				 ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
			
 
				-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
			
 
				-{
			
 
				+	vtable4_64x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				-	t0p = t0;
			
 
				-	t1p = t0 ^ t1;
			
 
				-	t2p = t1 ^ t2;
			
 
				-	t3p = t2 ^ t3;
			
 
				+	table.t0 = vint4::load(data);
			
 
				+	table.t1 = vint4::load(data + 16);
			
 
				+	table.t2 = vint4::load(data + 32);
			
 
				+	table.t3 = vint4::load(data + 48);
			
 
				+
			
 
				+	table.t3 = table.t3 ^ table.t2;
			
 
				+	table.t2 = table.t2 ^ table.t1;
			
 
				+	table.t1 = table.t1 ^ table.t0;
			
 
				 #else
			
 
				-	t0p = t0;
			
 
				-	t1p = t1;
			
 
				-	t2p = t2;
			
 
				-	t3p = t3;
			
 
				+	table.data = data;
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_16x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	return vint4(result);
			
 
				 #else
			
 
				-	uint8_t table[16];
			
 
				-
			
 
				-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+	return vint4(tbl.data[idx.lane<0>()],
			
 
				+	             tbl.data[idx.lane<1>()],
			
 
				+	             tbl.data[idx.lane<2>()],
			
 
				+	             tbl.data[idx.lane<3>()]);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_32x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				 
			
 
				-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
			
 
				+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
			
 
				 	result = _mm_xor_si128(result, result2);
			
 
				 
			
 
				 	return vint4(result);
			
 
				 #else
			
 
				-	uint8_t table[32];
			
 
				-
			
 
				-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+	return vint4(tbl.data[idx.lane<0>()],
			
 
				+	             tbl.data[idx.lane<1>()],
			
 
				+	             tbl.data[idx.lane<2>()],
			
 
				+	             tbl.data[idx.lane<3>()]);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
			
 
				+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
			
 
				  */
			
 
				-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
			
 
				-{
			
 
				+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
			
 
				+	const vtable4_64x8& tbl,
			
 
				+	vint4 idx
			
 
				+) {
			
 
				 #if ASTCENC_SSE >= 41
			
 
				 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
			
 
				 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
			
 
				 
			
 
				-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
			
 
				+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
			
 
				 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				 
			
 
				-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
			
 
				+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
			
 
				 	result = _mm_xor_si128(result, result2);
			
 
				 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				 
			
 
				-	result2 = _mm_shuffle_epi8(t2.m, idxx);
			
 
				+	result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
			
 
				 	result = _mm_xor_si128(result, result2);
			
 
				 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
			
 
				 
			
 
				-	result2 = _mm_shuffle_epi8(t3.m, idxx);
			
 
				+	result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
			
 
				 	result = _mm_xor_si128(result, result2);
			
 
				 
			
 
				 	return vint4(result);
			
 
				 #else
			
 
				-	uint8_t table[64];
			
 
				-
			
 
				-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
			
 
				-	std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
			
 
				-
			
 
				-	return vint4(table[idx.lane<0>()],
			
 
				-	             table[idx.lane<1>()],
			
 
				-	             table[idx.lane<2>()],
			
 
				-	             table[idx.lane<3>()]);
			
 
				+	return vint4(tbl.data[idx.lane<0>()],
			
 
				+	             tbl.data[idx.lane<1>()],
			
 
				+	             tbl.data[idx.lane<2>()],
			
 
				+	             tbl.data[idx.lane<3>()]);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -1307,7 +1328,11 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
 
				  */
			
 
				 ASTCENC_SIMD_INLINE int popcount(uint64_t v)
			
 
				 {
			
 
				+#if !defined(__x86_64__) && !defined(_M_AMD64)
			
 
				+	return static_cast<int>(__builtin_popcountll(v));
			
 
				+#else
			
 
				 	return static_cast<int>(_mm_popcnt_u64(v));
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 #endif // ASTCENC_POPCNT >= 1
			
--- a/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
@@ -0,0 +1,1101 @@
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+// ----------------------------------------------------------------------------
			
 
				+// Copyright 2019-2024 Arm Limited
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
			
 
				+// use this file except in compliance with the License. You may obtain a copy
			
 
				+// of the License at:
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
			
 
				+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
			
 
				+// License for the specific language governing permissions and limitations
			
 
				+// under the License.
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+/**
			
 
				+ * @brief 8x32-bit vectors, implemented using SVE.
			
 
				+ *
			
 
				+ * This module implements 8-wide 32-bit float, int, and mask vectors for Arm
			
 
				+ * SVE.
			
 
				+ *
			
 
				+ * There is a baseline level of functionality provided by all vector widths and
			
 
				+ * implementations. This is implemented using identical function signatures,
			
 
				+ * modulo data type, so we can use them as substitutable implementations in VLA
			
 
				+ * code.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED
			
 
				+#define ASTC_VECMATHLIB_SVE_8_H_INCLUDED
			
 
				+
			
 
				+#ifndef ASTCENC_SIMD_INLINE
			
 
				+	#error "Include astcenc_vecmathlib.h, do not include directly"
			
 
				+#endif
			
 
				+
			
 
				+#include <cstdio>
			
 
				+
			
 
				+typedef svbool_t svbool_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+typedef svuint8_t svuint8_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+typedef svuint16_t svuint16_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+typedef svuint32_t svuint32_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+typedef svint32_t svint32_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+typedef svfloat32_t svfloat32_8_t __attribute__((arm_sve_vector_bits(256)));
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide floats.
			
 
				+ */
			
 
				+struct vfloat8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vfloat8() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
			
 
				+	{
			
 
				+		m = svld1_f32(svptrue_b32(), p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
			
 
				+	{
			
 
				+		m = svdup_f32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vfloat8(svfloat32_8_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 zero()
			
 
				+	{
			
 
				+		return vfloat8(0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
			
 
				+	{
			
 
				+		return vfloat8(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
			
 
				+	{
			
 
				+		return vfloat8(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	svfloat32_8_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide ints.
			
 
				+ */
			
 
				+struct vint8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from zero-initialized value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE vint8() = default;
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 values loaded from an unaligned address.
			
 
				+	 *
			
 
				+	 * Consider using loada() which is better with vectors if data is aligned
			
 
				+	 * to vector length.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
			
 
				+	{
			
 
				+		m = svld1_s32(svptrue_b32(), p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
			
 
				+	{
			
 
				+		// Load 8-bit values and expand to 32-bits
			
 
				+		m = svld1ub_s32(svptrue_b32(), p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value replicated across all lanes.
			
 
				+	 *
			
 
				+	 * Consider using zero() for constexpr zeros.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(int a)
			
 
				+	{
			
 
				+		m = svdup_s32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vint8(svint32_8_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector of zeros.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 zero()
			
 
				+	{
			
 
				+		return vint8(0.0f);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a replicated scalar loaded from memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
			
 
				+	{
			
 
				+		return vint8(*p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from unaligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
			
 
				+	{
			
 
				+		svuint8_8_t data = svld1_u8(svptrue_b8(), p);
			
 
				+		return vint8(svreinterpret_s32_u8(data));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
			
 
				+	{
			
 
				+		return vint8(p);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Factory that returns a vector containing the lane IDs.
			
 
				+	 */
			
 
				+	static ASTCENC_SIMD_INLINE vint8 lane_id()
			
 
				+	{
			
 
				+		return vint8(svindex_s32(0, 1));
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	 svint32_8_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask8 data type
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Data type for 8-wide control plane masks.
			
 
				+ */
			
 
				+struct vmask8
			
 
				+{
			
 
				+	/**
			
 
				+	 * @brief Construct from an existing SIMD register.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask8(svbool_8_t a)
			
 
				+	{
			
 
				+		m = a;
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief Construct from 1 scalar value.
			
 
				+	 */
			
 
				+	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
			
 
				+	{
			
 
				+		m = svdup_b32(a);
			
 
				+	}
			
 
				+
			
 
				+	/**
			
 
				+	 * @brief The vector ...
			
 
				+	 */
			
 
				+	svbool_8_t m;
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vmask8 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask union (or).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(svorr_z(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask intersect (and).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(svand_z(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask difference (xor).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
			
 
				+{
			
 
				+	return vmask8(sveor_z(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: mask invert (not).
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
			
 
				+{
			
 
				+	return vmask8(svnot_z(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a 8-bit mask code indicating mask status.
			
 
				+ *
			
 
				+ * bit0 = lane 0
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
			
 
				+{
			
 
				+	alignas(32) const int shifta[8] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
			
 
				+	svint32_8_t template_vals = svld1_s32(svptrue_b32(), shifta);
			
 
				+	svint32_8_t active_vals = svsel_s32(a.m, template_vals, svdup_s32(0));
			
 
				+	return static_cast<unsigned int>(svaddv_s32(svptrue_b32(), active_vals));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if any lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool any(vmask8 a)
			
 
				+{
			
 
				+	return svptest_any(svptrue_b32(), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief True if all lanes are enabled, false otherwise.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE bool all(vmask8 a)
			
 
				+{
			
 
				+	return !svptest_any(svptrue_b32(), (~a).m);
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vint8 operators and functions
			
 
				+// ============================================================================
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svadd_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svsub_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svmul_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector bit invert.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
			
 
				+{
			
 
				+	return vint8(svnot_s32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise or.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svorr_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise and.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svand_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector bitwise xor.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(sveor_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpeq_s32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpne_s32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(svcmplt_s32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpgt_s32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift left.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
			
 
				+{
			
 
				+	return vint8(svlsl_n_s32_x(svptrue_b32(), a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Arithmetic shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
			
 
				+{
			
 
				+	return vint8(svasr_n_s32_x(svptrue_b32(), a.m, s));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Logical shift right.
			
 
				+ */
			
 
				+template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
			
 
				+{
			
 
				+	svuint32_8_t r = svreinterpret_u32_s32(a.m);
			
 
				+	r = svlsr_n_u32_x(svptrue_b32(), r, s);
			
 
				+	return vint8(svreinterpret_s32_u32(r));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svmin_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
			
 
				+{
			
 
				+	return vint8(svmax_s32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
			
 
				+{
			
 
				+	return vint8(svminv_s32(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
			
 
				+{
			
 
				+	return svminv_s32(svptrue_b32(), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
			
 
				+{
			
 
				+	return vint8(svmaxv_s32(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
			
 
				+{
			
 
				+	return svmaxv_s32(svptrue_b32(), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Generate a vint8 from a size_t.
			
 
				+ */
			
 
				+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
			
 
				+ {
			
 
				+	assert(a <= std::numeric_limits<int>::max());
			
 
				+	return vint8(static_cast<int>(a));
			
 
				+ }
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 16B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
			
 
				+{
			
 
				+	svst1_s32(svptrue_b32(), p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
			
 
				+{
			
 
				+	svst1_s32(svptrue_b32(), p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store lowest N (vector width) bytes into an unaligned address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
			
 
				+{
			
 
				+	svuint8_8_t r = svreinterpret_u8_s32(a.m);
			
 
				+	svst1_u8(svptrue_pat_b8(SV_VL8), p, r);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
			
 
				+{
			
 
				+	svuint32_8_t data = svreinterpret_u32_s32(v.m);
			
 
				+	svst1b_u32(svptrue_b32(), p, data);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
			
 
				+{
			
 
				+	return vint8(svsel_s32(cond.m, b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// vfloat8 operators and functions
			
 
				+// ============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svadd_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector incremental addition.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
			
 
				+{
			
 
				+	a = a + b;
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector subtraction.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svsub_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
			
 
				+{
			
 
				+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector multiplication.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svmul_f32_x(svptrue_b32(), svdup_f32(a), b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by scalar division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
			
 
				+{
			
 
				+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: scalar by vector division.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svdiv_f32_x(svptrue_b32(), svdup_f32(a), b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector equality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpeq_f32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector inequality.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpne_f32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmplt_f32(svptrue_b32(), a.m, b.m));;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpgt_f32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector less than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmple_f32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Overload: vector by vector greater than or equal.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vmask8(svcmpge_f32(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, the other lane will be returned.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svminnm_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the min vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, the other lane will be returned.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
			
 
				+{
			
 
				+	return min(a, vfloat8(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of two vectors.
			
 
				+ *
			
 
				+ * If either lane value is NaN, the other lane will be returned.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
			
 
				+{
			
 
				+	return vfloat8(svmaxnm_f32_x(svptrue_b32(), a.m, b.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the max vector of a vector and a scalar.
			
 
				+ *
			
 
				+ * If either lane value is NaN, the other lane will be returned.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
			
 
				+{
			
 
				+	return max(a, vfloat8(b));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the clamped value between min and max.
			
 
				+ *
			
 
				+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
			
 
				+ * then @c min will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 clamp(float minv, float maxv, vfloat8 a)
			
 
				+{
			
 
				+	return min(max(a, minv), maxv);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a clamped value between 0.0f and 1.0f.
			
 
				+ *
			
 
				+ * If @c a is NaN then zero will be returned for that lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
			
 
				+{
			
 
				+	return clamp(0.0f, 1.0f, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the absolute value of the float vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(svabs_f32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float rounded to the nearest integer value.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(svrintn_f32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(svminnmv_f32(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal minimum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
			
 
				+{
			
 
				+	return svminnmv_f32(svptrue_b32(), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(svmaxnmv_f32(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal maximum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
			
 
				+{
			
 
				+	return svmaxnmv_f32(svptrue_b32(), a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the horizontal sum of a vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
			
 
				+{
			
 
				+	// Can't use svaddv - it's not invariant
			
 
				+	vfloat4 lo(svget_neonq_f32(a.m));
			
 
				+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
			
 
				+	return hadd_s(lo) + hadd_s(hi);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return lanes from @c b if @c cond is set, else @c a.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
			
 
				+{
			
 
				+	return vfloat8(svsel_f32(cond.m, b.m, a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
			
 
				+ *
			
 
				+ * This is invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
			
 
				+{
			
 
				+	vfloat4 lo(svget_neonq_f32(a.m));
			
 
				+	haccumulate(accum, lo);
			
 
				+
			
 
				+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
			
 
				+	haccumulate(accum, hi);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate lane-wise sums for a vector.
			
 
				+ *
			
 
				+ * This is NOT invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
			
 
				+{
			
 
				+	accum += a;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
			
 
				+ *
			
 
				+ * This is invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
			
 
				+{
			
 
				+	a = select(vfloat8::zero(), a, m);
			
 
				+	haccumulate(accum, a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Accumulate masked lane-wise sums for a vector.
			
 
				+ *
			
 
				+ * This is NOT invariant with 4-wide implementations.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
			
 
				+{
			
 
				+	accum.m = svadd_f32_m(m.m, accum.m, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return the sqrt of the lanes in the vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
			
 
				+{
			
 
				+	return vfloat8(svsqrt_f32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array;
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
			
 
				+{
			
 
				+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Load a vector of gathered results from an array using byte indices from memory
			
 
				+ */
			
 
				+template<>
			
 
				+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
			
 
				+{
			
 
				+	svint32_t offsets = svld1ub_s32(svptrue_b32(), indices);
			
 
				+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to an unaligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
			
 
				+{
			
 
				+	svst1_f32(svptrue_b32(), p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector to a 32B aligned memory address.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
			
 
				+{
			
 
				+	svst1_f32(svptrue_b32(), p, a.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using truncation.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
			
 
				+{
			
 
				+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value for a float vector, using round-to-nearest.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
			
 
				+{
			
 
				+	a = a + vfloat8(0.5f);
			
 
				+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value for an integer vector.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
			
 
				+{
			
 
				+	return vfloat8(svcvt_f32_s32_x(svptrue_b32(), a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the first half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
			
 
				+{
			
 
				+	return vint8(svreinterpret_s32_f32(a.m));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
			
 
				+ *
			
 
				+ * It is a common trick to convert floats into integer bit patterns, perform
			
 
				+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
			
 
				+ * convert them back again. This is the second half of that flip.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
			
 
				+{
			
 
				+	return vfloat8(svreinterpret_f32_s32(a.m));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 16x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_16x8 {
			
 
				+	svuint8_8_t t0;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 32x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_32x8 {
			
 
				+	svuint8_8_t t0;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Table structure for a 64x 8-bit entry table.
			
 
				+ */
			
 
				+struct vtable8_64x8 {
			
 
				+	svuint8_8_t t0;
			
 
				+	svuint8_8_t t1;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable8_16x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	// Top half of register will be zeros
			
 
				+	table.t0 = svld1_u8(svptrue_pat_b8(SV_VL16), data);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable8_32x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.t0 = svld1_u8(svptrue_b8(), data);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void vtable_prepare(
			
 
				+	vtable8_64x8& table,
			
 
				+	const uint8_t* data
			
 
				+) {
			
 
				+	table.t0 = svld1_u8(svptrue_b8(), data);
			
 
				+	table.t1 = svld1_u8(svptrue_b8(), data + 32);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_16x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
			
 
				+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
			
 
				+
			
 
				+	svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
			
 
				+	return vint8(svreinterpret_s32_u8(result));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_32x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
			
 
				+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
			
 
				+
			
 
				+	svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
			
 
				+	return vint8(svreinterpret_s32_u8(result));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
			
 
				+ *
			
 
				+ * Future: SVE2 can directly do svtbl2_u8() for a two register table.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
			
 
				+	const vtable8_64x8& tbl,
			
 
				+	vint8 idx
			
 
				+) {
			
 
				+	// Set index byte above max index for unused bytes so table lookup returns zero
			
 
				+	svint32_8_t idxm = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
			
 
				+
			
 
				+	svuint8_8_t idxm8 = svreinterpret_u8_s32(idxm);
			
 
				+	svuint8_8_t t0_lookup = svtbl_u8(tbl.t0, idxm8);
			
 
				+
			
 
				+	idxm8 = svsub_u8_x(svptrue_b8(), idxm8, svdup_u8(32));
			
 
				+	svuint8_8_t t1_lookup = svtbl_u8(tbl.t1, idxm8);
			
 
				+
			
 
				+	svuint8_8_t result = svorr_u8_x(svptrue_b32(), t0_lookup, t1_lookup);
			
 
				+	return vint8(svreinterpret_s32_u8(result));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Return a vector of interleaved RGBA data.
			
 
				+ *
			
 
				+ * Input vectors have the value stored in the bottom 8 bits of each lane,
			
 
				+ * with high bits set to zero.
			
 
				+ *
			
 
				+ * Output vector stores a single RGBA texel packed in each lane.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
			
 
				+{
			
 
				+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Store a vector, skipping masked lanes.
			
 
				+ *
			
 
				+ * All masked lanes must be at the end of vector, after all non-masked lanes.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
			
 
				+{
			
 
				+	svst1_s32(mask.m, reinterpret_cast<int32_t*>(base), data.m);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vint8 a)
			
 
				+{
			
 
				+	alignas(32) int v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
			
 
				+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of ints.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void printx(vint8 a)
			
 
				+{
			
 
				+	alignas(32) int v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
			
 
				+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of floats.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vfloat8 a)
			
 
				+{
			
 
				+	alignas(32) float v[8];
			
 
				+	storea(a, v);
			
 
				+	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
			
 
				+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
			
 
				+	       static_cast<double>(v[2]), static_cast<double>(v[3]),
			
 
				+	       static_cast<double>(v[4]), static_cast<double>(v[5]),
			
 
				+	       static_cast<double>(v[6]), static_cast<double>(v[7]));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * @brief Debug function to print a vector of masks.
			
 
				+ */
			
 
				+ASTCENC_SIMD_INLINE void print(vmask8 a)
			
 
				+{
			
 
				+	print(select(vint8(0), vint8(1), a));
			
 
				+}
			
 
				+
			
 
				+#endif // #ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED
			
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@@ -43,6 +43,7 @@
 
				 #include <stdio.h>
			
 
				 #include <cassert>
			
 
				 #include <cstring>
			
 
				+#include <cfloat>
			
 
				 
			
 
				 static constexpr unsigned int ANGULAR_STEPS { 32 };
			
 
				 
			
@@ -104,14 +105,17 @@ static void compute_angular_offsets(
 
				 	// Precompute isample; arrays are always allocated 64 elements long
			
 
				 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				 	{
			
 
				-		// Add 2^23 and interpreting bits extracts round-to-nearest int
			
 
				-		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
			
 
				-		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
			
 
				+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
			
 
				+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
			
 
				+
			
 
				+		// Convert a weight to a sincos table index
			
 
				+		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
			
 
				+		vint isample = float_to_int_rtn(sample);
			
 
				 		storea(isample, isamplev + i);
			
 
				 	}
			
 
				 
			
 
				 	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
			
 
				-	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
			
 
				+	vfloat mult(1.0f / (2.0f * astc::PI));
			
 
				 
			
 
				 	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
			
 
				 	{
			
@@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
 
				 	promise(weight_count > 0);
			
 
				 	promise(max_angular_steps > 0);
			
 
				 
			
 
				-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
			
 
				+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
			
 
				+
			
 
				+	// Compute minimum/maximum weights in the weight array. Our remapping
			
 
				+	// is monotonic, so the min/max rounded weights relate to the min/max
			
 
				+	// unrounded weights in a straightforward way.
			
 
				+	vfloat min_weight(FLT_MAX);
			
 
				+	vfloat max_weight(-FLT_MAX);
			
 
				+
			
 
				+	vint lane_id = vint::lane_id();
			
 
				+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
			
 
				+	{
			
 
				+		vmask active = lane_id < vint(weight_count);
			
 
				+		lane_id += vint(ASTCENC_SIMD_WIDTH);
			
 
				+
			
 
				+		vfloat weights = loada(dec_weight_ideal_value + i);
			
 
				+		min_weight = min(min_weight, select(min_weight, weights, active));
			
 
				+		max_weight = max(max_weight, select(max_weight, weights, active));
			
 
				+	}
			
 
				+
			
 
				+	min_weight = hmin(min_weight);
			
 
				+	max_weight = hmax(max_weight);
			
 
				 
			
 
				 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
			
 
				 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
			
 
				 	{
			
 
				-		vfloat minidx(128.0f);
			
 
				-		vfloat maxidx(-128.0f);
			
 
				 		vfloat errval = vfloat::zero();
			
 
				 		vfloat cut_low_weight_err = vfloat::zero();
			
 
				 		vfloat cut_high_weight_err = vfloat::zero();
			
 
				 		vfloat offset = loada(offsets + sp);
			
 
				 
			
 
				+		// We know the min and max weight values, so we can figure out
			
 
				+		// the corresponding indices before we enter the loop.
			
 
				+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
			
 
				+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
			
 
				+
			
 
				 		for (unsigned int j = 0; j < weight_count; j++)
			
 
				 		{
			
 
				 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
			
@@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
 
				 			vfloat diff = sval - svalrte;
			
 
				 			errval += diff * diff;
			
 
				 
			
 
				-			// Reset tracker on min hit
			
 
				-			vmask mask = svalrte < minidx;
			
 
				-			minidx = select(minidx, svalrte, mask);
			
 
				-			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
			
 
				-
			
 
				-			// Accumulate on min hit
			
 
				-			mask = svalrte == minidx;
			
 
				+			// Accumulate errors for minimum index
			
 
				+			vmask mask = svalrte == minidx;
			
 
				 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
			
 
				 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
			
 
				 
			
 
				-			// Reset tracker on max hit
			
 
				-			mask = svalrte > maxidx;
			
 
				-			maxidx = select(maxidx, svalrte, mask);
			
 
				-			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
			
 
				-
			
 
				-			// Accumulate on max hit
			
 
				+			// Accumulate errors for maximum index
			
 
				 			mask = svalrte == maxidx;
			
 
				 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
			
 
				 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);