Prechádzať zdrojové kódy

Update astcenc to the upstream 5.3.0 release

This is mostly a maintenance update that brings the compressor inline
with the recently published Khronos Data Format Specification 1.4
release which clarified some ambiguity in the specification. This update
also gives minor codec optimizations, bug fixes, and image quality
improvements.

The biggest improvement for Godot is that builds using MSVC cl.exe will
now correctly default to the SSE2-optimized backend rather than the
reference C backend. This makes compression more than 3 times faster.
Builds using other compilers (GCC, LLVM/Clang) were not impacted by the
underlying issue, and see no performance uplift.
Peter Harris 4 mesiacov pred
rodič
commit
75ce42d463

+ 1 - 1
COPYRIGHT.txt

@@ -180,7 +180,7 @@ License: BSD-3-clause
 
 Files: thirdparty/astcenc/*
 Comment: Arm ASTC Encoder
-Copyright: 2011-2024, Arm Limited
+Copyright: 2011-2025, Arm Limited
 License: Apache-2.0
 
 Files: thirdparty/basis_universal/*

+ 1 - 1
thirdparty/README.md

@@ -50,7 +50,7 @@ Files extracted from upstream source:
 ## astcenc
 
 - Upstream: https://github.com/ARM-software/astc-encoder
-- Version: 4.8.0 (0d6c9047c5ad19640e2d60fdb8f11a16675e7938, 2024)
+- Version: 5.3.0 (bf32abd05eccaf3042170b2a85cebdf0bfee5873, 2025)
 - License: Apache 2.0
 
 Files extracted from upstream source:

+ 15 - 1
thirdparty/astcenc/astcenc.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2024 Arm Limited
+// Copyright 2020-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -784,6 +784,20 @@ ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
 ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
 	astcenc_context* context);
 
+/**
+ * @brief Cancel any pending compression operation.
+ *
+ * The caller must behave as if the compression completed normally, even though the data will be
+ * undefined. They are still responsible for synchronizing threads in the worker thread pool, and
+ * must call reset before starting another compression.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
+	astcenc_context* context);
+
 /**
  * @brief Decompress an image.
  *

+ 51 - 51
thirdparty/astcenc/astcenc_averages_and_directions.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -50,7 +50,7 @@ static void compute_partition_averages_rgb(
 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
 ) {
 	unsigned int partition_count = pi.partition_count;
-	unsigned int texel_count = blk.texel_count;
+	size_t texel_count = blk.texel_count;
 	promise(texel_count > 0);
 
 	// For 1 partition just use the precomputed mean
@@ -64,11 +64,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[3] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -100,11 +100,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[2][3] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -145,11 +145,11 @@ static void compute_partition_averages_rgb(
 		vfloatacc pp_avg_rgb[3][3] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -221,7 +221,7 @@ static void compute_partition_averages_rgba(
 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
 ) {
 	unsigned int partition_count = pi.partition_count;
-	unsigned int texel_count = blk.texel_count;
+	size_t texel_count = blk.texel_count;
 	promise(texel_count > 0);
 
 	// For 1 partition just use the precomputed mean
@@ -235,11 +235,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[4] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -275,11 +275,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[2][4] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -326,11 +326,11 @@ static void compute_partition_averages_rgba(
 		vfloat4 pp_avg_rgba[3][4] {};
 
 		vint lane_id = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -390,17 +390,17 @@ void compute_avgs_and_dirs_4_comp(
 	const image_block& blk,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	// Pre-compute partition_averages
 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
 	compute_partition_averages_rgba(pi, blk, partition_averages);
 
-	for (int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		vfloat4 average = partition_averages[partition];
@@ -411,7 +411,7 @@ void compute_avgs_and_dirs_4_comp(
 		vfloat4 sum_zp = vfloat4::zero();
 		vfloat4 sum_wp = vfloat4::zero();
 
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 			vfloat4 texel_datum = blk.texel(iwt);
@@ -509,13 +509,13 @@ void compute_avgs_and_dirs_3_comp(
 		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
 	}
 
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		vfloat4 average = partition_averages[partition];
@@ -525,7 +525,7 @@ void compute_avgs_and_dirs_3_comp(
 		vfloat4 sum_yp = vfloat4::zero();
 		vfloat4 sum_zp = vfloat4::zero();
 
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 
@@ -570,17 +570,17 @@ void compute_avgs_and_dirs_3_comp_rgb(
 	const image_block& blk,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	// Pre-compute partition_averages
 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
 	compute_partition_averages_rgb(pi, blk, partition_averages);
 
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		vfloat4 average = partition_averages[partition];
@@ -590,7 +590,7 @@ void compute_avgs_and_dirs_3_comp_rgb(
 		vfloat4 sum_yp = vfloat4::zero();
 		vfloat4 sum_zp = vfloat4::zero();
 
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 
@@ -664,20 +664,20 @@ void compute_avgs_and_dirs_2_comp(
 		data_vg = blk.data_b;
 	}
 
-	unsigned int partition_count = pt.partition_count;
+	size_t partition_count = pt.partition_count;
 	promise(partition_count > 0);
 
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
-		unsigned int texel_count = pt.partition_texel_count[partition];
+		size_t texel_count = pt.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		// Only compute a partition mean if more than one partition
 		if (partition_count > 1)
 		{
 			average = vfloat4::zero();
-			for (unsigned int i = 0; i < texel_count; i++)
+			for (size_t i = 0; i < texel_count; i++)
 			{
 				unsigned int iwt = texel_indexes[i];
 				average += vfloat2(data_vr[iwt], data_vg[iwt]);
@@ -691,7 +691,7 @@ void compute_avgs_and_dirs_2_comp(
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
 
-		for (unsigned int i = 0; i < texel_count; i++)
+		for (size_t i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
 			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
@@ -729,20 +729,20 @@ void compute_error_squared_rgba(
 	float& uncor_error,
 	float& samec_error
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	vfloatacc uncor_errorsumv = vfloatacc::zero();
 	vfloatacc samec_errorsumv = vfloatacc::zero();
 
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
 
 		processed_line4 l_uncor = uncor_plines[partition];
 		processed_line4 l_samec = samec_plines[partition];
 
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		// Vectorize some useful scalar inputs
@@ -775,15 +775,15 @@ void compute_error_squared_rgba(
 		// array to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
 		vint lane_ids = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
 
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
-			vfloat data_a = gatherf(blk.data_a, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
 
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)
@@ -847,17 +847,17 @@ void compute_error_squared_rgb(
 	float& uncor_error,
 	float& samec_error
 ) {
-	unsigned int partition_count = pi.partition_count;
+	size_t partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	vfloatacc uncor_errorsumv = vfloatacc::zero();
 	vfloatacc samec_errorsumv = vfloatacc::zero();
 
-	for (unsigned int partition = 0; partition < partition_count; partition++)
+	for (size_t partition = 0; partition < partition_count; partition++)
 	{
 		partition_lines3& pl = plines[partition];
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-		unsigned int texel_count = pi.partition_texel_count[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
 		processed_line3 l_uncor = pl.uncor_pline;
@@ -889,14 +889,14 @@ void compute_error_squared_rgb(
 		// to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
 		vint lane_ids = vint::lane_id();
-		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
 
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
 
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)

+ 13 - 13
thirdparty/astcenc/astcenc_block_sizes.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -384,12 +384,12 @@ static void init_decimation_info_2d(
 	}
 
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
 	{
 		di.texel_weight_count[i] = 0;
 
-		for (unsigned int j = 0; j < 4; j++)
+		for (size_t j = 0; j < 4; j++)
 		{
 			di.texel_weight_contribs_float_tr[j][i] = 0;
 			di.texel_weights_tr[j][i] = 0;
@@ -402,12 +402,12 @@ static void init_decimation_info_2d(
 	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
 
-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
 	{
 		di.weight_texel_count[i] = 0;
 
-		for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
 		{
 			di.weight_texels_tr[j][i] = last_texel;
 			di.weights_texel_contribs_tr[j][i] = 0.0f;
@@ -640,12 +640,12 @@ static void init_decimation_info_3d(
 	}
 
 	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
-	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
-	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
 	{
 		di.texel_weight_count[i] = 0;
 
-		for (unsigned int j = 0; j < 4; j++)
+		for (size_t j = 0; j < 4; j++)
 		{
 			di.texel_weight_contribs_float_tr[j][i] = 0;
 			di.texel_weights_tr[j][i] = 0;
@@ -658,12 +658,12 @@ static void init_decimation_info_3d(
 	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
 	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
 
-	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
-	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
 	{
 		di.weight_texel_count[i] = 0;
 
-		for (int j = 0; j < max_texel_count_of_weight; j++)
+		for (size_t j = 0; j < max_texel_count_of_weight; j++)
 		{
 			di.weight_texels_tr[j][i] = last_texel;
 			di.weights_texel_contribs_tr[j][i] = 0.0f;

+ 2 - 9
thirdparty/astcenc/astcenc_color_unquantize.cpp

@@ -925,15 +925,8 @@ void unpack_color_endpoints(
 			alpha_hdr = false;
 		}
 
-		vmask4 mask(true, true, true, false);
-
-		vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
-		vint4 output0a = output0 * 257;
-		output0 = select(output0a, output0rgb, mask);
-
-		vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
-		vint4 output1a = output1 * 257;
-		output1 = select(output1a, output1rgb, mask);
+		output0 = lsl<8>(output0) | vint4(0x80);
+		output1 = lsl<8>(output1) | vint4(0x80);
 	}
 	// An HDR profile decode, but may be using linear LDR endpoints
 	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication

+ 2 - 2
thirdparty/astcenc/astcenc_compress_symbolic.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1280,7 +1280,7 @@ void compress_block(
 		1.0f
 	};
 
-	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
+	const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
 
 	// Only enable MODE0 fast path if enabled
 	// Never enable for 3D blocks as no "always" block modes are available

+ 11 - 20
thirdparty/astcenc/astcenc_decompress_symbolic.cpp

@@ -98,19 +98,14 @@ void unpack_weights(
 	if (!is_dual_plane)
 	{
 		// Build full 64-entry weight lookup table
-		vint4 tab0 = vint4::load(scb.weights +  0);
-		vint4 tab1 = vint4::load(scb.weights + 16);
-		vint4 tab2 = vint4::load(scb.weights + 32);
-		vint4 tab3 = vint4::load(scb.weights + 48);
-
-		vint tab0p, tab1p, tab2p, tab3p;
-		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
+		vtable_64x8 table;
+		vtable_prepare(table, scb.weights);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vint summed_value(8);
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@@ -118,7 +113,7 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(summed_value), weights_plane1 + i);
@@ -128,16 +123,12 @@ void unpack_weights(
 	{
 		// Build a 32-entry weight lookup table per plane
 		// Plane 1
-		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
-		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
-		vint tab0_plane1p, tab1_plane1p;
-		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
+		vtable_32x8 tab_plane1;
+		vtable_prepare(tab_plane1, scb.weights);
 
 		// Plane 2
-		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
-		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
-		vint tab0_plane2p, tab1_plane2p;
-		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
+		vtable_32x8 tab_plane2;
+		vtable_prepare(tab_plane2, scb.weights + 32);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -145,7 +136,7 @@ void unpack_weights(
 			vint sum_plane2(8);
 
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@@ -153,8 +144,8 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
-				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(sum_plane1), weights_plane1 + i);

+ 24 - 1
thirdparty/astcenc/astcenc_entry.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1123,6 +1123,29 @@ astcenc_error astcenc_compress_reset(
 #endif
 }
 
+/* See header for documentation. */
+astcenc_error astcenc_compress_cancel(
+	astcenc_context* ctxo
+) {
+#if defined(ASTCENC_DECOMPRESS_ONLY)
+	(void)ctxo;
+	return ASTCENC_ERR_BAD_CONTEXT;
+#else
+	astcenc_contexti* ctx = &ctxo->context;
+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
+	{
+		return ASTCENC_ERR_BAD_CONTEXT;
+	}
+
+	// Cancel compression before cancelling avg. This avoids the race condition
+	// where cancelling them in the other order could see a compression worker
+	// starting to process even though some of the avg data is undefined.
+	ctxo->manage_compress.cancel();
+	ctxo->manage_avg.cancel();
+	return ASTCENC_SUCCESS;
+#endif
+}
+
 /* See header for documentation. */
 astcenc_error astcenc_decompress_image(
 	astcenc_context* ctxo,

+ 4 - 4
thirdparty/astcenc/astcenc_find_best_partitioning.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -226,7 +226,7 @@ static void kmeans_update(
 
 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
 
-	// Find the center-of-gravity in each cluster
+	// Find the center of gravity in each cluster
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		uint8_t partition = partition_of_texel[i];
@@ -425,8 +425,8 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
 	}
 
 	// Create a running sum from the histogram array
-	// Cells store previous values only; i.e. exclude self after sum
-	unsigned int sum = 0;
+	// Indices store previous values only; i.e. exclude self after sum
+	uint16_t sum = 0;
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		uint16_t cnt = mscount[i];

+ 40 - 51
thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp

@@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
-	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
-	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
+	const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
+	const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
 
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
-	vfloat weight_val2 = gatherf(weights, weight_idx2);
-	vfloat weight_val3 = gatherf(weights, weight_idx3);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
+	vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
+	vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
 
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
 
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
 
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -195,8 +195,8 @@ static void compute_ideal_colors_and_weights_1_comp(
 	}
 
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@@ -333,8 +333,8 @@ static void compute_ideal_colors_and_weights_2_comp(
 	}
 
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@@ -500,8 +500,8 @@ static void compute_ideal_colors_and_weights_3_comp(
 	}
 
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@@ -598,8 +598,8 @@ static void compute_ideal_colors_and_weights_4_comp(
 	}
 
 	// Zero initialize any SIMD over-fetch
-	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
-	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (size_t i = texel_count; i < texel_count_simd; i++)
 	{
 		ei.weights[i] = 0.0f;
 		ei.weight_error_scale[i] = 0.0f;
@@ -853,12 +853,6 @@ void compute_ideal_weights_for_decimation(
 	promise(texel_count > 0);
 	promise(weight_count > 0);
 
-	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
-	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
-	// arrays always contain space for 64 elements
-	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
-	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
-
 	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
 	// zero-initialized SIMD over-fetch region
 	if (is_direct)
@@ -873,7 +867,6 @@ void compute_ideal_weights_for_decimation(
 	}
 
 	// Otherwise compute an estimate and perform single refinement iteration
-	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
 
 	// Compute an initial average for each decimated weight
 	bool constant_wes = ei.is_constant_weight_error_scale;
@@ -889,23 +882,23 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
 			{
-				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 
 			vfloat contrib_weight = weight * weight_error_scale;
 
 			weight_weight += contrib_weight;
-			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
+			initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
 		}
 
 		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
@@ -914,6 +907,7 @@ void compute_ideal_weights_for_decimation(
 	// Populate the interpolated weight grid based on the initial average
 	// Process SIMD-width texel coordinates at at time while we can. Safe to
 	// over-process full SIMD vectors - the tail is zeroed.
+	ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
 	if (di.max_texel_weight_count <= 2)
 	{
 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -947,22 +941,22 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
 			{
- 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 
 			vfloat scale = weight_error_scale * contrib_weight;
-			vfloat old_weight = gatherf(infilled_weights, texel);
-			vfloat ideal_weight = gatherf(ei.weights, texel);
+			vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
+			vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
 
 			error_change0 += contrib_weight * scale;
 			error_change1 += (old_weight - ideal_weight) * scale;
@@ -1023,9 +1017,8 @@ void compute_quantized_weights_for_decimation(
 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
 	if (get_quant_level(quant_level) <= 16)
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant);
-		vint tab0p;
-		vtable_prepare(tab0, tab0p);
+		vtable_16x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1038,8 +1031,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@@ -1050,16 +1043,13 @@ void compute_quantized_weights_for_decimation(
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
-			store_nbytes(scn, quantized_weight_set + i);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 		}
 	}
 	else
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant +  0);
-		vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
-		vint tab0p, tab1p;
-		vtable_prepare(tab0, tab1, tab0p, tab1p);
+		vtable_32x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1072,8 +1062,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@@ -1084,8 +1074,7 @@ void compute_quantized_weights_for_decimation(
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
-			vint scn = pack_low_bytes(weight);
-			store_nbytes(scn, quantized_weight_set + i);
+			pack_and_store_low_bytes(weight, quantized_weight_set + i);
 		}
 	}
 }

+ 4 - 10
thirdparty/astcenc/astcenc_internal.h

@@ -1583,19 +1583,13 @@ static inline vmask4 get_u8_component_mask(
 	astcenc_profile decode_mode,
 	const image_block& blk
 ) {
-	vmask4 u8_mask(false);
-	// Decode mode writing to a unorm8 output value
-	if (blk.decode_unorm8)
+	// Decode mode or sRGB forces writing to unorm8 output value
+	if (blk.decode_unorm8 || decode_mode == ASTCENC_PRF_LDR_SRGB)
 	{
-		u8_mask = vmask4(true);
-	}
-	// SRGB writing to a unorm8 RGB value
-	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-	{
-		u8_mask = vmask4(true, true, true, false);
+		return vmask4(true);
 	}
 
-	return u8_mask;
+	return vmask4(false);
 }
 
 /**

+ 26 - 11
thirdparty/astcenc/astcenc_internal_entry.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -100,6 +100,9 @@ private:
 	/** @brief Lock used for critical section and condition synchronization. */
 	std::mutex m_lock;
 
+	/** @brief True if the current operation is cancelled. */
+	std::atomic<bool> m_is_cancelled;
+
 	/** @brief True if the stage init() step has been executed. */
 	bool m_init_done;
 
@@ -147,6 +150,7 @@ public:
 	{
 		m_init_done = false;
 		m_term_done = false;
+		m_is_cancelled = false;
 		m_start_count = 0;
 		m_done_count = 0;
 		m_task_count = 0;
@@ -155,6 +159,16 @@ public:
 		m_callback_min_diff = 1.0f;
 	}
 
+	/**
+	 * @brief Clear the tracker and stop new tasks being assigned.
+	 *
+	 * Note, all in-flight tasks in a worker will still complete normally.
+	 */
+	void cancel()
+	{
+		m_is_cancelled = true;
+	}
+
 	/**
 	 * @brief Trigger the pipeline stage init step.
 	 *
@@ -211,7 +225,7 @@ public:
 	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
 	{
 		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
-		if (base >= m_task_count)
+		if (m_is_cancelled || base >= m_task_count)
 		{
 			count = 0;
 			return 0;
@@ -241,16 +255,17 @@ public:
 			local_count = m_done_count;
 			local_last_value = m_callback_last_value;
 
-			if (m_done_count == m_task_count)
+			// Ensure the progress bar hits 100%
+			if (m_callback && m_done_count == m_task_count)
 			{
-				// Ensure the progress bar hits 100%
-				if (m_callback)
-				{
-					std::unique_lock<std::mutex> cblck(m_callback_lock);
-					m_callback(100.0f);
-					m_callback_last_value = 100.0f;
-				}
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				m_callback(100.0f);
+				m_callback_last_value = 100.0f;
+			}
 
+			// Notify if nothing left to do
+			if (m_is_cancelled || m_done_count == m_task_count)
+			{
 				lck.unlock();
 				m_complete.notify_all();
 			}
@@ -285,7 +300,7 @@ public:
 	void wait()
 	{
 		std::unique_lock<std::mutex> lck(m_lock);
-		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
+		m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
 	}
 
 	/**

+ 22 - 5
thirdparty/astcenc/astcenc_mathlib.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -48,7 +48,7 @@
     #define ASTCENC_SSE 42
   #elif defined(__SSE4_1__)
     #define ASTCENC_SSE 41
-  #elif defined(__SSE2__)
+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
     #define ASTCENC_SSE 20
   #else
     #define ASTCENC_SSE 0
@@ -58,25 +58,42 @@
 #ifndef ASTCENC_AVX
   #if defined(__AVX2__)
     #define ASTCENC_AVX 2
+    #define ASTCENC_X86_GATHERS 1
   #elif defined(__AVX__)
     #define ASTCENC_AVX 1
+    #define ASTCENC_X86_GATHERS 1
   #else
     #define ASTCENC_AVX 0
   #endif
 #endif
 
 #ifndef ASTCENC_NEON
-  #if defined(__aarch64__)
+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     #define ASTCENC_NEON 1
   #else
     #define ASTCENC_NEON 0
   #endif
 #endif
 
+#ifndef ASTCENC_SVE
+  #if defined(__ARM_FEATURE_SVE)
+    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+      #define ASTCENC_SVE 8
+    // Auto-detected SVE can only assume vector width of 4 is available, but
+    // must also allow for hardware being longer and so all use of intrinsics
+    // must explicitly use predicate masks to limit to 4-wide.
+    #else
+      #define ASTCENC_SVE 4
+    #endif
+    #else
+    #define ASTCENC_SVE 0
+  #endif
+#endif
+
 // Force vector-sized SIMD alignment
-#if ASTCENC_AVX
+#if ASTCENC_AVX || ASTCENC_SVE == 8
   #define ASTCENC_VECALIGN 32
-#elif ASTCENC_SSE || ASTCENC_NEON
+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
   #define ASTCENC_VECALIGN 16
 // Use default alignment for non-SIMD builds
 #else

+ 3 - 3
thirdparty/astcenc/astcenc_partition_tables.cpp

@@ -304,9 +304,9 @@ static bool generate_one_partition_info_entry(
 	// Fill loop tail so we can overfetch later
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
-		int ptex_count = counts[i];
-		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
-		for (int j = ptex_count; j < ptex_count_simd; j++)
+		size_t ptex_count = counts[i];
+		size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		for (size_t j = ptex_count; j < ptex_count_simd; j++)
 		{
 			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
 		}

+ 16 - 13
thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
 	vint lane_ids = vint::lane_id();
 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		vint tix(texel_indexes + i);
+		const uint8_t* tix = texel_indexes + i;
 
 		vmask mask = lane_ids < vint(texel_count);
 		lane_ids += vint(ASTCENC_SIMD_WIDTH);
 
 		// Compute the error that arises from just ditching alpha
-		vfloat data_a = gatherf(blk.data_a, tix);
+		vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
 		vfloat alpha_diff = data_a - default_a;
 		alpha_diff = alpha_diff * alpha_diff;
 
 		haccumulate(a_drop_errv, alpha_diff, mask);
 
-		vfloat data_r = gatherf(blk.data_r, tix);
-		vfloat data_g = gatherf(blk.data_g, tix);
-		vfloat data_b = gatherf(blk.data_b, tix);
+		vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
+		vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
+		vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
 
 		// Compute uncorrelated error
 		vfloat param = data_r * uncor_bs0
@@ -1135,13 +1135,13 @@ unsigned int compute_ideal_endpoint_formats(
 	vfloat clear_error(ERROR_CALC_DEFAULT);
 	vint clear_quant(0);
 
-	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
+	size_t packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
 	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
 	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
 	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
 
 	// Ensure that last iteration overstep contains data that will never be picked
-	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
+	size_t packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
 	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
 	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
 	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
@@ -1292,9 +1292,12 @@ unsigned int compute_ideal_endpoint_formats(
 		vint vbest_error_index(-1);
 		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
 
-		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
-		vint lane_ids = vint::lane_id() + vint(start_block_mode);
-		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
+		// TODO: This should use size_t for the inputs of start/end_block_mode
+		// to avoid some of this type conversion, but that propagates and will
+		// need a bigger PR to fix
+		size_t start_mode = round_down_to_simd_multiple_vla(start_block_mode);
+		vint lane_ids = vint::lane_id() + vint_from_size(start_mode);
+		for (size_t j = start_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
 		{
 			vfloat err = vfloat(errors_of_best_combination + j);
 			vmask mask = err < vbest_ep_error;
@@ -1306,8 +1309,8 @@ unsigned int compute_ideal_endpoint_formats(
 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
 		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
-		vbest_error_index = hmin(vbest_error_index);
-		int best_error_index = vbest_error_index.lane<0>();
+
+		int best_error_index = hmin_s(vbest_error_index);
 
 		best_error_weights[i] = best_error_index;
 

+ 78 - 40
thirdparty/astcenc/astcenc_vecmathlib.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2025 Arm Limited
 // Copyright 2008 Jose Fonseca
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
@@ -42,11 +42,12 @@
  *
  * With the current implementation ISA support is provided for:
  *
- *     * 1-wide for scalar reference.
- *     * 4-wide for Armv8-A NEON.
- *     * 4-wide for x86-64 SSE2.
- *     * 4-wide for x86-64 SSE4.1.
- *     * 8-wide for x86-64 AVX2.
+ *     * 1-wide for scalar reference
+ *     * 4-wide for Armv8-A NEON
+ *     * 4-wide for x86-64 SSE2
+ *     * 4-wide for x86-64 SSE4.1
+ *     * 8-wide for Armv8-A SVE
+ *     * 8-wide for x86-64 AVX2
  */
 
 #ifndef ASTC_VECMATHLIB_H_INCLUDED
@@ -54,7 +55,14 @@
 
 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
 	#include <immintrin.h>
-#elif ASTCENC_NEON != 0
+#endif
+
+#if ASTCENC_SVE != 0
+	#include <arm_sve.h>
+	#include <arm_neon_sve_bridge.h>
+#endif
+
+#if ASTCENC_NEON != 0
 	#include <arm_neon.h>
 #endif
 
@@ -69,8 +77,10 @@
 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
 #endif
 
+template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
+
 #if ASTCENC_AVX >= 2
-	/* If we have AVX2 expose 8-wide VLA. */
+	// If we have AVX2 expose 8-wide VLA.
 	#include "astcenc_vecmathlib_sse_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
 	#include "astcenc_vecmathlib_avx2_8.h"
@@ -88,11 +98,16 @@
 	using vint = vint8;
 	using vmask = vmask8;
 
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
 
 #elif ASTCENC_SSE >= 20
-	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
+	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
 	#include "astcenc_vecmathlib_sse_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
 
@@ -103,11 +118,48 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+
+#elif ASTCENC_SVE == 8
+	// Check the compiler is configured with fixed-length 256-bit SVE.
+	#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
+		#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
+	#endif
+
+	// If we have SVE configured as 8-wide, expose 8-wide VLA.
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_sve_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
 
 #elif ASTCENC_NEON > 0
-	/* If we have NEON expose 4-wide VLA. */
+	// If we have NEON expose 4-wide VLA.
 	#include "astcenc_vecmathlib_neon_4.h"
 	#include "astcenc_vecmathlib_common_4.h"
 
@@ -118,8 +170,13 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
 
 #else
 	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
@@ -150,34 +207,15 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
 #endif
 
-/**
- * @brief Round a count down to the largest multiple of 8.
- *
- * @param count   The unrounded value.
- *
- * @return The rounded value.
- */
-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
-{
-	return count & static_cast<unsigned int>(~(8 - 1));
-}
-
-/**
- * @brief Round a count down to the largest multiple of 4.
- *
- * @param count   The unrounded value.
- *
- * @return The rounded value.
- */
-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
-{
-	return count & static_cast<unsigned int>(~(4 - 1));
-}
-
 /**
  * @brief Round a count down to the largest multiple of the SIMD width.
  *
@@ -187,9 +225,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int coun
  *
  * @return The rounded value.
  */
-ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
+ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
 {
-	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
+	return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
 }
 
 /**
@@ -201,9 +239,9 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int co
  *
  * @return The rounded value.
  */
-ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
+ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
 {
-	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
 	return multiples * ASTCENC_SIMD_WIDTH;
 }
 
@@ -239,8 +277,8 @@ ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
 {
 	vfloat z = atan(abs(y / x));
-	vmask xmask = vmask(float_as_int(x).m);
-	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
+	vmask xmask = x < vfloat::zero();
+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
 }
 
 /*

+ 163 - 157
thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2024 Arm Limited
+// Copyright 2019-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -54,7 +54,7 @@ struct vfloat8
 	ASTCENC_SIMD_INLINE vfloat8() = default;
 
 	/**
-	 * @brief Construct from 4 values loaded from an unaligned address.
+	 * @brief Construct from 8 values loaded from an unaligned address.
 	 *
 	 * Consider using loada() which is better with vectors if data is aligned
 	 * to vector length.
@@ -74,18 +74,6 @@ struct vfloat8
 		m = _mm256_set1_ps(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vfloat8(
-		float a, float b, float c, float d,
-		float e, float f, float g, float h)
-	{
-		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -94,20 +82,6 @@ struct vfloat8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar value of a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256_f32[l];
-	#else
-		union { __m256 m; float f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -132,14 +106,6 @@ struct vfloat8
 		return vfloat8(_mm256_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
-	{
-		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief The vector ...
 	 */
@@ -183,25 +149,13 @@ struct vint8
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint8(int a)
 	{
 		m = _mm256_set1_epi32(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vint8(
-		int a, int b, int c, int d,
-		int e, int f, int g, int h)
-	{
-		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -210,20 +164,6 @@ struct vint8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar from a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE int lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256i_i32[l];
-	#else
-		union { __m256i m; int f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -518,31 +458,54 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
  */
 ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
 {
-	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
-	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+	// Build min within groups of 2, then 4, then 8
+	__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
+	m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
+	m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
 
-	__m256i r = astcenc_mm256_set_m128i(m, m);
-	vint8 vmin(r);
+	vint8 vmin(m);
 	return vmin;
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmin(a).m);
+}
+
 /**
  * @brief Return the horizontal maximum of a vector.
  */
 ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
 {
-	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
-	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
-	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+	// Build max within groups of 2, then 4, then 8
+	__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
+	m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
+	m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
 
-	__m256i r = astcenc_mm256_set_m128i(m, m);
-	vint8 vmax(r);
+	vint8 vmax(m);
 	return vmax;
 }
 
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmax(a).m);
+}
+
+/**
+ * @brief Generate a vint8 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint8(static_cast<int>(a));
+ }
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -570,18 +533,10 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
-{
-	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
 {
 	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
 	                               0, 0, 0, 0, 28, 24, 20, 16,
@@ -593,7 +548,8 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
 	__m128i b = _mm_unpacklo_epi32(a0, a1);
 
 	__m256i r = astcenc_mm256_set_m128i(b, b);
-	return vint8(r);
+
+	store_nbytes(vint8(r), p);
 }
 
 /**
@@ -606,7 +562,7 @@ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
 }
 
 // ============================================================================
-// vfloat4 operators and functions
+// vfloat8 operators and functions
 // ============================================================================
 
 /**
@@ -674,7 +630,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
 	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
 }
 
-
 /**
  * @brief Overload: scalar by vector division.
  */
@@ -683,7 +638,6 @@ ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
 	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
 }
 
-
 /**
  * @brief Overload: vector by vector equality.
  */
@@ -786,19 +740,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
 	return a;
 }
 
-/**
- * @brief Return a clamped value between 0.0f and max.
- *
- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
-{
-	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
-	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
-	return a;
-}
-
 /**
  * @brief Return a clamped value between 0.0f and 1.0f.
  *
@@ -857,7 +798,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
 {
-	return hmin(a).lane<0>();
+	return _mm256_cvtss_f32(hmin(a).m);
 }
 
 /**
@@ -887,7 +828,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
 {
-	return hmax(a).lane<0>();
+	return _mm256_cvtss_f32(hmax(a).m);
 }
 
 /**
@@ -909,14 +850,6 @@ ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
 	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
-{
-	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
-}
-
 /**
  * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
  *
@@ -979,6 +912,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
+{
+#if ASTCENC_X86_GATHERS == 0
+	// Perform manual gather using scalar loads in two separate dependency chains,
+	// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
+	// into a bunch of memory-operand inserts on 128-bit halves then merges late,
+	// which performs significantly worse in tests.
+	__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
+	__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
+
+	return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
+#else
+	vint8 inds(indices);
+	return gatherf(base, inds);
+#endif
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */
@@ -1045,98 +1005,140 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(_mm256_castsi256_ps(a.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable8_16x8 {
+	vint8 t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable8_32x8 {
+	vint8 t0;
+	vint8 t1;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	vint8 t0;
+	vint8 t1;
+	vint8 t2;
+	vint8 t3;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	vint4 d1 = vint4::load(data + 16);
 
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
+
+	// XOR chain the high rows to allow table emulation
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
-
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	vint4 d1 = vint4::load(data + 16);
+	vint4 d2 = vint4::load(data + 32);
+	vint4 d3 = vint4::load(data + 48);
 
-	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
-	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
+	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
+	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
 
-	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
-	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
+	// XOR chain the high rows to allow table emulation
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t2.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t3.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 
 	return vint8(result);
@@ -1146,7 +1148,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
  * @brief Return a vector of interleaved RGBA data.
  *
  * Input vectors have the value stored in the bottom 8 bits of each lane,
- * with high  bits set to zero.
+ * with high bits set to zero.
  *
  * Output vector stores a single RGBA texel packed in each lane.
  */
@@ -1183,8 +1185,12 @@ ASTCENC_SIMD_INLINE void printx(vint8 a)
 {
 	alignas(32) int v[8];
 	storea(a, v);
+
+	unsigned int uv[8];
+	std::memcpy(uv, v, sizeof(int) * 8);
+
 	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
-	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+		uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
 }
 
 /**

+ 32 - 34
thirdparty/astcenc/astcenc_vecmathlib_common_4.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2024 Arm Limited
+// Copyright 2020-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -31,26 +31,7 @@
 #endif
 
 #include <cstdio>
-
-// ============================================================================
-// vmask4 operators and functions
-// ============================================================================
-
-/**
- * @brief True if any lanes are enabled, false otherwise.
- */
-ASTCENC_SIMD_INLINE bool any(vmask4 a)
-{
-	return mask(a) != 0;
-}
-
-/**
- * @brief True if all lanes are enabled, false otherwise.
- */
-ASTCENC_SIMD_INLINE bool all(vmask4 a)
-{
-	return mask(a) == 0xF;
-}
+#include <limits>
 
 // ============================================================================
 // vint4 operators and functions
@@ -129,6 +110,31 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Generate a vint4 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint4(static_cast<int>(a));
+ }
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+	return hmax(a).lane<0>();
+}
+
 // ============================================================================
 // vfloat4 operators and functions
 // ============================================================================
@@ -222,18 +228,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
 	return min(max(a, minv), maxv);
 }
 
-/**
- * @brief Return the clamped value between 0.0f and max.
- *
- * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
-{
-	// Do not reorder - second operand will return if either is NaN
-	return min(max(a, vfloat4::zero()), maxv);
-}
-
 /**
  * @brief Return the clamped value between 0.0f and 1.0f.
  *
@@ -396,8 +390,12 @@ ASTCENC_SIMD_INLINE void printx(vint4 a)
 {
 	ASTCENC_ALIGNAS int v[4];
 	storea(a, v);
+
+	unsigned int uv[4];
+	std::memcpy(uv, v, sizeof(int) * 4);
+
 	printf("v4_i32:\n  %08x %08x %08x %08x\n",
-	       v[0], v[1], v[2], v[3]);
+		uv[0], uv[1], uv[2], uv[3]);
 }
 
 /**

+ 120 - 98
thirdparty/astcenc/astcenc_vecmathlib_neon_4.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2023 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -115,7 +115,7 @@ struct vfloat4
 	 */
 	static ASTCENC_SIMD_INLINE vfloat4 zero()
 	{
-		return vfloat4(vdupq_n_f32(0.0f));
+		return vfloat4(0.0f);
 	}
 
 	/**
@@ -134,15 +134,6 @@ struct vfloat4
 		return vfloat4(vld1q_f32(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
-		return vfloat4(vld1q_f32(data));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -203,16 +194,21 @@ struct vint4
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
 	{
-		// Cast is safe - NEON loads are allowed to be unaligned
-		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
-		m = vreinterpretq_s32_u32(vmovl_u16(t16));
+#if ASTCENC_SVE == 0
+	// Cast is safe - NEON loads are allowed to be unaligned
+	uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
+	uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
+	m = vreinterpretq_s32_u32(vmovl_u16(t16));
+#else
+	svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
+	m = svget_neonq(data);
+#endif
 	}
 
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@@ -420,6 +416,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 	return vaddvq_u32(vshlq_u32(tmp, shift));
 }
 
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return vmaxvq_u32(a.m) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return vminvq_u32(a.m) != 0;
+}
+
 // ============================================================================
 // vint4 operators and functions
 // ============================================================================
@@ -570,15 +582,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 	return vint4(vmaxvq_s32(a.m));
 }
 
-/**
- * @brief Return the horizontal sum of a vector.
- */
-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
-{
-	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
-	return vget_lane_s32(vpadd_s32(t, t), 0);
-}
-
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -612,31 +615,17 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 }
 
 /**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	alignas(16) int vals[4];
-	vals[0] = base[idx[0]];
-	vals[1] = base[idx[1]];
-	vals[2] = base[idx[2]];
-	vals[3] = base[idx[3]];
-	return vint4(vals);
-}
-
-/**
- * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ * @brief Pack and store low 8 bits of each vector lane.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
 {
 	alignas(16) uint8_t shuf[16] {
 		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
 	};
 	uint8x16_t idx = vld1q_u8(shuf);
 	int8x16_t av = vreinterpretq_s8_s32(a.m);
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+	a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+	store_nbytes(a, data);
 }
 
 /**
@@ -814,21 +803,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
-	uint32x4_t mask = vcgeq_u32(cond.m, msb);
-	return vfloat4(vbslq_f32(mask, b.m, a.m));
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
+#if ASTCENC_SVE == 0
 	alignas(16) int idx[4];
 	storea(indices, idx);
 	alignas(16) float vals[4];
@@ -837,8 +817,32 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	vals[2] = base[idx[2]];
 	vals[3] = base[idx[3]];
 	return vfloat4(vals);
+#else
+	svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
+	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
+	return vfloat4(svget_neonq_f32(data));
+#endif
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+#if ASTCENC_SVE == 0
+	alignas(16) float vals[4];
+	vals[0] = base[indices[0]];
+	vals[1] = base[indices[1]];
+	vals[2] = base[indices[2]];
+	vals[3] = base[indices[3]];
+	return vfloat4(vals);
+#else
+	svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
+	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
+	return vfloat4(svget_neonq_f32(data));
+#endif
+}
 /**
  * @brief Store a vector to an unaligned memory address.
  */
@@ -950,87 +954,105 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(vreinterpretq_f32_s32(v.m));
 }
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
-}
+struct vtable4_16x8 {
+	uint8x16_t t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	uint8x16x2_t t01;
+};
 
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	uint8x16x4_t t0123;
+};
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vld1q_u8(data);
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.t01 = uint8x16x2_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16)
+	};
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
-	int8x16_t table {
-		vreinterpretq_s8_s32(t0.m)
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.t0123 = uint8x16x4_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16),
+		vld1q_u8(data + 32),
+		vld1q_u8(data + 48)
 	};
+}
 
+/**
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
-	int8x16x2_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m)
-	};
-
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
-	int8x16x4_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m),
-		vreinterpretq_s8_s32(t2.m),
-		vreinterpretq_s8_s32(t3.m)
-	};
-
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
 }
 
 /**

+ 111 - 102
thirdparty/astcenc/astcenc_vecmathlib_none_4.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2024 Arm Limited
+// Copyright 2019-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -139,14 +139,6 @@ struct vfloat4
 		return vfloat4(p);
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -233,7 +225,7 @@ struct vint4
 	/**
 	 * @brief Construct from 4 scalar values replicated across all lanes.
 	 *
-	 * Consider using vint4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@@ -354,7 +346,7 @@ struct vmask4
 	/**
 	 * @brief Get the scalar value of a single lane.
 	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
 	{
 		return m[l] != 0;
 	}
@@ -420,10 +412,26 @@ ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  */
 ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 {
-	return ((a.m[0] >> 31) & 0x1) |
-	       ((a.m[1] >> 30) & 0x2) |
-	       ((a.m[2] >> 29) & 0x4) |
-	       ((a.m[3] >> 28) & 0x8);
+	return (a.m[0] & 0x1) |
+	       (a.m[1] & 0x2) |
+	       (a.m[2] & 0x4) |
+	       (a.m[3] & 0x8);
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
 }
 
 // ============================================================================
@@ -638,14 +646,6 @@ ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 	return vint4(std::max(b, c));
 }
 
-/**
- * @brief Return the horizontal sum of vector lanes as a scalar.
- */
-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
-{
-	return a.m[0] + a.m[1] + a.m[2] + a.m[3];
-}
-
 /**
  * @brief Store a vector to an aligned memory address.
  */
@@ -684,29 +684,23 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	std::memcpy(p, a.m, sizeof(uint8_t) * 4);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	return vint4(base[indices.m[0]],
-	             base[indices.m[1]],
-	             base[indices.m[2]],
-	             base[indices.m[3]]);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 	int b0 = a.m[0] & 0xFF;
 	int b1 = a.m[1] & 0xFF;
 	int b2 = a.m[2] & 0xFF;
 	int b3 = a.m[3] & 0xFF;
 
+#if !defined(ASTCENC_BIG_ENDIAN)
 	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
-	return vint4(b, 0, 0, 0);
+#else
+	int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24);
+#endif
+	a = vint4(b, 0, 0, 0);
+	store_nbytes(a, p);
 }
 
 /**
@@ -934,17 +928,6 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
-	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
-	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
-	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
@@ -956,6 +939,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	               base[indices.m[3]]);
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+	return vfloat4(base[indices[0]],
+	               base[indices[1]],
+	               base[indices[2]],
+	               base[indices[3]]);
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */
@@ -1080,84 +1075,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
 	return r;
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	const uint8_t* data;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
-	uint8_t table[16];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
-
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
-	uint8_t table[32];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
-	uint8_t table[64];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
@@ -1170,7 +1175,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3
  */
 ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
 {
+#if !defined(ASTCENC_BIG_ENDIAN)
 	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+#else
+	return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r);
+#endif
 }
 
 /**

+ 155 - 130
thirdparty/astcenc/astcenc_vecmathlib_sse_4.h

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2023 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -142,14 +142,6 @@ struct vfloat4
 		return vfloat4(_mm_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(_mm_set_ps(3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -229,7 +221,7 @@ struct vint4
 	/**
 	 * @brief Construct from 1 scalar value replicated across all lanes.
 	 *
-	 * Consider using vfloat4::zero() for constexpr zeros.
+	 * Consider using zero() for constexpr zeros.
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(int a)
 	{
@@ -436,6 +428,22 @@ ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
 	return static_cast<unsigned int>(_mm_movemask_ps(a.m));
 }
 
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
+}
+
 // ============================================================================
 // vint4 operators and functions
 // ============================================================================
@@ -598,9 +606,9 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  */
 ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
 {
-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
-	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
+	return a;
 }
 
 /*
@@ -608,25 +616,9 @@ ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  */
 ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
 {
-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
-	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
-	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
-}
-
-/**
- * @brief Return the horizontal sum of a vector as a scalar.
- */
-ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
-{
-	// Add top and bottom halves, lane 1/0
-	__m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
-	                                              _mm_castsi128_ps(a.m)));
-	__m128i t = _mm_add_epi32(a.m, fold);
-
-	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
-	t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
-
-	return _mm_cvtsi128_si32(t);
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
+	return a;
 }
 
 /**
@@ -663,32 +655,20 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-#if ASTCENC_AVX >= 2
-	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
-#else
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
-#endif
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
-ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
 {
 #if ASTCENC_SSE >= 41
 	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
-	return vint4(_mm_shuffle_epi8(a.m, shuf));
+	a = vint4(_mm_shuffle_epi8(a.m, shuf));
+	store_nbytes(a, p);
 #else
 	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
 	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
-	return vint4(_mm_unpacklo_epi16(va, vb));
+	a = vint4(_mm_unpacklo_epi16(va, vb));
+	store_nbytes(a, p);
 #endif
 }
 
@@ -899,25 +879,12 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
 #endif
 }
 
-/**
- * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
- */
-ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
-{
-#if ASTCENC_SSE >= 41
-	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
-#else
-	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
-	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
-#endif
-}
-
 /**
  * @brief Load a vector of gathered results from an array;
  */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
-#if ASTCENC_AVX >= 2
+#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
 	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
 #else
 	alignas(16) int idx[4];
@@ -926,6 +893,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 #endif
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+	// Experimentally, in this particular use case (byte indices in memory),
+	// using 4 separate scalar loads is appreciably faster than using gathers
+	// even if they're available, on every x86 uArch tried, so always do the
+	// separate loads even when ASTCENC_X86_GATHERS is enabled.
+	//
+	// Tested on:
+	//   - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
+	//   - AMD Zen 2, Zen 4
+	return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */
@@ -1054,136 +1038,173 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(_mm_castsi128_ps(v.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+#else
+	const uint8_t* data;
+#endif
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+	vint4 t1;
+#else
+	const uint8_t* data;
+#endif
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+#if ASTCENC_SSE >= 41
+	vint4 t0;
+	vint4 t1;
+	vint4 t2;
+	vint4 t3;
+#else
+	const uint8_t* data;
+#endif
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+#if ASTCENC_SSE >= 41
+	table.t0 = vint4::load(data);
+#else
+	table.data = data;
+#endif
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+
+	table.t1 = table.t1 ^ table.t0;
 #else
-	t0p = t0;
-	t1p = t1;
+	table.data = data;
 #endif
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
-	t2p = t1 ^ t2;
-	t3p = t2 ^ t3;
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+	table.t2 = vint4::load(data + 32);
+	table.t3 = vint4::load(data + 48);
+
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 #else
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	table.data = data;
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	return vint4(result);
 #else
-	uint8_t table[16];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
-	uint8_t table[32];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t2.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t3.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
-	uint8_t table[64];
-
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
@@ -1307,7 +1328,11 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
  */
 ASTCENC_SIMD_INLINE int popcount(uint64_t v)
 {
+#if !defined(__x86_64__) && !defined(_M_AMD64)
+	return static_cast<int>(__builtin_popcountll(v));
+#else
 	return static_cast<int>(_mm_popcnt_u64(v));
+#endif
 }
 
 #endif // ASTCENC_POPCNT >= 1

+ 1101 - 0
thirdparty/astcenc/astcenc_vecmathlib_sve_8.h

@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 8x32-bit vectors, implemented using SVE.
+ *
+ * This module implements 8-wide 32-bit float, int, and mask vectors for Arm
+ * SVE.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ */
+
+#ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED
+#define ASTC_VECMATHLIB_SVE_8_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+typedef svbool_t svbool_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint8_t svuint8_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint16_t svuint16_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint32_t svuint32_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svint32_t svint32_8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat32_t svfloat32_8_t __attribute__((arm_sve_vector_bits(256)));
+
+// ============================================================================
+// vfloat8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide floats.
+ */
+struct vfloat8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat8() = default;
+
+	/**
+	 * @brief Construct from 8 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
+	{
+		m = svld1_f32(svptrue_b32(), p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
+	{
+		m = svdup_f32(a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(svfloat32_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 zero()
+	{
+		return vfloat8(0.0f);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
+	{
+		return vfloat8(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
+	{
+		return vfloat8(p);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	svfloat32_8_t m;
+};
+
+// ============================================================================
+// vint8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide ints.
+ */
+struct vint8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint8() = default;
+
+	/**
+	 * @brief Construct from 8 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
+	{
+		m = svld1_s32(svptrue_b32(), p);
+	}
+
+	/**
+	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
+	{
+		// Load 8-bit values and expand to 32-bits
+		m = svld1ub_s32(svptrue_b32(), p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(int a)
+	{
+		m = svdup_s32(a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(svint32_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 zero()
+	{
+		return vint8(0.0f);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
+	{
+		return vint8(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from unaligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
+	{
+		svuint8_8_t data = svld1_u8(svptrue_b8(), p);
+		return vint8(svreinterpret_s32_u8(data));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
+	{
+		return vint8(p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 lane_id()
+	{
+		return vint8(svindex_s32(0, 1));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	 svint32_8_t m;
+};
+
+// ============================================================================
+// vmask8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide control plane masks.
+ */
+struct vmask8
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(svbool_8_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
+	{
+		m = svdup_b32(a);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	svbool_8_t m;
+};
+
+// ============================================================================
+// vmask8 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
+{
+	return vmask8(svorr_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
+{
+	return vmask8(svand_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
+{
+	return vmask8(sveor_z(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
+{
+	return vmask8(svnot_z(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a 8-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
+{
+	alignas(32) const int shifta[8] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
+	svint32_8_t template_vals = svld1_s32(svptrue_b32(), shifta);
+	svint32_8_t active_vals = svsel_s32(a.m, template_vals, svdup_s32(0));
+	return static_cast<unsigned int>(svaddv_s32(svptrue_b32(), active_vals));
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask8 a)
+{
+	return svptest_any(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask8 a)
+{
+	return !svptest_any(svptrue_b32(), (~a).m);
+}
+
+// ============================================================================
+// vint8 operators and functions
+// ============================================================================
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
+{
+	return vint8(svadd_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
+{
+	return vint8(svsub_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
+{
+	return vint8(svmul_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
+{
+	return vint8(svnot_s32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
+{
+	return vint8(svorr_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
+{
+	return vint8(svand_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
+{
+	return vint8(sveor_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
+{
+	return vmask8(svcmpeq_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
+{
+	return vmask8(svcmpne_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
+{
+	return vmask8(svcmplt_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
+{
+	return vmask8(svcmpgt_s32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
+{
+	return vint8(svlsl_n_s32_x(svptrue_b32(), a.m, s));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
+{
+	return vint8(svasr_n_s32_x(svptrue_b32(), a.m, s));
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
+{
+	svuint32_8_t r = svreinterpret_u32_s32(a.m);
+	r = svlsr_n_u32_x(svptrue_b32(), r, s);
+	return vint8(svreinterpret_s32_u32(r));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
+{
+	return vint8(svmin_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
+{
+	return vint8(svmax_s32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
+{
+	return vint8(svminv_s32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+	return svminv_s32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
+{
+	return vint8(svmaxv_s32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+	return svmaxv_s32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Generate a vint8 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint8(static_cast<int>(a));
+ }
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
+{
+	svst1_s32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
+{
+	svst1_s32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
+{
+	svuint8_8_t r = svreinterpret_u8_s32(a.m);
+	svst1_u8(svptrue_pat_b8(SV_VL8), p, r);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
+{
+	svuint32_8_t data = svreinterpret_u32_s32(v.m);
+	svst1b_u32(svptrue_b32(), p, data);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
+{
+	return vint8(svsel_s32(cond.m, b.m, a.m));
+}
+
+// ============================================================================
+// vfloat8 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svadd_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svsub_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
+{
+	return vfloat8(svmul_f32_x(svptrue_b32(), svdup_f32(a), b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), a.m, svdup_f32(b)));
+}
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
+{
+	return vfloat8(svdiv_f32_x(svptrue_b32(), svdup_f32(a), b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpeq_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpne_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmplt_f32(svptrue_b32(), a.m, b.m));;
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpgt_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmple_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(svcmpge_f32(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svminnm_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
+{
+	return min(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(svmaxnm_f32_x(svptrue_b32(), a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, the other lane will be returned.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
+{
+	return max(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clamp(float minv, float maxv, vfloat8 a)
+{
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
+{
+	return clamp(0.0f, 1.0f, a);
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
+{
+	return vfloat8(svabs_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
+{
+	return vfloat8(svrintn_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
+{
+	return vfloat8(svminnmv_f32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
+{
+	return svminnmv_f32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
+{
+	return vfloat8(svmaxnmv_f32(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
+{
+	return svmaxnmv_f32(svptrue_b32(), a.m);
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
+{
+	// Can't use svaddv - it's not invariant
+	vfloat4 lo(svget_neonq_f32(a.m));
+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
+	return hadd_s(lo) + hadd_s(hi);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
+{
+	return vfloat8(svsel_f32(cond.m, b.m, a.m));
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
+{
+	vfloat4 lo(svget_neonq_f32(a.m));
+	haccumulate(accum, lo);
+
+	vfloat4 hi(svget_neonq_f32(svext_f32(a.m, a.m, 4)));
+	haccumulate(accum, hi);
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
+{
+	accum += a;
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
+{
+	a = select(vfloat8::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
+{
+	accum.m = svadd_f32_m(m.m, accum.m, a.m);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
+{
+	return vfloat8(svsqrt_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
+{
+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
+{
+	svint32_t offsets = svld1ub_s32(svptrue_b32(), indices);
+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets));
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
+{
+	svst1_f32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 32B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
+{
+	svst1_f32(svptrue_b32(), p, a.m);
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
+{
+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
+{
+	a = a + vfloat8(0.5f);
+	return vint8(svcvt_s32_f32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float value for an integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
+{
+	return vfloat8(svcvt_f32_s32_x(svptrue_b32(), a.m));
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
+{
+	return vint8(svreinterpret_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
+{
+	return vfloat8(svreinterpret_f32_s32(a.m));
+}
+
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable8_16x8 {
+	svuint8_8_t t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable8_32x8 {
+	svuint8_8_t t0;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	svuint8_8_t t0;
+	svuint8_8_t t1;
+};
+
+/**
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// Top half of register will be zeros
+	table.t0 = svld1_u8(svptrue_pat_b8(SV_VL16), data);
+}
+
+/**
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	table.t0 = svld1_u8(svptrue_b8(), data);
+}
+
+/**
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	table.t0 = svld1_u8(svptrue_b8(), data);
+	table.t1 = svld1_u8(svptrue_b8(), data + 32);
+}
+
+/**
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
+
+	svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
+	return vint8(svreinterpret_s32_u8(result));
+}
+
+/**
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
+
+	svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
+	return vint8(svreinterpret_s32_u8(result));
+}
+
+/**
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
+ *
+ * Future: SVE2 can directly do svtbl2_u8() for a two register table.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	svint32_8_t idxm = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
+
+	svuint8_8_t idxm8 = svreinterpret_u8_s32(idxm);
+	svuint8_8_t t0_lookup = svtbl_u8(tbl.t0, idxm8);
+
+	idxm8 = svsub_u8_x(svptrue_b8(), idxm8, svdup_u8(32));
+	svuint8_8_t t1_lookup = svtbl_u8(tbl.t1, idxm8);
+
+	svuint8_8_t result = svorr_u8_x(svptrue_b32(), t0_lookup, t1_lookup);
+	return vint8(svreinterpret_s32_u8(result));
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
+{
+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
+{
+	svst1_s32(mask.m, reinterpret_cast<int32_t*>(base), data.m);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint8 a)
+{
+	alignas(32) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint8 a)
+{
+	alignas(32) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat8 a)
+{
+	alignas(32) float v[8];
+	storea(a, v);
+	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]),
+	       static_cast<double>(v[4]), static_cast<double>(v[5]),
+	       static_cast<double>(v[6]), static_cast<double>(v[7]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask8 a)
+{
+	print(select(vint8(0), vint8(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_SVE_8_H_INCLUDED

+ 37 - 20
thirdparty/astcenc/astcenc_weight_align.cpp

@@ -43,6 +43,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cstring>
+#include <cfloat>
 
 static constexpr unsigned int ANGULAR_STEPS { 32 };
 
@@ -104,14 +105,17 @@ static void compute_angular_offsets(
 	// Precompute isample; arrays are always allocated 64 elements long
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		// Add 2^23 and interpreting bits extracts round-to-nearest int
-		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
-		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
+
+		// Convert a weight to a sincos table index
+		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
+		vint isample = float_to_int_rtn(sample);
 		storea(isample, isamplev + i);
 	}
 
 	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
-	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
+	vfloat mult(1.0f / (2.0f * astc::PI));
 
 	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
 	{
@@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);
 
-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
+
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+
+	vint lane_id = vint::lane_id();
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vmask active = lane_id < vint(weight_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, select(min_weight, weights, active));
+		max_weight = max(max_weight, select(max_weight, weights, active));
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);
 
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
-		vfloat minidx(128.0f);
-		vfloat maxidx(-128.0f);
 		vfloat errval = vfloat::zero();
 		vfloat cut_low_weight_err = vfloat::zero();
 		vfloat cut_high_weight_err = vfloat::zero();
 		vfloat offset = loada(offsets + sp);
 
+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
 		for (unsigned int j = 0; j < weight_count; j++)
 		{
 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
@@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
 			vfloat diff = sval - svalrte;
 			errval += diff * diff;
 
-			// Reset tracker on min hit
-			vmask mask = svalrte < minidx;
-			minidx = select(minidx, svalrte, mask);
-			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on min hit
-			mask = svalrte == minidx;
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
 
-			// Reset tracker on max hit
-			mask = svalrte > maxidx;
-			maxidx = select(maxidx, svalrte, mask);
-			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on max hit
+			// Accumulate errors for maximum index
 			mask = svalrte == maxidx;
 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);